firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,270 @@
1
+ from firecrawl import Firecrawl
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from firecrawl.types import SearchData, Document, ScrapeOptions, SearchResultWeb, SearchResultNews, SearchResultImages
5
+
6
+ load_dotenv()
7
+
8
+ firecrawl = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
9
+
10
+ def _collect_texts(entries):
11
+ texts = []
12
+ for r in entries or []:
13
+ title = getattr(r, 'title', None) if hasattr(r, 'title') else None
14
+ desc = getattr(r, 'description', None) if hasattr(r, 'description') else None
15
+ if title:
16
+ texts.append(str(title).lower())
17
+ if desc:
18
+ texts.append(str(desc).lower())
19
+ return texts
20
+
21
+ def _is_document(entry) -> bool:
22
+ try:
23
+ from firecrawl.v2.types import Document
24
+ return isinstance(entry, Document) or \
25
+ hasattr(entry, 'markdown') or \
26
+ hasattr(entry, 'html') or \
27
+ hasattr(entry, 'raw_html') or \
28
+ hasattr(entry, 'json') or \
29
+ hasattr(entry, 'screenshot') or \
30
+ hasattr(entry, 'change_tracking') or \
31
+ hasattr(entry, 'summary')
32
+ except Exception:
33
+ return hasattr(entry, 'markdown') or \
34
+ hasattr(entry, 'html') or \
35
+ hasattr(entry, 'raw_html') or \
36
+ hasattr(entry, 'json') or \
37
+ hasattr(entry, 'screenshot') or \
38
+ hasattr(entry, 'change_tracking') or \
39
+ hasattr(entry, 'summary')
40
+
41
+ def test_search_minimal_request():
42
+ results = firecrawl.search(
43
+ query="What is the capital of France?"
44
+ )
45
+
46
+ assert isinstance(results, SearchData)
47
+ assert hasattr(results, 'web')
48
+ assert results.web is not None
49
+ assert len(results.web) > 0
50
+ assert hasattr(results, 'news')
51
+ assert results.news is None
52
+ assert hasattr(results, 'images')
53
+ assert results.images is None
54
+
55
+ for result in results.web:
56
+ assert isinstance(result, SearchResultWeb)
57
+ assert hasattr(result, 'url')
58
+ assert hasattr(result, 'title')
59
+ assert hasattr(result, 'description')
60
+ assert result.url.startswith('http')
61
+ assert result.title is not None
62
+ assert result.description is not None
63
+
64
+ all_text = ' '.join(_collect_texts(results.web))
65
+
66
+ assert 'paris' in all_text
67
+
68
+ assert results.news is None
69
+ assert results.images is None
70
+
71
+
72
+ def test_search_with_sources():
73
+ """Test search with specific sources."""
74
+ results = firecrawl.search(
75
+ query="firecrawl",
76
+ sources=["web", "news", "images"],
77
+ limit=3
78
+ )
79
+
80
+ assert isinstance(results, SearchData)
81
+
82
+ assert results.web is not None
83
+ assert len(results.web) <= 3
84
+ assert isinstance(results.web[0], SearchResultWeb)
85
+
86
+ if results.news is not None:
87
+ assert len(results.news) <= 3
88
+ assert isinstance(results.news[0], SearchResultNews)
89
+
90
+ if results.images is not None:
91
+ assert len(results.images) <= 3
92
+ assert isinstance(results.images[0], SearchResultImages)
93
+
94
+ web_titles = [result.title.lower() for result in results.web]
95
+ web_descriptions = [result.description.lower() for result in results.web]
96
+ all_web_text = ' '.join(web_titles + web_descriptions)
97
+
98
+ assert 'firecrawl' in all_web_text
99
+
100
+ def test_search_result_structure():
101
+ """Test that SearchResult objects have the correct structure."""
102
+ results = firecrawl.search(
103
+ query="test query",
104
+ limit=1
105
+ )
106
+
107
+ if results.web and len(results.web) > 0:
108
+ result = results.web[0]
109
+
110
+ assert hasattr(result, 'url')
111
+ assert hasattr(result, 'title')
112
+ assert hasattr(result, 'description')
113
+
114
+ assert isinstance(result.url, str)
115
+ assert isinstance(result.title, str) or result.title is None
116
+ assert isinstance(result.description, str) or result.description is None
117
+
118
+ # Test URL format
119
+ assert result.url.startswith('http')
120
+
121
+ def test_search_all_parameters():
122
+ """Test search with all available parameters (comprehensive e2e test)."""
123
+ from firecrawl.types import ScrapeOptions, JsonFormat, Location, WaitAction
124
+
125
+ # Define a schema for JSON extraction
126
+ schema = {
127
+ "type": "object",
128
+ "properties": {
129
+ "title": {"type": "string"},
130
+ "description": {"type": "string"},
131
+ "url": {"type": "string"}
132
+ },
133
+ "required": ["title", "description"]
134
+ }
135
+
136
+ results = firecrawl.search(
137
+ query="artificial intelligence",
138
+ sources=[
139
+ {"type": "web"},
140
+ {"type": "news"}
141
+ ],
142
+ limit=3,
143
+ tbs="qdr:m", # Last month
144
+ location="US",
145
+ ignore_invalid_urls=True,
146
+ timeout=60000,
147
+ integration="_e2e-test",
148
+ scrape_options=ScrapeOptions(
149
+ formats=[
150
+ "markdown",
151
+ "html",
152
+ {
153
+ "type": "json",
154
+ "prompt": "Extract the title and description from the page",
155
+ "schema": schema
156
+ },
157
+ {"type": "summary"}
158
+ ],
159
+ headers={"User-Agent": "Firecrawl-Test/1.0"},
160
+ include_tags=["h1", "h2", "p"],
161
+ exclude_tags=["nav", "footer"],
162
+ only_main_content=True,
163
+ wait_for=2000,
164
+ mobile=False,
165
+ skip_tls_verification=False,
166
+ remove_base64_images=True,
167
+ block_ads=True,
168
+ proxy="basic",
169
+ max_age=3600000, # 1 hour cache
170
+ store_in_cache=True,
171
+ location=Location(
172
+ country="US",
173
+ languages=["en"]
174
+ ),
175
+ actions=[
176
+ WaitAction(milliseconds=1000)
177
+ ]
178
+ # Note: raw_html and screenshot_full_page are not supported by v2 API yet
179
+ )
180
+ )
181
+
182
+ # Test structure
183
+ assert isinstance(results, SearchData)
184
+ assert hasattr(results, 'web')
185
+ assert hasattr(results, 'news')
186
+ assert hasattr(results, 'images')
187
+
188
+ # Test that web results exist
189
+ assert results.web is not None
190
+ assert len(results.web) <= 3 # Should respect limit
191
+
192
+ # Test that results contain expected content for non-document entries only
193
+ non_doc_entries = [r for r in (results.web or []) if not _is_document(r)]
194
+ if non_doc_entries:
195
+ all_web_text = ' '.join(_collect_texts(non_doc_entries))
196
+ ai_terms = ['artificial', 'intelligence', 'ai', 'machine', 'learning']
197
+ assert any(term in all_web_text for term in ai_terms)
198
+
199
+ # Test that each result has proper structure
200
+ for result in results.web:
201
+ assert isinstance(result, (SearchResultWeb, Document))
202
+ if isinstance(result, Document):
203
+ # Document path: ensure content present
204
+ assert (result.markdown is not None) or (result.html is not None)
205
+ else:
206
+ # LinkResult path
207
+ assert hasattr(result, 'url')
208
+ assert isinstance(result.url, str) and result.url.startswith('http')
209
+
210
+ # Test that news results exist (if API supports it)
211
+ if results.news is not None:
212
+ assert len(results.news) <= 3
213
+ for result in results.news:
214
+ assert isinstance(result, (SearchResultNews, Document))
215
+ if isinstance(result, Document):
216
+ assert (result.markdown is not None) or (result.html is not None)
217
+ else:
218
+ assert hasattr(result, 'url')
219
+ assert isinstance(result.url, str) and result.url.startswith('http')
220
+
221
+ # Test that unspecified sources are None
222
+ assert results.images is None
223
+
224
+
225
+ def test_search_formats_flexibility():
226
+ """Test that both list and ScrapeFormats work for formats."""
227
+ from firecrawl.types import ScrapeFormats
228
+
229
+ # Test with list format
230
+ results1 = firecrawl.search(
231
+ query="python programming",
232
+ limit=1,
233
+ scrape_options=ScrapeOptions(
234
+ formats=["markdown"]
235
+ )
236
+ )
237
+
238
+ # Test with ScrapeFormats object
239
+ results2 = firecrawl.search(
240
+ query="python programming",
241
+ limit=1,
242
+ scrape_options=ScrapeOptions(
243
+ formats=ScrapeFormats(markdown=True)
244
+ )
245
+ )
246
+
247
+ # Both should work without errors
248
+ assert isinstance(results1, SearchData)
249
+ assert isinstance(results2, SearchData)
250
+ assert results1.web is not None
251
+ assert results2.web is not None
252
+
253
+ def test_search_with_json_format_object():
254
+ """Search with scrape_options including a JSON format object (prompt + schema)."""
255
+ json_schema = {
256
+ "type": "object",
257
+ "properties": {
258
+ "title": {"type": "string"}
259
+ },
260
+ "required": ["title"],
261
+ }
262
+ results = firecrawl.search(
263
+ query="site:docs.firecrawl.dev",
264
+ limit=1,
265
+ scrape_options=ScrapeOptions(
266
+ formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}]
267
+ ),
268
+ )
269
+ assert isinstance(results, SearchData)
270
+ assert results.web is not None and len(results.web) >= 0
@@ -0,0 +1,26 @@
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from firecrawl import Firecrawl
4
+
5
+ load_dotenv()
6
+
7
+
8
+ class TestUsageE2E:
9
+ def setup_method(self):
10
+ # Environment is exported by conftest at import time
11
+ self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
12
+
13
+ def test_get_concurrency(self):
14
+ resp = self.client.get_concurrency()
15
+ # Shape assertions (endpoint not live yet, but types are defined)
16
+ assert hasattr(resp, "concurrency")
17
+ assert hasattr(resp, "max_concurrency")
18
+
19
+ def test_get_credit_usage(self):
20
+ resp = self.client.get_credit_usage()
21
+ assert hasattr(resp, "remaining_credits")
22
+
23
+ def test_get_token_usage(self):
24
+ resp = self.client.get_token_usage()
25
+ assert hasattr(resp, "remaining_tokens")
26
+
@@ -0,0 +1,65 @@
1
+ import os
2
+ import time
3
+ from dotenv import load_dotenv
4
+ from firecrawl import Firecrawl
5
+
6
+ load_dotenv()
7
+
8
+ if not os.getenv("API_KEY"):
9
+ raise ValueError("API_KEY is not set")
10
+
11
+ if not os.getenv("API_URL"):
12
+ raise ValueError("API_URL is not set")
13
+
14
+
15
+ class TestWatcherE2E:
16
+ def setup_method(self):
17
+ from firecrawl import Firecrawl
18
+ self.client = Firecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
19
+
20
+ def test_crawl_watcher(self):
21
+ # Start a small crawl job
22
+ start_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=2)
23
+ job_id = start_job.id
24
+
25
+ statuses = []
26
+ w = self.client.watcher(job_id, kind="crawl", poll_interval=1, timeout=120)
27
+ w.add_listener(lambda s: statuses.append(s.status))
28
+ w.start()
29
+
30
+ # Wait for terminal state up to 180 seconds
31
+ deadline = time.time() + 180
32
+ while time.time() < deadline:
33
+ if statuses and statuses[-1] in ["completed", "failed"]:
34
+ break
35
+ time.sleep(1)
36
+
37
+ w.stop()
38
+
39
+ assert len(statuses) > 0
40
+ assert statuses[-1] in ["completed", "failed"]
41
+
42
+ def test_batch_watcher(self):
43
+ urls = [
44
+ "https://docs.firecrawl.dev",
45
+ "https://firecrawl.dev",
46
+ ]
47
+ start_resp = self.client.start_batch_scrape(urls, formats=["markdown"], max_concurrency=1)
48
+ job_id = start_resp.id
49
+
50
+ statuses = []
51
+ w = self.client.watcher(job_id, kind="batch", poll_interval=1, timeout=180)
52
+ w.add_listener(lambda s: statuses.append(s.status))
53
+ w.start()
54
+
55
+ deadline = time.time() + 240
56
+ while time.time() < deadline:
57
+ if statuses and statuses[-1] in ["completed", "failed", "cancelled"]:
58
+ break
59
+ time.sleep(1)
60
+
61
+ w.stop()
62
+
63
+ assert len(statuses) > 0
64
+ assert statuses[-1] in ["completed", "failed", "cancelled"]
65
+