firecrawl-py 3.3.1__py3-none-any.whl → 3.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl-py might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/e2e/v2/test_scrape.py +37 -1
- firecrawl/client.py +8 -4
- firecrawl/v2/types.py +19 -2
- {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info}/METADATA +7 -3
- firecrawl_py-3.3.3.dist-info/RECORD +79 -0
- {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info}/WHEEL +1 -1
- {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info/licenses}/LICENSE +0 -0
- {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info}/top_level.txt +0 -2
- build/lib/firecrawl/__init__.py +0 -87
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -79
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -188
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -38
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -40
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -137
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -248
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -35
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -43
- build/lib/firecrawl/__tests__/e2e/v2/conftest.py +0 -73
- build/lib/firecrawl/__tests__/e2e/v2/test_async.py +0 -73
- build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -105
- build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -276
- build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +0 -54
- build/lib/firecrawl/__tests__/e2e/v2/test_map.py +0 -60
- build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -154
- build/lib/firecrawl/__tests__/e2e/v2/test_search.py +0 -269
- build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +0 -26
- build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -65
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -12
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -61
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -12
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -19
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -50
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -63
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -28
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -117
- build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -90
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -70
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -240
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -107
- build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -53
- build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -92
- build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -167
- build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -236
- build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -18
- build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -123
- build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -290
- build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -332
- build/lib/firecrawl/client.py +0 -242
- build/lib/firecrawl/firecrawl.backup.py +0 -4635
- build/lib/firecrawl/types.py +0 -161
- build/lib/firecrawl/v1/__init__.py +0 -14
- build/lib/firecrawl/v1/client.py +0 -4653
- build/lib/firecrawl/v2/__init__.py +0 -4
- build/lib/firecrawl/v2/client.py +0 -805
- build/lib/firecrawl/v2/client_async.py +0 -250
- build/lib/firecrawl/v2/methods/aio/__init__.py +0 -1
- build/lib/firecrawl/v2/methods/aio/batch.py +0 -85
- build/lib/firecrawl/v2/methods/aio/crawl.py +0 -171
- build/lib/firecrawl/v2/methods/aio/extract.py +0 -126
- build/lib/firecrawl/v2/methods/aio/map.py +0 -59
- build/lib/firecrawl/v2/methods/aio/scrape.py +0 -33
- build/lib/firecrawl/v2/methods/aio/search.py +0 -172
- build/lib/firecrawl/v2/methods/aio/usage.py +0 -42
- build/lib/firecrawl/v2/methods/batch.py +0 -417
- build/lib/firecrawl/v2/methods/crawl.py +0 -469
- build/lib/firecrawl/v2/methods/extract.py +0 -131
- build/lib/firecrawl/v2/methods/map.py +0 -77
- build/lib/firecrawl/v2/methods/scrape.py +0 -64
- build/lib/firecrawl/v2/methods/search.py +0 -197
- build/lib/firecrawl/v2/methods/usage.py +0 -41
- build/lib/firecrawl/v2/types.py +0 -665
- build/lib/firecrawl/v2/utils/__init__.py +0 -9
- build/lib/firecrawl/v2/utils/error_handler.py +0 -107
- build/lib/firecrawl/v2/utils/get_version.py +0 -15
- build/lib/firecrawl/v2/utils/http_client.py +0 -153
- build/lib/firecrawl/v2/utils/http_client_async.py +0 -65
- build/lib/firecrawl/v2/utils/normalize.py +0 -107
- build/lib/firecrawl/v2/utils/validation.py +0 -324
- build/lib/firecrawl/v2/watcher.py +0 -301
- build/lib/firecrawl/v2/watcher_async.py +0 -242
- build/lib/tests/test_change_tracking.py +0 -98
- build/lib/tests/test_timeout_conversion.py +0 -117
- firecrawl_py-3.3.1.dist-info/RECORD +0 -153
firecrawl/__init__.py
CHANGED
|
@@ -151,4 +151,40 @@ class TestScrapeE2E:
|
|
|
151
151
|
max_age=0,
|
|
152
152
|
store_in_cache=False,
|
|
153
153
|
)
|
|
154
|
-
assert isinstance(doc, Document)
|
|
154
|
+
assert isinstance(doc, Document)
|
|
155
|
+
|
|
156
|
+
def test_scrape_images_format(self):
|
|
157
|
+
"""Test images format extraction."""
|
|
158
|
+
doc = self.client.scrape(
|
|
159
|
+
"https://firecrawl.dev",
|
|
160
|
+
formats=["images"]
|
|
161
|
+
)
|
|
162
|
+
assert isinstance(doc, Document)
|
|
163
|
+
assert doc.images is not None
|
|
164
|
+
assert isinstance(doc.images, list)
|
|
165
|
+
assert len(doc.images) > 0
|
|
166
|
+
# Should find firecrawl logo/branding images
|
|
167
|
+
assert any("firecrawl" in img.lower() or "logo" in img.lower() for img in doc.images)
|
|
168
|
+
|
|
169
|
+
def test_scrape_images_with_multiple_formats(self):
|
|
170
|
+
"""Test images format works with other formats."""
|
|
171
|
+
doc = self.client.scrape(
|
|
172
|
+
"https://github.com",
|
|
173
|
+
formats=["markdown", "links", "images"]
|
|
174
|
+
)
|
|
175
|
+
assert isinstance(doc, Document)
|
|
176
|
+
assert doc.markdown is not None
|
|
177
|
+
assert doc.links is not None
|
|
178
|
+
assert doc.images is not None
|
|
179
|
+
assert isinstance(doc.images, list)
|
|
180
|
+
assert len(doc.images) > 0
|
|
181
|
+
|
|
182
|
+
# Images should find content not available in links format
|
|
183
|
+
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico']
|
|
184
|
+
link_images = [
|
|
185
|
+
link for link in (doc.links or [])
|
|
186
|
+
if any(ext in link.lower() for ext in image_extensions)
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
# Should discover additional images beyond those with obvious extensions
|
|
190
|
+
assert len(doc.images) >= len(link_images)
|
firecrawl/client.py
CHANGED
|
@@ -56,7 +56,6 @@ class V2Proxy:
|
|
|
56
56
|
self._client = client_instance
|
|
57
57
|
|
|
58
58
|
if client_instance:
|
|
59
|
-
# self.scrape = client_instance.scrape
|
|
60
59
|
self.search = client_instance.search
|
|
61
60
|
self.crawl = client_instance.crawl
|
|
62
61
|
self.get_crawl_status = client_instance.get_crawl_status
|
|
@@ -168,14 +167,17 @@ class Firecrawl:
|
|
|
168
167
|
self.v1 = V1Proxy(self._v1_client) if self._v1_client else None
|
|
169
168
|
self.v2 = V2Proxy(self._v2_client)
|
|
170
169
|
|
|
171
|
-
|
|
172
170
|
self.scrape = self._v2_client.scrape
|
|
171
|
+
self.search = self._v2_client.search
|
|
172
|
+
self.map = self._v2_client.map
|
|
173
|
+
|
|
173
174
|
self.crawl = self._v2_client.crawl
|
|
174
175
|
self.start_crawl = self._v2_client.start_crawl
|
|
175
176
|
self.crawl_params_preview = self._v2_client.crawl_params_preview
|
|
176
177
|
self.get_crawl_status = self._v2_client.get_crawl_status
|
|
177
178
|
self.cancel_crawl = self._v2_client.cancel_crawl
|
|
178
179
|
self.get_crawl_errors = self._v2_client.get_crawl_errors
|
|
180
|
+
self.get_active_crawls = self._v2_client.get_active_crawls
|
|
179
181
|
self.active_crawls = self._v2_client.active_crawls
|
|
180
182
|
|
|
181
183
|
self.start_batch_scrape = self._v2_client.start_batch_scrape
|
|
@@ -183,13 +185,15 @@ class Firecrawl:
|
|
|
183
185
|
self.cancel_batch_scrape = self._v2_client.cancel_batch_scrape
|
|
184
186
|
self.batch_scrape = self._v2_client.batch_scrape
|
|
185
187
|
self.get_batch_scrape_errors = self._v2_client.get_batch_scrape_errors
|
|
188
|
+
|
|
189
|
+
self.start_extract = self._v2_client.start_extract
|
|
186
190
|
self.get_extract_status = self._v2_client.get_extract_status
|
|
187
|
-
self.map = self._v2_client.map
|
|
188
|
-
self.search = self._v2_client.search
|
|
189
191
|
self.extract = self._v2_client.extract
|
|
192
|
+
|
|
190
193
|
self.get_concurrency = self._v2_client.get_concurrency
|
|
191
194
|
self.get_credit_usage = self._v2_client.get_credit_usage
|
|
192
195
|
self.get_token_usage = self._v2_client.get_token_usage
|
|
196
|
+
|
|
193
197
|
self.watcher = self._v2_client.watcher
|
|
194
198
|
|
|
195
199
|
class AsyncFirecrawl:
|
firecrawl/v2/types.py
CHANGED
|
@@ -114,6 +114,12 @@ class DocumentMetadata(BaseModel):
|
|
|
114
114
|
def coerce_status_code_to_int(cls, v):
|
|
115
115
|
return cls._coerce_string_to_int(v)
|
|
116
116
|
|
|
117
|
+
class AttributeResult(BaseModel):
|
|
118
|
+
"""Result of attribute extraction."""
|
|
119
|
+
selector: str
|
|
120
|
+
attribute: str
|
|
121
|
+
values: List[str]
|
|
122
|
+
|
|
117
123
|
class Document(BaseModel):
|
|
118
124
|
"""A scraped document."""
|
|
119
125
|
markdown: Optional[str] = None
|
|
@@ -123,6 +129,7 @@ class Document(BaseModel):
|
|
|
123
129
|
summary: Optional[str] = None
|
|
124
130
|
metadata: Optional[DocumentMetadata] = None
|
|
125
131
|
links: Optional[List[str]] = None
|
|
132
|
+
images: Optional[List[str]] = None
|
|
126
133
|
screenshot: Optional[str] = None
|
|
127
134
|
actions: Optional[Dict[str, Any]] = None
|
|
128
135
|
warning: Optional[str] = None
|
|
@@ -182,7 +189,7 @@ CategoryOption = Union[str, Category]
|
|
|
182
189
|
|
|
183
190
|
FormatString = Literal[
|
|
184
191
|
# camelCase versions (API format)
|
|
185
|
-
"markdown", "html", "rawHtml", "links", "screenshot", "summary", "changeTracking", "json",
|
|
192
|
+
"markdown", "html", "rawHtml", "links", "images", "screenshot", "summary", "changeTracking", "json", "attributes",
|
|
186
193
|
# snake_case versions (user-friendly)
|
|
187
194
|
"raw_html", "change_tracking"
|
|
188
195
|
]
|
|
@@ -214,9 +221,18 @@ class ScreenshotFormat(BaseModel):
|
|
|
214
221
|
full_page: Optional[bool] = None
|
|
215
222
|
quality: Optional[int] = None
|
|
216
223
|
viewport: Optional[Union[Dict[str, int], Viewport]] = None
|
|
224
|
+
|
|
225
|
+
class AttributeSelector(BaseModel):
|
|
226
|
+
"""Selector and attribute pair for attribute extraction."""
|
|
227
|
+
selector: str
|
|
228
|
+
attribute: str
|
|
217
229
|
|
|
218
|
-
|
|
230
|
+
class AttributesFormat(Format):
|
|
231
|
+
"""Configuration for attribute extraction."""
|
|
232
|
+
type: Literal["attributes"] = "attributes"
|
|
233
|
+
selectors: List[AttributeSelector]
|
|
219
234
|
|
|
235
|
+
FormatOption = Union[Dict[str, Any], FormatString, JsonFormat, ChangeTrackingFormat, ScreenshotFormat, AttributesFormat, Format]
|
|
220
236
|
# Scrape types
|
|
221
237
|
class ScrapeFormats(BaseModel):
|
|
222
238
|
"""Output formats for scraping."""
|
|
@@ -226,6 +242,7 @@ class ScrapeFormats(BaseModel):
|
|
|
226
242
|
raw_html: bool = False
|
|
227
243
|
summary: bool = False
|
|
228
244
|
links: bool = False
|
|
245
|
+
images: bool = False
|
|
229
246
|
screenshot: bool = False
|
|
230
247
|
change_tracking: bool = False
|
|
231
248
|
json: bool = False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: firecrawl-py
|
|
3
|
-
Version: 3.3.
|
|
3
|
+
Version: 3.3.3
|
|
4
4
|
Summary: Python SDK for Firecrawl API
|
|
5
5
|
Home-page: https://github.com/firecrawl/firecrawl
|
|
6
6
|
Author: Mendable.ai
|
|
@@ -38,8 +38,12 @@ Requires-Dist: httpx
|
|
|
38
38
|
Requires-Dist: python-dotenv
|
|
39
39
|
Requires-Dist: websockets
|
|
40
40
|
Requires-Dist: nest-asyncio
|
|
41
|
-
Requires-Dist: pydantic
|
|
41
|
+
Requires-Dist: pydantic>=2.0
|
|
42
42
|
Requires-Dist: aiohttp
|
|
43
|
+
Dynamic: author
|
|
44
|
+
Dynamic: home-page
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
Dynamic: requires-python
|
|
43
47
|
|
|
44
48
|
# Firecrawl Python SDK
|
|
45
49
|
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=1MYT5_7-p8sfruL_5y1m1n9AoWG_6aNduWGW4NId86M,2192
|
|
2
|
+
firecrawl/client.py,sha256=tp3mUo_3aGPuZ53kpU4bhM-5EtwD_IUWrJ7wm0GMuCc,11159
|
|
3
|
+
firecrawl/firecrawl.backup.py,sha256=v1FEN3jR4g5Aupg4xp6SLkuFvYMQuUKND2YELbYjE6c,200430
|
|
4
|
+
firecrawl/types.py,sha256=W9N2pqQuevEIIjYHN9rbDf31E-nwdCECqIn11Foz2T8,2836
|
|
5
|
+
firecrawl/__tests__/e2e/v2/conftest.py,sha256=I28TUpN5j0-9gM79NlbrDS8Jlsheao657od2f-2xK0Y,2587
|
|
6
|
+
firecrawl/__tests__/e2e/v2/test_async.py,sha256=ZXpf1FVOJgNclITglrxIyFwP4cOiqzWLicGaxIm70BQ,2526
|
|
7
|
+
firecrawl/__tests__/e2e/v2/test_batch_scrape.py,sha256=H9GtuwHIFdOQ958SOVThi_kvDDxcXAK_ECRh95ogonQ,3265
|
|
8
|
+
firecrawl/__tests__/e2e/v2/test_crawl.py,sha256=cOssZvIwtghAtLiM1QdNLhPEwAxZ9j9umTrBUPtJjpU,9951
|
|
9
|
+
firecrawl/__tests__/e2e/v2/test_extract.py,sha256=HgvGiDlyWtFygiPo5EP44Dem1oWrwgRF-hfc1LfeVSU,1670
|
|
10
|
+
firecrawl/__tests__/e2e/v2/test_map.py,sha256=9sT-Yq8V_8c9esl_bv5hnTA9WXb2Dg81kj6M-s0484c,1618
|
|
11
|
+
firecrawl/__tests__/e2e/v2/test_scrape.py,sha256=oyroF_WaEdxgD8t_SHkLBBfDRv1_6xZ_7vSTQpwlmA8,7198
|
|
12
|
+
firecrawl/__tests__/e2e/v2/test_search.py,sha256=tvU9_eg_3H5em0fhIwPPjuYe9BRAQ5St-BLM0l_FfVs,9079
|
|
13
|
+
firecrawl/__tests__/e2e/v2/test_usage.py,sha256=JlBkYblhThua5qF2crRjsPpq4Ja0cBsdzxZ5zxXnQ_Y,805
|
|
14
|
+
firecrawl/__tests__/e2e/v2/test_watcher.py,sha256=OPTKLhVAKWqXl2Tieo6zCN1xpEwZDsz-B977CVJgLMA,1932
|
|
15
|
+
firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py,sha256=gJv_mLzzoAYftETB2TLkrpSfB5c04kaYgkD4hQTYsIg,2639
|
|
16
|
+
firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py,sha256=X-nk5tkYUYIkM6kTYl7GDjvxh2JT9GxJqk2KlO8xpWw,7282
|
|
17
|
+
firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py,sha256=3CNRIFzgBMcOYOLhnKcK1k5a3Gy--u08EGDkL31uieM,1199
|
|
18
|
+
firecrawl/__tests__/e2e/v2/aio/test_aio_map.py,sha256=nckl1kbiEaaTdu5lm__tOoTDG-txTYwwSH3KZEvyKzc,1199
|
|
19
|
+
firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py,sha256=b17A7advBEjxrjdait2w8GHztZeKy_P3zZ3ixm5H7xw,4453
|
|
20
|
+
firecrawl/__tests__/e2e/v2/aio/test_aio_search.py,sha256=ehV0Ai_hknAkaoE551j2lbktV4bi_J0h3FKzC7G15Iw,8246
|
|
21
|
+
firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py,sha256=Dh9BVo48NKSZOKgLbO7n8fpMjvYmeMXDFzbIhnCTMhE,1014
|
|
22
|
+
firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py,sha256=hwES4Nu5c0hniZ9heIPDfvh_2JmJ2wPoX9ULTZ0Asjs,1471
|
|
23
|
+
firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py,sha256=HeOxN-sPYSssytcIRAEicJSZsFt_Oa5qGXAtdumR54c,4040
|
|
24
|
+
firecrawl/__tests__/unit/v2/methods/test_crawl_params.py,sha256=p9hzg14uAs1iHKXPDSXhGU6hEzPBF_Ae34RAf5XYa10,2387
|
|
25
|
+
firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py,sha256=PEKbooNXfQwPpvcPHXABJnveztgAA-RFBhtlSs8uPro,8780
|
|
26
|
+
firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py,sha256=kErOmHSD01eMjXiMd4rgsMVGd_aU2G9uVymBjbAFoGw,3918
|
|
27
|
+
firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py,sha256=toVcgnMp_cFeYsIUuyKGEWZGp0nAAkzaeFGUbY0zY0o,1868
|
|
28
|
+
firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py,sha256=wDOslsA5BN4kyezlaT5GeMv_Ifn8f461EaA7i5ujnaQ,3482
|
|
29
|
+
firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py,sha256=14lUgFpQsiosgMKjDustBRVE0zXnHujBI76F8BC5PZ4,6072
|
|
30
|
+
firecrawl/__tests__/unit/v2/methods/test_search_validation.py,sha256=7UGcNHpQzCpZbAPYjthfdPFWmAPcoApY-ED-khtuANs,9498
|
|
31
|
+
firecrawl/__tests__/unit/v2/methods/test_usage_types.py,sha256=cCHHfa6agSjD0brQ9rcAcw2kaI9riUH5C0dXV-fqktg,591
|
|
32
|
+
firecrawl/__tests__/unit/v2/methods/test_webhook.py,sha256=AvvW-bKpUA--Lvtif2bmUIp-AxiaMJ29ie1i9dk8WbI,4586
|
|
33
|
+
firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py,sha256=9azJxVvDOBqUevLp-wBF9gF7Ptj-7nN6LOkPQncFX2M,456
|
|
34
|
+
firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py,sha256=RkIKt7uxBzVhAkrLQwXYjmC-9sj32SUNQrJZgF2WEMs,2565
|
|
35
|
+
firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py,sha256=WMgltdrrT2HOflqGyahC4v-Wb29_8sypN0hwS9lYXe8,403
|
|
36
|
+
firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py,sha256=PdUJrR0JLWqrithAnRXwuRrnsIN2h_DTu6-xvTOn_UU,725
|
|
37
|
+
firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py,sha256=A5DT4wpH4vrIPvFxKVHrtDH5A3bgJ_ad4fmVQ8LN1t0,1993
|
|
38
|
+
firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py,sha256=hFk4XgqF3aFPGFJe0ikB1uwf_0FsppNGA088OrWUXvg,2091
|
|
39
|
+
firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py,sha256=E26UnUhpbjG-EG0ab4WRD94AxA5IBWmIHq8ZLBOWoAA,1202
|
|
40
|
+
firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py,sha256=pUwuWhRbVUTbgsZn4hgZesMkTMesTv_NPmvFW--ls-Y,3815
|
|
41
|
+
firecrawl/__tests__/unit/v2/utils/test_validation.py,sha256=E4n4jpBhH_W7E0ikI5r8KMAKiOhbfGD3i_B8-dv3PlI,10803
|
|
42
|
+
firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py,sha256=87w47n0iOihtu4jTR4-4rw1-xVKWmLg2BOBGxjQPnUk,9517
|
|
43
|
+
firecrawl/v1/__init__.py,sha256=aP1oisPeZVGGZynvENc07JySMOZfv_4zAlxQ0ecMJXA,481
|
|
44
|
+
firecrawl/v1/client.py,sha256=sydurfEFTsXyowyaGryA1lkPxN_r9Nf6iQpM43OwJyM,201672
|
|
45
|
+
firecrawl/v2/__init__.py,sha256=Jc6a8tBjYG5OPkjDM5pl-notyys-7DEj7PLEfepv3fc,137
|
|
46
|
+
firecrawl/v2/client.py,sha256=_DZFZO1aWvODzznK0g2Svcd2-xxXgWGR0d9vniNlk1w,30621
|
|
47
|
+
firecrawl/v2/client_async.py,sha256=zwxHis1bSh0tSF1480ze-4XDQEDJ5yDur1ZqtL94dwc,10127
|
|
48
|
+
firecrawl/v2/types.py,sha256=F-RCADQFdpAmF5t8LUabLOgyIV02Ol34yNa9y3S3ZMg,22667
|
|
49
|
+
firecrawl/v2/watcher.py,sha256=FOU71tqSKxgeuGycu4ye0SLc2dw7clIcoQjPsi-4Csc,14229
|
|
50
|
+
firecrawl/v2/watcher_async.py,sha256=AVjW2mgABniolSsauK4u0FW8ya6WzRUdyEg2R-8vGCw,10278
|
|
51
|
+
firecrawl/v2/methods/batch.py,sha256=us7zUGl7u9ZDIEk2J3rNqj87bkaNjXU27SMFW_fdcg8,11932
|
|
52
|
+
firecrawl/v2/methods/crawl.py,sha256=4ZUmanHNuNtq9wbKMAZ3lenuPcNdOaV0kYXqMI5XJJ8,15485
|
|
53
|
+
firecrawl/v2/methods/extract.py,sha256=-Jr4BtraU3b7hd3JIY73V-S69rUclxyXyUpoQb6DCQk,4274
|
|
54
|
+
firecrawl/v2/methods/map.py,sha256=4SADb0-lkbdOWDmO6k8_TzK0yRti5xsN40N45nUl9uA,2592
|
|
55
|
+
firecrawl/v2/methods/scrape.py,sha256=CSHBwC-P91UfrW3zHirjNAs2h899FKcWvd1DY_4fJdo,1921
|
|
56
|
+
firecrawl/v2/methods/search.py,sha256=6BKiQ1aKJjWBKm9BBtKxFKGD74kCKBeMIp_OgjcDFAw,7673
|
|
57
|
+
firecrawl/v2/methods/usage.py,sha256=OJlkxwaB-AAtgO3WLr9QiqBRmjdh6GVhroCgleegupQ,1460
|
|
58
|
+
firecrawl/v2/methods/aio/__init__.py,sha256=RocMJnGwnLIvGu3G8ZvY8INkipC7WHZiu2bE31eSyJs,35
|
|
59
|
+
firecrawl/v2/methods/aio/batch.py,sha256=GS_xsd_Uib1fxFITBK1sH88VGzFMrIcqJVQqOvMQ540,3735
|
|
60
|
+
firecrawl/v2/methods/aio/crawl.py,sha256=pC6bHVk30Hj1EJdAChxpMOg0Xx_GVqq4tIlvU2e5RQ4,6688
|
|
61
|
+
firecrawl/v2/methods/aio/extract.py,sha256=IfNr2ETqt4dR73JFzrEYI4kk5vpKnJOG0BmPEjGEoO4,4217
|
|
62
|
+
firecrawl/v2/methods/aio/map.py,sha256=EuT-5A0cQr_e5SBfEZ6pnl8u0JUwEEvSwhyT2N-QoKU,2326
|
|
63
|
+
firecrawl/v2/methods/aio/scrape.py,sha256=ilA9qco8YGwCFpE0PN1XBQUyuHPQwH2QioZ-xsfxhgU,1386
|
|
64
|
+
firecrawl/v2/methods/aio/search.py,sha256=_TqTFGQLlOCCLNdWcOvakTqPGD2r9AOlBg8RasOgmvw,6177
|
|
65
|
+
firecrawl/v2/methods/aio/usage.py,sha256=OtBi6X-aT09MMR2dpm3vBCm9JrJZIJLCQ8jJ3L7vie4,1606
|
|
66
|
+
firecrawl/v2/utils/__init__.py,sha256=i1GgxySmqEXpWSBQCu3iZBPIJG7fXj0QXCDWGwerWNs,338
|
|
67
|
+
firecrawl/v2/utils/error_handler.py,sha256=Iuf916dHphDY8ObNNlWy75628DFeJ0Rv8ljRp4LttLE,4199
|
|
68
|
+
firecrawl/v2/utils/get_version.py,sha256=0CxW_41q2hlzIxEWOivUCaYw3GFiSIH32RPUMcIgwAY,492
|
|
69
|
+
firecrawl/v2/utils/http_client.py,sha256=_n8mp4xi6GGihg662Lsv6TSlvw9zykyADwEk0fg8mYA,4873
|
|
70
|
+
firecrawl/v2/utils/http_client_async.py,sha256=iy89_bk2HS3afSRHZ8016eMCa9Fk-5MFTntcOHfbPgE,1936
|
|
71
|
+
firecrawl/v2/utils/normalize.py,sha256=nlTU6QRghT1YKZzNZlIQj4STSRuSUGrS9cCErZIcY5w,3636
|
|
72
|
+
firecrawl/v2/utils/validation.py,sha256=L8by7z-t6GuMGIYkK7il1BM8d-4_-sAdG9hDMF_LeG4,14518
|
|
73
|
+
firecrawl_py-3.3.3.dist-info/licenses/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
74
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
75
|
+
tests/test_timeout_conversion.py,sha256=PWlIEMASQNhu4cp1OW_ebklnE9NCiigPnEFCtI5N3w0,3996
|
|
76
|
+
firecrawl_py-3.3.3.dist-info/METADATA,sha256=_5tGMWJrCEIJy1UCLKbPAgV6iczF2_T4aRpLhpoL0F4,7395
|
|
77
|
+
firecrawl_py-3.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
78
|
+
firecrawl_py-3.3.3.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
79
|
+
firecrawl_py-3.3.3.dist-info/RECORD,,
|
|
File without changes
|
build/lib/firecrawl/__init__.py
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Firecrawl Python SDK
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import logging
|
|
7
|
-
import os
|
|
8
|
-
|
|
9
|
-
from .client import Firecrawl, AsyncFirecrawl, FirecrawlApp, AsyncFirecrawlApp
|
|
10
|
-
from .v2.watcher import Watcher
|
|
11
|
-
from .v2.watcher_async import AsyncWatcher
|
|
12
|
-
from .v1 import (
|
|
13
|
-
V1FirecrawlApp,
|
|
14
|
-
AsyncV1FirecrawlApp,
|
|
15
|
-
V1JsonConfig,
|
|
16
|
-
V1ScrapeOptions,
|
|
17
|
-
V1ChangeTrackingOptions,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
__version__ = "3.3.1"
|
|
21
|
-
|
|
22
|
-
# Define the logger for the Firecrawl project
|
|
23
|
-
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def _configure_logger() -> None:
|
|
27
|
-
"""
|
|
28
|
-
Configure the firecrawl logger for console output.
|
|
29
|
-
|
|
30
|
-
The function attaches a handler for console output with a specific format and date
|
|
31
|
-
format to the firecrawl logger.
|
|
32
|
-
"""
|
|
33
|
-
try:
|
|
34
|
-
formatter = logging.Formatter(
|
|
35
|
-
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
36
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
console_handler = logging.StreamHandler()
|
|
40
|
-
console_handler.setFormatter(formatter)
|
|
41
|
-
|
|
42
|
-
logger.addHandler(console_handler)
|
|
43
|
-
except Exception as e:
|
|
44
|
-
logger.error("Failed to configure logging: %s", e)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def setup_logging() -> None:
|
|
48
|
-
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
49
|
-
if logger.hasHandlers():
|
|
50
|
-
return
|
|
51
|
-
|
|
52
|
-
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
53
|
-
logger.addHandler(logging.NullHandler())
|
|
54
|
-
return
|
|
55
|
-
|
|
56
|
-
_configure_logger()
|
|
57
|
-
|
|
58
|
-
if env == "DEBUG":
|
|
59
|
-
logger.setLevel(logging.DEBUG)
|
|
60
|
-
elif env == "INFO":
|
|
61
|
-
logger.setLevel(logging.INFO)
|
|
62
|
-
elif env == "WARNING":
|
|
63
|
-
logger.setLevel(logging.WARNING)
|
|
64
|
-
elif env == "ERROR":
|
|
65
|
-
logger.setLevel(logging.ERROR)
|
|
66
|
-
elif env == "CRITICAL":
|
|
67
|
-
logger.setLevel(logging.CRITICAL)
|
|
68
|
-
else:
|
|
69
|
-
logger.setLevel(logging.INFO)
|
|
70
|
-
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
71
|
-
|
|
72
|
-
setup_logging()
|
|
73
|
-
logger.debug("Debugging logger setup")
|
|
74
|
-
|
|
75
|
-
__all__ = [
|
|
76
|
-
'Firecrawl',
|
|
77
|
-
'AsyncFirecrawl',
|
|
78
|
-
'FirecrawlApp',
|
|
79
|
-
'AsyncFirecrawlApp',
|
|
80
|
-
'Watcher',
|
|
81
|
-
'AsyncWatcher',
|
|
82
|
-
'V1FirecrawlApp',
|
|
83
|
-
'AsyncV1FirecrawlApp',
|
|
84
|
-
'V1JsonConfig',
|
|
85
|
-
'V1ScrapeOptions',
|
|
86
|
-
'V1ChangeTrackingOptions',
|
|
87
|
-
]
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import asyncio
|
|
3
|
-
import pytest
|
|
4
|
-
from dotenv import load_dotenv
|
|
5
|
-
from firecrawl import AsyncFirecrawl
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
load_dotenv()
|
|
9
|
-
|
|
10
|
-
if not os.getenv("API_KEY"):
|
|
11
|
-
raise ValueError("API_KEY is not set")
|
|
12
|
-
|
|
13
|
-
if not os.getenv("API_URL"):
|
|
14
|
-
raise ValueError("API_URL is not set")
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@pytest.mark.asyncio
|
|
18
|
-
async def test_async_batch_start_and_status():
|
|
19
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
20
|
-
start = await client.start_batch_scrape([
|
|
21
|
-
"https://docs.firecrawl.dev",
|
|
22
|
-
"https://firecrawl.dev",
|
|
23
|
-
], formats=["markdown"], max_concurrency=1)
|
|
24
|
-
job_id = start.id
|
|
25
|
-
|
|
26
|
-
deadline = asyncio.get_event_loop().time() + 240
|
|
27
|
-
status = await client.get_batch_scrape_status(job_id)
|
|
28
|
-
while status.status not in ("completed", "failed", "cancelled") and asyncio.get_event_loop().time() < deadline:
|
|
29
|
-
await asyncio.sleep(2)
|
|
30
|
-
status = await client.get_batch_scrape_status(job_id)
|
|
31
|
-
|
|
32
|
-
assert status.status in ("completed", "failed", "cancelled")
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
@pytest.mark.asyncio
|
|
36
|
-
async def test_async_batch_wait_minimal():
|
|
37
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
38
|
-
job = await client.batch_scrape([
|
|
39
|
-
"https://docs.firecrawl.dev",
|
|
40
|
-
"https://firecrawl.dev",
|
|
41
|
-
], formats=["markdown"], poll_interval=1, timeout=120)
|
|
42
|
-
assert job.status in ("completed", "failed")
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@pytest.mark.asyncio
|
|
46
|
-
async def test_async_batch_wait_with_all_params():
|
|
47
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
48
|
-
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}
|
|
49
|
-
job = await client.batch_scrape(
|
|
50
|
-
[
|
|
51
|
-
"https://docs.firecrawl.dev",
|
|
52
|
-
"https://firecrawl.dev",
|
|
53
|
-
],
|
|
54
|
-
formats=[
|
|
55
|
-
"markdown",
|
|
56
|
-
{"type": "json", "prompt": "Extract page title", "schema": json_schema},
|
|
57
|
-
{"type": "changeTracking", "prompt": "Track changes", "modes": ["json"]},
|
|
58
|
-
],
|
|
59
|
-
only_main_content=True,
|
|
60
|
-
mobile=False,
|
|
61
|
-
ignore_invalid_urls=True,
|
|
62
|
-
max_concurrency=2,
|
|
63
|
-
zero_data_retention=False,
|
|
64
|
-
poll_interval=1,
|
|
65
|
-
timeout=180,
|
|
66
|
-
)
|
|
67
|
-
assert job.status in ("completed", "failed")
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
@pytest.mark.asyncio
|
|
71
|
-
async def test_async_cancel_batch():
|
|
72
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
73
|
-
start = await client.start_batch_scrape([
|
|
74
|
-
"https://docs.firecrawl.dev",
|
|
75
|
-
"https://firecrawl.dev",
|
|
76
|
-
], formats=["markdown"], max_concurrency=1)
|
|
77
|
-
ok = await client.cancel_batch_scrape(start.id)
|
|
78
|
-
assert ok is True
|
|
79
|
-
|
|
@@ -1,188 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import asyncio
|
|
3
|
-
import pytest
|
|
4
|
-
from dotenv import load_dotenv
|
|
5
|
-
from firecrawl import AsyncFirecrawl
|
|
6
|
-
from firecrawl.v2.types import ScrapeOptions
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
load_dotenv()
|
|
10
|
-
|
|
11
|
-
if not os.getenv("API_KEY"):
|
|
12
|
-
raise ValueError("API_KEY is not set")
|
|
13
|
-
|
|
14
|
-
if not os.getenv("API_URL"):
|
|
15
|
-
raise ValueError("API_URL is not set")
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
@pytest.mark.asyncio
|
|
19
|
-
async def test_async_crawl_start_and_status():
|
|
20
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
21
|
-
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2)
|
|
22
|
-
job_id = start.id
|
|
23
|
-
|
|
24
|
-
deadline = asyncio.get_event_loop().time() + 180
|
|
25
|
-
status = await client.get_crawl_status(job_id)
|
|
26
|
-
while status.status not in ("completed", "failed") and asyncio.get_event_loop().time() < deadline:
|
|
27
|
-
await asyncio.sleep(2)
|
|
28
|
-
status = await client.get_crawl_status(job_id)
|
|
29
|
-
|
|
30
|
-
assert status.status in ("completed", "failed")
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@pytest.mark.asyncio
|
|
34
|
-
async def test_async_crawl_with_all_params():
|
|
35
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
36
|
-
# rich scrape options including json format
|
|
37
|
-
json_schema = {
|
|
38
|
-
"type": "object",
|
|
39
|
-
"properties": {"title": {"type": "string"}},
|
|
40
|
-
"required": ["title"],
|
|
41
|
-
}
|
|
42
|
-
status = await client.crawl(
|
|
43
|
-
url="https://docs.firecrawl.dev",
|
|
44
|
-
prompt="Extract docs and blog",
|
|
45
|
-
include_paths=["/docs/*", "/blog/*"],
|
|
46
|
-
exclude_paths=["/admin/*"],
|
|
47
|
-
max_discovery_depth=2,
|
|
48
|
-
ignore_sitemap=False,
|
|
49
|
-
ignore_query_parameters=True,
|
|
50
|
-
limit=5,
|
|
51
|
-
crawl_entire_domain=False,
|
|
52
|
-
allow_external_links=True,
|
|
53
|
-
allow_subdomains=True,
|
|
54
|
-
delay=1,
|
|
55
|
-
max_concurrency=2,
|
|
56
|
-
webhook="https://example.com/hook",
|
|
57
|
-
scrape_options=ScrapeOptions(
|
|
58
|
-
formats=[
|
|
59
|
-
"markdown",
|
|
60
|
-
"rawHtml",
|
|
61
|
-
{"type": "json", "prompt": "Extract title", "schema": json_schema},
|
|
62
|
-
],
|
|
63
|
-
only_main_content=True,
|
|
64
|
-
mobile=False,
|
|
65
|
-
timeout=20000,
|
|
66
|
-
wait_for=500,
|
|
67
|
-
skip_tls_verification=False,
|
|
68
|
-
remove_base64_images=False,
|
|
69
|
-
),
|
|
70
|
-
zero_data_retention=False,
|
|
71
|
-
poll_interval=2,
|
|
72
|
-
timeout=180,
|
|
73
|
-
)
|
|
74
|
-
assert status.status in ("completed", "failed")
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
@pytest.mark.asyncio
|
|
78
|
-
async def test_async_start_crawl_with_options():
|
|
79
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
80
|
-
start = await client.start_crawl("https://docs.firecrawl.dev", limit=5, max_discovery_depth=2)
|
|
81
|
-
assert start.id is not None and start.url is not None
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
@pytest.mark.asyncio
|
|
85
|
-
async def test_async_start_crawl_with_prompt():
|
|
86
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
87
|
-
start = await client.start_crawl("https://firecrawl.dev", prompt="Extract all blog posts", limit=3)
|
|
88
|
-
assert start.id is not None and start.url is not None
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
@pytest.mark.asyncio
|
|
92
|
-
async def test_async_get_crawl_status_shape():
|
|
93
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
94
|
-
start = await client.start_crawl("https://docs.firecrawl.dev", limit=3)
|
|
95
|
-
status = await client.get_crawl_status(start.id)
|
|
96
|
-
assert status.status in ("scraping", "completed", "failed")
|
|
97
|
-
assert status.completed >= 0
|
|
98
|
-
assert status.expires_at is not None
|
|
99
|
-
assert isinstance(status.data, list)
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
@pytest.mark.asyncio
|
|
103
|
-
async def test_async_crawl_with_wait():
|
|
104
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
105
|
-
job = await client.crawl(url="https://docs.firecrawl.dev", limit=3, max_discovery_depth=2, poll_interval=1, timeout=120)
|
|
106
|
-
assert job.status in ("completed", "failed")
|
|
107
|
-
assert job.completed >= 0 and job.total >= 0 and isinstance(job.data, list)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
@pytest.mark.asyncio
|
|
111
|
-
async def test_async_crawl_with_prompt_and_wait():
|
|
112
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
113
|
-
job = await client.crawl(url="https://docs.firecrawl.dev", prompt="Extract all blog posts", limit=3, poll_interval=1, timeout=120)
|
|
114
|
-
assert job.status in ("completed", "failed")
|
|
115
|
-
assert job.completed >= 0 and job.total >= 0 and isinstance(job.data, list)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
@pytest.mark.asyncio
|
|
119
|
-
async def test_async_crawl_with_scrape_options():
|
|
120
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
121
|
-
scrape_opts = ScrapeOptions(formats=["markdown", "links"], only_main_content=False, mobile=True)
|
|
122
|
-
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2, scrape_options=scrape_opts)
|
|
123
|
-
assert start.id is not None
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
@pytest.mark.asyncio
|
|
127
|
-
async def test_async_crawl_with_json_format_object():
|
|
128
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
129
|
-
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}
|
|
130
|
-
scrape_opts = ScrapeOptions(formats=[{"type": "json", "prompt": "Extract page title", "schema": json_schema}])
|
|
131
|
-
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2, scrape_options=scrape_opts)
|
|
132
|
-
assert start.id is not None
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
@pytest.mark.asyncio
|
|
136
|
-
async def test_async_cancel_crawl():
|
|
137
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
138
|
-
start = await client.start_crawl("https://docs.firecrawl.dev", limit=3)
|
|
139
|
-
cancelled = await client.cancel_crawl(start.id)
|
|
140
|
-
assert cancelled is True
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
@pytest.mark.asyncio
|
|
144
|
-
async def test_async_get_crawl_errors_and_invalid_job():
|
|
145
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
146
|
-
start = await client.start_crawl("https://docs.firecrawl.dev", limit=2)
|
|
147
|
-
errs = await client.get_crawl_errors(start.id)
|
|
148
|
-
assert hasattr(errs, "errors") and hasattr(errs, "robots_blocked")
|
|
149
|
-
with pytest.raises(Exception):
|
|
150
|
-
await client.get_crawl_errors("invalid-job-id-12345")
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
@pytest.mark.asyncio
|
|
154
|
-
async def test_async_active_crawls():
|
|
155
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
156
|
-
resp = await client.active_crawls()
|
|
157
|
-
assert hasattr(resp, "success") and hasattr(resp, "crawls")
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
@pytest.mark.asyncio
|
|
161
|
-
async def test_async_active_crawls_with_running_crawl():
|
|
162
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
163
|
-
start = await client.start_crawl("https://docs.firecrawl.dev", limit=3)
|
|
164
|
-
# fetch active crawls and assert our ID is listed
|
|
165
|
-
active = await client.active_crawls()
|
|
166
|
-
ids = [c.id for c in active.crawls]
|
|
167
|
-
assert start.id in ids
|
|
168
|
-
# cleanup
|
|
169
|
-
await client.cancel_crawl(start.id)
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
@pytest.mark.asyncio
|
|
173
|
-
async def test_async_crawl_params_preview():
|
|
174
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
175
|
-
params = await client.crawl_params_preview(
|
|
176
|
-
url="https://docs.firecrawl.dev",
|
|
177
|
-
prompt="Extract all blog posts and documentation",
|
|
178
|
-
)
|
|
179
|
-
assert params is not None
|
|
180
|
-
# basic sanity: at least one field should be suggested
|
|
181
|
-
has_any = any([
|
|
182
|
-
getattr(params, "limit", None) is not None,
|
|
183
|
-
getattr(params, "include_paths", None) is not None,
|
|
184
|
-
getattr(params, "max_discovery_depth", None) is not None,
|
|
185
|
-
])
|
|
186
|
-
assert has_any
|
|
187
|
-
|
|
188
|
-
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import pytest
|
|
3
|
-
from dotenv import load_dotenv
|
|
4
|
-
from firecrawl import AsyncFirecrawl
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
load_dotenv()
|
|
8
|
-
|
|
9
|
-
if not os.getenv("API_KEY"):
|
|
10
|
-
raise ValueError("API_KEY is not set")
|
|
11
|
-
|
|
12
|
-
if not os.getenv("API_URL"):
|
|
13
|
-
raise ValueError("API_URL is not set")
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@pytest.mark.asyncio
|
|
17
|
-
async def test_async_extract_minimal():
|
|
18
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
19
|
-
res = await client.extract(urls=["https://docs.firecrawl.dev"], prompt="Extract title")
|
|
20
|
-
assert res is not None
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@pytest.mark.asyncio
|
|
24
|
-
async def test_async_extract_with_schema_and_options():
|
|
25
|
-
client = AsyncFirecrawl(api_key=os.getenv("API_KEY"), api_url=os.getenv("API_URL"))
|
|
26
|
-
schema = {"type": "object", "properties": {"title": {"type": "string"}}, "required": ["title"]}
|
|
27
|
-
res = await client.extract(
|
|
28
|
-
urls=["https://docs.firecrawl.dev"],
|
|
29
|
-
prompt="Extract title",
|
|
30
|
-
schema=schema,
|
|
31
|
-
system_prompt="You are a helpful extractor",
|
|
32
|
-
allow_external_links=False,
|
|
33
|
-
enable_web_search=False,
|
|
34
|
-
show_sources=False,
|
|
35
|
-
# agent={"model": "FIRE-1", "prompt": "Extract title"}, # Skipping agent test in CI
|
|
36
|
-
)
|
|
37
|
-
assert res is not None
|
|
38
|
-
|