firecrawl 3.3.0__py3-none-any.whl → 3.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/e2e/v2/test_scrape.py +37 -1
- firecrawl/client.py +8 -4
- firecrawl/v2/client.py +3 -0
- firecrawl/v2/types.py +19 -2
- {firecrawl-3.3.0.dist-info → firecrawl-3.3.3.dist-info}/METADATA +7 -3
- {firecrawl-3.3.0.dist-info → firecrawl-3.3.3.dist-info}/RECORD +10 -10
- {firecrawl-3.3.0.dist-info → firecrawl-3.3.3.dist-info}/WHEEL +1 -1
- {firecrawl-3.3.0.dist-info → firecrawl-3.3.3.dist-info/licenses}/LICENSE +0 -0
- {firecrawl-3.3.0.dist-info → firecrawl-3.3.3.dist-info}/top_level.txt +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -151,4 +151,40 @@ class TestScrapeE2E:
|
|
|
151
151
|
max_age=0,
|
|
152
152
|
store_in_cache=False,
|
|
153
153
|
)
|
|
154
|
-
assert isinstance(doc, Document)
|
|
154
|
+
assert isinstance(doc, Document)
|
|
155
|
+
|
|
156
|
+
def test_scrape_images_format(self):
|
|
157
|
+
"""Test images format extraction."""
|
|
158
|
+
doc = self.client.scrape(
|
|
159
|
+
"https://firecrawl.dev",
|
|
160
|
+
formats=["images"]
|
|
161
|
+
)
|
|
162
|
+
assert isinstance(doc, Document)
|
|
163
|
+
assert doc.images is not None
|
|
164
|
+
assert isinstance(doc.images, list)
|
|
165
|
+
assert len(doc.images) > 0
|
|
166
|
+
# Should find firecrawl logo/branding images
|
|
167
|
+
assert any("firecrawl" in img.lower() or "logo" in img.lower() for img in doc.images)
|
|
168
|
+
|
|
169
|
+
def test_scrape_images_with_multiple_formats(self):
|
|
170
|
+
"""Test images format works with other formats."""
|
|
171
|
+
doc = self.client.scrape(
|
|
172
|
+
"https://github.com",
|
|
173
|
+
formats=["markdown", "links", "images"]
|
|
174
|
+
)
|
|
175
|
+
assert isinstance(doc, Document)
|
|
176
|
+
assert doc.markdown is not None
|
|
177
|
+
assert doc.links is not None
|
|
178
|
+
assert doc.images is not None
|
|
179
|
+
assert isinstance(doc.images, list)
|
|
180
|
+
assert len(doc.images) > 0
|
|
181
|
+
|
|
182
|
+
# Images should find content not available in links format
|
|
183
|
+
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico']
|
|
184
|
+
link_images = [
|
|
185
|
+
link for link in (doc.links or [])
|
|
186
|
+
if any(ext in link.lower() for ext in image_extensions)
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
# Should discover additional images beyond those with obvious extensions
|
|
190
|
+
assert len(doc.images) >= len(link_images)
|
firecrawl/client.py
CHANGED
|
@@ -56,7 +56,6 @@ class V2Proxy:
|
|
|
56
56
|
self._client = client_instance
|
|
57
57
|
|
|
58
58
|
if client_instance:
|
|
59
|
-
# self.scrape = client_instance.scrape
|
|
60
59
|
self.search = client_instance.search
|
|
61
60
|
self.crawl = client_instance.crawl
|
|
62
61
|
self.get_crawl_status = client_instance.get_crawl_status
|
|
@@ -168,14 +167,17 @@ class Firecrawl:
|
|
|
168
167
|
self.v1 = V1Proxy(self._v1_client) if self._v1_client else None
|
|
169
168
|
self.v2 = V2Proxy(self._v2_client)
|
|
170
169
|
|
|
171
|
-
|
|
172
170
|
self.scrape = self._v2_client.scrape
|
|
171
|
+
self.search = self._v2_client.search
|
|
172
|
+
self.map = self._v2_client.map
|
|
173
|
+
|
|
173
174
|
self.crawl = self._v2_client.crawl
|
|
174
175
|
self.start_crawl = self._v2_client.start_crawl
|
|
175
176
|
self.crawl_params_preview = self._v2_client.crawl_params_preview
|
|
176
177
|
self.get_crawl_status = self._v2_client.get_crawl_status
|
|
177
178
|
self.cancel_crawl = self._v2_client.cancel_crawl
|
|
178
179
|
self.get_crawl_errors = self._v2_client.get_crawl_errors
|
|
180
|
+
self.get_active_crawls = self._v2_client.get_active_crawls
|
|
179
181
|
self.active_crawls = self._v2_client.active_crawls
|
|
180
182
|
|
|
181
183
|
self.start_batch_scrape = self._v2_client.start_batch_scrape
|
|
@@ -183,13 +185,15 @@ class Firecrawl:
|
|
|
183
185
|
self.cancel_batch_scrape = self._v2_client.cancel_batch_scrape
|
|
184
186
|
self.batch_scrape = self._v2_client.batch_scrape
|
|
185
187
|
self.get_batch_scrape_errors = self._v2_client.get_batch_scrape_errors
|
|
188
|
+
|
|
189
|
+
self.start_extract = self._v2_client.start_extract
|
|
186
190
|
self.get_extract_status = self._v2_client.get_extract_status
|
|
187
|
-
self.map = self._v2_client.map
|
|
188
|
-
self.search = self._v2_client.search
|
|
189
191
|
self.extract = self._v2_client.extract
|
|
192
|
+
|
|
190
193
|
self.get_concurrency = self._v2_client.get_concurrency
|
|
191
194
|
self.get_credit_usage = self._v2_client.get_credit_usage
|
|
192
195
|
self.get_token_usage = self._v2_client.get_token_usage
|
|
196
|
+
|
|
193
197
|
self.watcher = self._v2_client.watcher
|
|
194
198
|
|
|
195
199
|
class AsyncFirecrawl:
|
firecrawl/v2/client.py
CHANGED
|
@@ -13,6 +13,7 @@ from .types import (
|
|
|
13
13
|
SearchRequest,
|
|
14
14
|
SearchData,
|
|
15
15
|
SourceOption,
|
|
16
|
+
CategoryOption,
|
|
16
17
|
CrawlRequest,
|
|
17
18
|
CrawlResponse,
|
|
18
19
|
CrawlJob,
|
|
@@ -171,6 +172,7 @@ class FirecrawlClient:
|
|
|
171
172
|
query: str,
|
|
172
173
|
*,
|
|
173
174
|
sources: Optional[List[SourceOption]] = None,
|
|
175
|
+
categories: Optional[List[CategoryOption]] = None,
|
|
174
176
|
limit: Optional[int] = None,
|
|
175
177
|
tbs: Optional[str] = None,
|
|
176
178
|
location: Optional[str] = None,
|
|
@@ -195,6 +197,7 @@ class FirecrawlClient:
|
|
|
195
197
|
request = SearchRequest(
|
|
196
198
|
query=query,
|
|
197
199
|
sources=sources,
|
|
200
|
+
categories=categories,
|
|
198
201
|
limit=limit,
|
|
199
202
|
tbs=tbs,
|
|
200
203
|
location=location,
|
firecrawl/v2/types.py
CHANGED
|
@@ -114,6 +114,12 @@ class DocumentMetadata(BaseModel):
|
|
|
114
114
|
def coerce_status_code_to_int(cls, v):
|
|
115
115
|
return cls._coerce_string_to_int(v)
|
|
116
116
|
|
|
117
|
+
class AttributeResult(BaseModel):
|
|
118
|
+
"""Result of attribute extraction."""
|
|
119
|
+
selector: str
|
|
120
|
+
attribute: str
|
|
121
|
+
values: List[str]
|
|
122
|
+
|
|
117
123
|
class Document(BaseModel):
|
|
118
124
|
"""A scraped document."""
|
|
119
125
|
markdown: Optional[str] = None
|
|
@@ -123,6 +129,7 @@ class Document(BaseModel):
|
|
|
123
129
|
summary: Optional[str] = None
|
|
124
130
|
metadata: Optional[DocumentMetadata] = None
|
|
125
131
|
links: Optional[List[str]] = None
|
|
132
|
+
images: Optional[List[str]] = None
|
|
126
133
|
screenshot: Optional[str] = None
|
|
127
134
|
actions: Optional[Dict[str, Any]] = None
|
|
128
135
|
warning: Optional[str] = None
|
|
@@ -182,7 +189,7 @@ CategoryOption = Union[str, Category]
|
|
|
182
189
|
|
|
183
190
|
FormatString = Literal[
|
|
184
191
|
# camelCase versions (API format)
|
|
185
|
-
"markdown", "html", "rawHtml", "links", "screenshot", "summary", "changeTracking", "json",
|
|
192
|
+
"markdown", "html", "rawHtml", "links", "images", "screenshot", "summary", "changeTracking", "json", "attributes",
|
|
186
193
|
# snake_case versions (user-friendly)
|
|
187
194
|
"raw_html", "change_tracking"
|
|
188
195
|
]
|
|
@@ -214,9 +221,18 @@ class ScreenshotFormat(BaseModel):
|
|
|
214
221
|
full_page: Optional[bool] = None
|
|
215
222
|
quality: Optional[int] = None
|
|
216
223
|
viewport: Optional[Union[Dict[str, int], Viewport]] = None
|
|
224
|
+
|
|
225
|
+
class AttributeSelector(BaseModel):
|
|
226
|
+
"""Selector and attribute pair for attribute extraction."""
|
|
227
|
+
selector: str
|
|
228
|
+
attribute: str
|
|
217
229
|
|
|
218
|
-
|
|
230
|
+
class AttributesFormat(Format):
|
|
231
|
+
"""Configuration for attribute extraction."""
|
|
232
|
+
type: Literal["attributes"] = "attributes"
|
|
233
|
+
selectors: List[AttributeSelector]
|
|
219
234
|
|
|
235
|
+
FormatOption = Union[Dict[str, Any], FormatString, JsonFormat, ChangeTrackingFormat, ScreenshotFormat, AttributesFormat, Format]
|
|
220
236
|
# Scrape types
|
|
221
237
|
class ScrapeFormats(BaseModel):
|
|
222
238
|
"""Output formats for scraping."""
|
|
@@ -226,6 +242,7 @@ class ScrapeFormats(BaseModel):
|
|
|
226
242
|
raw_html: bool = False
|
|
227
243
|
summary: bool = False
|
|
228
244
|
links: bool = False
|
|
245
|
+
images: bool = False
|
|
229
246
|
screenshot: bool = False
|
|
230
247
|
change_tracking: bool = False
|
|
231
248
|
json: bool = False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: firecrawl
|
|
3
|
-
Version: 3.3.
|
|
3
|
+
Version: 3.3.3
|
|
4
4
|
Summary: Python SDK for Firecrawl API
|
|
5
5
|
Home-page: https://github.com/firecrawl/firecrawl
|
|
6
6
|
Author: Mendable.ai
|
|
@@ -38,8 +38,12 @@ Requires-Dist: httpx
|
|
|
38
38
|
Requires-Dist: python-dotenv
|
|
39
39
|
Requires-Dist: websockets
|
|
40
40
|
Requires-Dist: nest-asyncio
|
|
41
|
-
Requires-Dist: pydantic
|
|
41
|
+
Requires-Dist: pydantic>=2.0
|
|
42
42
|
Requires-Dist: aiohttp
|
|
43
|
+
Dynamic: author
|
|
44
|
+
Dynamic: home-page
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
Dynamic: requires-python
|
|
43
47
|
|
|
44
48
|
# Firecrawl Python SDK
|
|
45
49
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
firecrawl/__init__.py,sha256=
|
|
2
|
-
firecrawl/client.py,sha256=
|
|
1
|
+
firecrawl/__init__.py,sha256=1MYT5_7-p8sfruL_5y1m1n9AoWG_6aNduWGW4NId86M,2192
|
|
2
|
+
firecrawl/client.py,sha256=tp3mUo_3aGPuZ53kpU4bhM-5EtwD_IUWrJ7wm0GMuCc,11159
|
|
3
3
|
firecrawl/firecrawl.backup.py,sha256=v1FEN3jR4g5Aupg4xp6SLkuFvYMQuUKND2YELbYjE6c,200430
|
|
4
4
|
firecrawl/types.py,sha256=W9N2pqQuevEIIjYHN9rbDf31E-nwdCECqIn11Foz2T8,2836
|
|
5
5
|
firecrawl/__tests__/e2e/v2/conftest.py,sha256=I28TUpN5j0-9gM79NlbrDS8Jlsheao657od2f-2xK0Y,2587
|
|
@@ -8,7 +8,7 @@ firecrawl/__tests__/e2e/v2/test_batch_scrape.py,sha256=H9GtuwHIFdOQ958SOVThi_kvD
|
|
|
8
8
|
firecrawl/__tests__/e2e/v2/test_crawl.py,sha256=cOssZvIwtghAtLiM1QdNLhPEwAxZ9j9umTrBUPtJjpU,9951
|
|
9
9
|
firecrawl/__tests__/e2e/v2/test_extract.py,sha256=HgvGiDlyWtFygiPo5EP44Dem1oWrwgRF-hfc1LfeVSU,1670
|
|
10
10
|
firecrawl/__tests__/e2e/v2/test_map.py,sha256=9sT-Yq8V_8c9esl_bv5hnTA9WXb2Dg81kj6M-s0484c,1618
|
|
11
|
-
firecrawl/__tests__/e2e/v2/test_scrape.py,sha256=
|
|
11
|
+
firecrawl/__tests__/e2e/v2/test_scrape.py,sha256=oyroF_WaEdxgD8t_SHkLBBfDRv1_6xZ_7vSTQpwlmA8,7198
|
|
12
12
|
firecrawl/__tests__/e2e/v2/test_search.py,sha256=tvU9_eg_3H5em0fhIwPPjuYe9BRAQ5St-BLM0l_FfVs,9079
|
|
13
13
|
firecrawl/__tests__/e2e/v2/test_usage.py,sha256=JlBkYblhThua5qF2crRjsPpq4Ja0cBsdzxZ5zxXnQ_Y,805
|
|
14
14
|
firecrawl/__tests__/e2e/v2/test_watcher.py,sha256=OPTKLhVAKWqXl2Tieo6zCN1xpEwZDsz-B977CVJgLMA,1932
|
|
@@ -43,9 +43,9 @@ firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py,sha256=87w47n0iOihtu4jTR4
|
|
|
43
43
|
firecrawl/v1/__init__.py,sha256=aP1oisPeZVGGZynvENc07JySMOZfv_4zAlxQ0ecMJXA,481
|
|
44
44
|
firecrawl/v1/client.py,sha256=sydurfEFTsXyowyaGryA1lkPxN_r9Nf6iQpM43OwJyM,201672
|
|
45
45
|
firecrawl/v2/__init__.py,sha256=Jc6a8tBjYG5OPkjDM5pl-notyys-7DEj7PLEfepv3fc,137
|
|
46
|
-
firecrawl/v2/client.py,sha256=
|
|
46
|
+
firecrawl/v2/client.py,sha256=_DZFZO1aWvODzznK0g2Svcd2-xxXgWGR0d9vniNlk1w,30621
|
|
47
47
|
firecrawl/v2/client_async.py,sha256=zwxHis1bSh0tSF1480ze-4XDQEDJ5yDur1ZqtL94dwc,10127
|
|
48
|
-
firecrawl/v2/types.py,sha256=
|
|
48
|
+
firecrawl/v2/types.py,sha256=F-RCADQFdpAmF5t8LUabLOgyIV02Ol34yNa9y3S3ZMg,22667
|
|
49
49
|
firecrawl/v2/watcher.py,sha256=FOU71tqSKxgeuGycu4ye0SLc2dw7clIcoQjPsi-4Csc,14229
|
|
50
50
|
firecrawl/v2/watcher_async.py,sha256=AVjW2mgABniolSsauK4u0FW8ya6WzRUdyEg2R-8vGCw,10278
|
|
51
51
|
firecrawl/v2/methods/batch.py,sha256=us7zUGl7u9ZDIEk2J3rNqj87bkaNjXU27SMFW_fdcg8,11932
|
|
@@ -70,10 +70,10 @@ firecrawl/v2/utils/http_client.py,sha256=_n8mp4xi6GGihg662Lsv6TSlvw9zykyADwEk0fg
|
|
|
70
70
|
firecrawl/v2/utils/http_client_async.py,sha256=iy89_bk2HS3afSRHZ8016eMCa9Fk-5MFTntcOHfbPgE,1936
|
|
71
71
|
firecrawl/v2/utils/normalize.py,sha256=nlTU6QRghT1YKZzNZlIQj4STSRuSUGrS9cCErZIcY5w,3636
|
|
72
72
|
firecrawl/v2/utils/validation.py,sha256=L8by7z-t6GuMGIYkK7il1BM8d-4_-sAdG9hDMF_LeG4,14518
|
|
73
|
+
firecrawl-3.3.3.dist-info/licenses/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
73
74
|
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
74
75
|
tests/test_timeout_conversion.py,sha256=PWlIEMASQNhu4cp1OW_ebklnE9NCiigPnEFCtI5N3w0,3996
|
|
75
|
-
firecrawl-3.3.
|
|
76
|
-
firecrawl-3.3.
|
|
77
|
-
firecrawl-3.3.
|
|
78
|
-
firecrawl-3.3.
|
|
79
|
-
firecrawl-3.3.0.dist-info/RECORD,,
|
|
76
|
+
firecrawl-3.3.3.dist-info/METADATA,sha256=4KbMa3eOGJ3SXkPWvDwQP92ceCcEfuPp3AQMNj3o7lE,7392
|
|
77
|
+
firecrawl-3.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
78
|
+
firecrawl-3.3.3.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
79
|
+
firecrawl-3.3.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|