firecrawl 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/v1/e2e_withAuth/test.py +25 -0
- firecrawl/firecrawl.py +25 -10
- {firecrawl-2.9.0.dist-info → firecrawl-2.11.0.dist-info}/LICENSE +0 -0
- {firecrawl-2.9.0.dist-info → firecrawl-2.11.0.dist-info}/METADATA +1 -1
- firecrawl-2.11.0.dist-info/RECORD +12 -0
- firecrawl-2.9.0.dist-info/RECORD +0 -12
- {firecrawl-2.9.0.dist-info → firecrawl-2.11.0.dist-info}/WHEEL +0 -0
- {firecrawl-2.9.0.dist-info → firecrawl-2.11.0.dist-info}/top_level.txt +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.11.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
@@ -437,4 +437,29 @@ def test_search_with_invalid_params():
|
|
|
437
437
|
app.search("test query", {"invalid_param": "value"})
|
|
438
438
|
assert "ValidationError" in str(e.value)
|
|
439
439
|
|
|
440
|
+
# def test_scrape_url_with_parse_pdf_true():
|
|
441
|
+
# if TEST_API_KEY:
|
|
442
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
443
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=True)
|
|
444
|
+
# assert response is not None
|
|
445
|
+
# assert 'markdown' in response
|
|
446
|
+
# assert len(response['markdown']) > 100
|
|
447
|
+
|
|
448
|
+
# def test_scrape_url_with_parse_pdf_false():
|
|
449
|
+
# if TEST_API_KEY:
|
|
450
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
451
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=False)
|
|
452
|
+
# assert response is not None
|
|
453
|
+
# assert 'markdown' in response
|
|
454
|
+
# assert 'h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm' in response['markdown']
|
|
455
|
+
|
|
456
|
+
# def test_scrape_options_with_parse_pdf():
|
|
457
|
+
# if TEST_API_KEY:
|
|
458
|
+
# from firecrawl.firecrawl import ScrapeOptions
|
|
459
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
460
|
+
# scrape_options = ScrapeOptions(parsePDF=False, formats=['markdown'])
|
|
461
|
+
# response = app.search("firecrawl", limit=1, scrape_options=scrape_options)
|
|
462
|
+
# assert response is not None
|
|
463
|
+
# assert 'data' in response
|
|
464
|
+
|
|
440
465
|
|
firecrawl/firecrawl.py
CHANGED
|
@@ -96,6 +96,7 @@ class AgentOptionsExtract(pydantic.BaseModel):
|
|
|
96
96
|
class ActionsResult(pydantic.BaseModel):
|
|
97
97
|
"""Result of actions performed during scraping."""
|
|
98
98
|
screenshots: List[str]
|
|
99
|
+
pdfs: List[str]
|
|
99
100
|
|
|
100
101
|
class ChangeTrackingData(pydantic.BaseModel):
|
|
101
102
|
"""
|
|
@@ -160,6 +161,7 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
160
161
|
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
|
161
162
|
maxAge: Optional[int] = None
|
|
162
163
|
storeInCache: Optional[bool] = None
|
|
164
|
+
parsePDF: Optional[bool] = None
|
|
163
165
|
|
|
164
166
|
class WaitAction(pydantic.BaseModel):
|
|
165
167
|
"""Wait action to perform during scraping."""
|
|
@@ -171,6 +173,7 @@ class ScreenshotAction(pydantic.BaseModel):
|
|
|
171
173
|
"""Screenshot action to perform during scraping."""
|
|
172
174
|
type: Literal["screenshot"]
|
|
173
175
|
fullPage: Optional[bool] = None
|
|
176
|
+
quality: Optional[int] = None
|
|
174
177
|
|
|
175
178
|
class ClickAction(pydantic.BaseModel):
|
|
176
179
|
"""Click action to perform during scraping."""
|
|
@@ -202,6 +205,12 @@ class ExecuteJavascriptAction(pydantic.BaseModel):
|
|
|
202
205
|
type: Literal["executeJavascript"]
|
|
203
206
|
script: str
|
|
204
207
|
|
|
208
|
+
class PDFAction(pydantic.BaseModel):
|
|
209
|
+
"""PDF action to perform during scraping."""
|
|
210
|
+
type: Literal["pdf"]
|
|
211
|
+
format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
|
|
212
|
+
landscape: Optional[bool] = None
|
|
213
|
+
scale: Optional[float] = None
|
|
205
214
|
|
|
206
215
|
class ExtractAgent(pydantic.BaseModel):
|
|
207
216
|
"""Configuration for the agent in extract operations."""
|
|
@@ -218,7 +227,7 @@ class ScrapeParams(ScrapeOptions):
|
|
|
218
227
|
"""Parameters for scraping operations."""
|
|
219
228
|
extract: Optional[JsonConfig] = None
|
|
220
229
|
jsonOptions: Optional[JsonConfig] = None
|
|
221
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
|
230
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
|
|
222
231
|
agent: Optional[AgentOptions] = None
|
|
223
232
|
webhook: Optional[WebhookConfig] = None
|
|
224
233
|
|
|
@@ -465,9 +474,10 @@ class FirecrawlApp:
|
|
|
465
474
|
remove_base64_images: Optional[bool] = None,
|
|
466
475
|
block_ads: Optional[bool] = None,
|
|
467
476
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
477
|
+
parse_pdf: Optional[bool] = None,
|
|
468
478
|
extract: Optional[JsonConfig] = None,
|
|
469
479
|
json_options: Optional[JsonConfig] = None,
|
|
470
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
480
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
471
481
|
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
|
472
482
|
max_age: Optional[int] = None,
|
|
473
483
|
store_in_cache: Optional[bool] = None,
|
|
@@ -491,7 +501,7 @@ class FirecrawlApp:
|
|
|
491
501
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
492
502
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
493
503
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
494
|
-
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
504
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
|
|
495
505
|
change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
|
|
496
506
|
|
|
497
507
|
|
|
@@ -538,6 +548,8 @@ class FirecrawlApp:
|
|
|
538
548
|
scrape_params['blockAds'] = block_ads
|
|
539
549
|
if proxy:
|
|
540
550
|
scrape_params['proxy'] = proxy
|
|
551
|
+
if parse_pdf is not None:
|
|
552
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
541
553
|
if extract is not None:
|
|
542
554
|
extract = self._ensure_schema_dict(extract)
|
|
543
555
|
if isinstance(extract, dict) and "schema" in extract:
|
|
@@ -1232,7 +1244,7 @@ class FirecrawlApp:
|
|
|
1232
1244
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1233
1245
|
extract: Optional[JsonConfig] = None,
|
|
1234
1246
|
json_options: Optional[JsonConfig] = None,
|
|
1235
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1247
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1236
1248
|
agent: Optional[AgentOptions] = None,
|
|
1237
1249
|
poll_interval: Optional[int] = 2,
|
|
1238
1250
|
max_concurrency: Optional[int] = None,
|
|
@@ -1370,7 +1382,7 @@ class FirecrawlApp:
|
|
|
1370
1382
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1371
1383
|
extract: Optional[JsonConfig] = None,
|
|
1372
1384
|
json_options: Optional[JsonConfig] = None,
|
|
1373
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1385
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1374
1386
|
agent: Optional[AgentOptions] = None,
|
|
1375
1387
|
max_concurrency: Optional[int] = None,
|
|
1376
1388
|
idempotency_key: Optional[str] = None,
|
|
@@ -1506,7 +1518,7 @@ class FirecrawlApp:
|
|
|
1506
1518
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1507
1519
|
extract: Optional[JsonConfig] = None,
|
|
1508
1520
|
json_options: Optional[JsonConfig] = None,
|
|
1509
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1521
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1510
1522
|
agent: Optional[AgentOptions] = None,
|
|
1511
1523
|
max_concurrency: Optional[int] = None,
|
|
1512
1524
|
idempotency_key: Optional[str] = None,
|
|
@@ -2904,9 +2916,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2904
2916
|
remove_base64_images: Optional[bool] = None,
|
|
2905
2917
|
block_ads: Optional[bool] = None,
|
|
2906
2918
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
2919
|
+
parse_pdf: Optional[bool] = None,
|
|
2907
2920
|
extract: Optional[JsonConfig] = None,
|
|
2908
2921
|
json_options: Optional[JsonConfig] = None,
|
|
2909
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2922
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
2910
2923
|
**kwargs) -> ScrapeResponse[Any]:
|
|
2911
2924
|
"""
|
|
2912
2925
|
Scrape a single URL asynchronously.
|
|
@@ -2927,7 +2940,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2927
2940
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
2928
2941
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
2929
2942
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
2930
|
-
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
2943
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
|
|
2931
2944
|
**kwargs: Additional parameters to pass to the API
|
|
2932
2945
|
|
|
2933
2946
|
Returns:
|
|
@@ -2981,6 +2994,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2981
2994
|
scrape_params['blockAds'] = block_ads
|
|
2982
2995
|
if proxy:
|
|
2983
2996
|
scrape_params['proxy'] = proxy
|
|
2997
|
+
if parse_pdf is not None:
|
|
2998
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
2984
2999
|
if extract is not None:
|
|
2985
3000
|
extract = self._ensure_schema_dict(extract)
|
|
2986
3001
|
if isinstance(extract, dict) and "schema" in extract:
|
|
@@ -3035,7 +3050,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3035
3050
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3036
3051
|
extract: Optional[JsonConfig] = None,
|
|
3037
3052
|
json_options: Optional[JsonConfig] = None,
|
|
3038
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
3053
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
3039
3054
|
agent: Optional[AgentOptions] = None,
|
|
3040
3055
|
poll_interval: Optional[int] = 2,
|
|
3041
3056
|
idempotency_key: Optional[str] = None,
|
|
@@ -3174,7 +3189,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3174
3189
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3175
3190
|
extract: Optional[JsonConfig] = None,
|
|
3176
3191
|
json_options: Optional[JsonConfig] = None,
|
|
3177
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
3192
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
3178
3193
|
agent: Optional[AgentOptions] = None,
|
|
3179
3194
|
idempotency_key: Optional[str] = None,
|
|
3180
3195
|
**kwargs
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=uHnpcSPL_UX_PvPYv7sphq_21zQmzqYILej7FZnMEO4,2613
|
|
2
|
+
firecrawl/firecrawl.py,sha256=2fzg1wKvbH6_KYnEhwPaGv6hYMmW1f9o0mSoCRv0PHw,194334
|
|
3
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=k9IsEbdTHL9Cu49M4FpnQDEo2rnG6RqwmZAsK_EVJr4,21069
|
|
7
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
firecrawl-2.11.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.11.0.dist-info/METADATA,sha256=Ukyy5PHWLtVjdiwjyRLWxuAy9VLqRwjEjc4iOdTeWro,7166
|
|
10
|
+
firecrawl-2.11.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.11.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.11.0.dist-info/RECORD,,
|
firecrawl-2.9.0.dist-info/RECORD
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
firecrawl/__init__.py,sha256=nWGTmoKRj6qHs3mzjKE4d3giXsTeeXIO-Ujw0S0oy7k,2612
|
|
2
|
-
firecrawl/firecrawl.py,sha256=ICCfDvhpsV3OT5kwwuiS2_6tiq9kmCca4Elum7mKhxg,193573
|
|
3
|
-
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
-
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
|
-
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
-
firecrawl-2.9.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
-
firecrawl-2.9.0.dist-info/METADATA,sha256=7V6RGueUF-gnebxMeXVW6Lpc22vcRyU8Fe6xa58Ep7Q,7165
|
|
10
|
-
firecrawl-2.9.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
-
firecrawl-2.9.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
-
firecrawl-2.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|