firecrawl 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

firecrawl/__init__.py CHANGED
@@ -13,7 +13,7 @@ import os
13
13
 
14
14
  from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
15
15
 
16
- __version__ = "2.9.0"
16
+ __version__ = "2.11.0"
17
17
 
18
18
  # Define the logger for the Firecrawl project
19
19
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -437,4 +437,29 @@ def test_search_with_invalid_params():
437
437
  app.search("test query", {"invalid_param": "value"})
438
438
  assert "ValidationError" in str(e.value)
439
439
 
440
+ # def test_scrape_url_with_parse_pdf_true():
441
+ # if TEST_API_KEY:
442
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
443
+ # response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=True)
444
+ # assert response is not None
445
+ # assert 'markdown' in response
446
+ # assert len(response['markdown']) > 100
447
+
448
+ # def test_scrape_url_with_parse_pdf_false():
449
+ # if TEST_API_KEY:
450
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
451
+ # response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=False)
452
+ # assert response is not None
453
+ # assert 'markdown' in response
454
+ # assert 'h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm' in response['markdown']
455
+
456
+ # def test_scrape_options_with_parse_pdf():
457
+ # if TEST_API_KEY:
458
+ # from firecrawl.firecrawl import ScrapeOptions
459
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
460
+ # scrape_options = ScrapeOptions(parsePDF=False, formats=['markdown'])
461
+ # response = app.search("firecrawl", limit=1, scrape_options=scrape_options)
462
+ # assert response is not None
463
+ # assert 'data' in response
464
+
440
465
 
firecrawl/firecrawl.py CHANGED
@@ -96,6 +96,7 @@ class AgentOptionsExtract(pydantic.BaseModel):
96
96
  class ActionsResult(pydantic.BaseModel):
97
97
  """Result of actions performed during scraping."""
98
98
  screenshots: List[str]
99
+ pdfs: List[str]
99
100
 
100
101
  class ChangeTrackingData(pydantic.BaseModel):
101
102
  """
@@ -160,6 +161,7 @@ class ScrapeOptions(pydantic.BaseModel):
160
161
  changeTrackingOptions: Optional[ChangeTrackingOptions] = None
161
162
  maxAge: Optional[int] = None
162
163
  storeInCache: Optional[bool] = None
164
+ parsePDF: Optional[bool] = None
163
165
 
164
166
  class WaitAction(pydantic.BaseModel):
165
167
  """Wait action to perform during scraping."""
@@ -171,6 +173,7 @@ class ScreenshotAction(pydantic.BaseModel):
171
173
  """Screenshot action to perform during scraping."""
172
174
  type: Literal["screenshot"]
173
175
  fullPage: Optional[bool] = None
176
+ quality: Optional[int] = None
174
177
 
175
178
  class ClickAction(pydantic.BaseModel):
176
179
  """Click action to perform during scraping."""
@@ -202,6 +205,12 @@ class ExecuteJavascriptAction(pydantic.BaseModel):
202
205
  type: Literal["executeJavascript"]
203
206
  script: str
204
207
 
208
+ class PDFAction(pydantic.BaseModel):
209
+ """PDF action to perform during scraping."""
210
+ type: Literal["pdf"]
211
+ format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
212
+ landscape: Optional[bool] = None
213
+ scale: Optional[float] = None
205
214
 
206
215
  class ExtractAgent(pydantic.BaseModel):
207
216
  """Configuration for the agent in extract operations."""
@@ -218,7 +227,7 @@ class ScrapeParams(ScrapeOptions):
218
227
  """Parameters for scraping operations."""
219
228
  extract: Optional[JsonConfig] = None
220
229
  jsonOptions: Optional[JsonConfig] = None
221
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
230
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
222
231
  agent: Optional[AgentOptions] = None
223
232
  webhook: Optional[WebhookConfig] = None
224
233
 
@@ -465,9 +474,10 @@ class FirecrawlApp:
465
474
  remove_base64_images: Optional[bool] = None,
466
475
  block_ads: Optional[bool] = None,
467
476
  proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
477
+ parse_pdf: Optional[bool] = None,
468
478
  extract: Optional[JsonConfig] = None,
469
479
  json_options: Optional[JsonConfig] = None,
470
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
480
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
471
481
  change_tracking_options: Optional[ChangeTrackingOptions] = None,
472
482
  max_age: Optional[int] = None,
473
483
  store_in_cache: Optional[bool] = None,
@@ -491,7 +501,7 @@ class FirecrawlApp:
491
501
  proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
492
502
  extract (Optional[JsonConfig]): Content extraction settings
493
503
  json_options (Optional[JsonConfig]): JSON extraction settings
494
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
504
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
495
505
  change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
496
506
 
497
507
 
@@ -538,6 +548,8 @@ class FirecrawlApp:
538
548
  scrape_params['blockAds'] = block_ads
539
549
  if proxy:
540
550
  scrape_params['proxy'] = proxy
551
+ if parse_pdf is not None:
552
+ scrape_params['parsePDF'] = parse_pdf
541
553
  if extract is not None:
542
554
  extract = self._ensure_schema_dict(extract)
543
555
  if isinstance(extract, dict) and "schema" in extract:
@@ -1232,7 +1244,7 @@ class FirecrawlApp:
1232
1244
  proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1233
1245
  extract: Optional[JsonConfig] = None,
1234
1246
  json_options: Optional[JsonConfig] = None,
1235
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1247
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1236
1248
  agent: Optional[AgentOptions] = None,
1237
1249
  poll_interval: Optional[int] = 2,
1238
1250
  max_concurrency: Optional[int] = None,
@@ -1370,7 +1382,7 @@ class FirecrawlApp:
1370
1382
  proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1371
1383
  extract: Optional[JsonConfig] = None,
1372
1384
  json_options: Optional[JsonConfig] = None,
1373
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1385
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1374
1386
  agent: Optional[AgentOptions] = None,
1375
1387
  max_concurrency: Optional[int] = None,
1376
1388
  idempotency_key: Optional[str] = None,
@@ -1506,7 +1518,7 @@ class FirecrawlApp:
1506
1518
  proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
1507
1519
  extract: Optional[JsonConfig] = None,
1508
1520
  json_options: Optional[JsonConfig] = None,
1509
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1521
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
1510
1522
  agent: Optional[AgentOptions] = None,
1511
1523
  max_concurrency: Optional[int] = None,
1512
1524
  idempotency_key: Optional[str] = None,
@@ -2904,9 +2916,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
2904
2916
  remove_base64_images: Optional[bool] = None,
2905
2917
  block_ads: Optional[bool] = None,
2906
2918
  proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2919
+ parse_pdf: Optional[bool] = None,
2907
2920
  extract: Optional[JsonConfig] = None,
2908
2921
  json_options: Optional[JsonConfig] = None,
2909
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
2922
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
2910
2923
  **kwargs) -> ScrapeResponse[Any]:
2911
2924
  """
2912
2925
  Scrape a single URL asynchronously.
@@ -2927,7 +2940,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
2927
2940
  proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
2928
2941
  extract (Optional[JsonConfig]): Content extraction settings
2929
2942
  json_options (Optional[JsonConfig]): JSON extraction settings
2930
- actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
2943
+ actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
2931
2944
  **kwargs: Additional parameters to pass to the API
2932
2945
 
2933
2946
  Returns:
@@ -2981,6 +2994,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
2981
2994
  scrape_params['blockAds'] = block_ads
2982
2995
  if proxy:
2983
2996
  scrape_params['proxy'] = proxy
2997
+ if parse_pdf is not None:
2998
+ scrape_params['parsePDF'] = parse_pdf
2984
2999
  if extract is not None:
2985
3000
  extract = self._ensure_schema_dict(extract)
2986
3001
  if isinstance(extract, dict) and "schema" in extract:
@@ -3035,7 +3050,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
3035
3050
  proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3036
3051
  extract: Optional[JsonConfig] = None,
3037
3052
  json_options: Optional[JsonConfig] = None,
3038
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3053
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3039
3054
  agent: Optional[AgentOptions] = None,
3040
3055
  poll_interval: Optional[int] = 2,
3041
3056
  idempotency_key: Optional[str] = None,
@@ -3174,7 +3189,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
3174
3189
  proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
3175
3190
  extract: Optional[JsonConfig] = None,
3176
3191
  json_options: Optional[JsonConfig] = None,
3177
- actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
3192
+ actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
3178
3193
  agent: Optional[AgentOptions] = None,
3179
3194
  idempotency_key: Optional[str] = None,
3180
3195
  **kwargs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl
3
- Version: 2.9.0
3
+ Version: 2.11.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/mendableai/firecrawl
6
6
  Author: Mendable.ai
@@ -0,0 +1,12 @@
1
+ firecrawl/__init__.py,sha256=uHnpcSPL_UX_PvPYv7sphq_21zQmzqYILej7FZnMEO4,2613
2
+ firecrawl/firecrawl.py,sha256=2fzg1wKvbH6_KYnEhwPaGv6hYMmW1f9o0mSoCRv0PHw,194334
3
+ firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
5
+ firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=k9IsEbdTHL9Cu49M4FpnQDEo2rnG6RqwmZAsK_EVJr4,21069
7
+ tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
8
+ firecrawl-2.11.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
9
+ firecrawl-2.11.0.dist-info/METADATA,sha256=Ukyy5PHWLtVjdiwjyRLWxuAy9VLqRwjEjc4iOdTeWro,7166
10
+ firecrawl-2.11.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
11
+ firecrawl-2.11.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
12
+ firecrawl-2.11.0.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- firecrawl/__init__.py,sha256=nWGTmoKRj6qHs3mzjKE4d3giXsTeeXIO-Ujw0S0oy7k,2612
2
- firecrawl/firecrawl.py,sha256=ICCfDvhpsV3OT5kwwuiS2_6tiq9kmCca4Elum7mKhxg,193573
3
- firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
5
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
7
- tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
8
- firecrawl-2.9.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
9
- firecrawl-2.9.0.dist-info/METADATA,sha256=7V6RGueUF-gnebxMeXVW6Lpc22vcRyU8Fe6xa58Ep7Q,7165
10
- firecrawl-2.9.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
11
- firecrawl-2.9.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
12
- firecrawl-2.9.0.dist-info/RECORD,,