firecrawl-py 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (37) hide show
  1. firecrawl/__init__.py +1 -1
  2. firecrawl/__tests__/v1/e2e_withAuth/test.py +25 -0
  3. firecrawl/firecrawl.py +68 -15
  4. {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/LICENSE +0 -0
  5. {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/METADATA +1 -1
  6. firecrawl_py-2.10.0.dist-info/RECORD +12 -0
  7. {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/top_level.txt +0 -2
  8. build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  9. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  10. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  11. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  12. build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  13. build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
  14. build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  15. build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
  16. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  17. build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  18. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  19. build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  20. build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
  21. build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
  22. build/lib/build/lib/firecrawl/__init__.py +0 -79
  23. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  24. build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  25. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  26. build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  27. build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
  28. build/lib/build/lib/tests/test_change_tracking.py +0 -98
  29. build/lib/firecrawl/__init__.py +0 -79
  30. build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  31. build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  32. build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  33. build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
  34. build/lib/firecrawl/firecrawl.py +0 -4480
  35. build/lib/tests/test_change_tracking.py +0 -98
  36. firecrawl_py-2.8.0.dist-info/RECORD +0 -40
  37. {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/WHEEL +0 -0
firecrawl/__init__.py CHANGED
@@ -13,7 +13,7 @@ import os
13
13
 
14
14
  from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
15
15
 
16
- __version__ = "2.8.0"
16
+ __version__ = "2.10.0"
17
17
 
18
18
  # Define the logger for the Firecrawl project
19
19
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -437,4 +437,29 @@ def test_search_with_invalid_params():
437
437
  app.search("test query", {"invalid_param": "value"})
438
438
  assert "ValidationError" in str(e.value)
439
439
 
440
+ # def test_scrape_url_with_parse_pdf_true():
441
+ # if TEST_API_KEY:
442
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
443
+ # response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=True)
444
+ # assert response is not None
445
+ # assert 'markdown' in response
446
+ # assert len(response['markdown']) > 100
447
+
448
+ # def test_scrape_url_with_parse_pdf_false():
449
+ # if TEST_API_KEY:
450
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
451
+ # response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=False)
452
+ # assert response is not None
453
+ # assert 'markdown' in response
454
+ # assert 'h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm' in response['markdown']
455
+
456
+ # def test_scrape_options_with_parse_pdf():
457
+ # if TEST_API_KEY:
458
+ # from firecrawl.firecrawl import ScrapeOptions
459
+ # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
460
+ # scrape_options = ScrapeOptions(parsePDF=False, formats=['markdown'])
461
+ # response = app.search("firecrawl", limit=1, scrape_options=scrape_options)
462
+ # assert response is not None
463
+ # assert 'data' in response
464
+
440
465
 
firecrawl/firecrawl.py CHANGED
@@ -160,6 +160,7 @@ class ScrapeOptions(pydantic.BaseModel):
160
160
  changeTrackingOptions: Optional[ChangeTrackingOptions] = None
161
161
  maxAge: Optional[int] = None
162
162
  storeInCache: Optional[bool] = None
163
+ parsePDF: Optional[bool] = None
163
164
 
164
165
  class WaitAction(pydantic.BaseModel):
165
166
  """Wait action to perform during scraping."""
@@ -263,6 +264,7 @@ class CrawlParams(pydantic.BaseModel):
263
264
  ignoreQueryParameters: Optional[bool] = None
264
265
  regexOnFullURL: Optional[bool] = None
265
266
  delay: Optional[int] = None # Delay in seconds between scrapes
267
+ maxConcurrency: Optional[int] = None
266
268
 
267
269
  class CrawlResponse(pydantic.BaseModel):
268
270
  """Response from crawling operations."""
@@ -464,6 +466,7 @@ class FirecrawlApp:
464
466
  remove_base64_images: Optional[bool] = None,
465
467
  block_ads: Optional[bool] = None,
466
468
  proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
469
+ parse_pdf: Optional[bool] = None,
467
470
  extract: Optional[JsonConfig] = None,
468
471
  json_options: Optional[JsonConfig] = None,
469
472
  actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
@@ -537,6 +540,8 @@ class FirecrawlApp:
537
540
  scrape_params['blockAds'] = block_ads
538
541
  if proxy:
539
542
  scrape_params['proxy'] = proxy
543
+ if parse_pdf is not None:
544
+ scrape_params['parsePDF'] = parse_pdf
540
545
  if extract is not None:
541
546
  extract = self._ensure_schema_dict(extract)
542
547
  if isinstance(extract, dict) and "schema" in extract:
@@ -686,6 +691,7 @@ class FirecrawlApp:
686
691
  max_discovery_depth: Optional[int] = None,
687
692
  limit: Optional[int] = None,
688
693
  allow_backward_links: Optional[bool] = None,
694
+ crawl_entire_domain: Optional[bool] = None,
689
695
  allow_external_links: Optional[bool] = None,
690
696
  ignore_sitemap: Optional[bool] = None,
691
697
  scrape_options: Optional[ScrapeOptions] = None,
@@ -694,6 +700,7 @@ class FirecrawlApp:
694
700
  ignore_query_parameters: Optional[bool] = None,
695
701
  regex_on_full_url: Optional[bool] = None,
696
702
  delay: Optional[int] = None,
703
+ max_concurrency: Optional[int] = None,
697
704
  poll_interval: Optional[int] = 2,
698
705
  idempotency_key: Optional[str] = None,
699
706
  **kwargs
@@ -708,7 +715,8 @@ class FirecrawlApp:
708
715
  max_depth (Optional[int]): Maximum crawl depth
709
716
  max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
710
717
  limit (Optional[int]): Maximum pages to crawl
711
- allow_backward_links (Optional[bool]): Follow parent directory links
718
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
719
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
712
720
  allow_external_links (Optional[bool]): Follow external domain links
713
721
  ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
714
722
  scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@@ -717,6 +725,7 @@ class FirecrawlApp:
717
725
  ignore_query_parameters (Optional[bool]): Ignore URL parameters
718
726
  regex_on_full_url (Optional[bool]): Apply regex to full URLs
719
727
  delay (Optional[int]): Delay in seconds between scrapes
728
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
720
729
  poll_interval (Optional[int]): Seconds between status checks (default: 2)
721
730
  idempotency_key (Optional[str]): Unique key to prevent duplicate requests
722
731
  **kwargs: Additional parameters to pass to the API
@@ -746,7 +755,9 @@ class FirecrawlApp:
746
755
  crawl_params['maxDiscoveryDepth'] = max_discovery_depth
747
756
  if limit is not None:
748
757
  crawl_params['limit'] = limit
749
- if allow_backward_links is not None:
758
+ if crawl_entire_domain is not None:
759
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
760
+ elif allow_backward_links is not None:
750
761
  crawl_params['allowBackwardLinks'] = allow_backward_links
751
762
  if allow_external_links is not None:
752
763
  crawl_params['allowExternalLinks'] = allow_external_links
@@ -764,7 +775,9 @@ class FirecrawlApp:
764
775
  crawl_params['regexOnFullURL'] = regex_on_full_url
765
776
  if delay is not None:
766
777
  crawl_params['delay'] = delay
767
-
778
+ if max_concurrency is not None:
779
+ crawl_params['maxConcurrency'] = max_concurrency
780
+
768
781
  # Add any additional kwargs
769
782
  crawl_params.update(kwargs)
770
783
 
@@ -797,6 +810,7 @@ class FirecrawlApp:
797
810
  max_discovery_depth: Optional[int] = None,
798
811
  limit: Optional[int] = None,
799
812
  allow_backward_links: Optional[bool] = None,
813
+ crawl_entire_domain: Optional[bool] = None,
800
814
  allow_external_links: Optional[bool] = None,
801
815
  ignore_sitemap: Optional[bool] = None,
802
816
  scrape_options: Optional[ScrapeOptions] = None,
@@ -818,7 +832,8 @@ class FirecrawlApp:
818
832
  max_depth (Optional[int]): Maximum crawl depth
819
833
  max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
820
834
  limit (Optional[int]): Maximum pages to crawl
821
- allow_backward_links (Optional[bool]): Follow parent directory links
835
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
836
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
822
837
  allow_external_links (Optional[bool]): Follow external domain links
823
838
  ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
824
839
  scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@@ -826,6 +841,8 @@ class FirecrawlApp:
826
841
  deduplicate_similar_urls (Optional[bool]): Remove similar URLs
827
842
  ignore_query_parameters (Optional[bool]): Ignore URL parameters
828
843
  regex_on_full_url (Optional[bool]): Apply regex to full URLs
844
+ delay (Optional[int]): Delay in seconds between scrapes
845
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
829
846
  idempotency_key (Optional[str]): Unique key to prevent duplicate requests
830
847
  **kwargs: Additional parameters to pass to the API
831
848
 
@@ -855,7 +872,9 @@ class FirecrawlApp:
855
872
  crawl_params['maxDiscoveryDepth'] = max_discovery_depth
856
873
  if limit is not None:
857
874
  crawl_params['limit'] = limit
858
- if allow_backward_links is not None:
875
+ if crawl_entire_domain is not None:
876
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
877
+ elif allow_backward_links is not None:
859
878
  crawl_params['allowBackwardLinks'] = allow_backward_links
860
879
  if allow_external_links is not None:
861
880
  crawl_params['allowExternalLinks'] = allow_external_links
@@ -873,7 +892,9 @@ class FirecrawlApp:
873
892
  crawl_params['regexOnFullURL'] = regex_on_full_url
874
893
  if delay is not None:
875
894
  crawl_params['delay'] = delay
876
-
895
+ if max_concurrency is not None:
896
+ crawl_params['maxConcurrency'] = max_concurrency
897
+
877
898
  # Add any additional kwargs
878
899
  crawl_params.update(kwargs)
879
900
 
@@ -1042,6 +1063,7 @@ class FirecrawlApp:
1042
1063
  max_discovery_depth: Optional[int] = None,
1043
1064
  limit: Optional[int] = None,
1044
1065
  allow_backward_links: Optional[bool] = None,
1066
+ crawl_entire_domain: Optional[bool] = None,
1045
1067
  allow_external_links: Optional[bool] = None,
1046
1068
  ignore_sitemap: Optional[bool] = None,
1047
1069
  scrape_options: Optional[ScrapeOptions] = None,
@@ -1049,6 +1071,8 @@ class FirecrawlApp:
1049
1071
  deduplicate_similar_urls: Optional[bool] = None,
1050
1072
  ignore_query_parameters: Optional[bool] = None,
1051
1073
  regex_on_full_url: Optional[bool] = None,
1074
+ delay: Optional[int] = None,
1075
+ max_concurrency: Optional[int] = None,
1052
1076
  idempotency_key: Optional[str] = None,
1053
1077
  **kwargs
1054
1078
  ) -> 'CrawlWatcher':
@@ -1062,7 +1086,8 @@ class FirecrawlApp:
1062
1086
  max_depth (Optional[int]): Maximum crawl depth
1063
1087
  max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
1064
1088
  limit (Optional[int]): Maximum pages to crawl
1065
- allow_backward_links (Optional[bool]): Follow parent directory links
1089
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
1090
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
1066
1091
  allow_external_links (Optional[bool]): Follow external domain links
1067
1092
  ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
1068
1093
  scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@@ -1070,6 +1095,8 @@ class FirecrawlApp:
1070
1095
  deduplicate_similar_urls (Optional[bool]): Remove similar URLs
1071
1096
  ignore_query_parameters (Optional[bool]): Ignore URL parameters
1072
1097
  regex_on_full_url (Optional[bool]): Apply regex to full URLs
1098
+ delay (Optional[int]): Delay in seconds between scrapes
1099
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1073
1100
  idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1074
1101
  **kwargs: Additional parameters to pass to the API
1075
1102
 
@@ -1094,6 +1121,8 @@ class FirecrawlApp:
1094
1121
  deduplicate_similar_urls=deduplicate_similar_urls,
1095
1122
  ignore_query_parameters=ignore_query_parameters,
1096
1123
  regex_on_full_url=regex_on_full_url,
1124
+ delay=delay,
1125
+ max_concurrency=max_concurrency,
1097
1126
  idempotency_key=idempotency_key,
1098
1127
  **kwargs
1099
1128
  )
@@ -1210,6 +1239,7 @@ class FirecrawlApp:
1210
1239
  actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1211
1240
  agent: Optional[AgentOptions] = None,
1212
1241
  poll_interval: Optional[int] = 2,
1242
+ max_concurrency: Optional[int] = None,
1213
1243
  idempotency_key: Optional[str] = None,
1214
1244
  **kwargs
1215
1245
  ) -> BatchScrapeStatusResponse:
@@ -1235,6 +1265,7 @@ class FirecrawlApp:
1235
1265
  json_options (Optional[JsonConfig]): JSON extraction config
1236
1266
  actions (Optional[List[Union]]): Actions to perform
1237
1267
  agent (Optional[AgentOptions]): Agent configuration
1268
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1238
1269
  poll_interval (Optional[int]): Seconds between status checks (default: 2)
1239
1270
  idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1240
1271
  **kwargs: Additional parameters to pass to the API
@@ -1294,7 +1325,9 @@ class FirecrawlApp:
1294
1325
  scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1295
1326
  if agent is not None:
1296
1327
  scrape_params['agent'] = agent.dict(exclude_none=True)
1297
-
1328
+ if max_concurrency is not None:
1329
+ scrape_params['maxConcurrency'] = max_concurrency
1330
+
1298
1331
  # Add any additional kwargs
1299
1332
  scrape_params.update(kwargs)
1300
1333
 
@@ -1343,6 +1376,7 @@ class FirecrawlApp:
1343
1376
  json_options: Optional[JsonConfig] = None,
1344
1377
  actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1345
1378
  agent: Optional[AgentOptions] = None,
1379
+ max_concurrency: Optional[int] = None,
1346
1380
  idempotency_key: Optional[str] = None,
1347
1381
  **kwargs
1348
1382
  ) -> BatchScrapeResponse:
@@ -1368,6 +1402,7 @@ class FirecrawlApp:
1368
1402
  json_options (Optional[JsonConfig]): JSON extraction config
1369
1403
  actions (Optional[List[Union]]): Actions to perform
1370
1404
  agent (Optional[AgentOptions]): Agent configuration
1405
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1371
1406
  idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1372
1407
  **kwargs: Additional parameters to pass to the API
1373
1408
 
@@ -1427,7 +1462,9 @@ class FirecrawlApp:
1427
1462
  scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1428
1463
  if agent is not None:
1429
1464
  scrape_params['agent'] = agent.dict(exclude_none=True)
1430
-
1465
+ if max_concurrency is not None:
1466
+ scrape_params['maxConcurrency'] = max_concurrency
1467
+
1431
1468
  # Add any additional kwargs
1432
1469
  scrape_params.update(kwargs)
1433
1470
 
@@ -1475,6 +1512,7 @@ class FirecrawlApp:
1475
1512
  json_options: Optional[JsonConfig] = None,
1476
1513
  actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
1477
1514
  agent: Optional[AgentOptions] = None,
1515
+ max_concurrency: Optional[int] = None,
1478
1516
  idempotency_key: Optional[str] = None,
1479
1517
  **kwargs
1480
1518
  ) -> 'CrawlWatcher':
@@ -1500,6 +1538,7 @@ class FirecrawlApp:
1500
1538
  json_options (Optional[JsonConfig]): JSON extraction config
1501
1539
  actions (Optional[List[Union]]): Actions to perform
1502
1540
  agent (Optional[AgentOptions]): Agent configuration
1541
+ max_concurrency (Optional[int]): Maximum number of concurrent scrapes
1503
1542
  idempotency_key (Optional[str]): Unique key to prevent duplicate requests
1504
1543
  **kwargs: Additional parameters to pass to the API
1505
1544
 
@@ -1555,7 +1594,9 @@ class FirecrawlApp:
1555
1594
  scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
1556
1595
  if agent is not None:
1557
1596
  scrape_params['agent'] = agent.dict(exclude_none=True)
1558
-
1597
+ if max_concurrency is not None:
1598
+ scrape_params['maxConcurrency'] = max_concurrency
1599
+
1559
1600
  # Add any additional kwargs
1560
1601
  scrape_params.update(kwargs)
1561
1602
 
@@ -2784,7 +2825,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
2784
2825
  * limit - Maximum pages to crawl
2785
2826
 
2786
2827
  Link Following:
2787
- * allowBackwardLinks - Follow parent directory links
2828
+ * allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
2829
+ * crawlEntireDomain - Follow parent directory links
2788
2830
  * allowExternalLinks - Follow external domain links
2789
2831
  * ignoreSitemap - Skip sitemap.xml processing
2790
2832
 
@@ -2866,6 +2908,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
2866
2908
  remove_base64_images: Optional[bool] = None,
2867
2909
  block_ads: Optional[bool] = None,
2868
2910
  proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
2911
+ parse_pdf: Optional[bool] = None,
2869
2912
  extract: Optional[JsonConfig] = None,
2870
2913
  json_options: Optional[JsonConfig] = None,
2871
2914
  actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
@@ -2943,6 +2986,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
2943
2986
  scrape_params['blockAds'] = block_ads
2944
2987
  if proxy:
2945
2988
  scrape_params['proxy'] = proxy
2989
+ if parse_pdf is not None:
2990
+ scrape_params['parsePDF'] = parse_pdf
2946
2991
  if extract is not None:
2947
2992
  extract = self._ensure_schema_dict(extract)
2948
2993
  if isinstance(extract, dict) and "schema" in extract:
@@ -3263,6 +3308,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
3263
3308
  max_discovery_depth: Optional[int] = None,
3264
3309
  limit: Optional[int] = None,
3265
3310
  allow_backward_links: Optional[bool] = None,
3311
+ crawl_entire_domain: Optional[bool] = None,
3266
3312
  allow_external_links: Optional[bool] = None,
3267
3313
  ignore_sitemap: Optional[bool] = None,
3268
3314
  scrape_options: Optional[ScrapeOptions] = None,
@@ -3285,7 +3331,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
3285
3331
  max_depth (Optional[int]): Maximum crawl depth
3286
3332
  max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3287
3333
  limit (Optional[int]): Maximum pages to crawl
3288
- allow_backward_links (Optional[bool]): Follow parent directory links
3334
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3335
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3289
3336
  allow_external_links (Optional[bool]): Follow external domain links
3290
3337
  ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3291
3338
  scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@@ -3323,7 +3370,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
3323
3370
  crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3324
3371
  if limit is not None:
3325
3372
  crawl_params['limit'] = limit
3326
- if allow_backward_links is not None:
3373
+ if crawl_entire_domain is not None:
3374
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3375
+ elif allow_backward_links is not None:
3327
3376
  crawl_params['allowBackwardLinks'] = allow_backward_links
3328
3377
  if allow_external_links is not None:
3329
3378
  crawl_params['allowExternalLinks'] = allow_external_links
@@ -3375,6 +3424,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
3375
3424
  max_discovery_depth: Optional[int] = None,
3376
3425
  limit: Optional[int] = None,
3377
3426
  allow_backward_links: Optional[bool] = None,
3427
+ crawl_entire_domain: Optional[bool] = None,
3378
3428
  allow_external_links: Optional[bool] = None,
3379
3429
  ignore_sitemap: Optional[bool] = None,
3380
3430
  scrape_options: Optional[ScrapeOptions] = None,
@@ -3397,7 +3447,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
3397
3447
  max_depth (Optional[int]): Maximum crawl depth
3398
3448
  max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
3399
3449
  limit (Optional[int]): Maximum pages to crawl
3400
- allow_backward_links (Optional[bool]): Follow parent directory links
3450
+ allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
3451
+ crawl_entire_domain (Optional[bool]): Follow parent directory links
3401
3452
  allow_external_links (Optional[bool]): Follow external domain links
3402
3453
  ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
3403
3454
  scrape_options (Optional[ScrapeOptions]): Page scraping configuration
@@ -3431,7 +3482,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
3431
3482
  crawl_params['maxDiscoveryDepth'] = max_discovery_depth
3432
3483
  if limit is not None:
3433
3484
  crawl_params['limit'] = limit
3434
- if allow_backward_links is not None:
3485
+ if crawl_entire_domain is not None:
3486
+ crawl_params['crawlEntireDomain'] = crawl_entire_domain
3487
+ elif allow_backward_links is not None:
3435
3488
  crawl_params['allowBackwardLinks'] = allow_backward_links
3436
3489
  if allow_external_links is not None:
3437
3490
  crawl_params['allowExternalLinks'] = allow_external_links
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: firecrawl-py
3
- Version: 2.8.0
3
+ Version: 2.10.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/mendableai/firecrawl
6
6
  Author: Mendable.ai
@@ -0,0 +1,12 @@
1
+ firecrawl/__init__.py,sha256=qDOTVOIN0WXrkEEWPqy2UfFzbNDbimvD7HOPhXvTkC4,2613
2
+ firecrawl/firecrawl.py,sha256=Bi7n0U94YJicUYnbjKKOmbkrpWh-kSe1ttPpil3rZl4,193869
3
+ firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
5
+ firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=k9IsEbdTHL9Cu49M4FpnQDEo2rnG6RqwmZAsK_EVJr4,21069
7
+ tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
8
+ firecrawl_py-2.10.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
9
+ firecrawl_py-2.10.0.dist-info/METADATA,sha256=k_qij9hylsX7bmsCfslPrYl0xaQ1B356E7gqVcqTsa4,7169
10
+ firecrawl_py-2.10.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
11
+ firecrawl_py-2.10.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
12
+ firecrawl_py-2.10.0.dist-info/RECORD,,
@@ -1,4 +1,2 @@
1
- build
2
- dist
3
1
  firecrawl
4
2
  tests
@@ -1,79 +0,0 @@
1
- """
2
- This is the Firecrawl package.
3
-
4
- This package provides a Python SDK for interacting with the Firecrawl API.
5
- It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
6
- and check the status of these jobs.
7
-
8
- For more information visit https://github.com/firecrawl/
9
- """
10
-
11
- import logging
12
- import os
13
-
14
- from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
15
-
16
- __version__ = "2.8.0"
17
-
18
- # Define the logger for the Firecrawl project
19
- logger: logging.Logger = logging.getLogger("firecrawl")
20
-
21
-
22
- def _configure_logger() -> None:
23
- """
24
- Configure the firecrawl logger for console output.
25
-
26
- The function attaches a handler for console output with a specific format and date
27
- format to the firecrawl logger.
28
- """
29
- try:
30
- # Create the formatter
31
- formatter = logging.Formatter(
32
- "[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
33
- datefmt="%Y-%m-%d %H:%M:%S",
34
- )
35
-
36
- # Create the console handler and set the formatter
37
- console_handler = logging.StreamHandler()
38
- console_handler.setFormatter(formatter)
39
-
40
- # Add the console handler to the firecrawl logger
41
- logger.addHandler(console_handler)
42
- except Exception as e:
43
- logger.error("Failed to configure logging: %s", e)
44
-
45
-
46
- def setup_logging() -> None:
47
- """Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
48
- # Check if the firecrawl logger already has a handler
49
- if logger.hasHandlers():
50
- return # To prevent duplicate logging
51
-
52
- # Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
53
- if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
54
- # Attach a no-op handler to prevent warnings about no handlers
55
- logger.addHandler(logging.NullHandler())
56
- return
57
-
58
- # Attach the console handler to the firecrawl logger
59
- _configure_logger()
60
-
61
- # Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
62
- if env == "DEBUG":
63
- logger.setLevel(logging.DEBUG)
64
- elif env == "INFO":
65
- logger.setLevel(logging.INFO)
66
- elif env == "WARNING":
67
- logger.setLevel(logging.WARNING)
68
- elif env == "ERROR":
69
- logger.setLevel(logging.ERROR)
70
- elif env == "CRITICAL":
71
- logger.setLevel(logging.CRITICAL)
72
- else:
73
- logger.setLevel(logging.INFO)
74
- logger.warning("Unknown logging level: %s, defaulting to INFO", env)
75
-
76
-
77
- # Initialize logging configuration when the module is imported
78
- setup_logging()
79
- logger.debug("Debugging logger setup")
@@ -1,170 +0,0 @@
1
- import importlib.util
2
- import pytest
3
- import time
4
- import os
5
- from uuid import uuid4
6
- from dotenv import load_dotenv
7
-
8
- load_dotenv()
9
-
10
- API_URL = "http://127.0.0.1:3002"
11
- ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
12
- TEST_API_KEY = os.getenv('TEST_API_KEY')
13
-
14
- print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
15
-
16
- spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
17
- firecrawl = importlib.util.module_from_spec(spec)
18
- spec.loader.exec_module(firecrawl)
19
- FirecrawlApp = firecrawl.FirecrawlApp
20
-
21
- def test_no_api_key():
22
- with pytest.raises(Exception) as excinfo:
23
- invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
24
- assert "No API key provided" in str(excinfo.value)
25
-
26
- def test_scrape_url_invalid_api_key():
27
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
28
- with pytest.raises(Exception) as excinfo:
29
- invalid_app.scrape_url('https://firecrawl.dev')
30
- assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
31
-
32
- # def test_blocklisted_url():
33
- # blocklisted_url = "https://facebook.com/fake-test"
34
- # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
35
- # with pytest.raises(Exception) as excinfo:
36
- # app.scrape_url(blocklisted_url)
37
- # assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
38
-
39
- def test_successful_response_with_valid_preview_token():
40
- app = FirecrawlApp(api_url=API_URL, api_key=os.getenv('PREVIEW_TOKEN'), version='v0')
41
- response = app.scrape_url('https://roastmywebsite.ai')
42
- assert response is not None
43
- assert 'content' in response
44
- assert "_Roast_" in response['content']
45
-
46
- def test_scrape_url_e2e():
47
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
48
- response = app.scrape_url('https://roastmywebsite.ai')
49
- print(response)
50
-
51
- assert response is not None
52
- assert 'content' in response
53
- assert 'markdown' in response
54
- assert 'metadata' in response
55
- assert 'html' not in response
56
- assert "_Roast_" in response['content']
57
-
58
- def test_successful_response_with_valid_api_key_and_include_html():
59
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
60
- response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
61
- assert response is not None
62
- assert 'content' in response
63
- assert 'markdown' in response
64
- assert 'html' in response
65
- assert 'metadata' in response
66
- assert "_Roast_" in response['content']
67
- assert "_Roast_" in response['markdown']
68
- assert "<h1" in response['html']
69
-
70
- def test_successful_response_for_valid_scrape_with_pdf_file():
71
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
72
- response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
73
- assert response is not None
74
- assert 'content' in response
75
- assert 'metadata' in response
76
- assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
77
-
78
- def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
79
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
80
- response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
81
- time.sleep(6) # wait for 6 seconds
82
- assert response is not None
83
- assert 'content' in response
84
- assert 'metadata' in response
85
- assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
86
-
87
- def test_crawl_url_invalid_api_key():
88
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
89
- with pytest.raises(Exception) as excinfo:
90
- invalid_app.crawl_url('https://firecrawl.dev')
91
- assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
92
-
93
- # def test_should_return_error_for_blocklisted_url():
94
- # app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
95
- # blocklisted_url = "https://twitter.com/fake-test"
96
- # with pytest.raises(Exception) as excinfo:
97
- # app.crawl_url(blocklisted_url)
98
- # assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
99
-
100
- def test_crawl_url_wait_for_completion_e2e():
101
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
102
- response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
103
- assert response is not None
104
- assert len(response) > 0
105
- assert 'content' in response[0]
106
- assert "_Roast_" in response[0]['content']
107
-
108
- def test_crawl_url_with_idempotency_key_e2e():
109
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
110
- uniqueIdempotencyKey = str(uuid4())
111
- response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
112
- assert response is not None
113
- assert len(response) > 0
114
- assert 'content' in response[0]
115
- assert "_Roast_" in response[0]['content']
116
-
117
- with pytest.raises(Exception) as excinfo:
118
- app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
119
- assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
120
-
121
- def test_check_crawl_status_e2e():
122
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
123
- response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
124
- assert response is not None
125
- assert 'jobId' in response
126
-
127
- time.sleep(30) # wait for 30 seconds
128
- status_response = app.check_crawl_status(response['jobId'])
129
- assert status_response is not None
130
- assert 'status' in status_response
131
- assert status_response['status'] == 'completed'
132
- assert 'data' in status_response
133
- assert len(status_response['data']) > 0
134
-
135
- def test_search_e2e():
136
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
137
- response = app.search("test query")
138
- assert response is not None
139
- assert 'content' in response[0]
140
- assert len(response) > 2
141
-
142
- def test_search_invalid_api_key():
143
- invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
144
- with pytest.raises(Exception) as excinfo:
145
- invalid_app.search("test query")
146
- assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
147
-
148
- def test_llm_extraction():
149
- app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
150
- response = app.scrape_url("https://firecrawl.dev", {
151
- 'extractorOptions': {
152
- 'mode': 'llm-extraction',
153
- 'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
154
- 'extractionSchema': {
155
- 'type': 'object',
156
- 'properties': {
157
- 'company_mission': {'type': 'string'},
158
- 'supports_sso': {'type': 'boolean'},
159
- 'is_open_source': {'type': 'boolean'}
160
- },
161
- 'required': ['company_mission', 'supports_sso', 'is_open_source']
162
- }
163
- }
164
- })
165
- assert response is not None
166
- assert 'llm_extraction' in response
167
- llm_extraction = response['llm_extraction']
168
- assert 'company_mission' in llm_extraction
169
- assert isinstance(llm_extraction['supports_sso'], bool)
170
- assert isinstance(llm_extraction['is_open_source'], bool)