firecrawl-py 2.8.0__py3-none-any.whl → 2.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl-py might be problematic. Click here for more details.
- build/lib/firecrawl/__init__.py +1 -1
- build/lib/firecrawl/firecrawl.py +61 -15
- firecrawl/__init__.py +1 -1
- firecrawl/firecrawl.py +61 -15
- {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.9.0.dist-info}/METADATA +1 -1
- firecrawl_py-2.9.0.dist-info/RECORD +19 -0
- build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
- build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
- build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
- build/lib/build/lib/tests/test_change_tracking.py +0 -98
- firecrawl_py-2.8.0.dist-info/RECORD +0 -40
- {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.9.0.dist-info}/LICENSE +0 -0
- {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.9.0.dist-info}/WHEEL +0 -0
- {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.9.0.dist-info}/top_level.txt +0 -0
build/lib/firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.9.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
build/lib/firecrawl/firecrawl.py
CHANGED
|
@@ -263,6 +263,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
263
263
|
ignoreQueryParameters: Optional[bool] = None
|
|
264
264
|
regexOnFullURL: Optional[bool] = None
|
|
265
265
|
delay: Optional[int] = None # Delay in seconds between scrapes
|
|
266
|
+
maxConcurrency: Optional[int] = None
|
|
266
267
|
|
|
267
268
|
class CrawlResponse(pydantic.BaseModel):
|
|
268
269
|
"""Response from crawling operations."""
|
|
@@ -686,6 +687,7 @@ class FirecrawlApp:
|
|
|
686
687
|
max_discovery_depth: Optional[int] = None,
|
|
687
688
|
limit: Optional[int] = None,
|
|
688
689
|
allow_backward_links: Optional[bool] = None,
|
|
690
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
689
691
|
allow_external_links: Optional[bool] = None,
|
|
690
692
|
ignore_sitemap: Optional[bool] = None,
|
|
691
693
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -694,6 +696,7 @@ class FirecrawlApp:
|
|
|
694
696
|
ignore_query_parameters: Optional[bool] = None,
|
|
695
697
|
regex_on_full_url: Optional[bool] = None,
|
|
696
698
|
delay: Optional[int] = None,
|
|
699
|
+
max_concurrency: Optional[int] = None,
|
|
697
700
|
poll_interval: Optional[int] = 2,
|
|
698
701
|
idempotency_key: Optional[str] = None,
|
|
699
702
|
**kwargs
|
|
@@ -708,7 +711,8 @@ class FirecrawlApp:
|
|
|
708
711
|
max_depth (Optional[int]): Maximum crawl depth
|
|
709
712
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
710
713
|
limit (Optional[int]): Maximum pages to crawl
|
|
711
|
-
allow_backward_links (Optional[bool]):
|
|
714
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
715
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
712
716
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
713
717
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
714
718
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -717,6 +721,7 @@ class FirecrawlApp:
|
|
|
717
721
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
718
722
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
719
723
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
724
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
720
725
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
721
726
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
722
727
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -746,7 +751,9 @@ class FirecrawlApp:
|
|
|
746
751
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
747
752
|
if limit is not None:
|
|
748
753
|
crawl_params['limit'] = limit
|
|
749
|
-
if
|
|
754
|
+
if crawl_entire_domain is not None:
|
|
755
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
756
|
+
elif allow_backward_links is not None:
|
|
750
757
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
751
758
|
if allow_external_links is not None:
|
|
752
759
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -764,7 +771,9 @@ class FirecrawlApp:
|
|
|
764
771
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
765
772
|
if delay is not None:
|
|
766
773
|
crawl_params['delay'] = delay
|
|
767
|
-
|
|
774
|
+
if max_concurrency is not None:
|
|
775
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
776
|
+
|
|
768
777
|
# Add any additional kwargs
|
|
769
778
|
crawl_params.update(kwargs)
|
|
770
779
|
|
|
@@ -797,6 +806,7 @@ class FirecrawlApp:
|
|
|
797
806
|
max_discovery_depth: Optional[int] = None,
|
|
798
807
|
limit: Optional[int] = None,
|
|
799
808
|
allow_backward_links: Optional[bool] = None,
|
|
809
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
800
810
|
allow_external_links: Optional[bool] = None,
|
|
801
811
|
ignore_sitemap: Optional[bool] = None,
|
|
802
812
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -818,7 +828,8 @@ class FirecrawlApp:
|
|
|
818
828
|
max_depth (Optional[int]): Maximum crawl depth
|
|
819
829
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
820
830
|
limit (Optional[int]): Maximum pages to crawl
|
|
821
|
-
allow_backward_links (Optional[bool]):
|
|
831
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
832
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
822
833
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
823
834
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
824
835
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -826,6 +837,8 @@ class FirecrawlApp:
|
|
|
826
837
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
827
838
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
828
839
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
840
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
841
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
829
842
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
830
843
|
**kwargs: Additional parameters to pass to the API
|
|
831
844
|
|
|
@@ -855,7 +868,9 @@ class FirecrawlApp:
|
|
|
855
868
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
856
869
|
if limit is not None:
|
|
857
870
|
crawl_params['limit'] = limit
|
|
858
|
-
if
|
|
871
|
+
if crawl_entire_domain is not None:
|
|
872
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
873
|
+
elif allow_backward_links is not None:
|
|
859
874
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
860
875
|
if allow_external_links is not None:
|
|
861
876
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -873,7 +888,9 @@ class FirecrawlApp:
|
|
|
873
888
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
874
889
|
if delay is not None:
|
|
875
890
|
crawl_params['delay'] = delay
|
|
876
|
-
|
|
891
|
+
if max_concurrency is not None:
|
|
892
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
893
|
+
|
|
877
894
|
# Add any additional kwargs
|
|
878
895
|
crawl_params.update(kwargs)
|
|
879
896
|
|
|
@@ -1042,6 +1059,7 @@ class FirecrawlApp:
|
|
|
1042
1059
|
max_discovery_depth: Optional[int] = None,
|
|
1043
1060
|
limit: Optional[int] = None,
|
|
1044
1061
|
allow_backward_links: Optional[bool] = None,
|
|
1062
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
1045
1063
|
allow_external_links: Optional[bool] = None,
|
|
1046
1064
|
ignore_sitemap: Optional[bool] = None,
|
|
1047
1065
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -1049,6 +1067,8 @@ class FirecrawlApp:
|
|
|
1049
1067
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1050
1068
|
ignore_query_parameters: Optional[bool] = None,
|
|
1051
1069
|
regex_on_full_url: Optional[bool] = None,
|
|
1070
|
+
delay: Optional[int] = None,
|
|
1071
|
+
max_concurrency: Optional[int] = None,
|
|
1052
1072
|
idempotency_key: Optional[str] = None,
|
|
1053
1073
|
**kwargs
|
|
1054
1074
|
) -> 'CrawlWatcher':
|
|
@@ -1062,7 +1082,8 @@ class FirecrawlApp:
|
|
|
1062
1082
|
max_depth (Optional[int]): Maximum crawl depth
|
|
1063
1083
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
1064
1084
|
limit (Optional[int]): Maximum pages to crawl
|
|
1065
|
-
allow_backward_links (Optional[bool]):
|
|
1085
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
1086
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
1066
1087
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1067
1088
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1068
1089
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -1070,6 +1091,8 @@ class FirecrawlApp:
|
|
|
1070
1091
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1071
1092
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1072
1093
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1094
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
1095
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1073
1096
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1074
1097
|
**kwargs: Additional parameters to pass to the API
|
|
1075
1098
|
|
|
@@ -1094,6 +1117,8 @@ class FirecrawlApp:
|
|
|
1094
1117
|
deduplicate_similar_urls=deduplicate_similar_urls,
|
|
1095
1118
|
ignore_query_parameters=ignore_query_parameters,
|
|
1096
1119
|
regex_on_full_url=regex_on_full_url,
|
|
1120
|
+
delay=delay,
|
|
1121
|
+
max_concurrency=max_concurrency,
|
|
1097
1122
|
idempotency_key=idempotency_key,
|
|
1098
1123
|
**kwargs
|
|
1099
1124
|
)
|
|
@@ -1210,6 +1235,7 @@ class FirecrawlApp:
|
|
|
1210
1235
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1211
1236
|
agent: Optional[AgentOptions] = None,
|
|
1212
1237
|
poll_interval: Optional[int] = 2,
|
|
1238
|
+
max_concurrency: Optional[int] = None,
|
|
1213
1239
|
idempotency_key: Optional[str] = None,
|
|
1214
1240
|
**kwargs
|
|
1215
1241
|
) -> BatchScrapeStatusResponse:
|
|
@@ -1235,6 +1261,7 @@ class FirecrawlApp:
|
|
|
1235
1261
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1236
1262
|
actions (Optional[List[Union]]): Actions to perform
|
|
1237
1263
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1264
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1238
1265
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
1239
1266
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1240
1267
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -1294,7 +1321,9 @@ class FirecrawlApp:
|
|
|
1294
1321
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1295
1322
|
if agent is not None:
|
|
1296
1323
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1297
|
-
|
|
1324
|
+
if max_concurrency is not None:
|
|
1325
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1326
|
+
|
|
1298
1327
|
# Add any additional kwargs
|
|
1299
1328
|
scrape_params.update(kwargs)
|
|
1300
1329
|
|
|
@@ -1343,6 +1372,7 @@ class FirecrawlApp:
|
|
|
1343
1372
|
json_options: Optional[JsonConfig] = None,
|
|
1344
1373
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1345
1374
|
agent: Optional[AgentOptions] = None,
|
|
1375
|
+
max_concurrency: Optional[int] = None,
|
|
1346
1376
|
idempotency_key: Optional[str] = None,
|
|
1347
1377
|
**kwargs
|
|
1348
1378
|
) -> BatchScrapeResponse:
|
|
@@ -1368,6 +1398,7 @@ class FirecrawlApp:
|
|
|
1368
1398
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1369
1399
|
actions (Optional[List[Union]]): Actions to perform
|
|
1370
1400
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1401
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1371
1402
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1372
1403
|
**kwargs: Additional parameters to pass to the API
|
|
1373
1404
|
|
|
@@ -1427,7 +1458,9 @@ class FirecrawlApp:
|
|
|
1427
1458
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1428
1459
|
if agent is not None:
|
|
1429
1460
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1430
|
-
|
|
1461
|
+
if max_concurrency is not None:
|
|
1462
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1463
|
+
|
|
1431
1464
|
# Add any additional kwargs
|
|
1432
1465
|
scrape_params.update(kwargs)
|
|
1433
1466
|
|
|
@@ -1475,6 +1508,7 @@ class FirecrawlApp:
|
|
|
1475
1508
|
json_options: Optional[JsonConfig] = None,
|
|
1476
1509
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1477
1510
|
agent: Optional[AgentOptions] = None,
|
|
1511
|
+
max_concurrency: Optional[int] = None,
|
|
1478
1512
|
idempotency_key: Optional[str] = None,
|
|
1479
1513
|
**kwargs
|
|
1480
1514
|
) -> 'CrawlWatcher':
|
|
@@ -1500,6 +1534,7 @@ class FirecrawlApp:
|
|
|
1500
1534
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1501
1535
|
actions (Optional[List[Union]]): Actions to perform
|
|
1502
1536
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1537
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1503
1538
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1504
1539
|
**kwargs: Additional parameters to pass to the API
|
|
1505
1540
|
|
|
@@ -1555,7 +1590,9 @@ class FirecrawlApp:
|
|
|
1555
1590
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1556
1591
|
if agent is not None:
|
|
1557
1592
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1558
|
-
|
|
1593
|
+
if max_concurrency is not None:
|
|
1594
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1595
|
+
|
|
1559
1596
|
# Add any additional kwargs
|
|
1560
1597
|
scrape_params.update(kwargs)
|
|
1561
1598
|
|
|
@@ -2784,7 +2821,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2784
2821
|
* limit - Maximum pages to crawl
|
|
2785
2822
|
|
|
2786
2823
|
Link Following:
|
|
2787
|
-
* allowBackwardLinks -
|
|
2824
|
+
* allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
|
|
2825
|
+
* crawlEntireDomain - Follow parent directory links
|
|
2788
2826
|
* allowExternalLinks - Follow external domain links
|
|
2789
2827
|
* ignoreSitemap - Skip sitemap.xml processing
|
|
2790
2828
|
|
|
@@ -3263,6 +3301,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3263
3301
|
max_discovery_depth: Optional[int] = None,
|
|
3264
3302
|
limit: Optional[int] = None,
|
|
3265
3303
|
allow_backward_links: Optional[bool] = None,
|
|
3304
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3266
3305
|
allow_external_links: Optional[bool] = None,
|
|
3267
3306
|
ignore_sitemap: Optional[bool] = None,
|
|
3268
3307
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3285,7 +3324,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3285
3324
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3286
3325
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3287
3326
|
limit (Optional[int]): Maximum pages to crawl
|
|
3288
|
-
allow_backward_links (Optional[bool]):
|
|
3327
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3328
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3289
3329
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3290
3330
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3291
3331
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3323,7 +3363,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3323
3363
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3324
3364
|
if limit is not None:
|
|
3325
3365
|
crawl_params['limit'] = limit
|
|
3326
|
-
if
|
|
3366
|
+
if crawl_entire_domain is not None:
|
|
3367
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3368
|
+
elif allow_backward_links is not None:
|
|
3327
3369
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3328
3370
|
if allow_external_links is not None:
|
|
3329
3371
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -3375,6 +3417,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3375
3417
|
max_discovery_depth: Optional[int] = None,
|
|
3376
3418
|
limit: Optional[int] = None,
|
|
3377
3419
|
allow_backward_links: Optional[bool] = None,
|
|
3420
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3378
3421
|
allow_external_links: Optional[bool] = None,
|
|
3379
3422
|
ignore_sitemap: Optional[bool] = None,
|
|
3380
3423
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3397,7 +3440,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3397
3440
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3398
3441
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3399
3442
|
limit (Optional[int]): Maximum pages to crawl
|
|
3400
|
-
allow_backward_links (Optional[bool]):
|
|
3443
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3444
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3401
3445
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3402
3446
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3403
3447
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3431,7 +3475,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3431
3475
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3432
3476
|
if limit is not None:
|
|
3433
3477
|
crawl_params['limit'] = limit
|
|
3434
|
-
if
|
|
3478
|
+
if crawl_entire_domain is not None:
|
|
3479
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3480
|
+
elif allow_backward_links is not None:
|
|
3435
3481
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3436
3482
|
if allow_external_links is not None:
|
|
3437
3483
|
crawl_params['allowExternalLinks'] = allow_external_links
|
firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.9.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
firecrawl/firecrawl.py
CHANGED
|
@@ -263,6 +263,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
263
263
|
ignoreQueryParameters: Optional[bool] = None
|
|
264
264
|
regexOnFullURL: Optional[bool] = None
|
|
265
265
|
delay: Optional[int] = None # Delay in seconds between scrapes
|
|
266
|
+
maxConcurrency: Optional[int] = None
|
|
266
267
|
|
|
267
268
|
class CrawlResponse(pydantic.BaseModel):
|
|
268
269
|
"""Response from crawling operations."""
|
|
@@ -686,6 +687,7 @@ class FirecrawlApp:
|
|
|
686
687
|
max_discovery_depth: Optional[int] = None,
|
|
687
688
|
limit: Optional[int] = None,
|
|
688
689
|
allow_backward_links: Optional[bool] = None,
|
|
690
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
689
691
|
allow_external_links: Optional[bool] = None,
|
|
690
692
|
ignore_sitemap: Optional[bool] = None,
|
|
691
693
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -694,6 +696,7 @@ class FirecrawlApp:
|
|
|
694
696
|
ignore_query_parameters: Optional[bool] = None,
|
|
695
697
|
regex_on_full_url: Optional[bool] = None,
|
|
696
698
|
delay: Optional[int] = None,
|
|
699
|
+
max_concurrency: Optional[int] = None,
|
|
697
700
|
poll_interval: Optional[int] = 2,
|
|
698
701
|
idempotency_key: Optional[str] = None,
|
|
699
702
|
**kwargs
|
|
@@ -708,7 +711,8 @@ class FirecrawlApp:
|
|
|
708
711
|
max_depth (Optional[int]): Maximum crawl depth
|
|
709
712
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
710
713
|
limit (Optional[int]): Maximum pages to crawl
|
|
711
|
-
allow_backward_links (Optional[bool]):
|
|
714
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
715
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
712
716
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
713
717
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
714
718
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -717,6 +721,7 @@ class FirecrawlApp:
|
|
|
717
721
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
718
722
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
719
723
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
724
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
720
725
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
721
726
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
722
727
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -746,7 +751,9 @@ class FirecrawlApp:
|
|
|
746
751
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
747
752
|
if limit is not None:
|
|
748
753
|
crawl_params['limit'] = limit
|
|
749
|
-
if
|
|
754
|
+
if crawl_entire_domain is not None:
|
|
755
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
756
|
+
elif allow_backward_links is not None:
|
|
750
757
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
751
758
|
if allow_external_links is not None:
|
|
752
759
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -764,7 +771,9 @@ class FirecrawlApp:
|
|
|
764
771
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
765
772
|
if delay is not None:
|
|
766
773
|
crawl_params['delay'] = delay
|
|
767
|
-
|
|
774
|
+
if max_concurrency is not None:
|
|
775
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
776
|
+
|
|
768
777
|
# Add any additional kwargs
|
|
769
778
|
crawl_params.update(kwargs)
|
|
770
779
|
|
|
@@ -797,6 +806,7 @@ class FirecrawlApp:
|
|
|
797
806
|
max_discovery_depth: Optional[int] = None,
|
|
798
807
|
limit: Optional[int] = None,
|
|
799
808
|
allow_backward_links: Optional[bool] = None,
|
|
809
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
800
810
|
allow_external_links: Optional[bool] = None,
|
|
801
811
|
ignore_sitemap: Optional[bool] = None,
|
|
802
812
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -818,7 +828,8 @@ class FirecrawlApp:
|
|
|
818
828
|
max_depth (Optional[int]): Maximum crawl depth
|
|
819
829
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
820
830
|
limit (Optional[int]): Maximum pages to crawl
|
|
821
|
-
allow_backward_links (Optional[bool]):
|
|
831
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
832
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
822
833
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
823
834
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
824
835
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -826,6 +837,8 @@ class FirecrawlApp:
|
|
|
826
837
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
827
838
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
828
839
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
840
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
841
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
829
842
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
830
843
|
**kwargs: Additional parameters to pass to the API
|
|
831
844
|
|
|
@@ -855,7 +868,9 @@ class FirecrawlApp:
|
|
|
855
868
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
856
869
|
if limit is not None:
|
|
857
870
|
crawl_params['limit'] = limit
|
|
858
|
-
if
|
|
871
|
+
if crawl_entire_domain is not None:
|
|
872
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
873
|
+
elif allow_backward_links is not None:
|
|
859
874
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
860
875
|
if allow_external_links is not None:
|
|
861
876
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -873,7 +888,9 @@ class FirecrawlApp:
|
|
|
873
888
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
874
889
|
if delay is not None:
|
|
875
890
|
crawl_params['delay'] = delay
|
|
876
|
-
|
|
891
|
+
if max_concurrency is not None:
|
|
892
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
893
|
+
|
|
877
894
|
# Add any additional kwargs
|
|
878
895
|
crawl_params.update(kwargs)
|
|
879
896
|
|
|
@@ -1042,6 +1059,7 @@ class FirecrawlApp:
|
|
|
1042
1059
|
max_discovery_depth: Optional[int] = None,
|
|
1043
1060
|
limit: Optional[int] = None,
|
|
1044
1061
|
allow_backward_links: Optional[bool] = None,
|
|
1062
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
1045
1063
|
allow_external_links: Optional[bool] = None,
|
|
1046
1064
|
ignore_sitemap: Optional[bool] = None,
|
|
1047
1065
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -1049,6 +1067,8 @@ class FirecrawlApp:
|
|
|
1049
1067
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1050
1068
|
ignore_query_parameters: Optional[bool] = None,
|
|
1051
1069
|
regex_on_full_url: Optional[bool] = None,
|
|
1070
|
+
delay: Optional[int] = None,
|
|
1071
|
+
max_concurrency: Optional[int] = None,
|
|
1052
1072
|
idempotency_key: Optional[str] = None,
|
|
1053
1073
|
**kwargs
|
|
1054
1074
|
) -> 'CrawlWatcher':
|
|
@@ -1062,7 +1082,8 @@ class FirecrawlApp:
|
|
|
1062
1082
|
max_depth (Optional[int]): Maximum crawl depth
|
|
1063
1083
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
1064
1084
|
limit (Optional[int]): Maximum pages to crawl
|
|
1065
|
-
allow_backward_links (Optional[bool]):
|
|
1085
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
1086
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
1066
1087
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1067
1088
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1068
1089
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -1070,6 +1091,8 @@ class FirecrawlApp:
|
|
|
1070
1091
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1071
1092
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1072
1093
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1094
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
1095
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1073
1096
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1074
1097
|
**kwargs: Additional parameters to pass to the API
|
|
1075
1098
|
|
|
@@ -1094,6 +1117,8 @@ class FirecrawlApp:
|
|
|
1094
1117
|
deduplicate_similar_urls=deduplicate_similar_urls,
|
|
1095
1118
|
ignore_query_parameters=ignore_query_parameters,
|
|
1096
1119
|
regex_on_full_url=regex_on_full_url,
|
|
1120
|
+
delay=delay,
|
|
1121
|
+
max_concurrency=max_concurrency,
|
|
1097
1122
|
idempotency_key=idempotency_key,
|
|
1098
1123
|
**kwargs
|
|
1099
1124
|
)
|
|
@@ -1210,6 +1235,7 @@ class FirecrawlApp:
|
|
|
1210
1235
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1211
1236
|
agent: Optional[AgentOptions] = None,
|
|
1212
1237
|
poll_interval: Optional[int] = 2,
|
|
1238
|
+
max_concurrency: Optional[int] = None,
|
|
1213
1239
|
idempotency_key: Optional[str] = None,
|
|
1214
1240
|
**kwargs
|
|
1215
1241
|
) -> BatchScrapeStatusResponse:
|
|
@@ -1235,6 +1261,7 @@ class FirecrawlApp:
|
|
|
1235
1261
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1236
1262
|
actions (Optional[List[Union]]): Actions to perform
|
|
1237
1263
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1264
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1238
1265
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
1239
1266
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1240
1267
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -1294,7 +1321,9 @@ class FirecrawlApp:
|
|
|
1294
1321
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1295
1322
|
if agent is not None:
|
|
1296
1323
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1297
|
-
|
|
1324
|
+
if max_concurrency is not None:
|
|
1325
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1326
|
+
|
|
1298
1327
|
# Add any additional kwargs
|
|
1299
1328
|
scrape_params.update(kwargs)
|
|
1300
1329
|
|
|
@@ -1343,6 +1372,7 @@ class FirecrawlApp:
|
|
|
1343
1372
|
json_options: Optional[JsonConfig] = None,
|
|
1344
1373
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1345
1374
|
agent: Optional[AgentOptions] = None,
|
|
1375
|
+
max_concurrency: Optional[int] = None,
|
|
1346
1376
|
idempotency_key: Optional[str] = None,
|
|
1347
1377
|
**kwargs
|
|
1348
1378
|
) -> BatchScrapeResponse:
|
|
@@ -1368,6 +1398,7 @@ class FirecrawlApp:
|
|
|
1368
1398
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1369
1399
|
actions (Optional[List[Union]]): Actions to perform
|
|
1370
1400
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1401
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1371
1402
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1372
1403
|
**kwargs: Additional parameters to pass to the API
|
|
1373
1404
|
|
|
@@ -1427,7 +1458,9 @@ class FirecrawlApp:
|
|
|
1427
1458
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1428
1459
|
if agent is not None:
|
|
1429
1460
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1430
|
-
|
|
1461
|
+
if max_concurrency is not None:
|
|
1462
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1463
|
+
|
|
1431
1464
|
# Add any additional kwargs
|
|
1432
1465
|
scrape_params.update(kwargs)
|
|
1433
1466
|
|
|
@@ -1475,6 +1508,7 @@ class FirecrawlApp:
|
|
|
1475
1508
|
json_options: Optional[JsonConfig] = None,
|
|
1476
1509
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1477
1510
|
agent: Optional[AgentOptions] = None,
|
|
1511
|
+
max_concurrency: Optional[int] = None,
|
|
1478
1512
|
idempotency_key: Optional[str] = None,
|
|
1479
1513
|
**kwargs
|
|
1480
1514
|
) -> 'CrawlWatcher':
|
|
@@ -1500,6 +1534,7 @@ class FirecrawlApp:
|
|
|
1500
1534
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1501
1535
|
actions (Optional[List[Union]]): Actions to perform
|
|
1502
1536
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1537
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1503
1538
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1504
1539
|
**kwargs: Additional parameters to pass to the API
|
|
1505
1540
|
|
|
@@ -1555,7 +1590,9 @@ class FirecrawlApp:
|
|
|
1555
1590
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1556
1591
|
if agent is not None:
|
|
1557
1592
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1558
|
-
|
|
1593
|
+
if max_concurrency is not None:
|
|
1594
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1595
|
+
|
|
1559
1596
|
# Add any additional kwargs
|
|
1560
1597
|
scrape_params.update(kwargs)
|
|
1561
1598
|
|
|
@@ -2784,7 +2821,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2784
2821
|
* limit - Maximum pages to crawl
|
|
2785
2822
|
|
|
2786
2823
|
Link Following:
|
|
2787
|
-
* allowBackwardLinks -
|
|
2824
|
+
* allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
|
|
2825
|
+
* crawlEntireDomain - Follow parent directory links
|
|
2788
2826
|
* allowExternalLinks - Follow external domain links
|
|
2789
2827
|
* ignoreSitemap - Skip sitemap.xml processing
|
|
2790
2828
|
|
|
@@ -3263,6 +3301,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3263
3301
|
max_discovery_depth: Optional[int] = None,
|
|
3264
3302
|
limit: Optional[int] = None,
|
|
3265
3303
|
allow_backward_links: Optional[bool] = None,
|
|
3304
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3266
3305
|
allow_external_links: Optional[bool] = None,
|
|
3267
3306
|
ignore_sitemap: Optional[bool] = None,
|
|
3268
3307
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3285,7 +3324,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3285
3324
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3286
3325
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3287
3326
|
limit (Optional[int]): Maximum pages to crawl
|
|
3288
|
-
allow_backward_links (Optional[bool]):
|
|
3327
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3328
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3289
3329
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3290
3330
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3291
3331
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3323,7 +3363,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3323
3363
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3324
3364
|
if limit is not None:
|
|
3325
3365
|
crawl_params['limit'] = limit
|
|
3326
|
-
if
|
|
3366
|
+
if crawl_entire_domain is not None:
|
|
3367
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3368
|
+
elif allow_backward_links is not None:
|
|
3327
3369
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3328
3370
|
if allow_external_links is not None:
|
|
3329
3371
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -3375,6 +3417,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3375
3417
|
max_discovery_depth: Optional[int] = None,
|
|
3376
3418
|
limit: Optional[int] = None,
|
|
3377
3419
|
allow_backward_links: Optional[bool] = None,
|
|
3420
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3378
3421
|
allow_external_links: Optional[bool] = None,
|
|
3379
3422
|
ignore_sitemap: Optional[bool] = None,
|
|
3380
3423
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3397,7 +3440,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3397
3440
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3398
3441
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3399
3442
|
limit (Optional[int]): Maximum pages to crawl
|
|
3400
|
-
allow_backward_links (Optional[bool]):
|
|
3443
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3444
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3401
3445
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3402
3446
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3403
3447
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3431,7 +3475,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3431
3475
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3432
3476
|
if limit is not None:
|
|
3433
3477
|
crawl_params['limit'] = limit
|
|
3434
|
-
if
|
|
3478
|
+
if crawl_entire_domain is not None:
|
|
3479
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3480
|
+
elif allow_backward_links is not None:
|
|
3435
3481
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3436
3482
|
if allow_external_links is not None:
|
|
3437
3483
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
build/lib/firecrawl/__init__.py,sha256=nWGTmoKRj6qHs3mzjKE4d3giXsTeeXIO-Ujw0S0oy7k,2612
|
|
2
|
+
build/lib/firecrawl/firecrawl.py,sha256=ICCfDvhpsV3OT5kwwuiS2_6tiq9kmCca4Elum7mKhxg,193573
|
|
3
|
+
build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
build/lib/firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
|
+
build/lib/tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
firecrawl/__init__.py,sha256=nWGTmoKRj6qHs3mzjKE4d3giXsTeeXIO-Ujw0S0oy7k,2612
|
|
9
|
+
firecrawl/firecrawl.py,sha256=ICCfDvhpsV3OT5kwwuiS2_6tiq9kmCca4Elum7mKhxg,193573
|
|
10
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
12
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
14
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
15
|
+
firecrawl_py-2.9.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
16
|
+
firecrawl_py-2.9.0.dist-info/METADATA,sha256=naTyW9fM6Da64cF2NkI9i8U0EpAvyW4ZbfvZv-QsZUs,7168
|
|
17
|
+
firecrawl_py-2.9.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
18
|
+
firecrawl_py-2.9.0.dist-info/top_level.txt,sha256=ytN_R30g2U2qZYFyIm710Z8QeK9FO1Uwa-WPGHXyqjE,27
|
|
19
|
+
firecrawl_py-2.9.0.dist-info/RECORD,,
|