firecrawl 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/firecrawl.py +85 -26
- {firecrawl-2.7.0.dist-info → firecrawl-2.9.0.dist-info}/METADATA +1 -1
- {firecrawl-2.7.0.dist-info → firecrawl-2.9.0.dist-info}/RECORD +7 -7
- {firecrawl-2.7.0.dist-info → firecrawl-2.9.0.dist-info}/LICENSE +0 -0
- {firecrawl-2.7.0.dist-info → firecrawl-2.9.0.dist-info}/WHEEL +0 -0
- {firecrawl-2.7.0.dist-info → firecrawl-2.9.0.dist-info}/top_level.txt +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.9.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
firecrawl/firecrawl.py
CHANGED
|
@@ -140,6 +140,7 @@ class ChangeTrackingOptions(pydantic.BaseModel):
|
|
|
140
140
|
modes: Optional[List[Literal["git-diff", "json"]]] = None
|
|
141
141
|
schema: Optional[Any] = None
|
|
142
142
|
prompt: Optional[str] = None
|
|
143
|
+
tag: Optional[str] = None
|
|
143
144
|
|
|
144
145
|
class ScrapeOptions(pydantic.BaseModel):
|
|
145
146
|
"""Parameters for scraping operations."""
|
|
@@ -155,8 +156,10 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
155
156
|
skipTlsVerification: Optional[bool] = None
|
|
156
157
|
removeBase64Images: Optional[bool] = None
|
|
157
158
|
blockAds: Optional[bool] = None
|
|
158
|
-
proxy: Optional[Literal["basic", "stealth"]] = None
|
|
159
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
159
160
|
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
|
161
|
+
maxAge: Optional[int] = None
|
|
162
|
+
storeInCache: Optional[bool] = None
|
|
160
163
|
|
|
161
164
|
class WaitAction(pydantic.BaseModel):
|
|
162
165
|
"""Wait action to perform during scraping."""
|
|
@@ -260,6 +263,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
260
263
|
ignoreQueryParameters: Optional[bool] = None
|
|
261
264
|
regexOnFullURL: Optional[bool] = None
|
|
262
265
|
delay: Optional[int] = None # Delay in seconds between scrapes
|
|
266
|
+
maxConcurrency: Optional[int] = None
|
|
263
267
|
|
|
264
268
|
class CrawlResponse(pydantic.BaseModel):
|
|
265
269
|
"""Response from crawling operations."""
|
|
@@ -292,6 +296,7 @@ class MapParams(pydantic.BaseModel):
|
|
|
292
296
|
sitemapOnly: Optional[bool] = None
|
|
293
297
|
limit: Optional[int] = None
|
|
294
298
|
timeout: Optional[int] = None
|
|
299
|
+
useIndex: Optional[bool] = None
|
|
295
300
|
|
|
296
301
|
class MapResponse(pydantic.BaseModel):
|
|
297
302
|
"""Response from mapping operations."""
|
|
@@ -459,11 +464,13 @@ class FirecrawlApp:
|
|
|
459
464
|
skip_tls_verification: Optional[bool] = None,
|
|
460
465
|
remove_base64_images: Optional[bool] = None,
|
|
461
466
|
block_ads: Optional[bool] = None,
|
|
462
|
-
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
467
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
463
468
|
extract: Optional[JsonConfig] = None,
|
|
464
469
|
json_options: Optional[JsonConfig] = None,
|
|
465
470
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
466
471
|
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
|
472
|
+
max_age: Optional[int] = None,
|
|
473
|
+
store_in_cache: Optional[bool] = None,
|
|
467
474
|
**kwargs) -> ScrapeResponse[Any]:
|
|
468
475
|
"""
|
|
469
476
|
Scrape and extract content from a URL.
|
|
@@ -481,7 +488,7 @@ class FirecrawlApp:
|
|
|
481
488
|
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
482
489
|
remove_base64_images (Optional[bool]): Remove base64 images
|
|
483
490
|
block_ads (Optional[bool]): Block ads
|
|
484
|
-
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
491
|
+
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
485
492
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
486
493
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
487
494
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
@@ -545,6 +552,10 @@ class FirecrawlApp:
|
|
|
545
552
|
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
|
|
546
553
|
if change_tracking_options:
|
|
547
554
|
scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
|
|
555
|
+
if max_age is not None:
|
|
556
|
+
scrape_params['maxAge'] = max_age
|
|
557
|
+
if store_in_cache is not None:
|
|
558
|
+
scrape_params['storeInCache'] = store_in_cache
|
|
548
559
|
|
|
549
560
|
scrape_params.update(kwargs)
|
|
550
561
|
|
|
@@ -676,6 +687,7 @@ class FirecrawlApp:
|
|
|
676
687
|
max_discovery_depth: Optional[int] = None,
|
|
677
688
|
limit: Optional[int] = None,
|
|
678
689
|
allow_backward_links: Optional[bool] = None,
|
|
690
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
679
691
|
allow_external_links: Optional[bool] = None,
|
|
680
692
|
ignore_sitemap: Optional[bool] = None,
|
|
681
693
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -684,6 +696,7 @@ class FirecrawlApp:
|
|
|
684
696
|
ignore_query_parameters: Optional[bool] = None,
|
|
685
697
|
regex_on_full_url: Optional[bool] = None,
|
|
686
698
|
delay: Optional[int] = None,
|
|
699
|
+
max_concurrency: Optional[int] = None,
|
|
687
700
|
poll_interval: Optional[int] = 2,
|
|
688
701
|
idempotency_key: Optional[str] = None,
|
|
689
702
|
**kwargs
|
|
@@ -698,7 +711,8 @@ class FirecrawlApp:
|
|
|
698
711
|
max_depth (Optional[int]): Maximum crawl depth
|
|
699
712
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
700
713
|
limit (Optional[int]): Maximum pages to crawl
|
|
701
|
-
allow_backward_links (Optional[bool]):
|
|
714
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
715
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
702
716
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
703
717
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
704
718
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -707,6 +721,7 @@ class FirecrawlApp:
|
|
|
707
721
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
708
722
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
709
723
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
724
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
710
725
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
711
726
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
712
727
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -736,7 +751,9 @@ class FirecrawlApp:
|
|
|
736
751
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
737
752
|
if limit is not None:
|
|
738
753
|
crawl_params['limit'] = limit
|
|
739
|
-
if
|
|
754
|
+
if crawl_entire_domain is not None:
|
|
755
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
756
|
+
elif allow_backward_links is not None:
|
|
740
757
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
741
758
|
if allow_external_links is not None:
|
|
742
759
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -754,7 +771,9 @@ class FirecrawlApp:
|
|
|
754
771
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
755
772
|
if delay is not None:
|
|
756
773
|
crawl_params['delay'] = delay
|
|
757
|
-
|
|
774
|
+
if max_concurrency is not None:
|
|
775
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
776
|
+
|
|
758
777
|
# Add any additional kwargs
|
|
759
778
|
crawl_params.update(kwargs)
|
|
760
779
|
|
|
@@ -787,6 +806,7 @@ class FirecrawlApp:
|
|
|
787
806
|
max_discovery_depth: Optional[int] = None,
|
|
788
807
|
limit: Optional[int] = None,
|
|
789
808
|
allow_backward_links: Optional[bool] = None,
|
|
809
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
790
810
|
allow_external_links: Optional[bool] = None,
|
|
791
811
|
ignore_sitemap: Optional[bool] = None,
|
|
792
812
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -808,7 +828,8 @@ class FirecrawlApp:
|
|
|
808
828
|
max_depth (Optional[int]): Maximum crawl depth
|
|
809
829
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
810
830
|
limit (Optional[int]): Maximum pages to crawl
|
|
811
|
-
allow_backward_links (Optional[bool]):
|
|
831
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
832
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
812
833
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
813
834
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
814
835
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -816,6 +837,8 @@ class FirecrawlApp:
|
|
|
816
837
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
817
838
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
818
839
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
840
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
841
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
819
842
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
820
843
|
**kwargs: Additional parameters to pass to the API
|
|
821
844
|
|
|
@@ -845,7 +868,9 @@ class FirecrawlApp:
|
|
|
845
868
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
846
869
|
if limit is not None:
|
|
847
870
|
crawl_params['limit'] = limit
|
|
848
|
-
if
|
|
871
|
+
if crawl_entire_domain is not None:
|
|
872
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
873
|
+
elif allow_backward_links is not None:
|
|
849
874
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
850
875
|
if allow_external_links is not None:
|
|
851
876
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -863,7 +888,9 @@ class FirecrawlApp:
|
|
|
863
888
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
864
889
|
if delay is not None:
|
|
865
890
|
crawl_params['delay'] = delay
|
|
866
|
-
|
|
891
|
+
if max_concurrency is not None:
|
|
892
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
893
|
+
|
|
867
894
|
# Add any additional kwargs
|
|
868
895
|
crawl_params.update(kwargs)
|
|
869
896
|
|
|
@@ -1032,6 +1059,7 @@ class FirecrawlApp:
|
|
|
1032
1059
|
max_discovery_depth: Optional[int] = None,
|
|
1033
1060
|
limit: Optional[int] = None,
|
|
1034
1061
|
allow_backward_links: Optional[bool] = None,
|
|
1062
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
1035
1063
|
allow_external_links: Optional[bool] = None,
|
|
1036
1064
|
ignore_sitemap: Optional[bool] = None,
|
|
1037
1065
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -1039,6 +1067,8 @@ class FirecrawlApp:
|
|
|
1039
1067
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1040
1068
|
ignore_query_parameters: Optional[bool] = None,
|
|
1041
1069
|
regex_on_full_url: Optional[bool] = None,
|
|
1070
|
+
delay: Optional[int] = None,
|
|
1071
|
+
max_concurrency: Optional[int] = None,
|
|
1042
1072
|
idempotency_key: Optional[str] = None,
|
|
1043
1073
|
**kwargs
|
|
1044
1074
|
) -> 'CrawlWatcher':
|
|
@@ -1052,7 +1082,8 @@ class FirecrawlApp:
|
|
|
1052
1082
|
max_depth (Optional[int]): Maximum crawl depth
|
|
1053
1083
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
1054
1084
|
limit (Optional[int]): Maximum pages to crawl
|
|
1055
|
-
allow_backward_links (Optional[bool]):
|
|
1085
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
1086
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
1056
1087
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1057
1088
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1058
1089
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -1060,6 +1091,8 @@ class FirecrawlApp:
|
|
|
1060
1091
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1061
1092
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1062
1093
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1094
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
1095
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1063
1096
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1064
1097
|
**kwargs: Additional parameters to pass to the API
|
|
1065
1098
|
|
|
@@ -1084,6 +1117,8 @@ class FirecrawlApp:
|
|
|
1084
1117
|
deduplicate_similar_urls=deduplicate_similar_urls,
|
|
1085
1118
|
ignore_query_parameters=ignore_query_parameters,
|
|
1086
1119
|
regex_on_full_url=regex_on_full_url,
|
|
1120
|
+
delay=delay,
|
|
1121
|
+
max_concurrency=max_concurrency,
|
|
1087
1122
|
idempotency_key=idempotency_key,
|
|
1088
1123
|
**kwargs
|
|
1089
1124
|
)
|
|
@@ -1102,6 +1137,7 @@ class FirecrawlApp:
|
|
|
1102
1137
|
sitemap_only: Optional[bool] = None,
|
|
1103
1138
|
limit: Optional[int] = None,
|
|
1104
1139
|
timeout: Optional[int] = None,
|
|
1140
|
+
use_index: Optional[bool] = None,
|
|
1105
1141
|
**kwargs) -> MapResponse:
|
|
1106
1142
|
"""
|
|
1107
1143
|
Map and discover links from a URL.
|
|
@@ -1144,7 +1180,9 @@ class FirecrawlApp:
|
|
|
1144
1180
|
map_params['limit'] = limit
|
|
1145
1181
|
if timeout is not None:
|
|
1146
1182
|
map_params['timeout'] = timeout
|
|
1147
|
-
|
|
1183
|
+
if use_index is not None:
|
|
1184
|
+
map_params['useIndex'] = use_index
|
|
1185
|
+
|
|
1148
1186
|
# Add any additional kwargs
|
|
1149
1187
|
map_params.update(kwargs)
|
|
1150
1188
|
|
|
@@ -1191,12 +1229,13 @@ class FirecrawlApp:
|
|
|
1191
1229
|
skip_tls_verification: Optional[bool] = None,
|
|
1192
1230
|
remove_base64_images: Optional[bool] = None,
|
|
1193
1231
|
block_ads: Optional[bool] = None,
|
|
1194
|
-
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1232
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1195
1233
|
extract: Optional[JsonConfig] = None,
|
|
1196
1234
|
json_options: Optional[JsonConfig] = None,
|
|
1197
1235
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1198
1236
|
agent: Optional[AgentOptions] = None,
|
|
1199
1237
|
poll_interval: Optional[int] = 2,
|
|
1238
|
+
max_concurrency: Optional[int] = None,
|
|
1200
1239
|
idempotency_key: Optional[str] = None,
|
|
1201
1240
|
**kwargs
|
|
1202
1241
|
) -> BatchScrapeStatusResponse:
|
|
@@ -1222,6 +1261,7 @@ class FirecrawlApp:
|
|
|
1222
1261
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1223
1262
|
actions (Optional[List[Union]]): Actions to perform
|
|
1224
1263
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1264
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1225
1265
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
1226
1266
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1227
1267
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -1281,7 +1321,9 @@ class FirecrawlApp:
|
|
|
1281
1321
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1282
1322
|
if agent is not None:
|
|
1283
1323
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1284
|
-
|
|
1324
|
+
if max_concurrency is not None:
|
|
1325
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1326
|
+
|
|
1285
1327
|
# Add any additional kwargs
|
|
1286
1328
|
scrape_params.update(kwargs)
|
|
1287
1329
|
|
|
@@ -1325,11 +1367,12 @@ class FirecrawlApp:
|
|
|
1325
1367
|
skip_tls_verification: Optional[bool] = None,
|
|
1326
1368
|
remove_base64_images: Optional[bool] = None,
|
|
1327
1369
|
block_ads: Optional[bool] = None,
|
|
1328
|
-
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1370
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1329
1371
|
extract: Optional[JsonConfig] = None,
|
|
1330
1372
|
json_options: Optional[JsonConfig] = None,
|
|
1331
1373
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1332
1374
|
agent: Optional[AgentOptions] = None,
|
|
1375
|
+
max_concurrency: Optional[int] = None,
|
|
1333
1376
|
idempotency_key: Optional[str] = None,
|
|
1334
1377
|
**kwargs
|
|
1335
1378
|
) -> BatchScrapeResponse:
|
|
@@ -1355,6 +1398,7 @@ class FirecrawlApp:
|
|
|
1355
1398
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1356
1399
|
actions (Optional[List[Union]]): Actions to perform
|
|
1357
1400
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1401
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1358
1402
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1359
1403
|
**kwargs: Additional parameters to pass to the API
|
|
1360
1404
|
|
|
@@ -1414,7 +1458,9 @@ class FirecrawlApp:
|
|
|
1414
1458
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1415
1459
|
if agent is not None:
|
|
1416
1460
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1417
|
-
|
|
1461
|
+
if max_concurrency is not None:
|
|
1462
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1463
|
+
|
|
1418
1464
|
# Add any additional kwargs
|
|
1419
1465
|
scrape_params.update(kwargs)
|
|
1420
1466
|
|
|
@@ -1457,11 +1503,12 @@ class FirecrawlApp:
|
|
|
1457
1503
|
skip_tls_verification: Optional[bool] = None,
|
|
1458
1504
|
remove_base64_images: Optional[bool] = None,
|
|
1459
1505
|
block_ads: Optional[bool] = None,
|
|
1460
|
-
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1506
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1461
1507
|
extract: Optional[JsonConfig] = None,
|
|
1462
1508
|
json_options: Optional[JsonConfig] = None,
|
|
1463
1509
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1464
1510
|
agent: Optional[AgentOptions] = None,
|
|
1511
|
+
max_concurrency: Optional[int] = None,
|
|
1465
1512
|
idempotency_key: Optional[str] = None,
|
|
1466
1513
|
**kwargs
|
|
1467
1514
|
) -> 'CrawlWatcher':
|
|
@@ -1487,6 +1534,7 @@ class FirecrawlApp:
|
|
|
1487
1534
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1488
1535
|
actions (Optional[List[Union]]): Actions to perform
|
|
1489
1536
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1537
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1490
1538
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1491
1539
|
**kwargs: Additional parameters to pass to the API
|
|
1492
1540
|
|
|
@@ -1542,7 +1590,9 @@ class FirecrawlApp:
|
|
|
1542
1590
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1543
1591
|
if agent is not None:
|
|
1544
1592
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1545
|
-
|
|
1593
|
+
if max_concurrency is not None:
|
|
1594
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1595
|
+
|
|
1546
1596
|
# Add any additional kwargs
|
|
1547
1597
|
scrape_params.update(kwargs)
|
|
1548
1598
|
|
|
@@ -2771,7 +2821,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2771
2821
|
* limit - Maximum pages to crawl
|
|
2772
2822
|
|
|
2773
2823
|
Link Following:
|
|
2774
|
-
* allowBackwardLinks -
|
|
2824
|
+
* allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
|
|
2825
|
+
* crawlEntireDomain - Follow parent directory links
|
|
2775
2826
|
* allowExternalLinks - Follow external domain links
|
|
2776
2827
|
* ignoreSitemap - Skip sitemap.xml processing
|
|
2777
2828
|
|
|
@@ -2852,7 +2903,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2852
2903
|
skip_tls_verification: Optional[bool] = None,
|
|
2853
2904
|
remove_base64_images: Optional[bool] = None,
|
|
2854
2905
|
block_ads: Optional[bool] = None,
|
|
2855
|
-
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2906
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
2856
2907
|
extract: Optional[JsonConfig] = None,
|
|
2857
2908
|
json_options: Optional[JsonConfig] = None,
|
|
2858
2909
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
@@ -2873,7 +2924,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2873
2924
|
skip_tls_verification (Optional[bool]): Skip TLS verification
|
|
2874
2925
|
remove_base64_images (Optional[bool]): Remove base64 images
|
|
2875
2926
|
block_ads (Optional[bool]): Block ads
|
|
2876
|
-
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
2927
|
+
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
2877
2928
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
2878
2929
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
2879
2930
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
@@ -2981,7 +3032,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2981
3032
|
skip_tls_verification: Optional[bool] = None,
|
|
2982
3033
|
remove_base64_images: Optional[bool] = None,
|
|
2983
3034
|
block_ads: Optional[bool] = None,
|
|
2984
|
-
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
3035
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
2985
3036
|
extract: Optional[JsonConfig] = None,
|
|
2986
3037
|
json_options: Optional[JsonConfig] = None,
|
|
2987
3038
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
@@ -3120,7 +3171,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3120
3171
|
skip_tls_verification: Optional[bool] = None,
|
|
3121
3172
|
remove_base64_images: Optional[bool] = None,
|
|
3122
3173
|
block_ads: Optional[bool] = None,
|
|
3123
|
-
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
3174
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3124
3175
|
extract: Optional[JsonConfig] = None,
|
|
3125
3176
|
json_options: Optional[JsonConfig] = None,
|
|
3126
3177
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
@@ -3250,6 +3301,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3250
3301
|
max_discovery_depth: Optional[int] = None,
|
|
3251
3302
|
limit: Optional[int] = None,
|
|
3252
3303
|
allow_backward_links: Optional[bool] = None,
|
|
3304
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3253
3305
|
allow_external_links: Optional[bool] = None,
|
|
3254
3306
|
ignore_sitemap: Optional[bool] = None,
|
|
3255
3307
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3272,7 +3324,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3272
3324
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3273
3325
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3274
3326
|
limit (Optional[int]): Maximum pages to crawl
|
|
3275
|
-
allow_backward_links (Optional[bool]):
|
|
3327
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3328
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3276
3329
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3277
3330
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3278
3331
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3310,7 +3363,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3310
3363
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3311
3364
|
if limit is not None:
|
|
3312
3365
|
crawl_params['limit'] = limit
|
|
3313
|
-
if
|
|
3366
|
+
if crawl_entire_domain is not None:
|
|
3367
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3368
|
+
elif allow_backward_links is not None:
|
|
3314
3369
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3315
3370
|
if allow_external_links is not None:
|
|
3316
3371
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -3362,6 +3417,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3362
3417
|
max_discovery_depth: Optional[int] = None,
|
|
3363
3418
|
limit: Optional[int] = None,
|
|
3364
3419
|
allow_backward_links: Optional[bool] = None,
|
|
3420
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3365
3421
|
allow_external_links: Optional[bool] = None,
|
|
3366
3422
|
ignore_sitemap: Optional[bool] = None,
|
|
3367
3423
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3384,7 +3440,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3384
3440
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3385
3441
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3386
3442
|
limit (Optional[int]): Maximum pages to crawl
|
|
3387
|
-
allow_backward_links (Optional[bool]):
|
|
3443
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3444
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3388
3445
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3389
3446
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3390
3447
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3418,7 +3475,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3418
3475
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3419
3476
|
if limit is not None:
|
|
3420
3477
|
crawl_params['limit'] = limit
|
|
3421
|
-
if
|
|
3478
|
+
if crawl_entire_domain is not None:
|
|
3479
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3480
|
+
elif allow_backward_links is not None:
|
|
3422
3481
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3423
3482
|
if allow_external_links is not None:
|
|
3424
3483
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
firecrawl/__init__.py,sha256=
|
|
2
|
-
firecrawl/firecrawl.py,sha256=
|
|
1
|
+
firecrawl/__init__.py,sha256=nWGTmoKRj6qHs3mzjKE4d3giXsTeeXIO-Ujw0S0oy7k,2612
|
|
2
|
+
firecrawl/firecrawl.py,sha256=ICCfDvhpsV3OT5kwwuiS2_6tiq9kmCca4Elum7mKhxg,193573
|
|
3
3
|
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
5
|
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
7
|
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
-
firecrawl-2.
|
|
9
|
-
firecrawl-2.
|
|
10
|
-
firecrawl-2.
|
|
11
|
-
firecrawl-2.
|
|
12
|
-
firecrawl-2.
|
|
8
|
+
firecrawl-2.9.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.9.0.dist-info/METADATA,sha256=7V6RGueUF-gnebxMeXVW6Lpc22vcRyU8Fe6xa58Ep7Q,7165
|
|
10
|
+
firecrawl-2.9.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.9.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|