firecrawl 2.7.1__py3-none-any.whl → 2.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/firecrawl.py +75 -16
- {firecrawl-2.7.1.dist-info → firecrawl-2.9.0.dist-info}/METADATA +1 -1
- firecrawl-2.9.0.dist-info/RECORD +12 -0
- {firecrawl-2.7.1.dist-info → firecrawl-2.9.0.dist-info}/top_level.txt +0 -2
- build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/firecrawl/firecrawl.py +0 -4467
- build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/firecrawl/__init__.py +0 -79
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/firecrawl/firecrawl.py +0 -4467
- build/lib/tests/test_change_tracking.py +0 -98
- firecrawl-2.7.1.dist-info/RECORD +0 -26
- {firecrawl-2.7.1.dist-info → firecrawl-2.9.0.dist-info}/LICENSE +0 -0
- {firecrawl-2.7.1.dist-info → firecrawl-2.9.0.dist-info}/WHEEL +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.9.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
firecrawl/firecrawl.py
CHANGED
|
@@ -140,6 +140,7 @@ class ChangeTrackingOptions(pydantic.BaseModel):
|
|
|
140
140
|
modes: Optional[List[Literal["git-diff", "json"]]] = None
|
|
141
141
|
schema: Optional[Any] = None
|
|
142
142
|
prompt: Optional[str] = None
|
|
143
|
+
tag: Optional[str] = None
|
|
143
144
|
|
|
144
145
|
class ScrapeOptions(pydantic.BaseModel):
|
|
145
146
|
"""Parameters for scraping operations."""
|
|
@@ -157,6 +158,8 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
157
158
|
blockAds: Optional[bool] = None
|
|
158
159
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
159
160
|
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
|
161
|
+
maxAge: Optional[int] = None
|
|
162
|
+
storeInCache: Optional[bool] = None
|
|
160
163
|
|
|
161
164
|
class WaitAction(pydantic.BaseModel):
|
|
162
165
|
"""Wait action to perform during scraping."""
|
|
@@ -260,6 +263,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
260
263
|
ignoreQueryParameters: Optional[bool] = None
|
|
261
264
|
regexOnFullURL: Optional[bool] = None
|
|
262
265
|
delay: Optional[int] = None # Delay in seconds between scrapes
|
|
266
|
+
maxConcurrency: Optional[int] = None
|
|
263
267
|
|
|
264
268
|
class CrawlResponse(pydantic.BaseModel):
|
|
265
269
|
"""Response from crawling operations."""
|
|
@@ -292,6 +296,7 @@ class MapParams(pydantic.BaseModel):
|
|
|
292
296
|
sitemapOnly: Optional[bool] = None
|
|
293
297
|
limit: Optional[int] = None
|
|
294
298
|
timeout: Optional[int] = None
|
|
299
|
+
useIndex: Optional[bool] = None
|
|
295
300
|
|
|
296
301
|
class MapResponse(pydantic.BaseModel):
|
|
297
302
|
"""Response from mapping operations."""
|
|
@@ -464,6 +469,8 @@ class FirecrawlApp:
|
|
|
464
469
|
json_options: Optional[JsonConfig] = None,
|
|
465
470
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
466
471
|
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
|
472
|
+
max_age: Optional[int] = None,
|
|
473
|
+
store_in_cache: Optional[bool] = None,
|
|
467
474
|
**kwargs) -> ScrapeResponse[Any]:
|
|
468
475
|
"""
|
|
469
476
|
Scrape and extract content from a URL.
|
|
@@ -545,6 +552,10 @@ class FirecrawlApp:
|
|
|
545
552
|
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
|
|
546
553
|
if change_tracking_options:
|
|
547
554
|
scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
|
|
555
|
+
if max_age is not None:
|
|
556
|
+
scrape_params['maxAge'] = max_age
|
|
557
|
+
if store_in_cache is not None:
|
|
558
|
+
scrape_params['storeInCache'] = store_in_cache
|
|
548
559
|
|
|
549
560
|
scrape_params.update(kwargs)
|
|
550
561
|
|
|
@@ -676,6 +687,7 @@ class FirecrawlApp:
|
|
|
676
687
|
max_discovery_depth: Optional[int] = None,
|
|
677
688
|
limit: Optional[int] = None,
|
|
678
689
|
allow_backward_links: Optional[bool] = None,
|
|
690
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
679
691
|
allow_external_links: Optional[bool] = None,
|
|
680
692
|
ignore_sitemap: Optional[bool] = None,
|
|
681
693
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -684,6 +696,7 @@ class FirecrawlApp:
|
|
|
684
696
|
ignore_query_parameters: Optional[bool] = None,
|
|
685
697
|
regex_on_full_url: Optional[bool] = None,
|
|
686
698
|
delay: Optional[int] = None,
|
|
699
|
+
max_concurrency: Optional[int] = None,
|
|
687
700
|
poll_interval: Optional[int] = 2,
|
|
688
701
|
idempotency_key: Optional[str] = None,
|
|
689
702
|
**kwargs
|
|
@@ -698,7 +711,8 @@ class FirecrawlApp:
|
|
|
698
711
|
max_depth (Optional[int]): Maximum crawl depth
|
|
699
712
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
700
713
|
limit (Optional[int]): Maximum pages to crawl
|
|
701
|
-
allow_backward_links (Optional[bool]):
|
|
714
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
715
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
702
716
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
703
717
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
704
718
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -707,6 +721,7 @@ class FirecrawlApp:
|
|
|
707
721
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
708
722
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
709
723
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
724
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
710
725
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
711
726
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
712
727
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -736,7 +751,9 @@ class FirecrawlApp:
|
|
|
736
751
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
737
752
|
if limit is not None:
|
|
738
753
|
crawl_params['limit'] = limit
|
|
739
|
-
if
|
|
754
|
+
if crawl_entire_domain is not None:
|
|
755
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
756
|
+
elif allow_backward_links is not None:
|
|
740
757
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
741
758
|
if allow_external_links is not None:
|
|
742
759
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -754,7 +771,9 @@ class FirecrawlApp:
|
|
|
754
771
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
755
772
|
if delay is not None:
|
|
756
773
|
crawl_params['delay'] = delay
|
|
757
|
-
|
|
774
|
+
if max_concurrency is not None:
|
|
775
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
776
|
+
|
|
758
777
|
# Add any additional kwargs
|
|
759
778
|
crawl_params.update(kwargs)
|
|
760
779
|
|
|
@@ -787,6 +806,7 @@ class FirecrawlApp:
|
|
|
787
806
|
max_discovery_depth: Optional[int] = None,
|
|
788
807
|
limit: Optional[int] = None,
|
|
789
808
|
allow_backward_links: Optional[bool] = None,
|
|
809
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
790
810
|
allow_external_links: Optional[bool] = None,
|
|
791
811
|
ignore_sitemap: Optional[bool] = None,
|
|
792
812
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -808,7 +828,8 @@ class FirecrawlApp:
|
|
|
808
828
|
max_depth (Optional[int]): Maximum crawl depth
|
|
809
829
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
810
830
|
limit (Optional[int]): Maximum pages to crawl
|
|
811
|
-
allow_backward_links (Optional[bool]):
|
|
831
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
832
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
812
833
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
813
834
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
814
835
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -816,6 +837,8 @@ class FirecrawlApp:
|
|
|
816
837
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
817
838
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
818
839
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
840
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
841
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
819
842
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
820
843
|
**kwargs: Additional parameters to pass to the API
|
|
821
844
|
|
|
@@ -845,7 +868,9 @@ class FirecrawlApp:
|
|
|
845
868
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
846
869
|
if limit is not None:
|
|
847
870
|
crawl_params['limit'] = limit
|
|
848
|
-
if
|
|
871
|
+
if crawl_entire_domain is not None:
|
|
872
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
873
|
+
elif allow_backward_links is not None:
|
|
849
874
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
850
875
|
if allow_external_links is not None:
|
|
851
876
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -863,7 +888,9 @@ class FirecrawlApp:
|
|
|
863
888
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
864
889
|
if delay is not None:
|
|
865
890
|
crawl_params['delay'] = delay
|
|
866
|
-
|
|
891
|
+
if max_concurrency is not None:
|
|
892
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
893
|
+
|
|
867
894
|
# Add any additional kwargs
|
|
868
895
|
crawl_params.update(kwargs)
|
|
869
896
|
|
|
@@ -1032,6 +1059,7 @@ class FirecrawlApp:
|
|
|
1032
1059
|
max_discovery_depth: Optional[int] = None,
|
|
1033
1060
|
limit: Optional[int] = None,
|
|
1034
1061
|
allow_backward_links: Optional[bool] = None,
|
|
1062
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
1035
1063
|
allow_external_links: Optional[bool] = None,
|
|
1036
1064
|
ignore_sitemap: Optional[bool] = None,
|
|
1037
1065
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -1039,6 +1067,8 @@ class FirecrawlApp:
|
|
|
1039
1067
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1040
1068
|
ignore_query_parameters: Optional[bool] = None,
|
|
1041
1069
|
regex_on_full_url: Optional[bool] = None,
|
|
1070
|
+
delay: Optional[int] = None,
|
|
1071
|
+
max_concurrency: Optional[int] = None,
|
|
1042
1072
|
idempotency_key: Optional[str] = None,
|
|
1043
1073
|
**kwargs
|
|
1044
1074
|
) -> 'CrawlWatcher':
|
|
@@ -1052,7 +1082,8 @@ class FirecrawlApp:
|
|
|
1052
1082
|
max_depth (Optional[int]): Maximum crawl depth
|
|
1053
1083
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
1054
1084
|
limit (Optional[int]): Maximum pages to crawl
|
|
1055
|
-
allow_backward_links (Optional[bool]):
|
|
1085
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
1086
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
1056
1087
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1057
1088
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1058
1089
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -1060,6 +1091,8 @@ class FirecrawlApp:
|
|
|
1060
1091
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1061
1092
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1062
1093
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1094
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
1095
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1063
1096
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1064
1097
|
**kwargs: Additional parameters to pass to the API
|
|
1065
1098
|
|
|
@@ -1084,6 +1117,8 @@ class FirecrawlApp:
|
|
|
1084
1117
|
deduplicate_similar_urls=deduplicate_similar_urls,
|
|
1085
1118
|
ignore_query_parameters=ignore_query_parameters,
|
|
1086
1119
|
regex_on_full_url=regex_on_full_url,
|
|
1120
|
+
delay=delay,
|
|
1121
|
+
max_concurrency=max_concurrency,
|
|
1087
1122
|
idempotency_key=idempotency_key,
|
|
1088
1123
|
**kwargs
|
|
1089
1124
|
)
|
|
@@ -1102,6 +1137,7 @@ class FirecrawlApp:
|
|
|
1102
1137
|
sitemap_only: Optional[bool] = None,
|
|
1103
1138
|
limit: Optional[int] = None,
|
|
1104
1139
|
timeout: Optional[int] = None,
|
|
1140
|
+
use_index: Optional[bool] = None,
|
|
1105
1141
|
**kwargs) -> MapResponse:
|
|
1106
1142
|
"""
|
|
1107
1143
|
Map and discover links from a URL.
|
|
@@ -1144,7 +1180,9 @@ class FirecrawlApp:
|
|
|
1144
1180
|
map_params['limit'] = limit
|
|
1145
1181
|
if timeout is not None:
|
|
1146
1182
|
map_params['timeout'] = timeout
|
|
1147
|
-
|
|
1183
|
+
if use_index is not None:
|
|
1184
|
+
map_params['useIndex'] = use_index
|
|
1185
|
+
|
|
1148
1186
|
# Add any additional kwargs
|
|
1149
1187
|
map_params.update(kwargs)
|
|
1150
1188
|
|
|
@@ -1197,6 +1235,7 @@ class FirecrawlApp:
|
|
|
1197
1235
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1198
1236
|
agent: Optional[AgentOptions] = None,
|
|
1199
1237
|
poll_interval: Optional[int] = 2,
|
|
1238
|
+
max_concurrency: Optional[int] = None,
|
|
1200
1239
|
idempotency_key: Optional[str] = None,
|
|
1201
1240
|
**kwargs
|
|
1202
1241
|
) -> BatchScrapeStatusResponse:
|
|
@@ -1222,6 +1261,7 @@ class FirecrawlApp:
|
|
|
1222
1261
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1223
1262
|
actions (Optional[List[Union]]): Actions to perform
|
|
1224
1263
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1264
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1225
1265
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
1226
1266
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1227
1267
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -1281,7 +1321,9 @@ class FirecrawlApp:
|
|
|
1281
1321
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1282
1322
|
if agent is not None:
|
|
1283
1323
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1284
|
-
|
|
1324
|
+
if max_concurrency is not None:
|
|
1325
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1326
|
+
|
|
1285
1327
|
# Add any additional kwargs
|
|
1286
1328
|
scrape_params.update(kwargs)
|
|
1287
1329
|
|
|
@@ -1330,6 +1372,7 @@ class FirecrawlApp:
|
|
|
1330
1372
|
json_options: Optional[JsonConfig] = None,
|
|
1331
1373
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1332
1374
|
agent: Optional[AgentOptions] = None,
|
|
1375
|
+
max_concurrency: Optional[int] = None,
|
|
1333
1376
|
idempotency_key: Optional[str] = None,
|
|
1334
1377
|
**kwargs
|
|
1335
1378
|
) -> BatchScrapeResponse:
|
|
@@ -1355,6 +1398,7 @@ class FirecrawlApp:
|
|
|
1355
1398
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1356
1399
|
actions (Optional[List[Union]]): Actions to perform
|
|
1357
1400
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1401
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1358
1402
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1359
1403
|
**kwargs: Additional parameters to pass to the API
|
|
1360
1404
|
|
|
@@ -1414,7 +1458,9 @@ class FirecrawlApp:
|
|
|
1414
1458
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1415
1459
|
if agent is not None:
|
|
1416
1460
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1417
|
-
|
|
1461
|
+
if max_concurrency is not None:
|
|
1462
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1463
|
+
|
|
1418
1464
|
# Add any additional kwargs
|
|
1419
1465
|
scrape_params.update(kwargs)
|
|
1420
1466
|
|
|
@@ -1462,6 +1508,7 @@ class FirecrawlApp:
|
|
|
1462
1508
|
json_options: Optional[JsonConfig] = None,
|
|
1463
1509
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1464
1510
|
agent: Optional[AgentOptions] = None,
|
|
1511
|
+
max_concurrency: Optional[int] = None,
|
|
1465
1512
|
idempotency_key: Optional[str] = None,
|
|
1466
1513
|
**kwargs
|
|
1467
1514
|
) -> 'CrawlWatcher':
|
|
@@ -1487,6 +1534,7 @@ class FirecrawlApp:
|
|
|
1487
1534
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1488
1535
|
actions (Optional[List[Union]]): Actions to perform
|
|
1489
1536
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1537
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1490
1538
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1491
1539
|
**kwargs: Additional parameters to pass to the API
|
|
1492
1540
|
|
|
@@ -1542,7 +1590,9 @@ class FirecrawlApp:
|
|
|
1542
1590
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1543
1591
|
if agent is not None:
|
|
1544
1592
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1545
|
-
|
|
1593
|
+
if max_concurrency is not None:
|
|
1594
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1595
|
+
|
|
1546
1596
|
# Add any additional kwargs
|
|
1547
1597
|
scrape_params.update(kwargs)
|
|
1548
1598
|
|
|
@@ -2771,7 +2821,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2771
2821
|
* limit - Maximum pages to crawl
|
|
2772
2822
|
|
|
2773
2823
|
Link Following:
|
|
2774
|
-
* allowBackwardLinks -
|
|
2824
|
+
* allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
|
|
2825
|
+
* crawlEntireDomain - Follow parent directory links
|
|
2775
2826
|
* allowExternalLinks - Follow external domain links
|
|
2776
2827
|
* ignoreSitemap - Skip sitemap.xml processing
|
|
2777
2828
|
|
|
@@ -3250,6 +3301,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3250
3301
|
max_discovery_depth: Optional[int] = None,
|
|
3251
3302
|
limit: Optional[int] = None,
|
|
3252
3303
|
allow_backward_links: Optional[bool] = None,
|
|
3304
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3253
3305
|
allow_external_links: Optional[bool] = None,
|
|
3254
3306
|
ignore_sitemap: Optional[bool] = None,
|
|
3255
3307
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3272,7 +3324,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3272
3324
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3273
3325
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3274
3326
|
limit (Optional[int]): Maximum pages to crawl
|
|
3275
|
-
allow_backward_links (Optional[bool]):
|
|
3327
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3328
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3276
3329
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3277
3330
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3278
3331
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3310,7 +3363,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3310
3363
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3311
3364
|
if limit is not None:
|
|
3312
3365
|
crawl_params['limit'] = limit
|
|
3313
|
-
if
|
|
3366
|
+
if crawl_entire_domain is not None:
|
|
3367
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3368
|
+
elif allow_backward_links is not None:
|
|
3314
3369
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3315
3370
|
if allow_external_links is not None:
|
|
3316
3371
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -3362,6 +3417,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3362
3417
|
max_discovery_depth: Optional[int] = None,
|
|
3363
3418
|
limit: Optional[int] = None,
|
|
3364
3419
|
allow_backward_links: Optional[bool] = None,
|
|
3420
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3365
3421
|
allow_external_links: Optional[bool] = None,
|
|
3366
3422
|
ignore_sitemap: Optional[bool] = None,
|
|
3367
3423
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3384,7 +3440,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3384
3440
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3385
3441
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3386
3442
|
limit (Optional[int]): Maximum pages to crawl
|
|
3387
|
-
allow_backward_links (Optional[bool]):
|
|
3443
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3444
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3388
3445
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3389
3446
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3390
3447
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3418,7 +3475,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3418
3475
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3419
3476
|
if limit is not None:
|
|
3420
3477
|
crawl_params['limit'] = limit
|
|
3421
|
-
if
|
|
3478
|
+
if crawl_entire_domain is not None:
|
|
3479
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3480
|
+
elif allow_backward_links is not None:
|
|
3422
3481
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3423
3482
|
if allow_external_links is not None:
|
|
3424
3483
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=nWGTmoKRj6qHs3mzjKE4d3giXsTeeXIO-Ujw0S0oy7k,2612
|
|
2
|
+
firecrawl/firecrawl.py,sha256=ICCfDvhpsV3OT5kwwuiS2_6tiq9kmCca4Elum7mKhxg,193573
|
|
3
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
firecrawl-2.9.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.9.0.dist-info/METADATA,sha256=7V6RGueUF-gnebxMeXVW6Lpc22vcRyU8Fe6xa58Ep7Q,7165
|
|
10
|
+
firecrawl-2.9.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.9.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.9.0.dist-info/RECORD,,
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This is the Firecrawl package.
|
|
3
|
-
|
|
4
|
-
This package provides a Python SDK for interacting with the Firecrawl API.
|
|
5
|
-
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
-
and check the status of these jobs.
|
|
7
|
-
|
|
8
|
-
For more information visit https://github.com/firecrawl/
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import logging
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
|
-
|
|
16
|
-
__version__ = "2.7.1"
|
|
17
|
-
|
|
18
|
-
# Define the logger for the Firecrawl project
|
|
19
|
-
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _configure_logger() -> None:
|
|
23
|
-
"""
|
|
24
|
-
Configure the firecrawl logger for console output.
|
|
25
|
-
|
|
26
|
-
The function attaches a handler for console output with a specific format and date
|
|
27
|
-
format to the firecrawl logger.
|
|
28
|
-
"""
|
|
29
|
-
try:
|
|
30
|
-
# Create the formatter
|
|
31
|
-
formatter = logging.Formatter(
|
|
32
|
-
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
33
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# Create the console handler and set the formatter
|
|
37
|
-
console_handler = logging.StreamHandler()
|
|
38
|
-
console_handler.setFormatter(formatter)
|
|
39
|
-
|
|
40
|
-
# Add the console handler to the firecrawl logger
|
|
41
|
-
logger.addHandler(console_handler)
|
|
42
|
-
except Exception as e:
|
|
43
|
-
logger.error("Failed to configure logging: %s", e)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def setup_logging() -> None:
|
|
47
|
-
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
48
|
-
# Check if the firecrawl logger already has a handler
|
|
49
|
-
if logger.hasHandlers():
|
|
50
|
-
return # To prevent duplicate logging
|
|
51
|
-
|
|
52
|
-
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
|
53
|
-
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
54
|
-
# Attach a no-op handler to prevent warnings about no handlers
|
|
55
|
-
logger.addHandler(logging.NullHandler())
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
# Attach the console handler to the firecrawl logger
|
|
59
|
-
_configure_logger()
|
|
60
|
-
|
|
61
|
-
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
|
62
|
-
if env == "DEBUG":
|
|
63
|
-
logger.setLevel(logging.DEBUG)
|
|
64
|
-
elif env == "INFO":
|
|
65
|
-
logger.setLevel(logging.INFO)
|
|
66
|
-
elif env == "WARNING":
|
|
67
|
-
logger.setLevel(logging.WARNING)
|
|
68
|
-
elif env == "ERROR":
|
|
69
|
-
logger.setLevel(logging.ERROR)
|
|
70
|
-
elif env == "CRITICAL":
|
|
71
|
-
logger.setLevel(logging.CRITICAL)
|
|
72
|
-
else:
|
|
73
|
-
logger.setLevel(logging.INFO)
|
|
74
|
-
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
# Initialize logging configuration when the module is imported
|
|
78
|
-
setup_logging()
|
|
79
|
-
logger.debug("Debugging logger setup")
|
|
File without changes
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
import importlib.util
|
|
2
|
-
import pytest
|
|
3
|
-
import time
|
|
4
|
-
import os
|
|
5
|
-
from uuid import uuid4
|
|
6
|
-
from dotenv import load_dotenv
|
|
7
|
-
|
|
8
|
-
load_dotenv()
|
|
9
|
-
|
|
10
|
-
API_URL = "http://127.0.0.1:3002"
|
|
11
|
-
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
|
12
|
-
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
|
13
|
-
|
|
14
|
-
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
|
15
|
-
|
|
16
|
-
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
|
17
|
-
firecrawl = importlib.util.module_from_spec(spec)
|
|
18
|
-
spec.loader.exec_module(firecrawl)
|
|
19
|
-
FirecrawlApp = firecrawl.FirecrawlApp
|
|
20
|
-
|
|
21
|
-
def test_no_api_key():
|
|
22
|
-
with pytest.raises(Exception) as excinfo:
|
|
23
|
-
invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
|
|
24
|
-
assert "No API key provided" in str(excinfo.value)
|
|
25
|
-
|
|
26
|
-
def test_scrape_url_invalid_api_key():
|
|
27
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
28
|
-
with pytest.raises(Exception) as excinfo:
|
|
29
|
-
invalid_app.scrape_url('https://firecrawl.dev')
|
|
30
|
-
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
31
|
-
|
|
32
|
-
# def test_blocklisted_url():
|
|
33
|
-
# blocklisted_url = "https://facebook.com/fake-test"
|
|
34
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
35
|
-
# with pytest.raises(Exception) as excinfo:
|
|
36
|
-
# app.scrape_url(blocklisted_url)
|
|
37
|
-
# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
38
|
-
|
|
39
|
-
def test_successful_response_with_valid_preview_token():
|
|
40
|
-
app = FirecrawlApp(api_url=API_URL, api_key=os.getenv('PREVIEW_TOKEN'), version='v0')
|
|
41
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
42
|
-
assert response is not None
|
|
43
|
-
assert 'content' in response
|
|
44
|
-
assert "_Roast_" in response['content']
|
|
45
|
-
|
|
46
|
-
def test_scrape_url_e2e():
|
|
47
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
48
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
49
|
-
print(response)
|
|
50
|
-
|
|
51
|
-
assert response is not None
|
|
52
|
-
assert 'content' in response
|
|
53
|
-
assert 'markdown' in response
|
|
54
|
-
assert 'metadata' in response
|
|
55
|
-
assert 'html' not in response
|
|
56
|
-
assert "_Roast_" in response['content']
|
|
57
|
-
|
|
58
|
-
def test_successful_response_with_valid_api_key_and_include_html():
|
|
59
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
60
|
-
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
|
61
|
-
assert response is not None
|
|
62
|
-
assert 'content' in response
|
|
63
|
-
assert 'markdown' in response
|
|
64
|
-
assert 'html' in response
|
|
65
|
-
assert 'metadata' in response
|
|
66
|
-
assert "_Roast_" in response['content']
|
|
67
|
-
assert "_Roast_" in response['markdown']
|
|
68
|
-
assert "<h1" in response['html']
|
|
69
|
-
|
|
70
|
-
def test_successful_response_for_valid_scrape_with_pdf_file():
|
|
71
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
72
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
|
73
|
-
assert response is not None
|
|
74
|
-
assert 'content' in response
|
|
75
|
-
assert 'metadata' in response
|
|
76
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
77
|
-
|
|
78
|
-
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
|
79
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
80
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
|
81
|
-
time.sleep(6) # wait for 6 seconds
|
|
82
|
-
assert response is not None
|
|
83
|
-
assert 'content' in response
|
|
84
|
-
assert 'metadata' in response
|
|
85
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
86
|
-
|
|
87
|
-
def test_crawl_url_invalid_api_key():
|
|
88
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
89
|
-
with pytest.raises(Exception) as excinfo:
|
|
90
|
-
invalid_app.crawl_url('https://firecrawl.dev')
|
|
91
|
-
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
92
|
-
|
|
93
|
-
# def test_should_return_error_for_blocklisted_url():
|
|
94
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
95
|
-
# blocklisted_url = "https://twitter.com/fake-test"
|
|
96
|
-
# with pytest.raises(Exception) as excinfo:
|
|
97
|
-
# app.crawl_url(blocklisted_url)
|
|
98
|
-
# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
99
|
-
|
|
100
|
-
def test_crawl_url_wait_for_completion_e2e():
|
|
101
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
102
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
|
103
|
-
assert response is not None
|
|
104
|
-
assert len(response) > 0
|
|
105
|
-
assert 'content' in response[0]
|
|
106
|
-
assert "_Roast_" in response[0]['content']
|
|
107
|
-
|
|
108
|
-
def test_crawl_url_with_idempotency_key_e2e():
|
|
109
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
110
|
-
uniqueIdempotencyKey = str(uuid4())
|
|
111
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
112
|
-
assert response is not None
|
|
113
|
-
assert len(response) > 0
|
|
114
|
-
assert 'content' in response[0]
|
|
115
|
-
assert "_Roast_" in response[0]['content']
|
|
116
|
-
|
|
117
|
-
with pytest.raises(Exception) as excinfo:
|
|
118
|
-
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
119
|
-
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
|
120
|
-
|
|
121
|
-
def test_check_crawl_status_e2e():
|
|
122
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
123
|
-
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
|
124
|
-
assert response is not None
|
|
125
|
-
assert 'jobId' in response
|
|
126
|
-
|
|
127
|
-
time.sleep(30) # wait for 30 seconds
|
|
128
|
-
status_response = app.check_crawl_status(response['jobId'])
|
|
129
|
-
assert status_response is not None
|
|
130
|
-
assert 'status' in status_response
|
|
131
|
-
assert status_response['status'] == 'completed'
|
|
132
|
-
assert 'data' in status_response
|
|
133
|
-
assert len(status_response['data']) > 0
|
|
134
|
-
|
|
135
|
-
def test_search_e2e():
|
|
136
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
137
|
-
response = app.search("test query")
|
|
138
|
-
assert response is not None
|
|
139
|
-
assert 'content' in response[0]
|
|
140
|
-
assert len(response) > 2
|
|
141
|
-
|
|
142
|
-
def test_search_invalid_api_key():
|
|
143
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
144
|
-
with pytest.raises(Exception) as excinfo:
|
|
145
|
-
invalid_app.search("test query")
|
|
146
|
-
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
147
|
-
|
|
148
|
-
def test_llm_extraction():
|
|
149
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
150
|
-
response = app.scrape_url("https://firecrawl.dev", {
|
|
151
|
-
'extractorOptions': {
|
|
152
|
-
'mode': 'llm-extraction',
|
|
153
|
-
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
|
154
|
-
'extractionSchema': {
|
|
155
|
-
'type': 'object',
|
|
156
|
-
'properties': {
|
|
157
|
-
'company_mission': {'type': 'string'},
|
|
158
|
-
'supports_sso': {'type': 'boolean'},
|
|
159
|
-
'is_open_source': {'type': 'boolean'}
|
|
160
|
-
},
|
|
161
|
-
'required': ['company_mission', 'supports_sso', 'is_open_source']
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
})
|
|
165
|
-
assert response is not None
|
|
166
|
-
assert 'llm_extraction' in response
|
|
167
|
-
llm_extraction = response['llm_extraction']
|
|
168
|
-
assert 'company_mission' in llm_extraction
|
|
169
|
-
assert isinstance(llm_extraction['supports_sso'], bool)
|
|
170
|
-
assert isinstance(llm_extraction['is_open_source'], bool)
|
|
File without changes
|