firecrawl 2.13.0__tar.gz → 2.15.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-2.13.0 → firecrawl-2.15.0}/PKG-INFO +1 -1
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl/__init__.py +1 -1
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl/firecrawl.py +57 -8
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl.egg-info/PKG-INFO +1 -1
- {firecrawl-2.13.0 → firecrawl-2.15.0}/LICENSE +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/README.md +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl/__tests__/e2e_withAuth/test.py +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl.egg-info/SOURCES.txt +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl.egg-info/requires.txt +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/pyproject.toml +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/setup.cfg +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/setup.py +0 -0
- {firecrawl-2.13.0 → firecrawl-2.15.0}/tests/test_change_tracking.py +0 -0
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.15.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
@@ -482,6 +482,7 @@ class FirecrawlApp:
|
|
|
482
482
|
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
|
483
483
|
max_age: Optional[int] = None,
|
|
484
484
|
store_in_cache: Optional[bool] = None,
|
|
485
|
+
zero_data_retention: Optional[bool] = None,
|
|
485
486
|
**kwargs) -> ScrapeResponse[Any]:
|
|
486
487
|
"""
|
|
487
488
|
Scrape and extract content from a URL.
|
|
@@ -504,6 +505,7 @@ class FirecrawlApp:
|
|
|
504
505
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
505
506
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
|
|
506
507
|
change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
|
|
508
|
+
zero_data_retention (Optional[bool]): Whether to delete data after scrape is done
|
|
507
509
|
|
|
508
510
|
|
|
509
511
|
Returns:
|
|
@@ -569,6 +571,8 @@ class FirecrawlApp:
|
|
|
569
571
|
scrape_params['maxAge'] = max_age
|
|
570
572
|
if store_in_cache is not None:
|
|
571
573
|
scrape_params['storeInCache'] = store_in_cache
|
|
574
|
+
if zero_data_retention is not None:
|
|
575
|
+
scrape_params['zeroDataRetention'] = zero_data_retention
|
|
572
576
|
|
|
573
577
|
scrape_params.update(kwargs)
|
|
574
578
|
|
|
@@ -663,12 +667,16 @@ class FirecrawlApp:
|
|
|
663
667
|
|
|
664
668
|
# Add any additional kwargs
|
|
665
669
|
search_params.update(kwargs)
|
|
670
|
+
_integration = search_params.get('integration')
|
|
666
671
|
|
|
667
672
|
# Create final params object
|
|
668
673
|
final_params = SearchParams(query=query, **search_params)
|
|
669
674
|
params_dict = final_params.dict(exclude_none=True)
|
|
670
675
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
671
676
|
|
|
677
|
+
if _integration:
|
|
678
|
+
params_dict['integration'] = _integration
|
|
679
|
+
|
|
672
680
|
# Make request
|
|
673
681
|
response = requests.post(
|
|
674
682
|
f"{self.api_url}/v1/search",
|
|
@@ -711,6 +719,7 @@ class FirecrawlApp:
|
|
|
711
719
|
delay: Optional[int] = None,
|
|
712
720
|
allow_subdomains: Optional[bool] = None,
|
|
713
721
|
max_concurrency: Optional[int] = None,
|
|
722
|
+
zero_data_retention: Optional[bool] = None,
|
|
714
723
|
poll_interval: Optional[int] = 2,
|
|
715
724
|
idempotency_key: Optional[str] = None,
|
|
716
725
|
**kwargs
|
|
@@ -737,6 +746,7 @@ class FirecrawlApp:
|
|
|
737
746
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
738
747
|
allow_subdomains (Optional[bool]): Follow subdomains
|
|
739
748
|
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
749
|
+
zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
|
|
740
750
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
741
751
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
742
752
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -790,9 +800,11 @@ class FirecrawlApp:
|
|
|
790
800
|
crawl_params['allowSubdomains'] = allow_subdomains
|
|
791
801
|
if max_concurrency is not None:
|
|
792
802
|
crawl_params['maxConcurrency'] = max_concurrency
|
|
793
|
-
|
|
803
|
+
if zero_data_retention is not None:
|
|
804
|
+
crawl_params['zeroDataRetention'] = zero_data_retention
|
|
794
805
|
# Add any additional kwargs
|
|
795
806
|
crawl_params.update(kwargs)
|
|
807
|
+
_integration = crawl_params.get('integration')
|
|
796
808
|
|
|
797
809
|
# Create final params object
|
|
798
810
|
final_params = CrawlParams(**crawl_params)
|
|
@@ -800,6 +812,9 @@ class FirecrawlApp:
|
|
|
800
812
|
params_dict['url'] = url
|
|
801
813
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
802
814
|
|
|
815
|
+
if _integration:
|
|
816
|
+
params_dict['integration'] = _integration
|
|
817
|
+
|
|
803
818
|
# Make request
|
|
804
819
|
headers = self._prepare_headers(idempotency_key)
|
|
805
820
|
response = self._post_request(f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
@@ -834,6 +849,7 @@ class FirecrawlApp:
|
|
|
834
849
|
delay: Optional[int] = None,
|
|
835
850
|
allow_subdomains: Optional[bool] = None,
|
|
836
851
|
max_concurrency: Optional[int] = None,
|
|
852
|
+
zero_data_retention: Optional[bool] = None,
|
|
837
853
|
idempotency_key: Optional[str] = None,
|
|
838
854
|
**kwargs
|
|
839
855
|
) -> CrawlResponse:
|
|
@@ -859,6 +875,7 @@ class FirecrawlApp:
|
|
|
859
875
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
860
876
|
allow_subdomains (Optional[bool]): Follow subdomains
|
|
861
877
|
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
878
|
+
zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
|
|
862
879
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
863
880
|
**kwargs: Additional parameters to pass to the API
|
|
864
881
|
|
|
@@ -912,7 +929,8 @@ class FirecrawlApp:
|
|
|
912
929
|
crawl_params['allowSubdomains'] = allow_subdomains
|
|
913
930
|
if max_concurrency is not None:
|
|
914
931
|
crawl_params['maxConcurrency'] = max_concurrency
|
|
915
|
-
|
|
932
|
+
if zero_data_retention is not None:
|
|
933
|
+
crawl_params['zeroDataRetention'] = zero_data_retention
|
|
916
934
|
# Add any additional kwargs
|
|
917
935
|
crawl_params.update(kwargs)
|
|
918
936
|
|
|
@@ -1092,6 +1110,7 @@ class FirecrawlApp:
|
|
|
1092
1110
|
delay: Optional[int] = None,
|
|
1093
1111
|
allow_subdomains: Optional[bool] = None,
|
|
1094
1112
|
max_concurrency: Optional[int] = None,
|
|
1113
|
+
zero_data_retention: Optional[bool] = None,
|
|
1095
1114
|
idempotency_key: Optional[str] = None,
|
|
1096
1115
|
**kwargs
|
|
1097
1116
|
) -> 'CrawlWatcher':
|
|
@@ -1117,6 +1136,7 @@ class FirecrawlApp:
|
|
|
1117
1136
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
1118
1137
|
allow_subdomains (Optional[bool]): Follow subdomains
|
|
1119
1138
|
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1139
|
+
zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
|
|
1120
1140
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1121
1141
|
**kwargs: Additional parameters to pass to the API
|
|
1122
1142
|
|
|
@@ -1144,6 +1164,7 @@ class FirecrawlApp:
|
|
|
1144
1164
|
delay=delay,
|
|
1145
1165
|
allow_subdomains=allow_subdomains,
|
|
1146
1166
|
max_concurrency=max_concurrency,
|
|
1167
|
+
zero_data_retention=zero_data_retention,
|
|
1147
1168
|
idempotency_key=idempotency_key,
|
|
1148
1169
|
**kwargs
|
|
1149
1170
|
)
|
|
@@ -1210,6 +1231,7 @@ class FirecrawlApp:
|
|
|
1210
1231
|
|
|
1211
1232
|
# Add any additional kwargs
|
|
1212
1233
|
map_params.update(kwargs)
|
|
1234
|
+
_integration = map_params.get('integration')
|
|
1213
1235
|
|
|
1214
1236
|
# Create final params object
|
|
1215
1237
|
final_params = MapParams(**map_params)
|
|
@@ -1217,6 +1239,9 @@ class FirecrawlApp:
|
|
|
1217
1239
|
params_dict['url'] = url
|
|
1218
1240
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
1219
1241
|
|
|
1242
|
+
if _integration:
|
|
1243
|
+
params_dict['integration'] = _integration
|
|
1244
|
+
|
|
1220
1245
|
# Make request
|
|
1221
1246
|
response = requests.post(
|
|
1222
1247
|
f"{self.api_url}/v1/map",
|
|
@@ -1261,6 +1286,7 @@ class FirecrawlApp:
|
|
|
1261
1286
|
agent: Optional[AgentOptions] = None,
|
|
1262
1287
|
poll_interval: Optional[int] = 2,
|
|
1263
1288
|
max_concurrency: Optional[int] = None,
|
|
1289
|
+
zero_data_retention: Optional[bool] = None,
|
|
1264
1290
|
idempotency_key: Optional[str] = None,
|
|
1265
1291
|
**kwargs
|
|
1266
1292
|
) -> BatchScrapeStatusResponse:
|
|
@@ -1348,6 +1374,8 @@ class FirecrawlApp:
|
|
|
1348
1374
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1349
1375
|
if max_concurrency is not None:
|
|
1350
1376
|
scrape_params['maxConcurrency'] = max_concurrency
|
|
1377
|
+
if zero_data_retention is not None:
|
|
1378
|
+
scrape_params['zeroDataRetention'] = zero_data_retention
|
|
1351
1379
|
|
|
1352
1380
|
# Add any additional kwargs
|
|
1353
1381
|
scrape_params.update(kwargs)
|
|
@@ -1399,6 +1427,7 @@ class FirecrawlApp:
|
|
|
1399
1427
|
agent: Optional[AgentOptions] = None,
|
|
1400
1428
|
max_concurrency: Optional[int] = None,
|
|
1401
1429
|
idempotency_key: Optional[str] = None,
|
|
1430
|
+
zero_data_retention: Optional[bool] = None,
|
|
1402
1431
|
**kwargs
|
|
1403
1432
|
) -> BatchScrapeResponse:
|
|
1404
1433
|
"""
|
|
@@ -1424,6 +1453,7 @@ class FirecrawlApp:
|
|
|
1424
1453
|
actions (Optional[List[Union]]): Actions to perform
|
|
1425
1454
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1426
1455
|
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1456
|
+
zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
|
|
1427
1457
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1428
1458
|
**kwargs: Additional parameters to pass to the API
|
|
1429
1459
|
|
|
@@ -1485,6 +1515,8 @@ class FirecrawlApp:
|
|
|
1485
1515
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1486
1516
|
if max_concurrency is not None:
|
|
1487
1517
|
scrape_params['maxConcurrency'] = max_concurrency
|
|
1518
|
+
if zero_data_retention is not None:
|
|
1519
|
+
scrape_params['zeroDataRetention'] = zero_data_retention
|
|
1488
1520
|
|
|
1489
1521
|
# Add any additional kwargs
|
|
1490
1522
|
scrape_params.update(kwargs)
|
|
@@ -1534,6 +1566,7 @@ class FirecrawlApp:
|
|
|
1534
1566
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1535
1567
|
agent: Optional[AgentOptions] = None,
|
|
1536
1568
|
max_concurrency: Optional[int] = None,
|
|
1569
|
+
zero_data_retention: Optional[bool] = None,
|
|
1537
1570
|
idempotency_key: Optional[str] = None,
|
|
1538
1571
|
**kwargs
|
|
1539
1572
|
) -> 'CrawlWatcher':
|
|
@@ -1560,6 +1593,7 @@ class FirecrawlApp:
|
|
|
1560
1593
|
actions (Optional[List[Union]]): Actions to perform
|
|
1561
1594
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1562
1595
|
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1596
|
+
zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
|
|
1563
1597
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1564
1598
|
**kwargs: Additional parameters to pass to the API
|
|
1565
1599
|
|
|
@@ -1617,6 +1651,8 @@ class FirecrawlApp:
|
|
|
1617
1651
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1618
1652
|
if max_concurrency is not None:
|
|
1619
1653
|
scrape_params['maxConcurrency'] = max_concurrency
|
|
1654
|
+
if zero_data_retention is not None:
|
|
1655
|
+
scrape_params['zeroDataRetention'] = zero_data_retention
|
|
1620
1656
|
|
|
1621
1657
|
# Add any additional kwargs
|
|
1622
1658
|
scrape_params.update(kwargs)
|
|
@@ -1749,7 +1785,8 @@ class FirecrawlApp:
|
|
|
1749
1785
|
allow_external_links: Optional[bool] = False,
|
|
1750
1786
|
enable_web_search: Optional[bool] = False,
|
|
1751
1787
|
show_sources: Optional[bool] = False,
|
|
1752
|
-
agent: Optional[Dict[str, Any]] = None
|
|
1788
|
+
agent: Optional[Dict[str, Any]] = None,
|
|
1789
|
+
**kwargs) -> ExtractResponse[Any]:
|
|
1753
1790
|
"""
|
|
1754
1791
|
Extract structured information from URLs.
|
|
1755
1792
|
|
|
@@ -1762,6 +1799,7 @@ class FirecrawlApp:
|
|
|
1762
1799
|
enable_web_search (Optional[bool]): Enable web search
|
|
1763
1800
|
show_sources (Optional[bool]): Include source URLs
|
|
1764
1801
|
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
1802
|
+
**kwargs: Additional parameters to pass to the API
|
|
1765
1803
|
|
|
1766
1804
|
Returns:
|
|
1767
1805
|
ExtractResponse[Any] with:
|
|
@@ -1772,6 +1810,9 @@ class FirecrawlApp:
|
|
|
1772
1810
|
Raises:
|
|
1773
1811
|
ValueError: If prompt/schema missing or extraction fails
|
|
1774
1812
|
"""
|
|
1813
|
+
# Validate any additional kwargs
|
|
1814
|
+
self._validate_kwargs(kwargs, "extract")
|
|
1815
|
+
|
|
1775
1816
|
headers = self._prepare_headers()
|
|
1776
1817
|
|
|
1777
1818
|
if not prompt and not schema:
|
|
@@ -1801,6 +1842,9 @@ class FirecrawlApp:
|
|
|
1801
1842
|
if agent:
|
|
1802
1843
|
request_data['agent'] = agent
|
|
1803
1844
|
|
|
1845
|
+
# Add any additional kwargs
|
|
1846
|
+
request_data.update(kwargs)
|
|
1847
|
+
|
|
1804
1848
|
try:
|
|
1805
1849
|
# Send the initial extract request
|
|
1806
1850
|
response = self._post_request(
|
|
@@ -2549,12 +2593,13 @@ class FirecrawlApp:
|
|
|
2549
2593
|
method_params = {
|
|
2550
2594
|
"scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
|
|
2551
2595
|
"timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
|
|
2552
|
-
"block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options"},
|
|
2553
|
-
"search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
|
|
2596
|
+
"block_ads", "proxy", "extract", "json_options", "actions", "change_tracking_options", "integration"},
|
|
2597
|
+
"search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options", "integration"},
|
|
2554
2598
|
"crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
|
|
2555
2599
|
"allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
|
|
2556
|
-
"webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
|
|
2557
|
-
"map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
|
|
2600
|
+
"webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url", "integration"},
|
|
2601
|
+
"map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout", "integration"},
|
|
2602
|
+
"extract": {"prompt", "schema", "system_prompt", "allow_external_links", "enable_web_search", "show_sources", "agent", "integration"},
|
|
2558
2603
|
"batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
|
|
2559
2604
|
"wait_for", "timeout", "location", "mobile", "skip_tls_verification",
|
|
2560
2605
|
"remove_base64_images", "block_ads", "proxy", "extract", "json_options",
|
|
@@ -3204,6 +3249,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3204
3249
|
json_options: Optional[JsonConfig] = None,
|
|
3205
3250
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
3206
3251
|
agent: Optional[AgentOptions] = None,
|
|
3252
|
+
zero_data_retention: Optional[bool] = None,
|
|
3207
3253
|
idempotency_key: Optional[str] = None,
|
|
3208
3254
|
**kwargs
|
|
3209
3255
|
) -> BatchScrapeResponse:
|
|
@@ -3229,6 +3275,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3229
3275
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
3230
3276
|
actions (Optional[List[Union]]): Actions to perform
|
|
3231
3277
|
agent (Optional[AgentOptions]): Agent configuration
|
|
3278
|
+
zero_data_retention (Optional[bool]): Whether to delete data after 24 hours
|
|
3232
3279
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3233
3280
|
**kwargs: Additional parameters to pass to the API
|
|
3234
3281
|
|
|
@@ -3288,7 +3335,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3288
3335
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
3289
3336
|
if agent is not None:
|
|
3290
3337
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
3291
|
-
|
|
3338
|
+
if zero_data_retention is not None:
|
|
3339
|
+
scrape_params['zeroDataRetention'] = zero_data_retention
|
|
3340
|
+
|
|
3292
3341
|
# Add any additional kwargs
|
|
3293
3342
|
scrape_params.update(kwargs)
|
|
3294
3343
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|