firecrawl 2.7.1__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/v1/e2e_withAuth/test.py +25 -0
- firecrawl/firecrawl.py +82 -16
- {firecrawl-2.7.1.dist-info → firecrawl-2.10.0.dist-info}/LICENSE +0 -0
- {firecrawl-2.7.1.dist-info → firecrawl-2.10.0.dist-info}/METADATA +1 -1
- firecrawl-2.10.0.dist-info/RECORD +12 -0
- {firecrawl-2.7.1.dist-info → firecrawl-2.10.0.dist-info}/top_level.txt +0 -2
- build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/firecrawl/firecrawl.py +0 -4467
- build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/firecrawl/__init__.py +0 -79
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/firecrawl/firecrawl.py +0 -4467
- build/lib/tests/test_change_tracking.py +0 -98
- firecrawl-2.7.1.dist-info/RECORD +0 -26
- {firecrawl-2.7.1.dist-info → firecrawl-2.10.0.dist-info}/WHEEL +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.10.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
@@ -437,4 +437,29 @@ def test_search_with_invalid_params():
|
|
|
437
437
|
app.search("test query", {"invalid_param": "value"})
|
|
438
438
|
assert "ValidationError" in str(e.value)
|
|
439
439
|
|
|
440
|
+
# def test_scrape_url_with_parse_pdf_true():
|
|
441
|
+
# if TEST_API_KEY:
|
|
442
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
443
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=True)
|
|
444
|
+
# assert response is not None
|
|
445
|
+
# assert 'markdown' in response
|
|
446
|
+
# assert len(response['markdown']) > 100
|
|
447
|
+
|
|
448
|
+
# def test_scrape_url_with_parse_pdf_false():
|
|
449
|
+
# if TEST_API_KEY:
|
|
450
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
451
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=False)
|
|
452
|
+
# assert response is not None
|
|
453
|
+
# assert 'markdown' in response
|
|
454
|
+
# assert 'h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm' in response['markdown']
|
|
455
|
+
|
|
456
|
+
# def test_scrape_options_with_parse_pdf():
|
|
457
|
+
# if TEST_API_KEY:
|
|
458
|
+
# from firecrawl.firecrawl import ScrapeOptions
|
|
459
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
460
|
+
# scrape_options = ScrapeOptions(parsePDF=False, formats=['markdown'])
|
|
461
|
+
# response = app.search("firecrawl", limit=1, scrape_options=scrape_options)
|
|
462
|
+
# assert response is not None
|
|
463
|
+
# assert 'data' in response
|
|
464
|
+
|
|
440
465
|
|
firecrawl/firecrawl.py
CHANGED
|
@@ -140,6 +140,7 @@ class ChangeTrackingOptions(pydantic.BaseModel):
|
|
|
140
140
|
modes: Optional[List[Literal["git-diff", "json"]]] = None
|
|
141
141
|
schema: Optional[Any] = None
|
|
142
142
|
prompt: Optional[str] = None
|
|
143
|
+
tag: Optional[str] = None
|
|
143
144
|
|
|
144
145
|
class ScrapeOptions(pydantic.BaseModel):
|
|
145
146
|
"""Parameters for scraping operations."""
|
|
@@ -157,6 +158,9 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
157
158
|
blockAds: Optional[bool] = None
|
|
158
159
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
159
160
|
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
|
161
|
+
maxAge: Optional[int] = None
|
|
162
|
+
storeInCache: Optional[bool] = None
|
|
163
|
+
parsePDF: Optional[bool] = None
|
|
160
164
|
|
|
161
165
|
class WaitAction(pydantic.BaseModel):
|
|
162
166
|
"""Wait action to perform during scraping."""
|
|
@@ -260,6 +264,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
260
264
|
ignoreQueryParameters: Optional[bool] = None
|
|
261
265
|
regexOnFullURL: Optional[bool] = None
|
|
262
266
|
delay: Optional[int] = None # Delay in seconds between scrapes
|
|
267
|
+
maxConcurrency: Optional[int] = None
|
|
263
268
|
|
|
264
269
|
class CrawlResponse(pydantic.BaseModel):
|
|
265
270
|
"""Response from crawling operations."""
|
|
@@ -292,6 +297,7 @@ class MapParams(pydantic.BaseModel):
|
|
|
292
297
|
sitemapOnly: Optional[bool] = None
|
|
293
298
|
limit: Optional[int] = None
|
|
294
299
|
timeout: Optional[int] = None
|
|
300
|
+
useIndex: Optional[bool] = None
|
|
295
301
|
|
|
296
302
|
class MapResponse(pydantic.BaseModel):
|
|
297
303
|
"""Response from mapping operations."""
|
|
@@ -460,10 +466,13 @@ class FirecrawlApp:
|
|
|
460
466
|
remove_base64_images: Optional[bool] = None,
|
|
461
467
|
block_ads: Optional[bool] = None,
|
|
462
468
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
469
|
+
parse_pdf: Optional[bool] = None,
|
|
463
470
|
extract: Optional[JsonConfig] = None,
|
|
464
471
|
json_options: Optional[JsonConfig] = None,
|
|
465
472
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
466
473
|
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
|
474
|
+
max_age: Optional[int] = None,
|
|
475
|
+
store_in_cache: Optional[bool] = None,
|
|
467
476
|
**kwargs) -> ScrapeResponse[Any]:
|
|
468
477
|
"""
|
|
469
478
|
Scrape and extract content from a URL.
|
|
@@ -531,6 +540,8 @@ class FirecrawlApp:
|
|
|
531
540
|
scrape_params['blockAds'] = block_ads
|
|
532
541
|
if proxy:
|
|
533
542
|
scrape_params['proxy'] = proxy
|
|
543
|
+
if parse_pdf is not None:
|
|
544
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
534
545
|
if extract is not None:
|
|
535
546
|
extract = self._ensure_schema_dict(extract)
|
|
536
547
|
if isinstance(extract, dict) and "schema" in extract:
|
|
@@ -545,6 +556,10 @@ class FirecrawlApp:
|
|
|
545
556
|
scrape_params['actions'] = [action if isinstance(action, dict) else action.dict(exclude_none=True) for action in actions]
|
|
546
557
|
if change_tracking_options:
|
|
547
558
|
scrape_params['changeTrackingOptions'] = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.dict(exclude_none=True)
|
|
559
|
+
if max_age is not None:
|
|
560
|
+
scrape_params['maxAge'] = max_age
|
|
561
|
+
if store_in_cache is not None:
|
|
562
|
+
scrape_params['storeInCache'] = store_in_cache
|
|
548
563
|
|
|
549
564
|
scrape_params.update(kwargs)
|
|
550
565
|
|
|
@@ -676,6 +691,7 @@ class FirecrawlApp:
|
|
|
676
691
|
max_discovery_depth: Optional[int] = None,
|
|
677
692
|
limit: Optional[int] = None,
|
|
678
693
|
allow_backward_links: Optional[bool] = None,
|
|
694
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
679
695
|
allow_external_links: Optional[bool] = None,
|
|
680
696
|
ignore_sitemap: Optional[bool] = None,
|
|
681
697
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -684,6 +700,7 @@ class FirecrawlApp:
|
|
|
684
700
|
ignore_query_parameters: Optional[bool] = None,
|
|
685
701
|
regex_on_full_url: Optional[bool] = None,
|
|
686
702
|
delay: Optional[int] = None,
|
|
703
|
+
max_concurrency: Optional[int] = None,
|
|
687
704
|
poll_interval: Optional[int] = 2,
|
|
688
705
|
idempotency_key: Optional[str] = None,
|
|
689
706
|
**kwargs
|
|
@@ -698,7 +715,8 @@ class FirecrawlApp:
|
|
|
698
715
|
max_depth (Optional[int]): Maximum crawl depth
|
|
699
716
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
700
717
|
limit (Optional[int]): Maximum pages to crawl
|
|
701
|
-
allow_backward_links (Optional[bool]):
|
|
718
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
719
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
702
720
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
703
721
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
704
722
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -707,6 +725,7 @@ class FirecrawlApp:
|
|
|
707
725
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
708
726
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
709
727
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
728
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
710
729
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
711
730
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
712
731
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -736,7 +755,9 @@ class FirecrawlApp:
|
|
|
736
755
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
737
756
|
if limit is not None:
|
|
738
757
|
crawl_params['limit'] = limit
|
|
739
|
-
if
|
|
758
|
+
if crawl_entire_domain is not None:
|
|
759
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
760
|
+
elif allow_backward_links is not None:
|
|
740
761
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
741
762
|
if allow_external_links is not None:
|
|
742
763
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -754,7 +775,9 @@ class FirecrawlApp:
|
|
|
754
775
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
755
776
|
if delay is not None:
|
|
756
777
|
crawl_params['delay'] = delay
|
|
757
|
-
|
|
778
|
+
if max_concurrency is not None:
|
|
779
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
780
|
+
|
|
758
781
|
# Add any additional kwargs
|
|
759
782
|
crawl_params.update(kwargs)
|
|
760
783
|
|
|
@@ -787,6 +810,7 @@ class FirecrawlApp:
|
|
|
787
810
|
max_discovery_depth: Optional[int] = None,
|
|
788
811
|
limit: Optional[int] = None,
|
|
789
812
|
allow_backward_links: Optional[bool] = None,
|
|
813
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
790
814
|
allow_external_links: Optional[bool] = None,
|
|
791
815
|
ignore_sitemap: Optional[bool] = None,
|
|
792
816
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -808,7 +832,8 @@ class FirecrawlApp:
|
|
|
808
832
|
max_depth (Optional[int]): Maximum crawl depth
|
|
809
833
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
810
834
|
limit (Optional[int]): Maximum pages to crawl
|
|
811
|
-
allow_backward_links (Optional[bool]):
|
|
835
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
836
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
812
837
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
813
838
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
814
839
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -816,6 +841,8 @@ class FirecrawlApp:
|
|
|
816
841
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
817
842
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
818
843
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
844
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
845
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
819
846
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
820
847
|
**kwargs: Additional parameters to pass to the API
|
|
821
848
|
|
|
@@ -845,7 +872,9 @@ class FirecrawlApp:
|
|
|
845
872
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
846
873
|
if limit is not None:
|
|
847
874
|
crawl_params['limit'] = limit
|
|
848
|
-
if
|
|
875
|
+
if crawl_entire_domain is not None:
|
|
876
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
877
|
+
elif allow_backward_links is not None:
|
|
849
878
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
850
879
|
if allow_external_links is not None:
|
|
851
880
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -863,7 +892,9 @@ class FirecrawlApp:
|
|
|
863
892
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
864
893
|
if delay is not None:
|
|
865
894
|
crawl_params['delay'] = delay
|
|
866
|
-
|
|
895
|
+
if max_concurrency is not None:
|
|
896
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
897
|
+
|
|
867
898
|
# Add any additional kwargs
|
|
868
899
|
crawl_params.update(kwargs)
|
|
869
900
|
|
|
@@ -1032,6 +1063,7 @@ class FirecrawlApp:
|
|
|
1032
1063
|
max_discovery_depth: Optional[int] = None,
|
|
1033
1064
|
limit: Optional[int] = None,
|
|
1034
1065
|
allow_backward_links: Optional[bool] = None,
|
|
1066
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
1035
1067
|
allow_external_links: Optional[bool] = None,
|
|
1036
1068
|
ignore_sitemap: Optional[bool] = None,
|
|
1037
1069
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -1039,6 +1071,8 @@ class FirecrawlApp:
|
|
|
1039
1071
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1040
1072
|
ignore_query_parameters: Optional[bool] = None,
|
|
1041
1073
|
regex_on_full_url: Optional[bool] = None,
|
|
1074
|
+
delay: Optional[int] = None,
|
|
1075
|
+
max_concurrency: Optional[int] = None,
|
|
1042
1076
|
idempotency_key: Optional[str] = None,
|
|
1043
1077
|
**kwargs
|
|
1044
1078
|
) -> 'CrawlWatcher':
|
|
@@ -1052,7 +1086,8 @@ class FirecrawlApp:
|
|
|
1052
1086
|
max_depth (Optional[int]): Maximum crawl depth
|
|
1053
1087
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
1054
1088
|
limit (Optional[int]): Maximum pages to crawl
|
|
1055
|
-
allow_backward_links (Optional[bool]):
|
|
1089
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
1090
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
1056
1091
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1057
1092
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1058
1093
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -1060,6 +1095,8 @@ class FirecrawlApp:
|
|
|
1060
1095
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1061
1096
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1062
1097
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1098
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
1099
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1063
1100
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1064
1101
|
**kwargs: Additional parameters to pass to the API
|
|
1065
1102
|
|
|
@@ -1084,6 +1121,8 @@ class FirecrawlApp:
|
|
|
1084
1121
|
deduplicate_similar_urls=deduplicate_similar_urls,
|
|
1085
1122
|
ignore_query_parameters=ignore_query_parameters,
|
|
1086
1123
|
regex_on_full_url=regex_on_full_url,
|
|
1124
|
+
delay=delay,
|
|
1125
|
+
max_concurrency=max_concurrency,
|
|
1087
1126
|
idempotency_key=idempotency_key,
|
|
1088
1127
|
**kwargs
|
|
1089
1128
|
)
|
|
@@ -1102,6 +1141,7 @@ class FirecrawlApp:
|
|
|
1102
1141
|
sitemap_only: Optional[bool] = None,
|
|
1103
1142
|
limit: Optional[int] = None,
|
|
1104
1143
|
timeout: Optional[int] = None,
|
|
1144
|
+
use_index: Optional[bool] = None,
|
|
1105
1145
|
**kwargs) -> MapResponse:
|
|
1106
1146
|
"""
|
|
1107
1147
|
Map and discover links from a URL.
|
|
@@ -1144,7 +1184,9 @@ class FirecrawlApp:
|
|
|
1144
1184
|
map_params['limit'] = limit
|
|
1145
1185
|
if timeout is not None:
|
|
1146
1186
|
map_params['timeout'] = timeout
|
|
1147
|
-
|
|
1187
|
+
if use_index is not None:
|
|
1188
|
+
map_params['useIndex'] = use_index
|
|
1189
|
+
|
|
1148
1190
|
# Add any additional kwargs
|
|
1149
1191
|
map_params.update(kwargs)
|
|
1150
1192
|
|
|
@@ -1197,6 +1239,7 @@ class FirecrawlApp:
|
|
|
1197
1239
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1198
1240
|
agent: Optional[AgentOptions] = None,
|
|
1199
1241
|
poll_interval: Optional[int] = 2,
|
|
1242
|
+
max_concurrency: Optional[int] = None,
|
|
1200
1243
|
idempotency_key: Optional[str] = None,
|
|
1201
1244
|
**kwargs
|
|
1202
1245
|
) -> BatchScrapeStatusResponse:
|
|
@@ -1222,6 +1265,7 @@ class FirecrawlApp:
|
|
|
1222
1265
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1223
1266
|
actions (Optional[List[Union]]): Actions to perform
|
|
1224
1267
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1268
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1225
1269
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
1226
1270
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1227
1271
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -1281,7 +1325,9 @@ class FirecrawlApp:
|
|
|
1281
1325
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1282
1326
|
if agent is not None:
|
|
1283
1327
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1284
|
-
|
|
1328
|
+
if max_concurrency is not None:
|
|
1329
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1330
|
+
|
|
1285
1331
|
# Add any additional kwargs
|
|
1286
1332
|
scrape_params.update(kwargs)
|
|
1287
1333
|
|
|
@@ -1330,6 +1376,7 @@ class FirecrawlApp:
|
|
|
1330
1376
|
json_options: Optional[JsonConfig] = None,
|
|
1331
1377
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1332
1378
|
agent: Optional[AgentOptions] = None,
|
|
1379
|
+
max_concurrency: Optional[int] = None,
|
|
1333
1380
|
idempotency_key: Optional[str] = None,
|
|
1334
1381
|
**kwargs
|
|
1335
1382
|
) -> BatchScrapeResponse:
|
|
@@ -1355,6 +1402,7 @@ class FirecrawlApp:
|
|
|
1355
1402
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1356
1403
|
actions (Optional[List[Union]]): Actions to perform
|
|
1357
1404
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1405
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1358
1406
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1359
1407
|
**kwargs: Additional parameters to pass to the API
|
|
1360
1408
|
|
|
@@ -1414,7 +1462,9 @@ class FirecrawlApp:
|
|
|
1414
1462
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1415
1463
|
if agent is not None:
|
|
1416
1464
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1417
|
-
|
|
1465
|
+
if max_concurrency is not None:
|
|
1466
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1467
|
+
|
|
1418
1468
|
# Add any additional kwargs
|
|
1419
1469
|
scrape_params.update(kwargs)
|
|
1420
1470
|
|
|
@@ -1462,6 +1512,7 @@ class FirecrawlApp:
|
|
|
1462
1512
|
json_options: Optional[JsonConfig] = None,
|
|
1463
1513
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1464
1514
|
agent: Optional[AgentOptions] = None,
|
|
1515
|
+
max_concurrency: Optional[int] = None,
|
|
1465
1516
|
idempotency_key: Optional[str] = None,
|
|
1466
1517
|
**kwargs
|
|
1467
1518
|
) -> 'CrawlWatcher':
|
|
@@ -1487,6 +1538,7 @@ class FirecrawlApp:
|
|
|
1487
1538
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1488
1539
|
actions (Optional[List[Union]]): Actions to perform
|
|
1489
1540
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1541
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1490
1542
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1491
1543
|
**kwargs: Additional parameters to pass to the API
|
|
1492
1544
|
|
|
@@ -1542,7 +1594,9 @@ class FirecrawlApp:
|
|
|
1542
1594
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1543
1595
|
if agent is not None:
|
|
1544
1596
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1545
|
-
|
|
1597
|
+
if max_concurrency is not None:
|
|
1598
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1599
|
+
|
|
1546
1600
|
# Add any additional kwargs
|
|
1547
1601
|
scrape_params.update(kwargs)
|
|
1548
1602
|
|
|
@@ -2771,7 +2825,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2771
2825
|
* limit - Maximum pages to crawl
|
|
2772
2826
|
|
|
2773
2827
|
Link Following:
|
|
2774
|
-
* allowBackwardLinks -
|
|
2828
|
+
* allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
|
|
2829
|
+
* crawlEntireDomain - Follow parent directory links
|
|
2775
2830
|
* allowExternalLinks - Follow external domain links
|
|
2776
2831
|
* ignoreSitemap - Skip sitemap.xml processing
|
|
2777
2832
|
|
|
@@ -2853,6 +2908,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2853
2908
|
remove_base64_images: Optional[bool] = None,
|
|
2854
2909
|
block_ads: Optional[bool] = None,
|
|
2855
2910
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
2911
|
+
parse_pdf: Optional[bool] = None,
|
|
2856
2912
|
extract: Optional[JsonConfig] = None,
|
|
2857
2913
|
json_options: Optional[JsonConfig] = None,
|
|
2858
2914
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
@@ -2930,6 +2986,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2930
2986
|
scrape_params['blockAds'] = block_ads
|
|
2931
2987
|
if proxy:
|
|
2932
2988
|
scrape_params['proxy'] = proxy
|
|
2989
|
+
if parse_pdf is not None:
|
|
2990
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
2933
2991
|
if extract is not None:
|
|
2934
2992
|
extract = self._ensure_schema_dict(extract)
|
|
2935
2993
|
if isinstance(extract, dict) and "schema" in extract:
|
|
@@ -3250,6 +3308,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3250
3308
|
max_discovery_depth: Optional[int] = None,
|
|
3251
3309
|
limit: Optional[int] = None,
|
|
3252
3310
|
allow_backward_links: Optional[bool] = None,
|
|
3311
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3253
3312
|
allow_external_links: Optional[bool] = None,
|
|
3254
3313
|
ignore_sitemap: Optional[bool] = None,
|
|
3255
3314
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3272,7 +3331,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3272
3331
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3273
3332
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3274
3333
|
limit (Optional[int]): Maximum pages to crawl
|
|
3275
|
-
allow_backward_links (Optional[bool]):
|
|
3334
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3335
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3276
3336
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3277
3337
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3278
3338
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3310,7 +3370,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3310
3370
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3311
3371
|
if limit is not None:
|
|
3312
3372
|
crawl_params['limit'] = limit
|
|
3313
|
-
if
|
|
3373
|
+
if crawl_entire_domain is not None:
|
|
3374
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3375
|
+
elif allow_backward_links is not None:
|
|
3314
3376
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3315
3377
|
if allow_external_links is not None:
|
|
3316
3378
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -3362,6 +3424,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3362
3424
|
max_discovery_depth: Optional[int] = None,
|
|
3363
3425
|
limit: Optional[int] = None,
|
|
3364
3426
|
allow_backward_links: Optional[bool] = None,
|
|
3427
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3365
3428
|
allow_external_links: Optional[bool] = None,
|
|
3366
3429
|
ignore_sitemap: Optional[bool] = None,
|
|
3367
3430
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3384,7 +3447,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3384
3447
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3385
3448
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3386
3449
|
limit (Optional[int]): Maximum pages to crawl
|
|
3387
|
-
allow_backward_links (Optional[bool]):
|
|
3450
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3451
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3388
3452
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3389
3453
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3390
3454
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3418,7 +3482,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3418
3482
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3419
3483
|
if limit is not None:
|
|
3420
3484
|
crawl_params['limit'] = limit
|
|
3421
|
-
if
|
|
3485
|
+
if crawl_entire_domain is not None:
|
|
3486
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3487
|
+
elif allow_backward_links is not None:
|
|
3422
3488
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3423
3489
|
if allow_external_links is not None:
|
|
3424
3490
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=qDOTVOIN0WXrkEEWPqy2UfFzbNDbimvD7HOPhXvTkC4,2613
|
|
2
|
+
firecrawl/firecrawl.py,sha256=Bi7n0U94YJicUYnbjKKOmbkrpWh-kSe1ttPpil3rZl4,193869
|
|
3
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=k9IsEbdTHL9Cu49M4FpnQDEo2rnG6RqwmZAsK_EVJr4,21069
|
|
7
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
firecrawl-2.10.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.10.0.dist-info/METADATA,sha256=r0ytUZrMwcrvFMIUB6J7yG7LI8Lz6GszkwFvGMF2nms,7166
|
|
10
|
+
firecrawl-2.10.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.10.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.10.0.dist-info/RECORD,,
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This is the Firecrawl package.
|
|
3
|
-
|
|
4
|
-
This package provides a Python SDK for interacting with the Firecrawl API.
|
|
5
|
-
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
-
and check the status of these jobs.
|
|
7
|
-
|
|
8
|
-
For more information visit https://github.com/firecrawl/
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import logging
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
|
-
|
|
16
|
-
__version__ = "2.7.1"
|
|
17
|
-
|
|
18
|
-
# Define the logger for the Firecrawl project
|
|
19
|
-
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _configure_logger() -> None:
|
|
23
|
-
"""
|
|
24
|
-
Configure the firecrawl logger for console output.
|
|
25
|
-
|
|
26
|
-
The function attaches a handler for console output with a specific format and date
|
|
27
|
-
format to the firecrawl logger.
|
|
28
|
-
"""
|
|
29
|
-
try:
|
|
30
|
-
# Create the formatter
|
|
31
|
-
formatter = logging.Formatter(
|
|
32
|
-
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
33
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# Create the console handler and set the formatter
|
|
37
|
-
console_handler = logging.StreamHandler()
|
|
38
|
-
console_handler.setFormatter(formatter)
|
|
39
|
-
|
|
40
|
-
# Add the console handler to the firecrawl logger
|
|
41
|
-
logger.addHandler(console_handler)
|
|
42
|
-
except Exception as e:
|
|
43
|
-
logger.error("Failed to configure logging: %s", e)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def setup_logging() -> None:
|
|
47
|
-
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
48
|
-
# Check if the firecrawl logger already has a handler
|
|
49
|
-
if logger.hasHandlers():
|
|
50
|
-
return # To prevent duplicate logging
|
|
51
|
-
|
|
52
|
-
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
|
53
|
-
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
54
|
-
# Attach a no-op handler to prevent warnings about no handlers
|
|
55
|
-
logger.addHandler(logging.NullHandler())
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
# Attach the console handler to the firecrawl logger
|
|
59
|
-
_configure_logger()
|
|
60
|
-
|
|
61
|
-
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
|
62
|
-
if env == "DEBUG":
|
|
63
|
-
logger.setLevel(logging.DEBUG)
|
|
64
|
-
elif env == "INFO":
|
|
65
|
-
logger.setLevel(logging.INFO)
|
|
66
|
-
elif env == "WARNING":
|
|
67
|
-
logger.setLevel(logging.WARNING)
|
|
68
|
-
elif env == "ERROR":
|
|
69
|
-
logger.setLevel(logging.ERROR)
|
|
70
|
-
elif env == "CRITICAL":
|
|
71
|
-
logger.setLevel(logging.CRITICAL)
|
|
72
|
-
else:
|
|
73
|
-
logger.setLevel(logging.INFO)
|
|
74
|
-
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
# Initialize logging configuration when the module is imported
|
|
78
|
-
setup_logging()
|
|
79
|
-
logger.debug("Debugging logger setup")
|
|
File without changes
|