firecrawl-py 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl-py might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/v1/e2e_withAuth/test.py +25 -0
- firecrawl/firecrawl.py +68 -15
- {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/LICENSE +0 -0
- {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/METADATA +1 -1
- firecrawl_py-2.10.0.dist-info/RECORD +12 -0
- {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/top_level.txt +0 -2
- build/lib/build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
- build/lib/build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
- build/lib/build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/firecrawl/firecrawl.py +0 -4480
- build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/firecrawl/__init__.py +0 -79
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/firecrawl/firecrawl.py +0 -4480
- build/lib/tests/test_change_tracking.py +0 -98
- firecrawl_py-2.8.0.dist-info/RECORD +0 -40
- {firecrawl_py-2.8.0.dist-info → firecrawl_py-2.10.0.dist-info}/WHEEL +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.10.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
@@ -437,4 +437,29 @@ def test_search_with_invalid_params():
|
|
|
437
437
|
app.search("test query", {"invalid_param": "value"})
|
|
438
438
|
assert "ValidationError" in str(e.value)
|
|
439
439
|
|
|
440
|
+
# def test_scrape_url_with_parse_pdf_true():
|
|
441
|
+
# if TEST_API_KEY:
|
|
442
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
443
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=True)
|
|
444
|
+
# assert response is not None
|
|
445
|
+
# assert 'markdown' in response
|
|
446
|
+
# assert len(response['markdown']) > 100
|
|
447
|
+
|
|
448
|
+
# def test_scrape_url_with_parse_pdf_false():
|
|
449
|
+
# if TEST_API_KEY:
|
|
450
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
451
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=False)
|
|
452
|
+
# assert response is not None
|
|
453
|
+
# assert 'markdown' in response
|
|
454
|
+
# assert 'h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm' in response['markdown']
|
|
455
|
+
|
|
456
|
+
# def test_scrape_options_with_parse_pdf():
|
|
457
|
+
# if TEST_API_KEY:
|
|
458
|
+
# from firecrawl.firecrawl import ScrapeOptions
|
|
459
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
460
|
+
# scrape_options = ScrapeOptions(parsePDF=False, formats=['markdown'])
|
|
461
|
+
# response = app.search("firecrawl", limit=1, scrape_options=scrape_options)
|
|
462
|
+
# assert response is not None
|
|
463
|
+
# assert 'data' in response
|
|
464
|
+
|
|
440
465
|
|
firecrawl/firecrawl.py
CHANGED
|
@@ -160,6 +160,7 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
160
160
|
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
|
161
161
|
maxAge: Optional[int] = None
|
|
162
162
|
storeInCache: Optional[bool] = None
|
|
163
|
+
parsePDF: Optional[bool] = None
|
|
163
164
|
|
|
164
165
|
class WaitAction(pydantic.BaseModel):
|
|
165
166
|
"""Wait action to perform during scraping."""
|
|
@@ -263,6 +264,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
263
264
|
ignoreQueryParameters: Optional[bool] = None
|
|
264
265
|
regexOnFullURL: Optional[bool] = None
|
|
265
266
|
delay: Optional[int] = None # Delay in seconds between scrapes
|
|
267
|
+
maxConcurrency: Optional[int] = None
|
|
266
268
|
|
|
267
269
|
class CrawlResponse(pydantic.BaseModel):
|
|
268
270
|
"""Response from crawling operations."""
|
|
@@ -464,6 +466,7 @@ class FirecrawlApp:
|
|
|
464
466
|
remove_base64_images: Optional[bool] = None,
|
|
465
467
|
block_ads: Optional[bool] = None,
|
|
466
468
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
469
|
+
parse_pdf: Optional[bool] = None,
|
|
467
470
|
extract: Optional[JsonConfig] = None,
|
|
468
471
|
json_options: Optional[JsonConfig] = None,
|
|
469
472
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
@@ -537,6 +540,8 @@ class FirecrawlApp:
|
|
|
537
540
|
scrape_params['blockAds'] = block_ads
|
|
538
541
|
if proxy:
|
|
539
542
|
scrape_params['proxy'] = proxy
|
|
543
|
+
if parse_pdf is not None:
|
|
544
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
540
545
|
if extract is not None:
|
|
541
546
|
extract = self._ensure_schema_dict(extract)
|
|
542
547
|
if isinstance(extract, dict) and "schema" in extract:
|
|
@@ -686,6 +691,7 @@ class FirecrawlApp:
|
|
|
686
691
|
max_discovery_depth: Optional[int] = None,
|
|
687
692
|
limit: Optional[int] = None,
|
|
688
693
|
allow_backward_links: Optional[bool] = None,
|
|
694
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
689
695
|
allow_external_links: Optional[bool] = None,
|
|
690
696
|
ignore_sitemap: Optional[bool] = None,
|
|
691
697
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -694,6 +700,7 @@ class FirecrawlApp:
|
|
|
694
700
|
ignore_query_parameters: Optional[bool] = None,
|
|
695
701
|
regex_on_full_url: Optional[bool] = None,
|
|
696
702
|
delay: Optional[int] = None,
|
|
703
|
+
max_concurrency: Optional[int] = None,
|
|
697
704
|
poll_interval: Optional[int] = 2,
|
|
698
705
|
idempotency_key: Optional[str] = None,
|
|
699
706
|
**kwargs
|
|
@@ -708,7 +715,8 @@ class FirecrawlApp:
|
|
|
708
715
|
max_depth (Optional[int]): Maximum crawl depth
|
|
709
716
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
710
717
|
limit (Optional[int]): Maximum pages to crawl
|
|
711
|
-
allow_backward_links (Optional[bool]):
|
|
718
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
719
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
712
720
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
713
721
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
714
722
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -717,6 +725,7 @@ class FirecrawlApp:
|
|
|
717
725
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
718
726
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
719
727
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
728
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
720
729
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
721
730
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
722
731
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -746,7 +755,9 @@ class FirecrawlApp:
|
|
|
746
755
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
747
756
|
if limit is not None:
|
|
748
757
|
crawl_params['limit'] = limit
|
|
749
|
-
if
|
|
758
|
+
if crawl_entire_domain is not None:
|
|
759
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
760
|
+
elif allow_backward_links is not None:
|
|
750
761
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
751
762
|
if allow_external_links is not None:
|
|
752
763
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -764,7 +775,9 @@ class FirecrawlApp:
|
|
|
764
775
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
765
776
|
if delay is not None:
|
|
766
777
|
crawl_params['delay'] = delay
|
|
767
|
-
|
|
778
|
+
if max_concurrency is not None:
|
|
779
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
780
|
+
|
|
768
781
|
# Add any additional kwargs
|
|
769
782
|
crawl_params.update(kwargs)
|
|
770
783
|
|
|
@@ -797,6 +810,7 @@ class FirecrawlApp:
|
|
|
797
810
|
max_discovery_depth: Optional[int] = None,
|
|
798
811
|
limit: Optional[int] = None,
|
|
799
812
|
allow_backward_links: Optional[bool] = None,
|
|
813
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
800
814
|
allow_external_links: Optional[bool] = None,
|
|
801
815
|
ignore_sitemap: Optional[bool] = None,
|
|
802
816
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -818,7 +832,8 @@ class FirecrawlApp:
|
|
|
818
832
|
max_depth (Optional[int]): Maximum crawl depth
|
|
819
833
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
820
834
|
limit (Optional[int]): Maximum pages to crawl
|
|
821
|
-
allow_backward_links (Optional[bool]):
|
|
835
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
836
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
822
837
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
823
838
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
824
839
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -826,6 +841,8 @@ class FirecrawlApp:
|
|
|
826
841
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
827
842
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
828
843
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
844
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
845
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
829
846
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
830
847
|
**kwargs: Additional parameters to pass to the API
|
|
831
848
|
|
|
@@ -855,7 +872,9 @@ class FirecrawlApp:
|
|
|
855
872
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
856
873
|
if limit is not None:
|
|
857
874
|
crawl_params['limit'] = limit
|
|
858
|
-
if
|
|
875
|
+
if crawl_entire_domain is not None:
|
|
876
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
877
|
+
elif allow_backward_links is not None:
|
|
859
878
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
860
879
|
if allow_external_links is not None:
|
|
861
880
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -873,7 +892,9 @@ class FirecrawlApp:
|
|
|
873
892
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
874
893
|
if delay is not None:
|
|
875
894
|
crawl_params['delay'] = delay
|
|
876
|
-
|
|
895
|
+
if max_concurrency is not None:
|
|
896
|
+
crawl_params['maxConcurrency'] = max_concurrency
|
|
897
|
+
|
|
877
898
|
# Add any additional kwargs
|
|
878
899
|
crawl_params.update(kwargs)
|
|
879
900
|
|
|
@@ -1042,6 +1063,7 @@ class FirecrawlApp:
|
|
|
1042
1063
|
max_discovery_depth: Optional[int] = None,
|
|
1043
1064
|
limit: Optional[int] = None,
|
|
1044
1065
|
allow_backward_links: Optional[bool] = None,
|
|
1066
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
1045
1067
|
allow_external_links: Optional[bool] = None,
|
|
1046
1068
|
ignore_sitemap: Optional[bool] = None,
|
|
1047
1069
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -1049,6 +1071,8 @@ class FirecrawlApp:
|
|
|
1049
1071
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1050
1072
|
ignore_query_parameters: Optional[bool] = None,
|
|
1051
1073
|
regex_on_full_url: Optional[bool] = None,
|
|
1074
|
+
delay: Optional[int] = None,
|
|
1075
|
+
max_concurrency: Optional[int] = None,
|
|
1052
1076
|
idempotency_key: Optional[str] = None,
|
|
1053
1077
|
**kwargs
|
|
1054
1078
|
) -> 'CrawlWatcher':
|
|
@@ -1062,7 +1086,8 @@ class FirecrawlApp:
|
|
|
1062
1086
|
max_depth (Optional[int]): Maximum crawl depth
|
|
1063
1087
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
1064
1088
|
limit (Optional[int]): Maximum pages to crawl
|
|
1065
|
-
allow_backward_links (Optional[bool]):
|
|
1089
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
1090
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
1066
1091
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1067
1092
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1068
1093
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -1070,6 +1095,8 @@ class FirecrawlApp:
|
|
|
1070
1095
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1071
1096
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1072
1097
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1098
|
+
delay (Optional[int]): Delay in seconds between scrapes
|
|
1099
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1073
1100
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1074
1101
|
**kwargs: Additional parameters to pass to the API
|
|
1075
1102
|
|
|
@@ -1094,6 +1121,8 @@ class FirecrawlApp:
|
|
|
1094
1121
|
deduplicate_similar_urls=deduplicate_similar_urls,
|
|
1095
1122
|
ignore_query_parameters=ignore_query_parameters,
|
|
1096
1123
|
regex_on_full_url=regex_on_full_url,
|
|
1124
|
+
delay=delay,
|
|
1125
|
+
max_concurrency=max_concurrency,
|
|
1097
1126
|
idempotency_key=idempotency_key,
|
|
1098
1127
|
**kwargs
|
|
1099
1128
|
)
|
|
@@ -1210,6 +1239,7 @@ class FirecrawlApp:
|
|
|
1210
1239
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1211
1240
|
agent: Optional[AgentOptions] = None,
|
|
1212
1241
|
poll_interval: Optional[int] = 2,
|
|
1242
|
+
max_concurrency: Optional[int] = None,
|
|
1213
1243
|
idempotency_key: Optional[str] = None,
|
|
1214
1244
|
**kwargs
|
|
1215
1245
|
) -> BatchScrapeStatusResponse:
|
|
@@ -1235,6 +1265,7 @@ class FirecrawlApp:
|
|
|
1235
1265
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1236
1266
|
actions (Optional[List[Union]]): Actions to perform
|
|
1237
1267
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1268
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1238
1269
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
1239
1270
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1240
1271
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -1294,7 +1325,9 @@ class FirecrawlApp:
|
|
|
1294
1325
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1295
1326
|
if agent is not None:
|
|
1296
1327
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1297
|
-
|
|
1328
|
+
if max_concurrency is not None:
|
|
1329
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1330
|
+
|
|
1298
1331
|
# Add any additional kwargs
|
|
1299
1332
|
scrape_params.update(kwargs)
|
|
1300
1333
|
|
|
@@ -1343,6 +1376,7 @@ class FirecrawlApp:
|
|
|
1343
1376
|
json_options: Optional[JsonConfig] = None,
|
|
1344
1377
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1345
1378
|
agent: Optional[AgentOptions] = None,
|
|
1379
|
+
max_concurrency: Optional[int] = None,
|
|
1346
1380
|
idempotency_key: Optional[str] = None,
|
|
1347
1381
|
**kwargs
|
|
1348
1382
|
) -> BatchScrapeResponse:
|
|
@@ -1368,6 +1402,7 @@ class FirecrawlApp:
|
|
|
1368
1402
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1369
1403
|
actions (Optional[List[Union]]): Actions to perform
|
|
1370
1404
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1405
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1371
1406
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1372
1407
|
**kwargs: Additional parameters to pass to the API
|
|
1373
1408
|
|
|
@@ -1427,7 +1462,9 @@ class FirecrawlApp:
|
|
|
1427
1462
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1428
1463
|
if agent is not None:
|
|
1429
1464
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1430
|
-
|
|
1465
|
+
if max_concurrency is not None:
|
|
1466
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1467
|
+
|
|
1431
1468
|
# Add any additional kwargs
|
|
1432
1469
|
scrape_params.update(kwargs)
|
|
1433
1470
|
|
|
@@ -1475,6 +1512,7 @@ class FirecrawlApp:
|
|
|
1475
1512
|
json_options: Optional[JsonConfig] = None,
|
|
1476
1513
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1477
1514
|
agent: Optional[AgentOptions] = None,
|
|
1515
|
+
max_concurrency: Optional[int] = None,
|
|
1478
1516
|
idempotency_key: Optional[str] = None,
|
|
1479
1517
|
**kwargs
|
|
1480
1518
|
) -> 'CrawlWatcher':
|
|
@@ -1500,6 +1538,7 @@ class FirecrawlApp:
|
|
|
1500
1538
|
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1501
1539
|
actions (Optional[List[Union]]): Actions to perform
|
|
1502
1540
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1541
|
+
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1503
1542
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1504
1543
|
**kwargs: Additional parameters to pass to the API
|
|
1505
1544
|
|
|
@@ -1555,7 +1594,9 @@ class FirecrawlApp:
|
|
|
1555
1594
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
|
1556
1595
|
if agent is not None:
|
|
1557
1596
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
|
1558
|
-
|
|
1597
|
+
if max_concurrency is not None:
|
|
1598
|
+
scrape_params['maxConcurrency'] = max_concurrency
|
|
1599
|
+
|
|
1559
1600
|
# Add any additional kwargs
|
|
1560
1601
|
scrape_params.update(kwargs)
|
|
1561
1602
|
|
|
@@ -2784,7 +2825,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2784
2825
|
* limit - Maximum pages to crawl
|
|
2785
2826
|
|
|
2786
2827
|
Link Following:
|
|
2787
|
-
* allowBackwardLinks -
|
|
2828
|
+
* allowBackwardLinks - DEPRECATED: Use crawlEntireDomain instead
|
|
2829
|
+
* crawlEntireDomain - Follow parent directory links
|
|
2788
2830
|
* allowExternalLinks - Follow external domain links
|
|
2789
2831
|
* ignoreSitemap - Skip sitemap.xml processing
|
|
2790
2832
|
|
|
@@ -2866,6 +2908,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2866
2908
|
remove_base64_images: Optional[bool] = None,
|
|
2867
2909
|
block_ads: Optional[bool] = None,
|
|
2868
2910
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
2911
|
+
parse_pdf: Optional[bool] = None,
|
|
2869
2912
|
extract: Optional[JsonConfig] = None,
|
|
2870
2913
|
json_options: Optional[JsonConfig] = None,
|
|
2871
2914
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
@@ -2943,6 +2986,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2943
2986
|
scrape_params['blockAds'] = block_ads
|
|
2944
2987
|
if proxy:
|
|
2945
2988
|
scrape_params['proxy'] = proxy
|
|
2989
|
+
if parse_pdf is not None:
|
|
2990
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
2946
2991
|
if extract is not None:
|
|
2947
2992
|
extract = self._ensure_schema_dict(extract)
|
|
2948
2993
|
if isinstance(extract, dict) and "schema" in extract:
|
|
@@ -3263,6 +3308,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3263
3308
|
max_discovery_depth: Optional[int] = None,
|
|
3264
3309
|
limit: Optional[int] = None,
|
|
3265
3310
|
allow_backward_links: Optional[bool] = None,
|
|
3311
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3266
3312
|
allow_external_links: Optional[bool] = None,
|
|
3267
3313
|
ignore_sitemap: Optional[bool] = None,
|
|
3268
3314
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3285,7 +3331,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3285
3331
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3286
3332
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3287
3333
|
limit (Optional[int]): Maximum pages to crawl
|
|
3288
|
-
allow_backward_links (Optional[bool]):
|
|
3334
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3335
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3289
3336
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3290
3337
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3291
3338
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3323,7 +3370,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3323
3370
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3324
3371
|
if limit is not None:
|
|
3325
3372
|
crawl_params['limit'] = limit
|
|
3326
|
-
if
|
|
3373
|
+
if crawl_entire_domain is not None:
|
|
3374
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3375
|
+
elif allow_backward_links is not None:
|
|
3327
3376
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3328
3377
|
if allow_external_links is not None:
|
|
3329
3378
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
@@ -3375,6 +3424,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3375
3424
|
max_discovery_depth: Optional[int] = None,
|
|
3376
3425
|
limit: Optional[int] = None,
|
|
3377
3426
|
allow_backward_links: Optional[bool] = None,
|
|
3427
|
+
crawl_entire_domain: Optional[bool] = None,
|
|
3378
3428
|
allow_external_links: Optional[bool] = None,
|
|
3379
3429
|
ignore_sitemap: Optional[bool] = None,
|
|
3380
3430
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
@@ -3397,7 +3447,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3397
3447
|
max_depth (Optional[int]): Maximum crawl depth
|
|
3398
3448
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
|
3399
3449
|
limit (Optional[int]): Maximum pages to crawl
|
|
3400
|
-
allow_backward_links (Optional[bool]):
|
|
3450
|
+
allow_backward_links (Optional[bool]): DEPRECATED: Use crawl_entire_domain instead
|
|
3451
|
+
crawl_entire_domain (Optional[bool]): Follow parent directory links
|
|
3401
3452
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3402
3453
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3403
3454
|
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
@@ -3431,7 +3482,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3431
3482
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
|
3432
3483
|
if limit is not None:
|
|
3433
3484
|
crawl_params['limit'] = limit
|
|
3434
|
-
if
|
|
3485
|
+
if crawl_entire_domain is not None:
|
|
3486
|
+
crawl_params['crawlEntireDomain'] = crawl_entire_domain
|
|
3487
|
+
elif allow_backward_links is not None:
|
|
3435
3488
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
|
3436
3489
|
if allow_external_links is not None:
|
|
3437
3490
|
crawl_params['allowExternalLinks'] = allow_external_links
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=qDOTVOIN0WXrkEEWPqy2UfFzbNDbimvD7HOPhXvTkC4,2613
|
|
2
|
+
firecrawl/firecrawl.py,sha256=Bi7n0U94YJicUYnbjKKOmbkrpWh-kSe1ttPpil3rZl4,193869
|
|
3
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=k9IsEbdTHL9Cu49M4FpnQDEo2rnG6RqwmZAsK_EVJr4,21069
|
|
7
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
firecrawl_py-2.10.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl_py-2.10.0.dist-info/METADATA,sha256=k_qij9hylsX7bmsCfslPrYl0xaQ1B356E7gqVcqTsa4,7169
|
|
10
|
+
firecrawl_py-2.10.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl_py-2.10.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl_py-2.10.0.dist-info/RECORD,,
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This is the Firecrawl package.
|
|
3
|
-
|
|
4
|
-
This package provides a Python SDK for interacting with the Firecrawl API.
|
|
5
|
-
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
-
and check the status of these jobs.
|
|
7
|
-
|
|
8
|
-
For more information visit https://github.com/firecrawl/
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import logging
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
|
-
|
|
16
|
-
__version__ = "2.8.0"
|
|
17
|
-
|
|
18
|
-
# Define the logger for the Firecrawl project
|
|
19
|
-
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _configure_logger() -> None:
|
|
23
|
-
"""
|
|
24
|
-
Configure the firecrawl logger for console output.
|
|
25
|
-
|
|
26
|
-
The function attaches a handler for console output with a specific format and date
|
|
27
|
-
format to the firecrawl logger.
|
|
28
|
-
"""
|
|
29
|
-
try:
|
|
30
|
-
# Create the formatter
|
|
31
|
-
formatter = logging.Formatter(
|
|
32
|
-
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
33
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# Create the console handler and set the formatter
|
|
37
|
-
console_handler = logging.StreamHandler()
|
|
38
|
-
console_handler.setFormatter(formatter)
|
|
39
|
-
|
|
40
|
-
# Add the console handler to the firecrawl logger
|
|
41
|
-
logger.addHandler(console_handler)
|
|
42
|
-
except Exception as e:
|
|
43
|
-
logger.error("Failed to configure logging: %s", e)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def setup_logging() -> None:
|
|
47
|
-
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
48
|
-
# Check if the firecrawl logger already has a handler
|
|
49
|
-
if logger.hasHandlers():
|
|
50
|
-
return # To prevent duplicate logging
|
|
51
|
-
|
|
52
|
-
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
|
53
|
-
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
54
|
-
# Attach a no-op handler to prevent warnings about no handlers
|
|
55
|
-
logger.addHandler(logging.NullHandler())
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
# Attach the console handler to the firecrawl logger
|
|
59
|
-
_configure_logger()
|
|
60
|
-
|
|
61
|
-
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
|
62
|
-
if env == "DEBUG":
|
|
63
|
-
logger.setLevel(logging.DEBUG)
|
|
64
|
-
elif env == "INFO":
|
|
65
|
-
logger.setLevel(logging.INFO)
|
|
66
|
-
elif env == "WARNING":
|
|
67
|
-
logger.setLevel(logging.WARNING)
|
|
68
|
-
elif env == "ERROR":
|
|
69
|
-
logger.setLevel(logging.ERROR)
|
|
70
|
-
elif env == "CRITICAL":
|
|
71
|
-
logger.setLevel(logging.CRITICAL)
|
|
72
|
-
else:
|
|
73
|
-
logger.setLevel(logging.INFO)
|
|
74
|
-
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
# Initialize logging configuration when the module is imported
|
|
78
|
-
setup_logging()
|
|
79
|
-
logger.debug("Debugging logger setup")
|
|
File without changes
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
import importlib.util
|
|
2
|
-
import pytest
|
|
3
|
-
import time
|
|
4
|
-
import os
|
|
5
|
-
from uuid import uuid4
|
|
6
|
-
from dotenv import load_dotenv
|
|
7
|
-
|
|
8
|
-
load_dotenv()
|
|
9
|
-
|
|
10
|
-
API_URL = "http://127.0.0.1:3002"
|
|
11
|
-
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
|
12
|
-
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
|
13
|
-
|
|
14
|
-
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
|
15
|
-
|
|
16
|
-
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
|
17
|
-
firecrawl = importlib.util.module_from_spec(spec)
|
|
18
|
-
spec.loader.exec_module(firecrawl)
|
|
19
|
-
FirecrawlApp = firecrawl.FirecrawlApp
|
|
20
|
-
|
|
21
|
-
def test_no_api_key():
|
|
22
|
-
with pytest.raises(Exception) as excinfo:
|
|
23
|
-
invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
|
|
24
|
-
assert "No API key provided" in str(excinfo.value)
|
|
25
|
-
|
|
26
|
-
def test_scrape_url_invalid_api_key():
|
|
27
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
28
|
-
with pytest.raises(Exception) as excinfo:
|
|
29
|
-
invalid_app.scrape_url('https://firecrawl.dev')
|
|
30
|
-
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
31
|
-
|
|
32
|
-
# def test_blocklisted_url():
|
|
33
|
-
# blocklisted_url = "https://facebook.com/fake-test"
|
|
34
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
35
|
-
# with pytest.raises(Exception) as excinfo:
|
|
36
|
-
# app.scrape_url(blocklisted_url)
|
|
37
|
-
# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
38
|
-
|
|
39
|
-
def test_successful_response_with_valid_preview_token():
|
|
40
|
-
app = FirecrawlApp(api_url=API_URL, api_key=os.getenv('PREVIEW_TOKEN'), version='v0')
|
|
41
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
42
|
-
assert response is not None
|
|
43
|
-
assert 'content' in response
|
|
44
|
-
assert "_Roast_" in response['content']
|
|
45
|
-
|
|
46
|
-
def test_scrape_url_e2e():
|
|
47
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
48
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
49
|
-
print(response)
|
|
50
|
-
|
|
51
|
-
assert response is not None
|
|
52
|
-
assert 'content' in response
|
|
53
|
-
assert 'markdown' in response
|
|
54
|
-
assert 'metadata' in response
|
|
55
|
-
assert 'html' not in response
|
|
56
|
-
assert "_Roast_" in response['content']
|
|
57
|
-
|
|
58
|
-
def test_successful_response_with_valid_api_key_and_include_html():
|
|
59
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
60
|
-
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
|
61
|
-
assert response is not None
|
|
62
|
-
assert 'content' in response
|
|
63
|
-
assert 'markdown' in response
|
|
64
|
-
assert 'html' in response
|
|
65
|
-
assert 'metadata' in response
|
|
66
|
-
assert "_Roast_" in response['content']
|
|
67
|
-
assert "_Roast_" in response['markdown']
|
|
68
|
-
assert "<h1" in response['html']
|
|
69
|
-
|
|
70
|
-
def test_successful_response_for_valid_scrape_with_pdf_file():
|
|
71
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
72
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
|
73
|
-
assert response is not None
|
|
74
|
-
assert 'content' in response
|
|
75
|
-
assert 'metadata' in response
|
|
76
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
77
|
-
|
|
78
|
-
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
|
79
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
80
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
|
81
|
-
time.sleep(6) # wait for 6 seconds
|
|
82
|
-
assert response is not None
|
|
83
|
-
assert 'content' in response
|
|
84
|
-
assert 'metadata' in response
|
|
85
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
86
|
-
|
|
87
|
-
def test_crawl_url_invalid_api_key():
|
|
88
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
89
|
-
with pytest.raises(Exception) as excinfo:
|
|
90
|
-
invalid_app.crawl_url('https://firecrawl.dev')
|
|
91
|
-
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
92
|
-
|
|
93
|
-
# def test_should_return_error_for_blocklisted_url():
|
|
94
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
95
|
-
# blocklisted_url = "https://twitter.com/fake-test"
|
|
96
|
-
# with pytest.raises(Exception) as excinfo:
|
|
97
|
-
# app.crawl_url(blocklisted_url)
|
|
98
|
-
# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
99
|
-
|
|
100
|
-
def test_crawl_url_wait_for_completion_e2e():
|
|
101
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
102
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
|
103
|
-
assert response is not None
|
|
104
|
-
assert len(response) > 0
|
|
105
|
-
assert 'content' in response[0]
|
|
106
|
-
assert "_Roast_" in response[0]['content']
|
|
107
|
-
|
|
108
|
-
def test_crawl_url_with_idempotency_key_e2e():
|
|
109
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
110
|
-
uniqueIdempotencyKey = str(uuid4())
|
|
111
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
112
|
-
assert response is not None
|
|
113
|
-
assert len(response) > 0
|
|
114
|
-
assert 'content' in response[0]
|
|
115
|
-
assert "_Roast_" in response[0]['content']
|
|
116
|
-
|
|
117
|
-
with pytest.raises(Exception) as excinfo:
|
|
118
|
-
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
119
|
-
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
|
120
|
-
|
|
121
|
-
def test_check_crawl_status_e2e():
|
|
122
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
123
|
-
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
|
124
|
-
assert response is not None
|
|
125
|
-
assert 'jobId' in response
|
|
126
|
-
|
|
127
|
-
time.sleep(30) # wait for 30 seconds
|
|
128
|
-
status_response = app.check_crawl_status(response['jobId'])
|
|
129
|
-
assert status_response is not None
|
|
130
|
-
assert 'status' in status_response
|
|
131
|
-
assert status_response['status'] == 'completed'
|
|
132
|
-
assert 'data' in status_response
|
|
133
|
-
assert len(status_response['data']) > 0
|
|
134
|
-
|
|
135
|
-
def test_search_e2e():
|
|
136
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
137
|
-
response = app.search("test query")
|
|
138
|
-
assert response is not None
|
|
139
|
-
assert 'content' in response[0]
|
|
140
|
-
assert len(response) > 2
|
|
141
|
-
|
|
142
|
-
def test_search_invalid_api_key():
|
|
143
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
144
|
-
with pytest.raises(Exception) as excinfo:
|
|
145
|
-
invalid_app.search("test query")
|
|
146
|
-
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
147
|
-
|
|
148
|
-
def test_llm_extraction():
|
|
149
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
150
|
-
response = app.scrape_url("https://firecrawl.dev", {
|
|
151
|
-
'extractorOptions': {
|
|
152
|
-
'mode': 'llm-extraction',
|
|
153
|
-
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
|
154
|
-
'extractionSchema': {
|
|
155
|
-
'type': 'object',
|
|
156
|
-
'properties': {
|
|
157
|
-
'company_mission': {'type': 'string'},
|
|
158
|
-
'supports_sso': {'type': 'boolean'},
|
|
159
|
-
'is_open_source': {'type': 'boolean'}
|
|
160
|
-
},
|
|
161
|
-
'required': ['company_mission', 'supports_sso', 'is_open_source']
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
})
|
|
165
|
-
assert response is not None
|
|
166
|
-
assert 'llm_extraction' in response
|
|
167
|
-
llm_extraction = response['llm_extraction']
|
|
168
|
-
assert 'company_mission' in llm_extraction
|
|
169
|
-
assert isinstance(llm_extraction['supports_sso'], bool)
|
|
170
|
-
assert isinstance(llm_extraction['is_open_source'], bool)
|
|
File without changes
|