firecrawl 2.1.2__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/firecrawl.py +105 -20
- {firecrawl-2.1.2.dist-info → firecrawl-2.2.0.dist-info}/METADATA +1 -1
- firecrawl-2.2.0.dist-info/RECORD +12 -0
- {firecrawl-2.1.2.dist-info → firecrawl-2.2.0.dist-info}/top_level.txt +0 -2
- build/lib/build/lib/firecrawl/__init__.py +0 -79
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/build/lib/firecrawl/firecrawl.py +0 -4291
- build/lib/build/lib/tests/test_change_tracking.py +0 -98
- build/lib/firecrawl/__init__.py +0 -79
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/firecrawl/firecrawl.py +0 -4291
- build/lib/tests/test_change_tracking.py +0 -98
- firecrawl-2.1.2.dist-info/RECORD +0 -26
- {firecrawl-2.1.2.dist-info → firecrawl-2.2.0.dist-info}/LICENSE +0 -0
- {firecrawl-2.1.2.dist-info → firecrawl-2.2.0.dist-info}/WHEEL +0 -0
firecrawl/__init__.py
CHANGED
firecrawl/firecrawl.py
CHANGED
|
@@ -570,7 +570,6 @@ class FirecrawlApp:
|
|
|
570
570
|
location: Optional[str] = None,
|
|
571
571
|
timeout: Optional[int] = None,
|
|
572
572
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
573
|
-
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
574
573
|
**kwargs) -> SearchResponse:
|
|
575
574
|
"""
|
|
576
575
|
Search for content using Firecrawl.
|
|
@@ -585,7 +584,6 @@ class FirecrawlApp:
|
|
|
585
584
|
location (Optional[str]): Geo-targeting
|
|
586
585
|
timeout (Optional[int]): Request timeout in milliseconds
|
|
587
586
|
scrape_options (Optional[ScrapeOptions]): Result scraping configuration
|
|
588
|
-
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
589
587
|
**kwargs: Additional keyword arguments for future compatibility
|
|
590
588
|
|
|
591
589
|
Returns:
|
|
@@ -598,13 +596,11 @@ class FirecrawlApp:
|
|
|
598
596
|
Raises:
|
|
599
597
|
Exception: If search fails or response cannot be parsed
|
|
600
598
|
"""
|
|
599
|
+
# Validate any additional kwargs
|
|
600
|
+
self._validate_kwargs(kwargs, "search")
|
|
601
|
+
|
|
601
602
|
# Build search parameters
|
|
602
603
|
search_params = {}
|
|
603
|
-
if params:
|
|
604
|
-
if isinstance(params, dict):
|
|
605
|
-
search_params.update(params)
|
|
606
|
-
else:
|
|
607
|
-
search_params.update(params.dict(exclude_none=True))
|
|
608
604
|
|
|
609
605
|
# Add individual parameters
|
|
610
606
|
if limit is not None:
|
|
@@ -705,6 +701,9 @@ class FirecrawlApp:
|
|
|
705
701
|
Raises:
|
|
706
702
|
Exception: If crawl fails
|
|
707
703
|
"""
|
|
704
|
+
# Validate any additional kwargs
|
|
705
|
+
self._validate_kwargs(kwargs, "crawl_url")
|
|
706
|
+
|
|
708
707
|
crawl_params = {}
|
|
709
708
|
|
|
710
709
|
# Add individual parameters
|
|
@@ -808,6 +807,9 @@ class FirecrawlApp:
|
|
|
808
807
|
Raises:
|
|
809
808
|
Exception: If crawl initiation fails
|
|
810
809
|
"""
|
|
810
|
+
# Validate any additional kwargs
|
|
811
|
+
self._validate_kwargs(kwargs, "async_crawl_url")
|
|
812
|
+
|
|
811
813
|
crawl_params = {}
|
|
812
814
|
|
|
813
815
|
# Add individual parameters
|
|
@@ -1076,7 +1078,7 @@ class FirecrawlApp:
|
|
|
1076
1078
|
sitemap_only: Optional[bool] = None,
|
|
1077
1079
|
limit: Optional[int] = None,
|
|
1078
1080
|
timeout: Optional[int] = None,
|
|
1079
|
-
|
|
1081
|
+
**kwargs) -> MapResponse:
|
|
1080
1082
|
"""
|
|
1081
1083
|
Map and discover links from a URL.
|
|
1082
1084
|
|
|
@@ -1088,7 +1090,7 @@ class FirecrawlApp:
|
|
|
1088
1090
|
sitemap_only (Optional[bool]): Only use sitemap.xml
|
|
1089
1091
|
limit (Optional[int]): Maximum URLs to return
|
|
1090
1092
|
timeout (Optional[int]): Request timeout in milliseconds
|
|
1091
|
-
|
|
1093
|
+
**kwargs: Additional parameters to pass to the API
|
|
1092
1094
|
|
|
1093
1095
|
Returns:
|
|
1094
1096
|
MapResponse: Response containing:
|
|
@@ -1099,10 +1101,11 @@ class FirecrawlApp:
|
|
|
1099
1101
|
Raises:
|
|
1100
1102
|
Exception: If mapping fails or response cannot be parsed
|
|
1101
1103
|
"""
|
|
1104
|
+
# Validate any additional kwargs
|
|
1105
|
+
self._validate_kwargs(kwargs, "map_url")
|
|
1106
|
+
|
|
1102
1107
|
# Build map parameters
|
|
1103
1108
|
map_params = {}
|
|
1104
|
-
if params:
|
|
1105
|
-
map_params.update(params.dict(exclude_none=True))
|
|
1106
1109
|
|
|
1107
1110
|
# Add individual parameters
|
|
1108
1111
|
if search is not None:
|
|
@@ -1118,6 +1121,9 @@ class FirecrawlApp:
|
|
|
1118
1121
|
if timeout is not None:
|
|
1119
1122
|
map_params['timeout'] = timeout
|
|
1120
1123
|
|
|
1124
|
+
# Add any additional kwargs
|
|
1125
|
+
map_params.update(kwargs)
|
|
1126
|
+
|
|
1121
1127
|
# Create final params object
|
|
1122
1128
|
final_params = MapParams(**map_params)
|
|
1123
1129
|
params_dict = final_params.dict(exclude_none=True)
|
|
@@ -1205,6 +1211,9 @@ class FirecrawlApp:
|
|
|
1205
1211
|
Raises:
|
|
1206
1212
|
Exception: If batch scrape fails
|
|
1207
1213
|
"""
|
|
1214
|
+
# Validate any additional kwargs
|
|
1215
|
+
self._validate_kwargs(kwargs, "batch_scrape_urls")
|
|
1216
|
+
|
|
1208
1217
|
scrape_params = {}
|
|
1209
1218
|
|
|
1210
1219
|
# Add individual parameters
|
|
@@ -1328,6 +1337,9 @@ class FirecrawlApp:
|
|
|
1328
1337
|
Raises:
|
|
1329
1338
|
Exception: If job initiation fails
|
|
1330
1339
|
"""
|
|
1340
|
+
# Validate any additional kwargs
|
|
1341
|
+
self._validate_kwargs(kwargs, "async_batch_scrape_urls")
|
|
1342
|
+
|
|
1331
1343
|
scrape_params = {}
|
|
1332
1344
|
|
|
1333
1345
|
# Add individual parameters
|
|
@@ -1446,6 +1458,9 @@ class FirecrawlApp:
|
|
|
1446
1458
|
Raises:
|
|
1447
1459
|
Exception: If batch scrape job fails to start
|
|
1448
1460
|
"""
|
|
1461
|
+
# Validate any additional kwargs
|
|
1462
|
+
self._validate_kwargs(kwargs, "batch_scrape_urls_and_watch")
|
|
1463
|
+
|
|
1449
1464
|
scrape_params = {}
|
|
1450
1465
|
|
|
1451
1466
|
# Add individual parameters
|
|
@@ -2394,6 +2409,56 @@ class FirecrawlApp:
|
|
|
2394
2409
|
|
|
2395
2410
|
return {'success': False, 'error': 'Internal server error'}
|
|
2396
2411
|
|
|
2412
|
+
def _validate_kwargs(self, kwargs: Dict[str, Any], method_name: str) -> None:
|
|
2413
|
+
"""
|
|
2414
|
+
Validate additional keyword arguments before they are passed to the API.
|
|
2415
|
+
This provides early validation before the Pydantic model validation.
|
|
2416
|
+
|
|
2417
|
+
Args:
|
|
2418
|
+
kwargs (Dict[str, Any]): Additional keyword arguments to validate
|
|
2419
|
+
method_name (str): Name of the method these kwargs are for
|
|
2420
|
+
|
|
2421
|
+
Raises:
|
|
2422
|
+
ValueError: If kwargs contain invalid or unsupported parameters
|
|
2423
|
+
"""
|
|
2424
|
+
if not kwargs:
|
|
2425
|
+
return
|
|
2426
|
+
|
|
2427
|
+
# Known parameter mappings for each method
|
|
2428
|
+
method_params = {
|
|
2429
|
+
"scrape_url": {"formats", "include_tags", "exclude_tags", "only_main_content", "wait_for",
|
|
2430
|
+
"timeout", "location", "mobile", "skip_tls_verification", "remove_base64_images",
|
|
2431
|
+
"block_ads", "proxy", "extract", "json_options", "actions"},
|
|
2432
|
+
"search": {"limit", "tbs", "filter", "lang", "country", "location", "timeout", "scrape_options"},
|
|
2433
|
+
"crawl_url": {"include_paths", "exclude_paths", "max_depth", "max_discovery_depth", "limit",
|
|
2434
|
+
"allow_backward_links", "allow_external_links", "ignore_sitemap", "scrape_options",
|
|
2435
|
+
"webhook", "deduplicate_similar_urls", "ignore_query_parameters", "regex_on_full_url"},
|
|
2436
|
+
"map_url": {"search", "ignore_sitemap", "include_subdomains", "sitemap_only", "limit", "timeout"},
|
|
2437
|
+
"batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
|
|
2438
|
+
"wait_for", "timeout", "location", "mobile", "skip_tls_verification",
|
|
2439
|
+
"remove_base64_images", "block_ads", "proxy", "extract", "json_options",
|
|
2440
|
+
"actions", "agent"},
|
|
2441
|
+
"async_batch_scrape_urls": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
|
|
2442
|
+
"wait_for", "timeout", "location", "mobile", "skip_tls_verification",
|
|
2443
|
+
"remove_base64_images", "block_ads", "proxy", "extract", "json_options",
|
|
2444
|
+
"actions", "agent"},
|
|
2445
|
+
"batch_scrape_urls_and_watch": {"formats", "headers", "include_tags", "exclude_tags", "only_main_content",
|
|
2446
|
+
"wait_for", "timeout", "location", "mobile", "skip_tls_verification",
|
|
2447
|
+
"remove_base64_images", "block_ads", "proxy", "extract", "json_options",
|
|
2448
|
+
"actions", "agent"}
|
|
2449
|
+
}
|
|
2450
|
+
|
|
2451
|
+
# Get allowed parameters for this method
|
|
2452
|
+
allowed_params = method_params.get(method_name, set())
|
|
2453
|
+
|
|
2454
|
+
# Check for unknown parameters
|
|
2455
|
+
unknown_params = set(kwargs.keys()) - allowed_params
|
|
2456
|
+
if unknown_params:
|
|
2457
|
+
raise ValueError(f"Unsupported parameter(s) for {method_name}: {', '.join(unknown_params)}. Please refer to the API documentation for the correct parameters.")
|
|
2458
|
+
|
|
2459
|
+
# Additional type validation can be added here if needed
|
|
2460
|
+
# For now, we rely on Pydantic models for detailed type validation
|
|
2461
|
+
|
|
2397
2462
|
class CrawlWatcher:
|
|
2398
2463
|
"""
|
|
2399
2464
|
A class to watch and handle crawl job events via WebSocket connection.
|
|
@@ -2710,7 +2775,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2710
2775
|
async def scrape_url(
|
|
2711
2776
|
self,
|
|
2712
2777
|
url: str,
|
|
2713
|
-
|
|
2778
|
+
*,
|
|
2779
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
|
2714
2780
|
include_tags: Optional[List[str]] = None,
|
|
2715
2781
|
exclude_tags: Optional[List[str]] = None,
|
|
2716
2782
|
only_main_content: Optional[bool] = None,
|
|
@@ -2724,9 +2790,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2724
2790
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2725
2791
|
extract: Optional[JsonConfig] = None,
|
|
2726
2792
|
json_options: Optional[JsonConfig] = None,
|
|
2727
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
|
2793
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2794
|
+
**kwargs) -> ScrapeResponse[Any]:
|
|
2728
2795
|
"""
|
|
2729
|
-
Scrape
|
|
2796
|
+
Scrape a single URL asynchronously.
|
|
2730
2797
|
|
|
2731
2798
|
Args:
|
|
2732
2799
|
url (str): Target URL to scrape
|
|
@@ -2745,17 +2812,26 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2745
2812
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
2746
2813
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
2747
2814
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
2815
|
+
**kwargs: Additional parameters to pass to the API
|
|
2748
2816
|
|
|
2749
2817
|
Returns:
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
|
|
2754
|
-
|
|
2818
|
+
ScrapeResponse with:
|
|
2819
|
+
* success - Whether scrape was successful
|
|
2820
|
+
* markdown - Markdown content if requested
|
|
2821
|
+
* html - HTML content if requested
|
|
2822
|
+
* rawHtml - Raw HTML content if requested
|
|
2823
|
+
* links - Extracted links if requested
|
|
2824
|
+
* screenshot - Screenshot if requested
|
|
2825
|
+
* extract - Extracted data if requested
|
|
2826
|
+
* json - JSON data if requested
|
|
2827
|
+
* error - Error message if scrape failed
|
|
2755
2828
|
|
|
2756
2829
|
Raises:
|
|
2757
|
-
|
|
2830
|
+
Exception: If scraping fails
|
|
2758
2831
|
"""
|
|
2832
|
+
# Validate any additional kwargs
|
|
2833
|
+
self._validate_kwargs(kwargs, "scrape_url")
|
|
2834
|
+
|
|
2759
2835
|
headers = self._prepare_headers()
|
|
2760
2836
|
|
|
2761
2837
|
# Build scrape parameters
|
|
@@ -2879,6 +2955,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2879
2955
|
Raises:
|
|
2880
2956
|
Exception: If batch scrape fails
|
|
2881
2957
|
"""
|
|
2958
|
+
# Validate any additional kwargs
|
|
2959
|
+
self._validate_kwargs(kwargs, "batch_scrape_urls")
|
|
2960
|
+
|
|
2882
2961
|
scrape_params = {}
|
|
2883
2962
|
|
|
2884
2963
|
# Add individual parameters
|
|
@@ -3007,6 +3086,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3007
3086
|
Raises:
|
|
3008
3087
|
Exception: If job initiation fails
|
|
3009
3088
|
"""
|
|
3089
|
+
# Validate any additional kwargs
|
|
3090
|
+
self._validate_kwargs(kwargs, "async_batch_scrape_urls")
|
|
3091
|
+
|
|
3010
3092
|
scrape_params = {}
|
|
3011
3093
|
|
|
3012
3094
|
# Add individual parameters
|
|
@@ -3126,6 +3208,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3126
3208
|
Raises:
|
|
3127
3209
|
Exception: If crawl fails
|
|
3128
3210
|
"""
|
|
3211
|
+
# Validate any additional kwargs
|
|
3212
|
+
self._validate_kwargs(kwargs, "crawl_url")
|
|
3213
|
+
|
|
3129
3214
|
crawl_params = {}
|
|
3130
3215
|
|
|
3131
3216
|
# Add individual parameters
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=FlLNQdG6xpLs8ppLhPSF-bLx9L3o_A8gmU2UzAACSv8,2570
|
|
2
|
+
firecrawl/firecrawl.py,sha256=wyxYLkEKiW9GO4PKNElewsOJOJzBq_hHIDTl3nd5j94,182693
|
|
3
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
firecrawl-2.2.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.2.0.dist-info/METADATA,sha256=b4f32c9eyvFDwj11eOUf0KKy1fCsZTKfJFN-eRH_ohA,10583
|
|
10
|
+
firecrawl-2.2.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.2.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.2.0.dist-info/RECORD,,
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This is the Firecrawl package.
|
|
3
|
-
|
|
4
|
-
This package provides a Python SDK for interacting with the Firecrawl API.
|
|
5
|
-
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
-
and check the status of these jobs.
|
|
7
|
-
|
|
8
|
-
For more information visit https://github.com/firecrawl/
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import logging
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa
|
|
15
|
-
|
|
16
|
-
__version__ = "2.1.2"
|
|
17
|
-
|
|
18
|
-
# Define the logger for the Firecrawl project
|
|
19
|
-
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _configure_logger() -> None:
|
|
23
|
-
"""
|
|
24
|
-
Configure the firecrawl logger for console output.
|
|
25
|
-
|
|
26
|
-
The function attaches a handler for console output with a specific format and date
|
|
27
|
-
format to the firecrawl logger.
|
|
28
|
-
"""
|
|
29
|
-
try:
|
|
30
|
-
# Create the formatter
|
|
31
|
-
formatter = logging.Formatter(
|
|
32
|
-
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
33
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# Create the console handler and set the formatter
|
|
37
|
-
console_handler = logging.StreamHandler()
|
|
38
|
-
console_handler.setFormatter(formatter)
|
|
39
|
-
|
|
40
|
-
# Add the console handler to the firecrawl logger
|
|
41
|
-
logger.addHandler(console_handler)
|
|
42
|
-
except Exception as e:
|
|
43
|
-
logger.error("Failed to configure logging: %s", e)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def setup_logging() -> None:
|
|
47
|
-
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
48
|
-
# Check if the firecrawl logger already has a handler
|
|
49
|
-
if logger.hasHandlers():
|
|
50
|
-
return # To prevent duplicate logging
|
|
51
|
-
|
|
52
|
-
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
|
53
|
-
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
54
|
-
# Attach a no-op handler to prevent warnings about no handlers
|
|
55
|
-
logger.addHandler(logging.NullHandler())
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
# Attach the console handler to the firecrawl logger
|
|
59
|
-
_configure_logger()
|
|
60
|
-
|
|
61
|
-
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
|
62
|
-
if env == "DEBUG":
|
|
63
|
-
logger.setLevel(logging.DEBUG)
|
|
64
|
-
elif env == "INFO":
|
|
65
|
-
logger.setLevel(logging.INFO)
|
|
66
|
-
elif env == "WARNING":
|
|
67
|
-
logger.setLevel(logging.WARNING)
|
|
68
|
-
elif env == "ERROR":
|
|
69
|
-
logger.setLevel(logging.ERROR)
|
|
70
|
-
elif env == "CRITICAL":
|
|
71
|
-
logger.setLevel(logging.CRITICAL)
|
|
72
|
-
else:
|
|
73
|
-
logger.setLevel(logging.INFO)
|
|
74
|
-
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
# Initialize logging configuration when the module is imported
|
|
78
|
-
setup_logging()
|
|
79
|
-
logger.debug("Debugging logger setup")
|
|
File without changes
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
import importlib.util
|
|
2
|
-
import pytest
|
|
3
|
-
import time
|
|
4
|
-
import os
|
|
5
|
-
from uuid import uuid4
|
|
6
|
-
from dotenv import load_dotenv
|
|
7
|
-
|
|
8
|
-
load_dotenv()
|
|
9
|
-
|
|
10
|
-
API_URL = "http://127.0.0.1:3002"
|
|
11
|
-
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
|
12
|
-
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
|
13
|
-
|
|
14
|
-
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
|
15
|
-
|
|
16
|
-
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
|
17
|
-
firecrawl = importlib.util.module_from_spec(spec)
|
|
18
|
-
spec.loader.exec_module(firecrawl)
|
|
19
|
-
FirecrawlApp = firecrawl.FirecrawlApp
|
|
20
|
-
|
|
21
|
-
def test_no_api_key():
|
|
22
|
-
with pytest.raises(Exception) as excinfo:
|
|
23
|
-
invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
|
|
24
|
-
assert "No API key provided" in str(excinfo.value)
|
|
25
|
-
|
|
26
|
-
def test_scrape_url_invalid_api_key():
|
|
27
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
28
|
-
with pytest.raises(Exception) as excinfo:
|
|
29
|
-
invalid_app.scrape_url('https://firecrawl.dev')
|
|
30
|
-
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
31
|
-
|
|
32
|
-
# def test_blocklisted_url():
|
|
33
|
-
# blocklisted_url = "https://facebook.com/fake-test"
|
|
34
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
35
|
-
# with pytest.raises(Exception) as excinfo:
|
|
36
|
-
# app.scrape_url(blocklisted_url)
|
|
37
|
-
# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
38
|
-
|
|
39
|
-
def test_successful_response_with_valid_preview_token():
|
|
40
|
-
app = FirecrawlApp(api_url=API_URL, api_key=os.getenv('PREVIEW_TOKEN'), version='v0')
|
|
41
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
42
|
-
assert response is not None
|
|
43
|
-
assert 'content' in response
|
|
44
|
-
assert "_Roast_" in response['content']
|
|
45
|
-
|
|
46
|
-
def test_scrape_url_e2e():
|
|
47
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
48
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
49
|
-
print(response)
|
|
50
|
-
|
|
51
|
-
assert response is not None
|
|
52
|
-
assert 'content' in response
|
|
53
|
-
assert 'markdown' in response
|
|
54
|
-
assert 'metadata' in response
|
|
55
|
-
assert 'html' not in response
|
|
56
|
-
assert "_Roast_" in response['content']
|
|
57
|
-
|
|
58
|
-
def test_successful_response_with_valid_api_key_and_include_html():
|
|
59
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
60
|
-
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
|
61
|
-
assert response is not None
|
|
62
|
-
assert 'content' in response
|
|
63
|
-
assert 'markdown' in response
|
|
64
|
-
assert 'html' in response
|
|
65
|
-
assert 'metadata' in response
|
|
66
|
-
assert "_Roast_" in response['content']
|
|
67
|
-
assert "_Roast_" in response['markdown']
|
|
68
|
-
assert "<h1" in response['html']
|
|
69
|
-
|
|
70
|
-
def test_successful_response_for_valid_scrape_with_pdf_file():
|
|
71
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
72
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
|
73
|
-
assert response is not None
|
|
74
|
-
assert 'content' in response
|
|
75
|
-
assert 'metadata' in response
|
|
76
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
77
|
-
|
|
78
|
-
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
|
79
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
80
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
|
81
|
-
time.sleep(6) # wait for 6 seconds
|
|
82
|
-
assert response is not None
|
|
83
|
-
assert 'content' in response
|
|
84
|
-
assert 'metadata' in response
|
|
85
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
86
|
-
|
|
87
|
-
def test_crawl_url_invalid_api_key():
|
|
88
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
89
|
-
with pytest.raises(Exception) as excinfo:
|
|
90
|
-
invalid_app.crawl_url('https://firecrawl.dev')
|
|
91
|
-
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
92
|
-
|
|
93
|
-
# def test_should_return_error_for_blocklisted_url():
|
|
94
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
95
|
-
# blocklisted_url = "https://twitter.com/fake-test"
|
|
96
|
-
# with pytest.raises(Exception) as excinfo:
|
|
97
|
-
# app.crawl_url(blocklisted_url)
|
|
98
|
-
# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
99
|
-
|
|
100
|
-
def test_crawl_url_wait_for_completion_e2e():
|
|
101
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
102
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
|
103
|
-
assert response is not None
|
|
104
|
-
assert len(response) > 0
|
|
105
|
-
assert 'content' in response[0]
|
|
106
|
-
assert "_Roast_" in response[0]['content']
|
|
107
|
-
|
|
108
|
-
def test_crawl_url_with_idempotency_key_e2e():
|
|
109
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
110
|
-
uniqueIdempotencyKey = str(uuid4())
|
|
111
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
112
|
-
assert response is not None
|
|
113
|
-
assert len(response) > 0
|
|
114
|
-
assert 'content' in response[0]
|
|
115
|
-
assert "_Roast_" in response[0]['content']
|
|
116
|
-
|
|
117
|
-
with pytest.raises(Exception) as excinfo:
|
|
118
|
-
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
119
|
-
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
|
120
|
-
|
|
121
|
-
def test_check_crawl_status_e2e():
|
|
122
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
123
|
-
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
|
124
|
-
assert response is not None
|
|
125
|
-
assert 'jobId' in response
|
|
126
|
-
|
|
127
|
-
time.sleep(30) # wait for 30 seconds
|
|
128
|
-
status_response = app.check_crawl_status(response['jobId'])
|
|
129
|
-
assert status_response is not None
|
|
130
|
-
assert 'status' in status_response
|
|
131
|
-
assert status_response['status'] == 'completed'
|
|
132
|
-
assert 'data' in status_response
|
|
133
|
-
assert len(status_response['data']) > 0
|
|
134
|
-
|
|
135
|
-
def test_search_e2e():
|
|
136
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
137
|
-
response = app.search("test query")
|
|
138
|
-
assert response is not None
|
|
139
|
-
assert 'content' in response[0]
|
|
140
|
-
assert len(response) > 2
|
|
141
|
-
|
|
142
|
-
def test_search_invalid_api_key():
|
|
143
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
144
|
-
with pytest.raises(Exception) as excinfo:
|
|
145
|
-
invalid_app.search("test query")
|
|
146
|
-
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
147
|
-
|
|
148
|
-
def test_llm_extraction():
|
|
149
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
150
|
-
response = app.scrape_url("https://firecrawl.dev", {
|
|
151
|
-
'extractorOptions': {
|
|
152
|
-
'mode': 'llm-extraction',
|
|
153
|
-
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
|
154
|
-
'extractionSchema': {
|
|
155
|
-
'type': 'object',
|
|
156
|
-
'properties': {
|
|
157
|
-
'company_mission': {'type': 'string'},
|
|
158
|
-
'supports_sso': {'type': 'boolean'},
|
|
159
|
-
'is_open_source': {'type': 'boolean'}
|
|
160
|
-
},
|
|
161
|
-
'required': ['company_mission', 'supports_sso', 'is_open_source']
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
})
|
|
165
|
-
assert response is not None
|
|
166
|
-
assert 'llm_extraction' in response
|
|
167
|
-
llm_extraction = response['llm_extraction']
|
|
168
|
-
assert 'company_mission' in llm_extraction
|
|
169
|
-
assert isinstance(llm_extraction['supports_sso'], bool)
|
|
170
|
-
assert isinstance(llm_extraction['is_open_source'], bool)
|
|
File without changes
|