firecrawl-py 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl-py might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/v1/e2e_withAuth/test.py +25 -0
- firecrawl/firecrawl.py +25 -10
- {firecrawl_py-2.9.0.dist-info → firecrawl_py-2.11.0.dist-info}/LICENSE +0 -0
- {firecrawl_py-2.9.0.dist-info → firecrawl_py-2.11.0.dist-info}/METADATA +1 -1
- firecrawl_py-2.11.0.dist-info/RECORD +12 -0
- {firecrawl_py-2.9.0.dist-info → firecrawl_py-2.11.0.dist-info}/top_level.txt +0 -2
- build/lib/firecrawl/__init__.py +0 -79
- build/lib/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- build/lib/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -440
- build/lib/firecrawl/firecrawl.py +0 -4526
- build/lib/tests/test_change_tracking.py +0 -98
- firecrawl_py-2.9.0.dist-info/RECORD +0 -19
- {firecrawl_py-2.9.0.dist-info → firecrawl_py-2.11.0.dist-info}/WHEEL +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.11.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
@@ -437,4 +437,29 @@ def test_search_with_invalid_params():
|
|
|
437
437
|
app.search("test query", {"invalid_param": "value"})
|
|
438
438
|
assert "ValidationError" in str(e.value)
|
|
439
439
|
|
|
440
|
+
# def test_scrape_url_with_parse_pdf_true():
|
|
441
|
+
# if TEST_API_KEY:
|
|
442
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
443
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=True)
|
|
444
|
+
# assert response is not None
|
|
445
|
+
# assert 'markdown' in response
|
|
446
|
+
# assert len(response['markdown']) > 100
|
|
447
|
+
|
|
448
|
+
# def test_scrape_url_with_parse_pdf_false():
|
|
449
|
+
# if TEST_API_KEY:
|
|
450
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
451
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=False)
|
|
452
|
+
# assert response is not None
|
|
453
|
+
# assert 'markdown' in response
|
|
454
|
+
# assert 'h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm' in response['markdown']
|
|
455
|
+
|
|
456
|
+
# def test_scrape_options_with_parse_pdf():
|
|
457
|
+
# if TEST_API_KEY:
|
|
458
|
+
# from firecrawl.firecrawl import ScrapeOptions
|
|
459
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
460
|
+
# scrape_options = ScrapeOptions(parsePDF=False, formats=['markdown'])
|
|
461
|
+
# response = app.search("firecrawl", limit=1, scrape_options=scrape_options)
|
|
462
|
+
# assert response is not None
|
|
463
|
+
# assert 'data' in response
|
|
464
|
+
|
|
440
465
|
|
firecrawl/firecrawl.py
CHANGED
|
@@ -96,6 +96,7 @@ class AgentOptionsExtract(pydantic.BaseModel):
|
|
|
96
96
|
class ActionsResult(pydantic.BaseModel):
|
|
97
97
|
"""Result of actions performed during scraping."""
|
|
98
98
|
screenshots: List[str]
|
|
99
|
+
pdfs: List[str]
|
|
99
100
|
|
|
100
101
|
class ChangeTrackingData(pydantic.BaseModel):
|
|
101
102
|
"""
|
|
@@ -160,6 +161,7 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
160
161
|
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
|
161
162
|
maxAge: Optional[int] = None
|
|
162
163
|
storeInCache: Optional[bool] = None
|
|
164
|
+
parsePDF: Optional[bool] = None
|
|
163
165
|
|
|
164
166
|
class WaitAction(pydantic.BaseModel):
|
|
165
167
|
"""Wait action to perform during scraping."""
|
|
@@ -171,6 +173,7 @@ class ScreenshotAction(pydantic.BaseModel):
|
|
|
171
173
|
"""Screenshot action to perform during scraping."""
|
|
172
174
|
type: Literal["screenshot"]
|
|
173
175
|
fullPage: Optional[bool] = None
|
|
176
|
+
quality: Optional[int] = None
|
|
174
177
|
|
|
175
178
|
class ClickAction(pydantic.BaseModel):
|
|
176
179
|
"""Click action to perform during scraping."""
|
|
@@ -202,6 +205,12 @@ class ExecuteJavascriptAction(pydantic.BaseModel):
|
|
|
202
205
|
type: Literal["executeJavascript"]
|
|
203
206
|
script: str
|
|
204
207
|
|
|
208
|
+
class PDFAction(pydantic.BaseModel):
|
|
209
|
+
"""PDF action to perform during scraping."""
|
|
210
|
+
type: Literal["pdf"]
|
|
211
|
+
format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
|
|
212
|
+
landscape: Optional[bool] = None
|
|
213
|
+
scale: Optional[float] = None
|
|
205
214
|
|
|
206
215
|
class ExtractAgent(pydantic.BaseModel):
|
|
207
216
|
"""Configuration for the agent in extract operations."""
|
|
@@ -218,7 +227,7 @@ class ScrapeParams(ScrapeOptions):
|
|
|
218
227
|
"""Parameters for scraping operations."""
|
|
219
228
|
extract: Optional[JsonConfig] = None
|
|
220
229
|
jsonOptions: Optional[JsonConfig] = None
|
|
221
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
|
230
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
|
|
222
231
|
agent: Optional[AgentOptions] = None
|
|
223
232
|
webhook: Optional[WebhookConfig] = None
|
|
224
233
|
|
|
@@ -465,9 +474,10 @@ class FirecrawlApp:
|
|
|
465
474
|
remove_base64_images: Optional[bool] = None,
|
|
466
475
|
block_ads: Optional[bool] = None,
|
|
467
476
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
477
|
+
parse_pdf: Optional[bool] = None,
|
|
468
478
|
extract: Optional[JsonConfig] = None,
|
|
469
479
|
json_options: Optional[JsonConfig] = None,
|
|
470
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
480
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
471
481
|
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
|
472
482
|
max_age: Optional[int] = None,
|
|
473
483
|
store_in_cache: Optional[bool] = None,
|
|
@@ -491,7 +501,7 @@ class FirecrawlApp:
|
|
|
491
501
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
492
502
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
493
503
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
494
|
-
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
504
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
|
|
495
505
|
change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
|
|
496
506
|
|
|
497
507
|
|
|
@@ -538,6 +548,8 @@ class FirecrawlApp:
|
|
|
538
548
|
scrape_params['blockAds'] = block_ads
|
|
539
549
|
if proxy:
|
|
540
550
|
scrape_params['proxy'] = proxy
|
|
551
|
+
if parse_pdf is not None:
|
|
552
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
541
553
|
if extract is not None:
|
|
542
554
|
extract = self._ensure_schema_dict(extract)
|
|
543
555
|
if isinstance(extract, dict) and "schema" in extract:
|
|
@@ -1232,7 +1244,7 @@ class FirecrawlApp:
|
|
|
1232
1244
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1233
1245
|
extract: Optional[JsonConfig] = None,
|
|
1234
1246
|
json_options: Optional[JsonConfig] = None,
|
|
1235
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1247
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1236
1248
|
agent: Optional[AgentOptions] = None,
|
|
1237
1249
|
poll_interval: Optional[int] = 2,
|
|
1238
1250
|
max_concurrency: Optional[int] = None,
|
|
@@ -1370,7 +1382,7 @@ class FirecrawlApp:
|
|
|
1370
1382
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1371
1383
|
extract: Optional[JsonConfig] = None,
|
|
1372
1384
|
json_options: Optional[JsonConfig] = None,
|
|
1373
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1385
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1374
1386
|
agent: Optional[AgentOptions] = None,
|
|
1375
1387
|
max_concurrency: Optional[int] = None,
|
|
1376
1388
|
idempotency_key: Optional[str] = None,
|
|
@@ -1506,7 +1518,7 @@ class FirecrawlApp:
|
|
|
1506
1518
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1507
1519
|
extract: Optional[JsonConfig] = None,
|
|
1508
1520
|
json_options: Optional[JsonConfig] = None,
|
|
1509
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1521
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1510
1522
|
agent: Optional[AgentOptions] = None,
|
|
1511
1523
|
max_concurrency: Optional[int] = None,
|
|
1512
1524
|
idempotency_key: Optional[str] = None,
|
|
@@ -2904,9 +2916,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2904
2916
|
remove_base64_images: Optional[bool] = None,
|
|
2905
2917
|
block_ads: Optional[bool] = None,
|
|
2906
2918
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
2919
|
+
parse_pdf: Optional[bool] = None,
|
|
2907
2920
|
extract: Optional[JsonConfig] = None,
|
|
2908
2921
|
json_options: Optional[JsonConfig] = None,
|
|
2909
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2922
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
2910
2923
|
**kwargs) -> ScrapeResponse[Any]:
|
|
2911
2924
|
"""
|
|
2912
2925
|
Scrape a single URL asynchronously.
|
|
@@ -2927,7 +2940,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2927
2940
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
2928
2941
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
2929
2942
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
2930
|
-
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
2943
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
|
|
2931
2944
|
**kwargs: Additional parameters to pass to the API
|
|
2932
2945
|
|
|
2933
2946
|
Returns:
|
|
@@ -2981,6 +2994,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2981
2994
|
scrape_params['blockAds'] = block_ads
|
|
2982
2995
|
if proxy:
|
|
2983
2996
|
scrape_params['proxy'] = proxy
|
|
2997
|
+
if parse_pdf is not None:
|
|
2998
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
2984
2999
|
if extract is not None:
|
|
2985
3000
|
extract = self._ensure_schema_dict(extract)
|
|
2986
3001
|
if isinstance(extract, dict) and "schema" in extract:
|
|
@@ -3035,7 +3050,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3035
3050
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3036
3051
|
extract: Optional[JsonConfig] = None,
|
|
3037
3052
|
json_options: Optional[JsonConfig] = None,
|
|
3038
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
3053
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
3039
3054
|
agent: Optional[AgentOptions] = None,
|
|
3040
3055
|
poll_interval: Optional[int] = 2,
|
|
3041
3056
|
idempotency_key: Optional[str] = None,
|
|
@@ -3174,7 +3189,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3174
3189
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3175
3190
|
extract: Optional[JsonConfig] = None,
|
|
3176
3191
|
json_options: Optional[JsonConfig] = None,
|
|
3177
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
3192
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
3178
3193
|
agent: Optional[AgentOptions] = None,
|
|
3179
3194
|
idempotency_key: Optional[str] = None,
|
|
3180
3195
|
**kwargs
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
firecrawl/__init__.py,sha256=uHnpcSPL_UX_PvPYv7sphq_21zQmzqYILej7FZnMEO4,2613
|
|
2
|
+
firecrawl/firecrawl.py,sha256=2fzg1wKvbH6_KYnEhwPaGv6hYMmW1f9o0mSoCRv0PHw,194334
|
|
3
|
+
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
|
+
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=k9IsEbdTHL9Cu49M4FpnQDEo2rnG6RqwmZAsK_EVJr4,21069
|
|
7
|
+
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
+
firecrawl_py-2.11.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl_py-2.11.0.dist-info/METADATA,sha256=dcuZcTRp8mUNKyZYsEgHrxTP1n5EsxXZKiGm067aooQ,7169
|
|
10
|
+
firecrawl_py-2.11.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl_py-2.11.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl_py-2.11.0.dist-info/RECORD,,
|
build/lib/firecrawl/__init__.py
DELETED
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This is the Firecrawl package.
|
|
3
|
-
|
|
4
|
-
This package provides a Python SDK for interacting with the Firecrawl API.
|
|
5
|
-
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
6
|
-
and check the status of these jobs.
|
|
7
|
-
|
|
8
|
-
For more information visit https://github.com/firecrawl/
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import logging
|
|
12
|
-
import os
|
|
13
|
-
|
|
14
|
-
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
|
-
|
|
16
|
-
__version__ = "2.9.0"
|
|
17
|
-
|
|
18
|
-
# Define the logger for the Firecrawl project
|
|
19
|
-
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def _configure_logger() -> None:
|
|
23
|
-
"""
|
|
24
|
-
Configure the firecrawl logger for console output.
|
|
25
|
-
|
|
26
|
-
The function attaches a handler for console output with a specific format and date
|
|
27
|
-
format to the firecrawl logger.
|
|
28
|
-
"""
|
|
29
|
-
try:
|
|
30
|
-
# Create the formatter
|
|
31
|
-
formatter = logging.Formatter(
|
|
32
|
-
"[%(asctime)s - %(name)s:%(lineno)d - %(levelname)s] %(message)s",
|
|
33
|
-
datefmt="%Y-%m-%d %H:%M:%S",
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
# Create the console handler and set the formatter
|
|
37
|
-
console_handler = logging.StreamHandler()
|
|
38
|
-
console_handler.setFormatter(formatter)
|
|
39
|
-
|
|
40
|
-
# Add the console handler to the firecrawl logger
|
|
41
|
-
logger.addHandler(console_handler)
|
|
42
|
-
except Exception as e:
|
|
43
|
-
logger.error("Failed to configure logging: %s", e)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def setup_logging() -> None:
|
|
47
|
-
"""Set up logging based on the FIRECRAWL_LOGGING_LEVEL environment variable."""
|
|
48
|
-
# Check if the firecrawl logger already has a handler
|
|
49
|
-
if logger.hasHandlers():
|
|
50
|
-
return # To prevent duplicate logging
|
|
51
|
-
|
|
52
|
-
# Check if the FIRECRAWL_LOGGING_LEVEL environment variable is set
|
|
53
|
-
if not (env := os.getenv("FIRECRAWL_LOGGING_LEVEL", "").upper()):
|
|
54
|
-
# Attach a no-op handler to prevent warnings about no handlers
|
|
55
|
-
logger.addHandler(logging.NullHandler())
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
# Attach the console handler to the firecrawl logger
|
|
59
|
-
_configure_logger()
|
|
60
|
-
|
|
61
|
-
# Set the logging level based on the FIRECRAWL_LOGGING_LEVEL environment variable
|
|
62
|
-
if env == "DEBUG":
|
|
63
|
-
logger.setLevel(logging.DEBUG)
|
|
64
|
-
elif env == "INFO":
|
|
65
|
-
logger.setLevel(logging.INFO)
|
|
66
|
-
elif env == "WARNING":
|
|
67
|
-
logger.setLevel(logging.WARNING)
|
|
68
|
-
elif env == "ERROR":
|
|
69
|
-
logger.setLevel(logging.ERROR)
|
|
70
|
-
elif env == "CRITICAL":
|
|
71
|
-
logger.setLevel(logging.CRITICAL)
|
|
72
|
-
else:
|
|
73
|
-
logger.setLevel(logging.INFO)
|
|
74
|
-
logger.warning("Unknown logging level: %s, defaulting to INFO", env)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
# Initialize logging configuration when the module is imported
|
|
78
|
-
setup_logging()
|
|
79
|
-
logger.debug("Debugging logger setup")
|
|
File without changes
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
import importlib.util
|
|
2
|
-
import pytest
|
|
3
|
-
import time
|
|
4
|
-
import os
|
|
5
|
-
from uuid import uuid4
|
|
6
|
-
from dotenv import load_dotenv
|
|
7
|
-
|
|
8
|
-
load_dotenv()
|
|
9
|
-
|
|
10
|
-
API_URL = "http://127.0.0.1:3002"
|
|
11
|
-
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
|
12
|
-
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
|
13
|
-
|
|
14
|
-
print(f"ABSOLUTE_FIRECRAWL_PATH: {ABSOLUTE_FIRECRAWL_PATH}")
|
|
15
|
-
|
|
16
|
-
spec = importlib.util.spec_from_file_location("FirecrawlApp", ABSOLUTE_FIRECRAWL_PATH)
|
|
17
|
-
firecrawl = importlib.util.module_from_spec(spec)
|
|
18
|
-
spec.loader.exec_module(firecrawl)
|
|
19
|
-
FirecrawlApp = firecrawl.FirecrawlApp
|
|
20
|
-
|
|
21
|
-
def test_no_api_key():
|
|
22
|
-
with pytest.raises(Exception) as excinfo:
|
|
23
|
-
invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
|
|
24
|
-
assert "No API key provided" in str(excinfo.value)
|
|
25
|
-
|
|
26
|
-
def test_scrape_url_invalid_api_key():
|
|
27
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
28
|
-
with pytest.raises(Exception) as excinfo:
|
|
29
|
-
invalid_app.scrape_url('https://firecrawl.dev')
|
|
30
|
-
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
31
|
-
|
|
32
|
-
# def test_blocklisted_url():
|
|
33
|
-
# blocklisted_url = "https://facebook.com/fake-test"
|
|
34
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
35
|
-
# with pytest.raises(Exception) as excinfo:
|
|
36
|
-
# app.scrape_url(blocklisted_url)
|
|
37
|
-
# assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
38
|
-
|
|
39
|
-
def test_successful_response_with_valid_preview_token():
|
|
40
|
-
app = FirecrawlApp(api_url=API_URL, api_key=os.getenv('PREVIEW_TOKEN'), version='v0')
|
|
41
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
42
|
-
assert response is not None
|
|
43
|
-
assert 'content' in response
|
|
44
|
-
assert "_Roast_" in response['content']
|
|
45
|
-
|
|
46
|
-
def test_scrape_url_e2e():
|
|
47
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
48
|
-
response = app.scrape_url('https://roastmywebsite.ai')
|
|
49
|
-
print(response)
|
|
50
|
-
|
|
51
|
-
assert response is not None
|
|
52
|
-
assert 'content' in response
|
|
53
|
-
assert 'markdown' in response
|
|
54
|
-
assert 'metadata' in response
|
|
55
|
-
assert 'html' not in response
|
|
56
|
-
assert "_Roast_" in response['content']
|
|
57
|
-
|
|
58
|
-
def test_successful_response_with_valid_api_key_and_include_html():
|
|
59
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
60
|
-
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
|
61
|
-
assert response is not None
|
|
62
|
-
assert 'content' in response
|
|
63
|
-
assert 'markdown' in response
|
|
64
|
-
assert 'html' in response
|
|
65
|
-
assert 'metadata' in response
|
|
66
|
-
assert "_Roast_" in response['content']
|
|
67
|
-
assert "_Roast_" in response['markdown']
|
|
68
|
-
assert "<h1" in response['html']
|
|
69
|
-
|
|
70
|
-
def test_successful_response_for_valid_scrape_with_pdf_file():
|
|
71
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
72
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
|
73
|
-
assert response is not None
|
|
74
|
-
assert 'content' in response
|
|
75
|
-
assert 'metadata' in response
|
|
76
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
77
|
-
|
|
78
|
-
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
|
79
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
80
|
-
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
|
81
|
-
time.sleep(6) # wait for 6 seconds
|
|
82
|
-
assert response is not None
|
|
83
|
-
assert 'content' in response
|
|
84
|
-
assert 'metadata' in response
|
|
85
|
-
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
|
86
|
-
|
|
87
|
-
def test_crawl_url_invalid_api_key():
|
|
88
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
89
|
-
with pytest.raises(Exception) as excinfo:
|
|
90
|
-
invalid_app.crawl_url('https://firecrawl.dev')
|
|
91
|
-
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
92
|
-
|
|
93
|
-
# def test_should_return_error_for_blocklisted_url():
|
|
94
|
-
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
95
|
-
# blocklisted_url = "https://twitter.com/fake-test"
|
|
96
|
-
# with pytest.raises(Exception) as excinfo:
|
|
97
|
-
# app.crawl_url(blocklisted_url)
|
|
98
|
-
# assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
|
99
|
-
|
|
100
|
-
def test_crawl_url_wait_for_completion_e2e():
|
|
101
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
102
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
|
103
|
-
assert response is not None
|
|
104
|
-
assert len(response) > 0
|
|
105
|
-
assert 'content' in response[0]
|
|
106
|
-
assert "_Roast_" in response[0]['content']
|
|
107
|
-
|
|
108
|
-
def test_crawl_url_with_idempotency_key_e2e():
|
|
109
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
110
|
-
uniqueIdempotencyKey = str(uuid4())
|
|
111
|
-
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
112
|
-
assert response is not None
|
|
113
|
-
assert len(response) > 0
|
|
114
|
-
assert 'content' in response[0]
|
|
115
|
-
assert "_Roast_" in response[0]['content']
|
|
116
|
-
|
|
117
|
-
with pytest.raises(Exception) as excinfo:
|
|
118
|
-
app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
|
119
|
-
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
|
120
|
-
|
|
121
|
-
def test_check_crawl_status_e2e():
|
|
122
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
123
|
-
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
|
124
|
-
assert response is not None
|
|
125
|
-
assert 'jobId' in response
|
|
126
|
-
|
|
127
|
-
time.sleep(30) # wait for 30 seconds
|
|
128
|
-
status_response = app.check_crawl_status(response['jobId'])
|
|
129
|
-
assert status_response is not None
|
|
130
|
-
assert 'status' in status_response
|
|
131
|
-
assert status_response['status'] == 'completed'
|
|
132
|
-
assert 'data' in status_response
|
|
133
|
-
assert len(status_response['data']) > 0
|
|
134
|
-
|
|
135
|
-
def test_search_e2e():
|
|
136
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
137
|
-
response = app.search("test query")
|
|
138
|
-
assert response is not None
|
|
139
|
-
assert 'content' in response[0]
|
|
140
|
-
assert len(response) > 2
|
|
141
|
-
|
|
142
|
-
def test_search_invalid_api_key():
|
|
143
|
-
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
|
144
|
-
with pytest.raises(Exception) as excinfo:
|
|
145
|
-
invalid_app.search("test query")
|
|
146
|
-
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
|
147
|
-
|
|
148
|
-
def test_llm_extraction():
|
|
149
|
-
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
|
150
|
-
response = app.scrape_url("https://firecrawl.dev", {
|
|
151
|
-
'extractorOptions': {
|
|
152
|
-
'mode': 'llm-extraction',
|
|
153
|
-
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
|
154
|
-
'extractionSchema': {
|
|
155
|
-
'type': 'object',
|
|
156
|
-
'properties': {
|
|
157
|
-
'company_mission': {'type': 'string'},
|
|
158
|
-
'supports_sso': {'type': 'boolean'},
|
|
159
|
-
'is_open_source': {'type': 'boolean'}
|
|
160
|
-
},
|
|
161
|
-
'required': ['company_mission', 'supports_sso', 'is_open_source']
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
})
|
|
165
|
-
assert response is not None
|
|
166
|
-
assert 'llm_extraction' in response
|
|
167
|
-
llm_extraction = response['llm_extraction']
|
|
168
|
-
assert 'company_mission' in llm_extraction
|
|
169
|
-
assert isinstance(llm_extraction['supports_sso'], bool)
|
|
170
|
-
assert isinstance(llm_extraction['is_open_source'], bool)
|
|
File without changes
|