firecrawl 2.9.0__tar.gz → 2.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-2.9.0 → firecrawl-2.10.0}/LICENSE +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/PKG-INFO +1 -1
- {firecrawl-2.9.0 → firecrawl-2.10.0}/README.md +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl/__init__.py +1 -1
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl/__tests__/e2e_withAuth/test.py +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl/__tests__/v1/e2e_withAuth/test.py +25 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl/firecrawl.py +7 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl.egg-info/PKG-INFO +1 -1
- {firecrawl-2.9.0 → firecrawl-2.10.0}/tests/test_change_tracking.py +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl.egg-info/SOURCES.txt +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl.egg-info/requires.txt +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/pyproject.toml +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/setup.cfg +0 -0
- {firecrawl-2.9.0 → firecrawl-2.10.0}/setup.py +0 -0
|
File without changes
|
|
File without changes
|
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.10.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -437,4 +437,29 @@ def test_search_with_invalid_params():
|
|
|
437
437
|
app.search("test query", {"invalid_param": "value"})
|
|
438
438
|
assert "ValidationError" in str(e.value)
|
|
439
439
|
|
|
440
|
+
# def test_scrape_url_with_parse_pdf_true():
|
|
441
|
+
# if TEST_API_KEY:
|
|
442
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
443
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=True)
|
|
444
|
+
# assert response is not None
|
|
445
|
+
# assert 'markdown' in response
|
|
446
|
+
# assert len(response['markdown']) > 100
|
|
447
|
+
|
|
448
|
+
# def test_scrape_url_with_parse_pdf_false():
|
|
449
|
+
# if TEST_API_KEY:
|
|
450
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
451
|
+
# response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf', parse_pdf=False)
|
|
452
|
+
# assert response is not None
|
|
453
|
+
# assert 'markdown' in response
|
|
454
|
+
# assert 'h7uKu14adDL6yGfnGf2qycY5uq8kC3OKCWkPxm' in response['markdown']
|
|
455
|
+
|
|
456
|
+
# def test_scrape_options_with_parse_pdf():
|
|
457
|
+
# if TEST_API_KEY:
|
|
458
|
+
# from firecrawl.firecrawl import ScrapeOptions
|
|
459
|
+
# app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
|
460
|
+
# scrape_options = ScrapeOptions(parsePDF=False, formats=['markdown'])
|
|
461
|
+
# response = app.search("firecrawl", limit=1, scrape_options=scrape_options)
|
|
462
|
+
# assert response is not None
|
|
463
|
+
# assert 'data' in response
|
|
464
|
+
|
|
440
465
|
|
|
@@ -160,6 +160,7 @@ class ScrapeOptions(pydantic.BaseModel):
|
|
|
160
160
|
changeTrackingOptions: Optional[ChangeTrackingOptions] = None
|
|
161
161
|
maxAge: Optional[int] = None
|
|
162
162
|
storeInCache: Optional[bool] = None
|
|
163
|
+
parsePDF: Optional[bool] = None
|
|
163
164
|
|
|
164
165
|
class WaitAction(pydantic.BaseModel):
|
|
165
166
|
"""Wait action to perform during scraping."""
|
|
@@ -465,6 +466,7 @@ class FirecrawlApp:
|
|
|
465
466
|
remove_base64_images: Optional[bool] = None,
|
|
466
467
|
block_ads: Optional[bool] = None,
|
|
467
468
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
469
|
+
parse_pdf: Optional[bool] = None,
|
|
468
470
|
extract: Optional[JsonConfig] = None,
|
|
469
471
|
json_options: Optional[JsonConfig] = None,
|
|
470
472
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
@@ -538,6 +540,8 @@ class FirecrawlApp:
|
|
|
538
540
|
scrape_params['blockAds'] = block_ads
|
|
539
541
|
if proxy:
|
|
540
542
|
scrape_params['proxy'] = proxy
|
|
543
|
+
if parse_pdf is not None:
|
|
544
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
541
545
|
if extract is not None:
|
|
542
546
|
extract = self._ensure_schema_dict(extract)
|
|
543
547
|
if isinstance(extract, dict) and "schema" in extract:
|
|
@@ -2904,6 +2908,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2904
2908
|
remove_base64_images: Optional[bool] = None,
|
|
2905
2909
|
block_ads: Optional[bool] = None,
|
|
2906
2910
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
2911
|
+
parse_pdf: Optional[bool] = None,
|
|
2907
2912
|
extract: Optional[JsonConfig] = None,
|
|
2908
2913
|
json_options: Optional[JsonConfig] = None,
|
|
2909
2914
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
@@ -2981,6 +2986,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2981
2986
|
scrape_params['blockAds'] = block_ads
|
|
2982
2987
|
if proxy:
|
|
2983
2988
|
scrape_params['proxy'] = proxy
|
|
2989
|
+
if parse_pdf is not None:
|
|
2990
|
+
scrape_params['parsePDF'] = parse_pdf
|
|
2984
2991
|
if extract is not None:
|
|
2985
2992
|
extract = self._ensure_schema_dict(extract)
|
|
2986
2993
|
if isinstance(extract, dict) and "schema" in extract:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|