firecrawl 2.14.0__tar.gz → 2.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-2.14.0 → firecrawl-2.16.0}/PKG-INFO +1 -1
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl/__init__.py +1 -1
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl/firecrawl.py +13 -8
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl.egg-info/PKG-INFO +1 -1
- {firecrawl-2.14.0 → firecrawl-2.16.0}/setup.py +1 -2
- {firecrawl-2.14.0 → firecrawl-2.16.0}/LICENSE +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/README.md +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl/__tests__/e2e_withAuth/test.py +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl.egg-info/SOURCES.txt +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl.egg-info/requires.txt +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/pyproject.toml +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/setup.cfg +0 -0
- {firecrawl-2.14.0 → firecrawl-2.16.0}/tests/test_change_tracking.py +0 -0
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.16.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
@@ -23,8 +23,6 @@ import websockets
|
|
|
23
23
|
import aiohttp
|
|
24
24
|
import asyncio
|
|
25
25
|
from pydantic import Field
|
|
26
|
-
import ssl
|
|
27
|
-
import certifi
|
|
28
26
|
|
|
29
27
|
# Suppress Pydantic warnings about attribute shadowing
|
|
30
28
|
warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
|
|
@@ -466,6 +464,7 @@ class FirecrawlApp:
|
|
|
466
464
|
url: str,
|
|
467
465
|
*,
|
|
468
466
|
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
|
467
|
+
headers: Optional[Dict[str, str]] = None,
|
|
469
468
|
include_tags: Optional[List[str]] = None,
|
|
470
469
|
exclude_tags: Optional[List[str]] = None,
|
|
471
470
|
only_main_content: Optional[bool] = None,
|
|
@@ -492,6 +491,7 @@ class FirecrawlApp:
|
|
|
492
491
|
Args:
|
|
493
492
|
url (str): Target URL to scrape
|
|
494
493
|
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
|
494
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
495
495
|
include_tags (Optional[List[str]]): HTML tags to include
|
|
496
496
|
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
497
497
|
only_main_content (Optional[bool]): Extract main content only
|
|
@@ -520,7 +520,7 @@ class FirecrawlApp:
|
|
|
520
520
|
Raises:
|
|
521
521
|
Exception: If scraping fails
|
|
522
522
|
"""
|
|
523
|
-
|
|
523
|
+
_headers = self._prepare_headers()
|
|
524
524
|
|
|
525
525
|
# Build scrape parameters
|
|
526
526
|
scrape_params = {
|
|
@@ -531,6 +531,8 @@ class FirecrawlApp:
|
|
|
531
531
|
# Add optional parameters if provided
|
|
532
532
|
if formats:
|
|
533
533
|
scrape_params['formats'] = formats
|
|
534
|
+
if headers:
|
|
535
|
+
scrape_params['headers'] = headers
|
|
534
536
|
if include_tags:
|
|
535
537
|
scrape_params['includeTags'] = include_tags
|
|
536
538
|
if exclude_tags:
|
|
@@ -586,7 +588,7 @@ class FirecrawlApp:
|
|
|
586
588
|
# Make request
|
|
587
589
|
response = requests.post(
|
|
588
590
|
f'{self.api_url}/v1/scrape',
|
|
589
|
-
headers=
|
|
591
|
+
headers=_headers,
|
|
590
592
|
json=scrape_params,
|
|
591
593
|
timeout=(timeout + 5000 if timeout else None)
|
|
592
594
|
)
|
|
@@ -2769,8 +2771,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2769
2771
|
aiohttp.ClientError: If the request fails after all retries.
|
|
2770
2772
|
Exception: If max retries are exceeded or other errors occur.
|
|
2771
2773
|
"""
|
|
2772
|
-
|
|
2773
|
-
async with aiohttp.ClientSession(ssl=ssl_context) as session:
|
|
2774
|
+
async with aiohttp.ClientSession() as session:
|
|
2774
2775
|
for attempt in range(retries):
|
|
2775
2776
|
try:
|
|
2776
2777
|
async with session.request(
|
|
@@ -2966,6 +2967,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2966
2967
|
url: str,
|
|
2967
2968
|
*,
|
|
2968
2969
|
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
|
2970
|
+
headers: Optional[Dict[str, str]] = None,
|
|
2969
2971
|
include_tags: Optional[List[str]] = None,
|
|
2970
2972
|
exclude_tags: Optional[List[str]] = None,
|
|
2971
2973
|
only_main_content: Optional[bool] = None,
|
|
@@ -2988,6 +2990,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2988
2990
|
Args:
|
|
2989
2991
|
url (str): Target URL to scrape
|
|
2990
2992
|
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
|
2993
|
+
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
|
2991
2994
|
include_tags (Optional[List[str]]): HTML tags to include
|
|
2992
2995
|
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
|
2993
2996
|
only_main_content (Optional[bool]): Extract main content only
|
|
@@ -3022,7 +3025,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3022
3025
|
# Validate any additional kwargs
|
|
3023
3026
|
self._validate_kwargs(kwargs, "scrape_url")
|
|
3024
3027
|
|
|
3025
|
-
|
|
3028
|
+
_headers = self._prepare_headers()
|
|
3026
3029
|
|
|
3027
3030
|
# Build scrape parameters
|
|
3028
3031
|
scrape_params = {
|
|
@@ -3033,6 +3036,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3033
3036
|
# Add optional parameters if provided and not None
|
|
3034
3037
|
if formats:
|
|
3035
3038
|
scrape_params['formats'] = formats
|
|
3039
|
+
if headers:
|
|
3040
|
+
scrape_params['headers'] = headers
|
|
3036
3041
|
if include_tags:
|
|
3037
3042
|
scrape_params['includeTags'] = include_tags
|
|
3038
3043
|
if exclude_tags:
|
|
@@ -3080,7 +3085,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3080
3085
|
response = await self._async_post_request(
|
|
3081
3086
|
f'{self.api_url}{endpoint}',
|
|
3082
3087
|
scrape_params,
|
|
3083
|
-
|
|
3088
|
+
_headers
|
|
3084
3089
|
)
|
|
3085
3090
|
|
|
3086
3091
|
if response.get('success') and 'data' in response:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|