firecrawl 2.0.2__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +2 -2
- firecrawl/firecrawl.py +160 -124
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.1.dist-info}/METADATA +1 -1
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.1.dist-info}/RECORD +7 -7
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.1.dist-info}/LICENSE +0 -0
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.1.dist-info}/WHEEL +0 -0
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.1.dist-info}/top_level.txt +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -11,9 +11,9 @@ For more information visit https://github.com/firecrawl/
|
|
|
11
11
|
import logging
|
|
12
12
|
import os
|
|
13
13
|
|
|
14
|
-
from .firecrawl import FirecrawlApp, JsonConfig # noqa
|
|
14
|
+
from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.1.1"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
firecrawl/firecrawl.py
CHANGED
|
@@ -97,6 +97,16 @@ class ActionsResult(pydantic.BaseModel):
|
|
|
97
97
|
"""Result of actions performed during scraping."""
|
|
98
98
|
screenshots: List[str]
|
|
99
99
|
|
|
100
|
+
class ChangeTrackingData(pydantic.BaseModel):
|
|
101
|
+
"""
|
|
102
|
+
Data for the change tracking format.
|
|
103
|
+
"""
|
|
104
|
+
previousScrapeAt: Optional[str] = None
|
|
105
|
+
changeStatus: str # "new" | "same" | "changed" | "removed"
|
|
106
|
+
visibility: str # "visible" | "hidden"
|
|
107
|
+
diff: Optional[Dict[str, Any]] = None
|
|
108
|
+
json: Optional[Any] = None
|
|
109
|
+
|
|
100
110
|
class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
101
111
|
"""Document retrieved or processed by Firecrawl."""
|
|
102
112
|
url: Optional[str] = None
|
|
@@ -111,6 +121,7 @@ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
|
111
121
|
actions: Optional[ActionsResult] = None
|
|
112
122
|
title: Optional[str] = None # v1 search only
|
|
113
123
|
description: Optional[str] = None # v1 search only
|
|
124
|
+
changeTracking: Optional[ChangeTrackingData] = None
|
|
114
125
|
|
|
115
126
|
class LocationConfig(pydantic.BaseModel):
|
|
116
127
|
"""Location configuration for scraping."""
|
|
@@ -124,9 +135,9 @@ class WebhookConfig(pydantic.BaseModel):
|
|
|
124
135
|
metadata: Optional[Dict[str, str]] = None
|
|
125
136
|
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
126
137
|
|
|
127
|
-
class
|
|
138
|
+
class ScrapeOptions(pydantic.BaseModel):
|
|
128
139
|
"""Parameters for scraping operations."""
|
|
129
|
-
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None
|
|
140
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
|
|
130
141
|
headers: Optional[Dict[str, str]] = None
|
|
131
142
|
includeTags: Optional[List[str]] = None
|
|
132
143
|
excludeTags: Optional[List[str]] = None
|
|
@@ -193,7 +204,7 @@ class JsonConfig(pydantic.BaseModel):
|
|
|
193
204
|
systemPrompt: Optional[str] = None
|
|
194
205
|
agent: Optional[ExtractAgent] = None
|
|
195
206
|
|
|
196
|
-
class ScrapeParams(
|
|
207
|
+
class ScrapeParams(ScrapeOptions):
|
|
197
208
|
"""Parameters for scraping operations."""
|
|
198
209
|
extract: Optional[JsonConfig] = None
|
|
199
210
|
jsonOptions: Optional[JsonConfig] = None
|
|
@@ -235,7 +246,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
235
246
|
allowBackwardLinks: Optional[bool] = None
|
|
236
247
|
allowExternalLinks: Optional[bool] = None
|
|
237
248
|
ignoreSitemap: Optional[bool] = None
|
|
238
|
-
scrapeOptions: Optional[
|
|
249
|
+
scrapeOptions: Optional[ScrapeOptions] = None
|
|
239
250
|
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
240
251
|
deduplicateSimilarURLs: Optional[bool] = None
|
|
241
252
|
ignoreQueryParameters: Optional[bool] = None
|
|
@@ -289,7 +300,7 @@ class ExtractParams(pydantic.BaseModel):
|
|
|
289
300
|
includeSubdomains: Optional[bool] = None
|
|
290
301
|
origin: Optional[str] = None
|
|
291
302
|
showSources: Optional[bool] = None
|
|
292
|
-
scrapeOptions: Optional[
|
|
303
|
+
scrapeOptions: Optional[ScrapeOptions] = None
|
|
293
304
|
|
|
294
305
|
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
295
306
|
"""Response from extract operations."""
|
|
@@ -309,7 +320,7 @@ class SearchParams(pydantic.BaseModel):
|
|
|
309
320
|
location: Optional[str] = None
|
|
310
321
|
origin: Optional[str] = "api"
|
|
311
322
|
timeout: Optional[int] = 60000
|
|
312
|
-
scrapeOptions: Optional[
|
|
323
|
+
scrapeOptions: Optional[ScrapeOptions] = None
|
|
313
324
|
|
|
314
325
|
class SearchResponse(pydantic.BaseModel):
|
|
315
326
|
"""Response from search operations."""
|
|
@@ -377,16 +388,6 @@ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
|
|
377
388
|
status: Literal["processing", "completed", "failed"]
|
|
378
389
|
error: Optional[str] = None
|
|
379
390
|
expiresAt: str
|
|
380
|
-
|
|
381
|
-
class ChangeTrackingData(pydantic.BaseModel):
|
|
382
|
-
"""
|
|
383
|
-
Data for the change tracking format.
|
|
384
|
-
"""
|
|
385
|
-
previousScrapeAt: Optional[str] = None
|
|
386
|
-
changeStatus: str # "new" | "same" | "changed" | "removed"
|
|
387
|
-
visibility: str # "visible" | "hidden"
|
|
388
|
-
diff: Optional[Dict[str, Any]] = None
|
|
389
|
-
json: Optional[Any] = None
|
|
390
391
|
|
|
391
392
|
class SearchResponse(pydantic.BaseModel):
|
|
392
393
|
"""
|
|
@@ -442,7 +443,7 @@ class FirecrawlApp:
|
|
|
442
443
|
self,
|
|
443
444
|
url: str,
|
|
444
445
|
*,
|
|
445
|
-
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
446
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
|
446
447
|
include_tags: Optional[List[str]] = None,
|
|
447
448
|
exclude_tags: Optional[List[str]] = None,
|
|
448
449
|
only_main_content: Optional[bool] = None,
|
|
@@ -568,7 +569,7 @@ class FirecrawlApp:
|
|
|
568
569
|
country: Optional[str] = None,
|
|
569
570
|
location: Optional[str] = None,
|
|
570
571
|
timeout: Optional[int] = None,
|
|
571
|
-
scrape_options: Optional[
|
|
572
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
572
573
|
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
573
574
|
**kwargs) -> SearchResponse:
|
|
574
575
|
"""
|
|
@@ -583,7 +584,7 @@ class FirecrawlApp:
|
|
|
583
584
|
country (Optional[str]): Country code (default: "us")
|
|
584
585
|
location (Optional[str]): Geo-targeting
|
|
585
586
|
timeout (Optional[int]): Request timeout in milliseconds
|
|
586
|
-
scrape_options (Optional[
|
|
587
|
+
scrape_options (Optional[ScrapeOptions]): Result scraping configuration
|
|
587
588
|
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
588
589
|
**kwargs: Additional keyword arguments for future compatibility
|
|
589
590
|
|
|
@@ -664,7 +665,7 @@ class FirecrawlApp:
|
|
|
664
665
|
allow_backward_links: Optional[bool] = None,
|
|
665
666
|
allow_external_links: Optional[bool] = None,
|
|
666
667
|
ignore_sitemap: Optional[bool] = None,
|
|
667
|
-
scrape_options: Optional[
|
|
668
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
668
669
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
669
670
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
670
671
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -686,7 +687,7 @@ class FirecrawlApp:
|
|
|
686
687
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
687
688
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
688
689
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
689
|
-
scrape_options (Optional[
|
|
690
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
690
691
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
691
692
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
692
693
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -768,7 +769,7 @@ class FirecrawlApp:
|
|
|
768
769
|
allow_backward_links: Optional[bool] = None,
|
|
769
770
|
allow_external_links: Optional[bool] = None,
|
|
770
771
|
ignore_sitemap: Optional[bool] = None,
|
|
771
|
-
scrape_options: Optional[
|
|
772
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
772
773
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
773
774
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
774
775
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -789,7 +790,7 @@ class FirecrawlApp:
|
|
|
789
790
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
790
791
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
791
792
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
792
|
-
scrape_options (Optional[
|
|
793
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
793
794
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
794
795
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
795
796
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -1007,7 +1008,7 @@ class FirecrawlApp:
|
|
|
1007
1008
|
allow_backward_links: Optional[bool] = None,
|
|
1008
1009
|
allow_external_links: Optional[bool] = None,
|
|
1009
1010
|
ignore_sitemap: Optional[bool] = None,
|
|
1010
|
-
scrape_options: Optional[
|
|
1011
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
1011
1012
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
1012
1013
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1013
1014
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -1028,7 +1029,7 @@ class FirecrawlApp:
|
|
|
1028
1029
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
1029
1030
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1030
1031
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1031
|
-
scrape_options (Optional[
|
|
1032
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
1032
1033
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
1033
1034
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1034
1035
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -1741,7 +1742,7 @@ class FirecrawlApp:
|
|
|
1741
1742
|
|
|
1742
1743
|
def async_extract(
|
|
1743
1744
|
self,
|
|
1744
|
-
urls: List[str],
|
|
1745
|
+
urls: Optional[List[str]] = None,
|
|
1745
1746
|
*,
|
|
1746
1747
|
prompt: Optional[str] = None,
|
|
1747
1748
|
schema: Optional[Any] = None,
|
|
@@ -1749,8 +1750,7 @@ class FirecrawlApp:
|
|
|
1749
1750
|
allow_external_links: Optional[bool] = False,
|
|
1750
1751
|
enable_web_search: Optional[bool] = False,
|
|
1751
1752
|
show_sources: Optional[bool] = False,
|
|
1752
|
-
agent: Optional[Dict[str, Any]] = None
|
|
1753
|
-
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
1753
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
1754
1754
|
"""
|
|
1755
1755
|
Initiate an asynchronous extract job.
|
|
1756
1756
|
|
|
@@ -1774,7 +1774,7 @@ class FirecrawlApp:
|
|
|
1774
1774
|
Raises:
|
|
1775
1775
|
ValueError: If job initiation fails
|
|
1776
1776
|
"""
|
|
1777
|
-
headers = self._prepare_headers(
|
|
1777
|
+
headers = self._prepare_headers()
|
|
1778
1778
|
|
|
1779
1779
|
schema = schema
|
|
1780
1780
|
if schema:
|
|
@@ -2922,9 +2922,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2922
2922
|
headers
|
|
2923
2923
|
)
|
|
2924
2924
|
|
|
2925
|
-
if response.
|
|
2925
|
+
if response.get('success'):
|
|
2926
2926
|
try:
|
|
2927
|
-
id = response.
|
|
2927
|
+
id = response.get('id')
|
|
2928
2928
|
except:
|
|
2929
2929
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
2930
2930
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
@@ -3050,7 +3050,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3050
3050
|
headers
|
|
3051
3051
|
)
|
|
3052
3052
|
|
|
3053
|
-
if response.status_code == 200:
|
|
3053
|
+
if response.get('status_code') == 200:
|
|
3054
3054
|
try:
|
|
3055
3055
|
return BatchScrapeResponse(**response.json())
|
|
3056
3056
|
except:
|
|
@@ -3059,7 +3059,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3059
3059
|
self._handle_error(response, 'start batch scrape job')
|
|
3060
3060
|
|
|
3061
3061
|
async def crawl_url(
|
|
3062
|
-
|
|
3062
|
+
self,
|
|
3063
3063
|
url: str,
|
|
3064
3064
|
*,
|
|
3065
3065
|
include_paths: Optional[List[str]] = None,
|
|
@@ -3070,7 +3070,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3070
3070
|
allow_backward_links: Optional[bool] = None,
|
|
3071
3071
|
allow_external_links: Optional[bool] = None,
|
|
3072
3072
|
ignore_sitemap: Optional[bool] = None,
|
|
3073
|
-
scrape_options: Optional[
|
|
3073
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
3074
3074
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3075
3075
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3076
3076
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -3092,7 +3092,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3092
3092
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3093
3093
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3094
3094
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3095
|
-
scrape_options (Optional[
|
|
3095
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
3096
3096
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3097
3097
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3098
3098
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -3148,15 +3148,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3148
3148
|
params_dict = final_params.dict(exclude_none=True)
|
|
3149
3149
|
params_dict['url'] = url
|
|
3150
3150
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3151
|
-
|
|
3152
3151
|
# Make request
|
|
3153
3152
|
headers = self._prepare_headers(idempotency_key)
|
|
3154
3153
|
response = await self._async_post_request(
|
|
3155
3154
|
f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
3156
3155
|
|
|
3157
|
-
if response.
|
|
3156
|
+
if response.get('success'):
|
|
3158
3157
|
try:
|
|
3159
|
-
id = response.
|
|
3158
|
+
id = response.get('id')
|
|
3160
3159
|
except:
|
|
3161
3160
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3162
3161
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
@@ -3176,11 +3175,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3176
3175
|
allow_backward_links: Optional[bool] = None,
|
|
3177
3176
|
allow_external_links: Optional[bool] = None,
|
|
3178
3177
|
ignore_sitemap: Optional[bool] = None,
|
|
3179
|
-
scrape_options: Optional[
|
|
3178
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
3180
3179
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3181
3180
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3182
3181
|
ignore_query_parameters: Optional[bool] = None,
|
|
3183
3182
|
regex_on_full_url: Optional[bool] = None,
|
|
3183
|
+
poll_interval: Optional[int] = 2,
|
|
3184
3184
|
idempotency_key: Optional[str] = None,
|
|
3185
3185
|
**kwargs
|
|
3186
3186
|
) -> CrawlResponse:
|
|
@@ -3197,7 +3197,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3197
3197
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3198
3198
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3199
3199
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3200
|
-
scrape_options (Optional[
|
|
3200
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
3201
3201
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3202
3202
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3203
3203
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -3262,9 +3262,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3262
3262
|
headers
|
|
3263
3263
|
)
|
|
3264
3264
|
|
|
3265
|
-
if response.
|
|
3265
|
+
if response.get('success'):
|
|
3266
3266
|
try:
|
|
3267
|
-
return CrawlResponse(**response
|
|
3267
|
+
return CrawlResponse(**response)
|
|
3268
3268
|
except:
|
|
3269
3269
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3270
3270
|
else:
|
|
@@ -3303,7 +3303,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3303
3303
|
headers
|
|
3304
3304
|
)
|
|
3305
3305
|
|
|
3306
|
-
if status_data
|
|
3306
|
+
if status_data.get('status') == 'completed':
|
|
3307
3307
|
if 'data' in status_data:
|
|
3308
3308
|
data = status_data['data']
|
|
3309
3309
|
while 'next' in status_data:
|
|
@@ -3317,26 +3317,24 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3317
3317
|
data.extend(next_data.get('data', []))
|
|
3318
3318
|
status_data = next_data
|
|
3319
3319
|
status_data['data'] = data
|
|
3320
|
-
|
|
3321
|
-
response =
|
|
3322
|
-
|
|
3323
|
-
|
|
3324
|
-
|
|
3325
|
-
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3320
|
+
# Create CrawlStatusResponse object from status data
|
|
3321
|
+
response = CrawlStatusResponse(
|
|
3322
|
+
status=status_data.get('status'),
|
|
3323
|
+
total=status_data.get('total'),
|
|
3324
|
+
completed=status_data.get('completed'),
|
|
3325
|
+
creditsUsed=status_data.get('creditsUsed'),
|
|
3326
|
+
expiresAt=status_data.get('expiresAt'),
|
|
3327
|
+
data=status_data.get('data'),
|
|
3328
|
+
success=False if 'error' in status_data else True
|
|
3329
|
+
)
|
|
3329
3330
|
|
|
3330
3331
|
if 'error' in status_data:
|
|
3331
|
-
response
|
|
3332
|
+
response.error = status_data.get('error')
|
|
3332
3333
|
|
|
3333
3334
|
if 'next' in status_data:
|
|
3334
|
-
response
|
|
3335
|
+
response.next = status_data.get('next')
|
|
3335
3336
|
|
|
3336
|
-
return
|
|
3337
|
-
'success': False if 'error' in status_data else True,
|
|
3338
|
-
**response
|
|
3339
|
-
}
|
|
3337
|
+
return response
|
|
3340
3338
|
|
|
3341
3339
|
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
|
|
3342
3340
|
"""
|
|
@@ -3359,7 +3357,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3359
3357
|
headers
|
|
3360
3358
|
)
|
|
3361
3359
|
|
|
3362
|
-
if status_data
|
|
3360
|
+
if status_data.get('status') == 'completed':
|
|
3363
3361
|
if 'data' in status_data:
|
|
3364
3362
|
data = status_data['data']
|
|
3365
3363
|
while 'next' in status_data:
|
|
@@ -3376,15 +3374,22 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3376
3374
|
return status_data
|
|
3377
3375
|
else:
|
|
3378
3376
|
raise Exception('Job completed but no data was returned')
|
|
3379
|
-
elif status_data
|
|
3377
|
+
elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
3380
3378
|
await asyncio.sleep(max(poll_interval, 2))
|
|
3381
3379
|
else:
|
|
3382
3380
|
raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
|
|
3383
3381
|
|
|
3384
3382
|
async def map_url(
|
|
3385
|
-
|
|
3386
|
-
|
|
3387
|
-
|
|
3383
|
+
self,
|
|
3384
|
+
url: str,
|
|
3385
|
+
*,
|
|
3386
|
+
search: Optional[str] = None,
|
|
3387
|
+
ignore_sitemap: Optional[bool] = None,
|
|
3388
|
+
include_subdomains: Optional[bool] = None,
|
|
3389
|
+
sitemap_only: Optional[bool] = None,
|
|
3390
|
+
limit: Optional[int] = None,
|
|
3391
|
+
timeout: Optional[int] = None,
|
|
3392
|
+
params: Optional[MapParams] = None) -> MapResponse:
|
|
3388
3393
|
"""
|
|
3389
3394
|
Asynchronously map and discover links from a URL.
|
|
3390
3395
|
|
|
@@ -3409,21 +3414,40 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3409
3414
|
Raises:
|
|
3410
3415
|
Exception: If mapping fails
|
|
3411
3416
|
"""
|
|
3412
|
-
|
|
3413
|
-
json_data = {'url': url}
|
|
3417
|
+
map_params = {}
|
|
3414
3418
|
if params:
|
|
3415
|
-
|
|
3416
|
-
json_data['origin'] = f"python-sdk@{version}"
|
|
3419
|
+
map_params.update(params.dict(exclude_none=True))
|
|
3417
3420
|
|
|
3421
|
+
# Add individual parameters
|
|
3422
|
+
if search is not None:
|
|
3423
|
+
map_params['search'] = search
|
|
3424
|
+
if ignore_sitemap is not None:
|
|
3425
|
+
map_params['ignoreSitemap'] = ignore_sitemap
|
|
3426
|
+
if include_subdomains is not None:
|
|
3427
|
+
map_params['includeSubdomains'] = include_subdomains
|
|
3428
|
+
if sitemap_only is not None:
|
|
3429
|
+
map_params['sitemapOnly'] = sitemap_only
|
|
3430
|
+
if limit is not None:
|
|
3431
|
+
map_params['limit'] = limit
|
|
3432
|
+
if timeout is not None:
|
|
3433
|
+
map_params['timeout'] = timeout
|
|
3434
|
+
|
|
3435
|
+
# Create final params object
|
|
3436
|
+
final_params = MapParams(**map_params)
|
|
3437
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3438
|
+
params_dict['url'] = url
|
|
3439
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3440
|
+
|
|
3441
|
+
# Make request
|
|
3418
3442
|
endpoint = f'/v1/map'
|
|
3419
3443
|
response = await self._async_post_request(
|
|
3420
3444
|
f'{self.api_url}{endpoint}',
|
|
3421
|
-
|
|
3422
|
-
headers
|
|
3445
|
+
params_dict,
|
|
3446
|
+
headers={"Authorization": f"Bearer {self.api_key}"}
|
|
3423
3447
|
)
|
|
3424
3448
|
|
|
3425
3449
|
if response.get('success') and 'links' in response:
|
|
3426
|
-
return response
|
|
3450
|
+
return MapResponse(**response)
|
|
3427
3451
|
elif 'error' in response:
|
|
3428
3452
|
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
|
3429
3453
|
else:
|
|
@@ -3431,27 +3455,28 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3431
3455
|
|
|
3432
3456
|
async def extract(
|
|
3433
3457
|
self,
|
|
3434
|
-
urls: List[str],
|
|
3435
|
-
|
|
3458
|
+
urls: Optional[List[str]] = None,
|
|
3459
|
+
*,
|
|
3460
|
+
prompt: Optional[str] = None,
|
|
3461
|
+
schema: Optional[Any] = None,
|
|
3462
|
+
system_prompt: Optional[str] = None,
|
|
3463
|
+
allow_external_links: Optional[bool] = False,
|
|
3464
|
+
enable_web_search: Optional[bool] = False,
|
|
3465
|
+
show_sources: Optional[bool] = False,
|
|
3466
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
3467
|
+
|
|
3436
3468
|
"""
|
|
3437
3469
|
Asynchronously extract structured information from URLs.
|
|
3438
3470
|
|
|
3439
3471
|
Args:
|
|
3440
|
-
urls (List[str]): URLs to extract from
|
|
3441
|
-
|
|
3442
|
-
|
|
3443
|
-
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
* allowExternalLinks - Follow external links
|
|
3449
|
-
* enableWebSearch - Enable web search
|
|
3450
|
-
* includeSubdomains - Include subdomains
|
|
3451
|
-
* showSources - Include source URLs
|
|
3452
|
-
|
|
3453
|
-
Scraping Options:
|
|
3454
|
-
* scrapeOptions - Page scraping config
|
|
3472
|
+
urls (Optional[List[str]]): URLs to extract from
|
|
3473
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
3474
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
3475
|
+
system_prompt (Optional[str]): System context
|
|
3476
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
3477
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
3478
|
+
show_sources (Optional[bool]): Include source URLs
|
|
3479
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
3455
3480
|
|
|
3456
3481
|
Returns:
|
|
3457
3482
|
ExtractResponse with:
|
|
@@ -3464,29 +3489,35 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3464
3489
|
"""
|
|
3465
3490
|
headers = self._prepare_headers()
|
|
3466
3491
|
|
|
3467
|
-
if not
|
|
3492
|
+
if not prompt and not schema:
|
|
3468
3493
|
raise ValueError("Either prompt or schema is required")
|
|
3469
3494
|
|
|
3470
|
-
|
|
3495
|
+
if not urls and not prompt:
|
|
3496
|
+
raise ValueError("Either urls or prompt is required")
|
|
3497
|
+
|
|
3471
3498
|
if schema:
|
|
3472
3499
|
if hasattr(schema, 'model_json_schema'):
|
|
3500
|
+
# Convert Pydantic model to JSON schema
|
|
3473
3501
|
schema = schema.model_json_schema()
|
|
3502
|
+
# Otherwise assume it's already a JSON schema dict
|
|
3474
3503
|
|
|
3475
3504
|
request_data = {
|
|
3476
|
-
'urls': urls,
|
|
3477
|
-
'allowExternalLinks':
|
|
3478
|
-
'enableWebSearch':
|
|
3479
|
-
'showSources':
|
|
3505
|
+
'urls': urls or [],
|
|
3506
|
+
'allowExternalLinks': allow_external_links,
|
|
3507
|
+
'enableWebSearch': enable_web_search,
|
|
3508
|
+
'showSources': show_sources,
|
|
3480
3509
|
'schema': schema,
|
|
3481
|
-
'origin': f'python-sdk@{
|
|
3510
|
+
'origin': f'python-sdk@{get_version()}'
|
|
3482
3511
|
}
|
|
3483
3512
|
|
|
3484
|
-
if
|
|
3485
|
-
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3513
|
+
# Only add prompt and systemPrompt if they exist
|
|
3514
|
+
if prompt:
|
|
3515
|
+
request_data['prompt'] = prompt
|
|
3516
|
+
if system_prompt:
|
|
3517
|
+
request_data['systemPrompt'] = system_prompt
|
|
3518
|
+
|
|
3519
|
+
if agent:
|
|
3520
|
+
request_data['agent'] = agent
|
|
3490
3521
|
|
|
3491
3522
|
response = await self._async_post_request(
|
|
3492
3523
|
f'{self.api_url}/v1/extract',
|
|
@@ -3506,7 +3537,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3506
3537
|
)
|
|
3507
3538
|
|
|
3508
3539
|
if status_data['status'] == 'completed':
|
|
3509
|
-
return status_data
|
|
3540
|
+
return ExtractResponse(**status_data)
|
|
3510
3541
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
3511
3542
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
3512
3543
|
|
|
@@ -3562,14 +3593,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3562
3593
|
status_data = next_data
|
|
3563
3594
|
status_data['data'] = data
|
|
3564
3595
|
|
|
3565
|
-
response =
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3572
|
-
|
|
3596
|
+
response = BatchScrapeStatusResponse(
|
|
3597
|
+
status=status_data.get('status'),
|
|
3598
|
+
total=status_data.get('total'),
|
|
3599
|
+
completed=status_data.get('completed'),
|
|
3600
|
+
creditsUsed=status_data.get('creditsUsed'),
|
|
3601
|
+
expiresAt=status_data.get('expiresAt'),
|
|
3602
|
+
data=status_data.get('data')
|
|
3603
|
+
)
|
|
3573
3604
|
|
|
3574
3605
|
if 'error' in status_data:
|
|
3575
3606
|
response['error'] = status_data['error']
|
|
@@ -3689,8 +3720,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3689
3720
|
allow_external_links: Optional[bool] = False,
|
|
3690
3721
|
enable_web_search: Optional[bool] = False,
|
|
3691
3722
|
show_sources: Optional[bool] = False,
|
|
3692
|
-
agent: Optional[Dict[str, Any]] = None
|
|
3693
|
-
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
3723
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
3694
3724
|
"""
|
|
3695
3725
|
Initiate an asynchronous extraction job without waiting for completion.
|
|
3696
3726
|
|
|
@@ -3714,7 +3744,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3714
3744
|
Raises:
|
|
3715
3745
|
ValueError: If job initiation fails
|
|
3716
3746
|
"""
|
|
3717
|
-
headers = self._prepare_headers(
|
|
3747
|
+
headers = self._prepare_headers()
|
|
3718
3748
|
|
|
3719
3749
|
if not prompt and not schema:
|
|
3720
3750
|
raise ValueError("Either prompt or schema is required")
|
|
@@ -3726,14 +3756,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3726
3756
|
if hasattr(schema, 'model_json_schema'):
|
|
3727
3757
|
schema = schema.model_json_schema()
|
|
3728
3758
|
|
|
3729
|
-
request_data =
|
|
3730
|
-
|
|
3731
|
-
|
|
3732
|
-
|
|
3733
|
-
|
|
3734
|
-
|
|
3735
|
-
|
|
3736
|
-
|
|
3759
|
+
request_data = ExtractResponse(
|
|
3760
|
+
urls=urls or [],
|
|
3761
|
+
allowExternalLinks=allow_external_links,
|
|
3762
|
+
enableWebSearch=enable_web_search,
|
|
3763
|
+
showSources=show_sources,
|
|
3764
|
+
schema=schema,
|
|
3765
|
+
origin=f'python-sdk@{version}'
|
|
3766
|
+
)
|
|
3737
3767
|
|
|
3738
3768
|
if prompt:
|
|
3739
3769
|
request_data['prompt'] = prompt
|
|
@@ -3810,7 +3840,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3810
3840
|
|
|
3811
3841
|
await asyncio.sleep(2)
|
|
3812
3842
|
|
|
3813
|
-
return
|
|
3843
|
+
return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
|
|
3814
3844
|
|
|
3815
3845
|
async def async_generate_llms_text(
|
|
3816
3846
|
self,
|
|
@@ -3845,6 +3875,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3845
3875
|
if experimental_stream is not None:
|
|
3846
3876
|
params['__experimental_stream'] = experimental_stream
|
|
3847
3877
|
|
|
3878
|
+
params = GenerateLLMsTextParams(
|
|
3879
|
+
maxUrls=max_urls,
|
|
3880
|
+
showFullText=show_full_text,
|
|
3881
|
+
__experimental_stream=experimental_stream
|
|
3882
|
+
)
|
|
3883
|
+
|
|
3848
3884
|
headers = self._prepare_headers()
|
|
3849
3885
|
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
3850
3886
|
json_data['origin'] = f"python-sdk@{version}"
|
|
@@ -3981,7 +4017,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3981
4017
|
|
|
3982
4018
|
await asyncio.sleep(2)
|
|
3983
4019
|
|
|
3984
|
-
return
|
|
4020
|
+
return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
|
|
3985
4021
|
|
|
3986
4022
|
async def async_deep_research(
|
|
3987
4023
|
self,
|
|
@@ -4088,7 +4124,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4088
4124
|
country: Optional[str] = None,
|
|
4089
4125
|
location: Optional[str] = None,
|
|
4090
4126
|
timeout: Optional[int] = None,
|
|
4091
|
-
scrape_options: Optional[
|
|
4127
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
4092
4128
|
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
4093
4129
|
**kwargs) -> SearchResponse:
|
|
4094
4130
|
"""
|
|
@@ -4103,7 +4139,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4103
4139
|
country (Optional[str]): Country code (default: "us")
|
|
4104
4140
|
location (Optional[str]): Geo-targeting
|
|
4105
4141
|
timeout (Optional[int]): Request timeout in milliseconds
|
|
4106
|
-
scrape_options (Optional[
|
|
4142
|
+
scrape_options (Optional[ScrapeOptions]): Result scraping configuration
|
|
4107
4143
|
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
4108
4144
|
**kwargs: Additional keyword arguments for future compatibility
|
|
4109
4145
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
firecrawl/__init__.py,sha256=
|
|
2
|
-
firecrawl/firecrawl.py,sha256=
|
|
1
|
+
firecrawl/__init__.py,sha256=NU9Qcom12t48ym3ovFMpCYI4-uH-Ac1jnddqSUzxEIE,2570
|
|
2
|
+
firecrawl/firecrawl.py,sha256=bXjJKt2UAdszpoCspBOPen_2lz5ysmVWP5vDMZUbyUo,177726
|
|
3
3
|
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
5
|
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
7
|
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
-
firecrawl-2.
|
|
9
|
-
firecrawl-2.
|
|
10
|
-
firecrawl-2.
|
|
11
|
-
firecrawl-2.
|
|
12
|
-
firecrawl-2.
|
|
8
|
+
firecrawl-2.1.1.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.1.1.dist-info/METADATA,sha256=5_5qIPtR-xSv8jAkZLqBP1i-xefxucWl3rZo2OfPsLo,10583
|
|
10
|
+
firecrawl-2.1.1.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.1.1.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|