firecrawl 2.0.2__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +2 -2
- firecrawl/firecrawl.py +161 -124
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.0.dist-info}/METADATA +1 -1
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.0.dist-info}/RECORD +7 -7
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.0.dist-info}/LICENSE +0 -0
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.0.dist-info}/WHEEL +0 -0
- {firecrawl-2.0.2.dist-info → firecrawl-2.1.0.dist-info}/top_level.txt +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -11,9 +11,9 @@ For more information visit https://github.com/firecrawl/
|
|
|
11
11
|
import logging
|
|
12
12
|
import os
|
|
13
13
|
|
|
14
|
-
from .firecrawl import FirecrawlApp, JsonConfig # noqa
|
|
14
|
+
from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.0
|
|
16
|
+
__version__ = "2.1.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
firecrawl/firecrawl.py
CHANGED
|
@@ -97,6 +97,16 @@ class ActionsResult(pydantic.BaseModel):
|
|
|
97
97
|
"""Result of actions performed during scraping."""
|
|
98
98
|
screenshots: List[str]
|
|
99
99
|
|
|
100
|
+
class ChangeTrackingData(pydantic.BaseModel):
|
|
101
|
+
"""
|
|
102
|
+
Data for the change tracking format.
|
|
103
|
+
"""
|
|
104
|
+
previousScrapeAt: Optional[str] = None
|
|
105
|
+
changeStatus: str # "new" | "same" | "changed" | "removed"
|
|
106
|
+
visibility: str # "visible" | "hidden"
|
|
107
|
+
diff: Optional[Dict[str, Any]] = None
|
|
108
|
+
json: Optional[Any] = None
|
|
109
|
+
|
|
100
110
|
class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
101
111
|
"""Document retrieved or processed by Firecrawl."""
|
|
102
112
|
url: Optional[str] = None
|
|
@@ -111,6 +121,7 @@ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
|
111
121
|
actions: Optional[ActionsResult] = None
|
|
112
122
|
title: Optional[str] = None # v1 search only
|
|
113
123
|
description: Optional[str] = None # v1 search only
|
|
124
|
+
changeTracking: Optional[ChangeTrackingData] = None
|
|
114
125
|
|
|
115
126
|
class LocationConfig(pydantic.BaseModel):
|
|
116
127
|
"""Location configuration for scraping."""
|
|
@@ -124,9 +135,9 @@ class WebhookConfig(pydantic.BaseModel):
|
|
|
124
135
|
metadata: Optional[Dict[str, str]] = None
|
|
125
136
|
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
126
137
|
|
|
127
|
-
class
|
|
138
|
+
class ScrapeOptions(pydantic.BaseModel):
|
|
128
139
|
"""Parameters for scraping operations."""
|
|
129
|
-
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None
|
|
140
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
|
|
130
141
|
headers: Optional[Dict[str, str]] = None
|
|
131
142
|
includeTags: Optional[List[str]] = None
|
|
132
143
|
excludeTags: Optional[List[str]] = None
|
|
@@ -193,7 +204,7 @@ class JsonConfig(pydantic.BaseModel):
|
|
|
193
204
|
systemPrompt: Optional[str] = None
|
|
194
205
|
agent: Optional[ExtractAgent] = None
|
|
195
206
|
|
|
196
|
-
class ScrapeParams(
|
|
207
|
+
class ScrapeParams(ScrapeOptions):
|
|
197
208
|
"""Parameters for scraping operations."""
|
|
198
209
|
extract: Optional[JsonConfig] = None
|
|
199
210
|
jsonOptions: Optional[JsonConfig] = None
|
|
@@ -235,7 +246,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
235
246
|
allowBackwardLinks: Optional[bool] = None
|
|
236
247
|
allowExternalLinks: Optional[bool] = None
|
|
237
248
|
ignoreSitemap: Optional[bool] = None
|
|
238
|
-
scrapeOptions: Optional[
|
|
249
|
+
scrapeOptions: Optional[ScrapeOptions] = None
|
|
239
250
|
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
240
251
|
deduplicateSimilarURLs: Optional[bool] = None
|
|
241
252
|
ignoreQueryParameters: Optional[bool] = None
|
|
@@ -289,7 +300,7 @@ class ExtractParams(pydantic.BaseModel):
|
|
|
289
300
|
includeSubdomains: Optional[bool] = None
|
|
290
301
|
origin: Optional[str] = None
|
|
291
302
|
showSources: Optional[bool] = None
|
|
292
|
-
scrapeOptions: Optional[
|
|
303
|
+
scrapeOptions: Optional[ScrapeOptions] = None
|
|
293
304
|
|
|
294
305
|
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
295
306
|
"""Response from extract operations."""
|
|
@@ -309,7 +320,7 @@ class SearchParams(pydantic.BaseModel):
|
|
|
309
320
|
location: Optional[str] = None
|
|
310
321
|
origin: Optional[str] = "api"
|
|
311
322
|
timeout: Optional[int] = 60000
|
|
312
|
-
scrapeOptions: Optional[
|
|
323
|
+
scrapeOptions: Optional[ScrapeOptions] = None
|
|
313
324
|
|
|
314
325
|
class SearchResponse(pydantic.BaseModel):
|
|
315
326
|
"""Response from search operations."""
|
|
@@ -377,16 +388,6 @@ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
|
|
377
388
|
status: Literal["processing", "completed", "failed"]
|
|
378
389
|
error: Optional[str] = None
|
|
379
390
|
expiresAt: str
|
|
380
|
-
|
|
381
|
-
class ChangeTrackingData(pydantic.BaseModel):
|
|
382
|
-
"""
|
|
383
|
-
Data for the change tracking format.
|
|
384
|
-
"""
|
|
385
|
-
previousScrapeAt: Optional[str] = None
|
|
386
|
-
changeStatus: str # "new" | "same" | "changed" | "removed"
|
|
387
|
-
visibility: str # "visible" | "hidden"
|
|
388
|
-
diff: Optional[Dict[str, Any]] = None
|
|
389
|
-
json: Optional[Any] = None
|
|
390
391
|
|
|
391
392
|
class SearchResponse(pydantic.BaseModel):
|
|
392
393
|
"""
|
|
@@ -442,7 +443,7 @@ class FirecrawlApp:
|
|
|
442
443
|
self,
|
|
443
444
|
url: str,
|
|
444
445
|
*,
|
|
445
|
-
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
446
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
|
446
447
|
include_tags: Optional[List[str]] = None,
|
|
447
448
|
exclude_tags: Optional[List[str]] = None,
|
|
448
449
|
only_main_content: Optional[bool] = None,
|
|
@@ -568,7 +569,7 @@ class FirecrawlApp:
|
|
|
568
569
|
country: Optional[str] = None,
|
|
569
570
|
location: Optional[str] = None,
|
|
570
571
|
timeout: Optional[int] = None,
|
|
571
|
-
scrape_options: Optional[
|
|
572
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
572
573
|
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
573
574
|
**kwargs) -> SearchResponse:
|
|
574
575
|
"""
|
|
@@ -583,7 +584,7 @@ class FirecrawlApp:
|
|
|
583
584
|
country (Optional[str]): Country code (default: "us")
|
|
584
585
|
location (Optional[str]): Geo-targeting
|
|
585
586
|
timeout (Optional[int]): Request timeout in milliseconds
|
|
586
|
-
scrape_options (Optional[
|
|
587
|
+
scrape_options (Optional[ScrapeOptions]): Result scraping configuration
|
|
587
588
|
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
588
589
|
**kwargs: Additional keyword arguments for future compatibility
|
|
589
590
|
|
|
@@ -664,7 +665,7 @@ class FirecrawlApp:
|
|
|
664
665
|
allow_backward_links: Optional[bool] = None,
|
|
665
666
|
allow_external_links: Optional[bool] = None,
|
|
666
667
|
ignore_sitemap: Optional[bool] = None,
|
|
667
|
-
scrape_options: Optional[
|
|
668
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
668
669
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
669
670
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
670
671
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -686,7 +687,7 @@ class FirecrawlApp:
|
|
|
686
687
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
687
688
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
688
689
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
689
|
-
scrape_options (Optional[
|
|
690
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
690
691
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
691
692
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
692
693
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -768,7 +769,7 @@ class FirecrawlApp:
|
|
|
768
769
|
allow_backward_links: Optional[bool] = None,
|
|
769
770
|
allow_external_links: Optional[bool] = None,
|
|
770
771
|
ignore_sitemap: Optional[bool] = None,
|
|
771
|
-
scrape_options: Optional[
|
|
772
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
772
773
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
773
774
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
774
775
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -789,7 +790,7 @@ class FirecrawlApp:
|
|
|
789
790
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
790
791
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
791
792
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
792
|
-
scrape_options (Optional[
|
|
793
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
793
794
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
794
795
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
795
796
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -1007,7 +1008,7 @@ class FirecrawlApp:
|
|
|
1007
1008
|
allow_backward_links: Optional[bool] = None,
|
|
1008
1009
|
allow_external_links: Optional[bool] = None,
|
|
1009
1010
|
ignore_sitemap: Optional[bool] = None,
|
|
1010
|
-
scrape_options: Optional[
|
|
1011
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
1011
1012
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
1012
1013
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1013
1014
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -1028,7 +1029,7 @@ class FirecrawlApp:
|
|
|
1028
1029
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
1029
1030
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1030
1031
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1031
|
-
scrape_options (Optional[
|
|
1032
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
1032
1033
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
1033
1034
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1034
1035
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -1741,7 +1742,7 @@ class FirecrawlApp:
|
|
|
1741
1742
|
|
|
1742
1743
|
def async_extract(
|
|
1743
1744
|
self,
|
|
1744
|
-
urls: List[str],
|
|
1745
|
+
urls: Optional[List[str]] = None,
|
|
1745
1746
|
*,
|
|
1746
1747
|
prompt: Optional[str] = None,
|
|
1747
1748
|
schema: Optional[Any] = None,
|
|
@@ -1749,8 +1750,7 @@ class FirecrawlApp:
|
|
|
1749
1750
|
allow_external_links: Optional[bool] = False,
|
|
1750
1751
|
enable_web_search: Optional[bool] = False,
|
|
1751
1752
|
show_sources: Optional[bool] = False,
|
|
1752
|
-
agent: Optional[Dict[str, Any]] = None
|
|
1753
|
-
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
1753
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
1754
1754
|
"""
|
|
1755
1755
|
Initiate an asynchronous extract job.
|
|
1756
1756
|
|
|
@@ -1774,7 +1774,7 @@ class FirecrawlApp:
|
|
|
1774
1774
|
Raises:
|
|
1775
1775
|
ValueError: If job initiation fails
|
|
1776
1776
|
"""
|
|
1777
|
-
headers = self._prepare_headers(
|
|
1777
|
+
headers = self._prepare_headers()
|
|
1778
1778
|
|
|
1779
1779
|
schema = schema
|
|
1780
1780
|
if schema:
|
|
@@ -2922,9 +2922,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2922
2922
|
headers
|
|
2923
2923
|
)
|
|
2924
2924
|
|
|
2925
|
-
if response.
|
|
2925
|
+
if response.get('success'):
|
|
2926
2926
|
try:
|
|
2927
|
-
id = response.
|
|
2927
|
+
id = response.get('id')
|
|
2928
2928
|
except:
|
|
2929
2929
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
2930
2930
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
@@ -3050,7 +3050,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3050
3050
|
headers
|
|
3051
3051
|
)
|
|
3052
3052
|
|
|
3053
|
-
if response.status_code == 200:
|
|
3053
|
+
if response.get('status_code') == 200:
|
|
3054
3054
|
try:
|
|
3055
3055
|
return BatchScrapeResponse(**response.json())
|
|
3056
3056
|
except:
|
|
@@ -3059,7 +3059,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3059
3059
|
self._handle_error(response, 'start batch scrape job')
|
|
3060
3060
|
|
|
3061
3061
|
async def crawl_url(
|
|
3062
|
-
|
|
3062
|
+
self,
|
|
3063
3063
|
url: str,
|
|
3064
3064
|
*,
|
|
3065
3065
|
include_paths: Optional[List[str]] = None,
|
|
@@ -3070,7 +3070,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3070
3070
|
allow_backward_links: Optional[bool] = None,
|
|
3071
3071
|
allow_external_links: Optional[bool] = None,
|
|
3072
3072
|
ignore_sitemap: Optional[bool] = None,
|
|
3073
|
-
scrape_options: Optional[
|
|
3073
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
3074
3074
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3075
3075
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3076
3076
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -3092,7 +3092,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3092
3092
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3093
3093
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3094
3094
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3095
|
-
scrape_options (Optional[
|
|
3095
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
3096
3096
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3097
3097
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3098
3098
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -3148,15 +3148,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3148
3148
|
params_dict = final_params.dict(exclude_none=True)
|
|
3149
3149
|
params_dict['url'] = url
|
|
3150
3150
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3151
|
-
|
|
3152
3151
|
# Make request
|
|
3153
3152
|
headers = self._prepare_headers(idempotency_key)
|
|
3154
3153
|
response = await self._async_post_request(
|
|
3155
3154
|
f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
3156
3155
|
|
|
3157
|
-
|
|
3156
|
+
print(response)
|
|
3157
|
+
if response.get('success'):
|
|
3158
3158
|
try:
|
|
3159
|
-
id = response.
|
|
3159
|
+
id = response.get('id')
|
|
3160
3160
|
except:
|
|
3161
3161
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3162
3162
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
@@ -3176,11 +3176,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3176
3176
|
allow_backward_links: Optional[bool] = None,
|
|
3177
3177
|
allow_external_links: Optional[bool] = None,
|
|
3178
3178
|
ignore_sitemap: Optional[bool] = None,
|
|
3179
|
-
scrape_options: Optional[
|
|
3179
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
3180
3180
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3181
3181
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3182
3182
|
ignore_query_parameters: Optional[bool] = None,
|
|
3183
3183
|
regex_on_full_url: Optional[bool] = None,
|
|
3184
|
+
poll_interval: Optional[int] = 2,
|
|
3184
3185
|
idempotency_key: Optional[str] = None,
|
|
3185
3186
|
**kwargs
|
|
3186
3187
|
) -> CrawlResponse:
|
|
@@ -3197,7 +3198,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3197
3198
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3198
3199
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3199
3200
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3200
|
-
scrape_options (Optional[
|
|
3201
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
3201
3202
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3202
3203
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3203
3204
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -3262,9 +3263,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3262
3263
|
headers
|
|
3263
3264
|
)
|
|
3264
3265
|
|
|
3265
|
-
if response.
|
|
3266
|
+
if response.get('success'):
|
|
3266
3267
|
try:
|
|
3267
|
-
return CrawlResponse(**response
|
|
3268
|
+
return CrawlResponse(**response)
|
|
3268
3269
|
except:
|
|
3269
3270
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3270
3271
|
else:
|
|
@@ -3303,7 +3304,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3303
3304
|
headers
|
|
3304
3305
|
)
|
|
3305
3306
|
|
|
3306
|
-
if status_data
|
|
3307
|
+
if status_data.get('status') == 'completed':
|
|
3307
3308
|
if 'data' in status_data:
|
|
3308
3309
|
data = status_data['data']
|
|
3309
3310
|
while 'next' in status_data:
|
|
@@ -3317,26 +3318,24 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3317
3318
|
data.extend(next_data.get('data', []))
|
|
3318
3319
|
status_data = next_data
|
|
3319
3320
|
status_data['data'] = data
|
|
3320
|
-
|
|
3321
|
-
response =
|
|
3322
|
-
|
|
3323
|
-
|
|
3324
|
-
|
|
3325
|
-
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3321
|
+
# Create CrawlStatusResponse object from status data
|
|
3322
|
+
response = CrawlStatusResponse(
|
|
3323
|
+
status=status_data.get('status'),
|
|
3324
|
+
total=status_data.get('total'),
|
|
3325
|
+
completed=status_data.get('completed'),
|
|
3326
|
+
creditsUsed=status_data.get('creditsUsed'),
|
|
3327
|
+
expiresAt=status_data.get('expiresAt'),
|
|
3328
|
+
data=status_data.get('data'),
|
|
3329
|
+
success=False if 'error' in status_data else True
|
|
3330
|
+
)
|
|
3329
3331
|
|
|
3330
3332
|
if 'error' in status_data:
|
|
3331
|
-
response
|
|
3333
|
+
response.error = status_data.get('error')
|
|
3332
3334
|
|
|
3333
3335
|
if 'next' in status_data:
|
|
3334
|
-
response
|
|
3336
|
+
response.next = status_data.get('next')
|
|
3335
3337
|
|
|
3336
|
-
return
|
|
3337
|
-
'success': False if 'error' in status_data else True,
|
|
3338
|
-
**response
|
|
3339
|
-
}
|
|
3338
|
+
return response
|
|
3340
3339
|
|
|
3341
3340
|
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
|
|
3342
3341
|
"""
|
|
@@ -3359,7 +3358,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3359
3358
|
headers
|
|
3360
3359
|
)
|
|
3361
3360
|
|
|
3362
|
-
if status_data
|
|
3361
|
+
if status_data.get('status') == 'completed':
|
|
3363
3362
|
if 'data' in status_data:
|
|
3364
3363
|
data = status_data['data']
|
|
3365
3364
|
while 'next' in status_data:
|
|
@@ -3376,15 +3375,22 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3376
3375
|
return status_data
|
|
3377
3376
|
else:
|
|
3378
3377
|
raise Exception('Job completed but no data was returned')
|
|
3379
|
-
elif status_data
|
|
3378
|
+
elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
3380
3379
|
await asyncio.sleep(max(poll_interval, 2))
|
|
3381
3380
|
else:
|
|
3382
3381
|
raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
|
|
3383
3382
|
|
|
3384
3383
|
async def map_url(
|
|
3385
|
-
|
|
3386
|
-
|
|
3387
|
-
|
|
3384
|
+
self,
|
|
3385
|
+
url: str,
|
|
3386
|
+
*,
|
|
3387
|
+
search: Optional[str] = None,
|
|
3388
|
+
ignore_sitemap: Optional[bool] = None,
|
|
3389
|
+
include_subdomains: Optional[bool] = None,
|
|
3390
|
+
sitemap_only: Optional[bool] = None,
|
|
3391
|
+
limit: Optional[int] = None,
|
|
3392
|
+
timeout: Optional[int] = None,
|
|
3393
|
+
params: Optional[MapParams] = None) -> MapResponse:
|
|
3388
3394
|
"""
|
|
3389
3395
|
Asynchronously map and discover links from a URL.
|
|
3390
3396
|
|
|
@@ -3409,21 +3415,40 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3409
3415
|
Raises:
|
|
3410
3416
|
Exception: If mapping fails
|
|
3411
3417
|
"""
|
|
3412
|
-
|
|
3413
|
-
json_data = {'url': url}
|
|
3418
|
+
map_params = {}
|
|
3414
3419
|
if params:
|
|
3415
|
-
|
|
3416
|
-
json_data['origin'] = f"python-sdk@{version}"
|
|
3420
|
+
map_params.update(params.dict(exclude_none=True))
|
|
3417
3421
|
|
|
3422
|
+
# Add individual parameters
|
|
3423
|
+
if search is not None:
|
|
3424
|
+
map_params['search'] = search
|
|
3425
|
+
if ignore_sitemap is not None:
|
|
3426
|
+
map_params['ignoreSitemap'] = ignore_sitemap
|
|
3427
|
+
if include_subdomains is not None:
|
|
3428
|
+
map_params['includeSubdomains'] = include_subdomains
|
|
3429
|
+
if sitemap_only is not None:
|
|
3430
|
+
map_params['sitemapOnly'] = sitemap_only
|
|
3431
|
+
if limit is not None:
|
|
3432
|
+
map_params['limit'] = limit
|
|
3433
|
+
if timeout is not None:
|
|
3434
|
+
map_params['timeout'] = timeout
|
|
3435
|
+
|
|
3436
|
+
# Create final params object
|
|
3437
|
+
final_params = MapParams(**map_params)
|
|
3438
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3439
|
+
params_dict['url'] = url
|
|
3440
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3441
|
+
|
|
3442
|
+
# Make request
|
|
3418
3443
|
endpoint = f'/v1/map'
|
|
3419
3444
|
response = await self._async_post_request(
|
|
3420
3445
|
f'{self.api_url}{endpoint}',
|
|
3421
|
-
|
|
3422
|
-
headers
|
|
3446
|
+
params_dict,
|
|
3447
|
+
headers={"Authorization": f"Bearer {self.api_key}"}
|
|
3423
3448
|
)
|
|
3424
3449
|
|
|
3425
3450
|
if response.get('success') and 'links' in response:
|
|
3426
|
-
return response
|
|
3451
|
+
return MapResponse(**response)
|
|
3427
3452
|
elif 'error' in response:
|
|
3428
3453
|
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
|
3429
3454
|
else:
|
|
@@ -3431,27 +3456,28 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3431
3456
|
|
|
3432
3457
|
async def extract(
|
|
3433
3458
|
self,
|
|
3434
|
-
urls: List[str],
|
|
3435
|
-
|
|
3459
|
+
urls: Optional[List[str]] = None,
|
|
3460
|
+
*,
|
|
3461
|
+
prompt: Optional[str] = None,
|
|
3462
|
+
schema: Optional[Any] = None,
|
|
3463
|
+
system_prompt: Optional[str] = None,
|
|
3464
|
+
allow_external_links: Optional[bool] = False,
|
|
3465
|
+
enable_web_search: Optional[bool] = False,
|
|
3466
|
+
show_sources: Optional[bool] = False,
|
|
3467
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
3468
|
+
|
|
3436
3469
|
"""
|
|
3437
3470
|
Asynchronously extract structured information from URLs.
|
|
3438
3471
|
|
|
3439
3472
|
Args:
|
|
3440
|
-
urls (List[str]): URLs to extract from
|
|
3441
|
-
|
|
3442
|
-
|
|
3443
|
-
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
* allowExternalLinks - Follow external links
|
|
3449
|
-
* enableWebSearch - Enable web search
|
|
3450
|
-
* includeSubdomains - Include subdomains
|
|
3451
|
-
* showSources - Include source URLs
|
|
3452
|
-
|
|
3453
|
-
Scraping Options:
|
|
3454
|
-
* scrapeOptions - Page scraping config
|
|
3473
|
+
urls (Optional[List[str]]): URLs to extract from
|
|
3474
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
3475
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
3476
|
+
system_prompt (Optional[str]): System context
|
|
3477
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
3478
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
3479
|
+
show_sources (Optional[bool]): Include source URLs
|
|
3480
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
3455
3481
|
|
|
3456
3482
|
Returns:
|
|
3457
3483
|
ExtractResponse with:
|
|
@@ -3464,29 +3490,35 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3464
3490
|
"""
|
|
3465
3491
|
headers = self._prepare_headers()
|
|
3466
3492
|
|
|
3467
|
-
if not
|
|
3493
|
+
if not prompt and not schema:
|
|
3468
3494
|
raise ValueError("Either prompt or schema is required")
|
|
3469
3495
|
|
|
3470
|
-
|
|
3496
|
+
if not urls and not prompt:
|
|
3497
|
+
raise ValueError("Either urls or prompt is required")
|
|
3498
|
+
|
|
3471
3499
|
if schema:
|
|
3472
3500
|
if hasattr(schema, 'model_json_schema'):
|
|
3501
|
+
# Convert Pydantic model to JSON schema
|
|
3473
3502
|
schema = schema.model_json_schema()
|
|
3503
|
+
# Otherwise assume it's already a JSON schema dict
|
|
3474
3504
|
|
|
3475
3505
|
request_data = {
|
|
3476
|
-
'urls': urls,
|
|
3477
|
-
'allowExternalLinks':
|
|
3478
|
-
'enableWebSearch':
|
|
3479
|
-
'showSources':
|
|
3506
|
+
'urls': urls or [],
|
|
3507
|
+
'allowExternalLinks': allow_external_links,
|
|
3508
|
+
'enableWebSearch': enable_web_search,
|
|
3509
|
+
'showSources': show_sources,
|
|
3480
3510
|
'schema': schema,
|
|
3481
|
-
'origin': f'python-sdk@{
|
|
3511
|
+
'origin': f'python-sdk@{get_version()}'
|
|
3482
3512
|
}
|
|
3483
3513
|
|
|
3484
|
-
if
|
|
3485
|
-
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3514
|
+
# Only add prompt and systemPrompt if they exist
|
|
3515
|
+
if prompt:
|
|
3516
|
+
request_data['prompt'] = prompt
|
|
3517
|
+
if system_prompt:
|
|
3518
|
+
request_data['systemPrompt'] = system_prompt
|
|
3519
|
+
|
|
3520
|
+
if agent:
|
|
3521
|
+
request_data['agent'] = agent
|
|
3490
3522
|
|
|
3491
3523
|
response = await self._async_post_request(
|
|
3492
3524
|
f'{self.api_url}/v1/extract',
|
|
@@ -3506,7 +3538,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3506
3538
|
)
|
|
3507
3539
|
|
|
3508
3540
|
if status_data['status'] == 'completed':
|
|
3509
|
-
return status_data
|
|
3541
|
+
return ExtractResponse(**status_data)
|
|
3510
3542
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
3511
3543
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
3512
3544
|
|
|
@@ -3562,14 +3594,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3562
3594
|
status_data = next_data
|
|
3563
3595
|
status_data['data'] = data
|
|
3564
3596
|
|
|
3565
|
-
response =
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3572
|
-
|
|
3597
|
+
response = BatchScrapeStatusResponse(
|
|
3598
|
+
status=status_data.get('status'),
|
|
3599
|
+
total=status_data.get('total'),
|
|
3600
|
+
completed=status_data.get('completed'),
|
|
3601
|
+
creditsUsed=status_data.get('creditsUsed'),
|
|
3602
|
+
expiresAt=status_data.get('expiresAt'),
|
|
3603
|
+
data=status_data.get('data')
|
|
3604
|
+
)
|
|
3573
3605
|
|
|
3574
3606
|
if 'error' in status_data:
|
|
3575
3607
|
response['error'] = status_data['error']
|
|
@@ -3689,8 +3721,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3689
3721
|
allow_external_links: Optional[bool] = False,
|
|
3690
3722
|
enable_web_search: Optional[bool] = False,
|
|
3691
3723
|
show_sources: Optional[bool] = False,
|
|
3692
|
-
agent: Optional[Dict[str, Any]] = None
|
|
3693
|
-
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
3724
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
3694
3725
|
"""
|
|
3695
3726
|
Initiate an asynchronous extraction job without waiting for completion.
|
|
3696
3727
|
|
|
@@ -3714,7 +3745,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3714
3745
|
Raises:
|
|
3715
3746
|
ValueError: If job initiation fails
|
|
3716
3747
|
"""
|
|
3717
|
-
headers = self._prepare_headers(
|
|
3748
|
+
headers = self._prepare_headers()
|
|
3718
3749
|
|
|
3719
3750
|
if not prompt and not schema:
|
|
3720
3751
|
raise ValueError("Either prompt or schema is required")
|
|
@@ -3726,14 +3757,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3726
3757
|
if hasattr(schema, 'model_json_schema'):
|
|
3727
3758
|
schema = schema.model_json_schema()
|
|
3728
3759
|
|
|
3729
|
-
request_data =
|
|
3730
|
-
|
|
3731
|
-
|
|
3732
|
-
|
|
3733
|
-
|
|
3734
|
-
|
|
3735
|
-
|
|
3736
|
-
|
|
3760
|
+
request_data = ExtractResponse(
|
|
3761
|
+
urls=urls or [],
|
|
3762
|
+
allowExternalLinks=allow_external_links,
|
|
3763
|
+
enableWebSearch=enable_web_search,
|
|
3764
|
+
showSources=show_sources,
|
|
3765
|
+
schema=schema,
|
|
3766
|
+
origin=f'python-sdk@{version}'
|
|
3767
|
+
)
|
|
3737
3768
|
|
|
3738
3769
|
if prompt:
|
|
3739
3770
|
request_data['prompt'] = prompt
|
|
@@ -3810,7 +3841,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3810
3841
|
|
|
3811
3842
|
await asyncio.sleep(2)
|
|
3812
3843
|
|
|
3813
|
-
return
|
|
3844
|
+
return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
|
|
3814
3845
|
|
|
3815
3846
|
async def async_generate_llms_text(
|
|
3816
3847
|
self,
|
|
@@ -3845,6 +3876,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3845
3876
|
if experimental_stream is not None:
|
|
3846
3877
|
params['__experimental_stream'] = experimental_stream
|
|
3847
3878
|
|
|
3879
|
+
params = GenerateLLMsTextParams(
|
|
3880
|
+
maxUrls=max_urls,
|
|
3881
|
+
showFullText=show_full_text,
|
|
3882
|
+
__experimental_stream=experimental_stream
|
|
3883
|
+
)
|
|
3884
|
+
|
|
3848
3885
|
headers = self._prepare_headers()
|
|
3849
3886
|
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
3850
3887
|
json_data['origin'] = f"python-sdk@{version}"
|
|
@@ -3981,7 +4018,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3981
4018
|
|
|
3982
4019
|
await asyncio.sleep(2)
|
|
3983
4020
|
|
|
3984
|
-
return
|
|
4021
|
+
return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
|
|
3985
4022
|
|
|
3986
4023
|
async def async_deep_research(
|
|
3987
4024
|
self,
|
|
@@ -4088,7 +4125,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4088
4125
|
country: Optional[str] = None,
|
|
4089
4126
|
location: Optional[str] = None,
|
|
4090
4127
|
timeout: Optional[int] = None,
|
|
4091
|
-
scrape_options: Optional[
|
|
4128
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
4092
4129
|
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
4093
4130
|
**kwargs) -> SearchResponse:
|
|
4094
4131
|
"""
|
|
@@ -4103,7 +4140,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4103
4140
|
country (Optional[str]): Country code (default: "us")
|
|
4104
4141
|
location (Optional[str]): Geo-targeting
|
|
4105
4142
|
timeout (Optional[int]): Request timeout in milliseconds
|
|
4106
|
-
scrape_options (Optional[
|
|
4143
|
+
scrape_options (Optional[ScrapeOptions]): Result scraping configuration
|
|
4107
4144
|
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
4108
4145
|
**kwargs: Additional keyword arguments for future compatibility
|
|
4109
4146
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
firecrawl/__init__.py,sha256=
|
|
2
|
-
firecrawl/firecrawl.py,sha256=
|
|
1
|
+
firecrawl/__init__.py,sha256=iHizMdAIoTmkymj1pSBrh7ktCGYU3kZ1kXZgntQPm3g,2570
|
|
2
|
+
firecrawl/firecrawl.py,sha256=O-wyUWL9VnfRhZWgVAnmwpwIe0M3MPz9ek95KfYcHPQ,177750
|
|
3
3
|
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
5
|
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
7
|
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
-
firecrawl-2.0.
|
|
9
|
-
firecrawl-2.0.
|
|
10
|
-
firecrawl-2.0.
|
|
11
|
-
firecrawl-2.0.
|
|
12
|
-
firecrawl-2.0.
|
|
8
|
+
firecrawl-2.1.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.1.0.dist-info/METADATA,sha256=l-XNBUPSE1sFvGZ1wBvesKC7fRlEIGI0DTfY7BNPAWI,10583
|
|
10
|
+
firecrawl-2.1.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.1.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|