firecrawl 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +2 -2
- firecrawl/firecrawl.py +193 -157
- {firecrawl-2.0.1.dist-info → firecrawl-2.1.0.dist-info}/METADATA +1 -1
- {firecrawl-2.0.1.dist-info → firecrawl-2.1.0.dist-info}/RECORD +7 -7
- {firecrawl-2.0.1.dist-info → firecrawl-2.1.0.dist-info}/LICENSE +0 -0
- {firecrawl-2.0.1.dist-info → firecrawl-2.1.0.dist-info}/WHEEL +0 -0
- {firecrawl-2.0.1.dist-info → firecrawl-2.1.0.dist-info}/top_level.txt +0 -0
firecrawl/__init__.py
CHANGED
|
@@ -11,9 +11,9 @@ For more information visit https://github.com/firecrawl/
|
|
|
11
11
|
import logging
|
|
12
12
|
import os
|
|
13
13
|
|
|
14
|
-
from .firecrawl import FirecrawlApp,
|
|
14
|
+
from .firecrawl import FirecrawlApp, JsonConfig, ScrapeOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.0
|
|
16
|
+
__version__ = "2.1.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
firecrawl/firecrawl.py
CHANGED
|
@@ -27,7 +27,7 @@ from pydantic import Field
|
|
|
27
27
|
# Suppress Pydantic warnings about attribute shadowing
|
|
28
28
|
warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
|
|
29
29
|
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
|
|
30
|
-
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"
|
|
30
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
|
|
31
31
|
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
|
|
32
32
|
|
|
33
33
|
|
|
@@ -84,7 +84,6 @@ T = TypeVar('T')
|
|
|
84
84
|
# statusCode: Optional[int] = None
|
|
85
85
|
# error: Optional[str] = None
|
|
86
86
|
|
|
87
|
-
|
|
88
87
|
class AgentOptions(pydantic.BaseModel):
|
|
89
88
|
"""Configuration for the agent."""
|
|
90
89
|
model: Literal["FIRE-1"] = "FIRE-1"
|
|
@@ -98,6 +97,16 @@ class ActionsResult(pydantic.BaseModel):
|
|
|
98
97
|
"""Result of actions performed during scraping."""
|
|
99
98
|
screenshots: List[str]
|
|
100
99
|
|
|
100
|
+
class ChangeTrackingData(pydantic.BaseModel):
|
|
101
|
+
"""
|
|
102
|
+
Data for the change tracking format.
|
|
103
|
+
"""
|
|
104
|
+
previousScrapeAt: Optional[str] = None
|
|
105
|
+
changeStatus: str # "new" | "same" | "changed" | "removed"
|
|
106
|
+
visibility: str # "visible" | "hidden"
|
|
107
|
+
diff: Optional[Dict[str, Any]] = None
|
|
108
|
+
json: Optional[Any] = None
|
|
109
|
+
|
|
101
110
|
class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
102
111
|
"""Document retrieved or processed by Firecrawl."""
|
|
103
112
|
url: Optional[str] = None
|
|
@@ -112,6 +121,7 @@ class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
|
|
112
121
|
actions: Optional[ActionsResult] = None
|
|
113
122
|
title: Optional[str] = None # v1 search only
|
|
114
123
|
description: Optional[str] = None # v1 search only
|
|
124
|
+
changeTracking: Optional[ChangeTrackingData] = None
|
|
115
125
|
|
|
116
126
|
class LocationConfig(pydantic.BaseModel):
|
|
117
127
|
"""Location configuration for scraping."""
|
|
@@ -125,9 +135,9 @@ class WebhookConfig(pydantic.BaseModel):
|
|
|
125
135
|
metadata: Optional[Dict[str, str]] = None
|
|
126
136
|
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
127
137
|
|
|
128
|
-
class
|
|
138
|
+
class ScrapeOptions(pydantic.BaseModel):
|
|
129
139
|
"""Parameters for scraping operations."""
|
|
130
|
-
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None
|
|
140
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
|
|
131
141
|
headers: Optional[Dict[str, str]] = None
|
|
132
142
|
includeTags: Optional[List[str]] = None
|
|
133
143
|
excludeTags: Optional[List[str]] = None
|
|
@@ -187,17 +197,17 @@ class ExtractAgent(pydantic.BaseModel):
|
|
|
187
197
|
"""Configuration for the agent in extract operations."""
|
|
188
198
|
model: Literal["FIRE-1"] = "FIRE-1"
|
|
189
199
|
|
|
190
|
-
class
|
|
200
|
+
class JsonConfig(pydantic.BaseModel):
|
|
191
201
|
"""Configuration for extraction."""
|
|
192
202
|
prompt: Optional[str] = None
|
|
193
203
|
schema: Optional[Any] = None
|
|
194
204
|
systemPrompt: Optional[str] = None
|
|
195
205
|
agent: Optional[ExtractAgent] = None
|
|
196
206
|
|
|
197
|
-
class ScrapeParams(
|
|
207
|
+
class ScrapeParams(ScrapeOptions):
|
|
198
208
|
"""Parameters for scraping operations."""
|
|
199
|
-
extract: Optional[
|
|
200
|
-
jsonOptions: Optional[
|
|
209
|
+
extract: Optional[JsonConfig] = None
|
|
210
|
+
jsonOptions: Optional[JsonConfig] = None
|
|
201
211
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
|
202
212
|
agent: Optional[AgentOptions] = None
|
|
203
213
|
|
|
@@ -236,7 +246,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
236
246
|
allowBackwardLinks: Optional[bool] = None
|
|
237
247
|
allowExternalLinks: Optional[bool] = None
|
|
238
248
|
ignoreSitemap: Optional[bool] = None
|
|
239
|
-
scrapeOptions: Optional[
|
|
249
|
+
scrapeOptions: Optional[ScrapeOptions] = None
|
|
240
250
|
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
241
251
|
deduplicateSimilarURLs: Optional[bool] = None
|
|
242
252
|
ignoreQueryParameters: Optional[bool] = None
|
|
@@ -290,7 +300,7 @@ class ExtractParams(pydantic.BaseModel):
|
|
|
290
300
|
includeSubdomains: Optional[bool] = None
|
|
291
301
|
origin: Optional[str] = None
|
|
292
302
|
showSources: Optional[bool] = None
|
|
293
|
-
scrapeOptions: Optional[
|
|
303
|
+
scrapeOptions: Optional[ScrapeOptions] = None
|
|
294
304
|
|
|
295
305
|
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
|
296
306
|
"""Response from extract operations."""
|
|
@@ -310,7 +320,7 @@ class SearchParams(pydantic.BaseModel):
|
|
|
310
320
|
location: Optional[str] = None
|
|
311
321
|
origin: Optional[str] = "api"
|
|
312
322
|
timeout: Optional[int] = 60000
|
|
313
|
-
scrapeOptions: Optional[
|
|
323
|
+
scrapeOptions: Optional[ScrapeOptions] = None
|
|
314
324
|
|
|
315
325
|
class SearchResponse(pydantic.BaseModel):
|
|
316
326
|
"""Response from search operations."""
|
|
@@ -378,16 +388,6 @@ class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
|
|
378
388
|
status: Literal["processing", "completed", "failed"]
|
|
379
389
|
error: Optional[str] = None
|
|
380
390
|
expiresAt: str
|
|
381
|
-
|
|
382
|
-
class ChangeTrackingData(pydantic.BaseModel):
|
|
383
|
-
"""
|
|
384
|
-
Data for the change tracking format.
|
|
385
|
-
"""
|
|
386
|
-
previousScrapeAt: Optional[str] = None
|
|
387
|
-
changeStatus: str # "new" | "same" | "changed" | "removed"
|
|
388
|
-
visibility: str # "visible" | "hidden"
|
|
389
|
-
diff: Optional[Dict[str, Any]] = None
|
|
390
|
-
json: Optional[Any] = None
|
|
391
391
|
|
|
392
392
|
class SearchResponse(pydantic.BaseModel):
|
|
393
393
|
"""
|
|
@@ -443,7 +443,7 @@ class FirecrawlApp:
|
|
|
443
443
|
self,
|
|
444
444
|
url: str,
|
|
445
445
|
*,
|
|
446
|
-
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
|
446
|
+
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
|
447
447
|
include_tags: Optional[List[str]] = None,
|
|
448
448
|
exclude_tags: Optional[List[str]] = None,
|
|
449
449
|
only_main_content: Optional[bool] = None,
|
|
@@ -455,8 +455,8 @@ class FirecrawlApp:
|
|
|
455
455
|
remove_base64_images: Optional[bool] = None,
|
|
456
456
|
block_ads: Optional[bool] = None,
|
|
457
457
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
458
|
-
extract: Optional[
|
|
459
|
-
json_options: Optional[
|
|
458
|
+
extract: Optional[JsonConfig] = None,
|
|
459
|
+
json_options: Optional[JsonConfig] = None,
|
|
460
460
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
461
461
|
**kwargs) -> ScrapeResponse[Any]:
|
|
462
462
|
"""
|
|
@@ -476,8 +476,8 @@ class FirecrawlApp:
|
|
|
476
476
|
remove_base64_images (Optional[bool]): Remove base64 images
|
|
477
477
|
block_ads (Optional[bool]): Block ads
|
|
478
478
|
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
479
|
-
extract (Optional[
|
|
480
|
-
json_options (Optional[
|
|
479
|
+
extract (Optional[JsonConfig]): Content extraction settings
|
|
480
|
+
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
481
481
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
482
482
|
|
|
483
483
|
|
|
@@ -569,7 +569,7 @@ class FirecrawlApp:
|
|
|
569
569
|
country: Optional[str] = None,
|
|
570
570
|
location: Optional[str] = None,
|
|
571
571
|
timeout: Optional[int] = None,
|
|
572
|
-
scrape_options: Optional[
|
|
572
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
573
573
|
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
574
574
|
**kwargs) -> SearchResponse:
|
|
575
575
|
"""
|
|
@@ -584,7 +584,7 @@ class FirecrawlApp:
|
|
|
584
584
|
country (Optional[str]): Country code (default: "us")
|
|
585
585
|
location (Optional[str]): Geo-targeting
|
|
586
586
|
timeout (Optional[int]): Request timeout in milliseconds
|
|
587
|
-
scrape_options (Optional[
|
|
587
|
+
scrape_options (Optional[ScrapeOptions]): Result scraping configuration
|
|
588
588
|
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
589
589
|
**kwargs: Additional keyword arguments for future compatibility
|
|
590
590
|
|
|
@@ -665,7 +665,7 @@ class FirecrawlApp:
|
|
|
665
665
|
allow_backward_links: Optional[bool] = None,
|
|
666
666
|
allow_external_links: Optional[bool] = None,
|
|
667
667
|
ignore_sitemap: Optional[bool] = None,
|
|
668
|
-
scrape_options: Optional[
|
|
668
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
669
669
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
670
670
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
671
671
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -687,7 +687,7 @@ class FirecrawlApp:
|
|
|
687
687
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
688
688
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
689
689
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
690
|
-
scrape_options (Optional[
|
|
690
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
691
691
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
692
692
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
693
693
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -769,7 +769,7 @@ class FirecrawlApp:
|
|
|
769
769
|
allow_backward_links: Optional[bool] = None,
|
|
770
770
|
allow_external_links: Optional[bool] = None,
|
|
771
771
|
ignore_sitemap: Optional[bool] = None,
|
|
772
|
-
scrape_options: Optional[
|
|
772
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
773
773
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
774
774
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
775
775
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -790,7 +790,7 @@ class FirecrawlApp:
|
|
|
790
790
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
791
791
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
792
792
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
793
|
-
scrape_options (Optional[
|
|
793
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
794
794
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
795
795
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
796
796
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -1008,7 +1008,7 @@ class FirecrawlApp:
|
|
|
1008
1008
|
allow_backward_links: Optional[bool] = None,
|
|
1009
1009
|
allow_external_links: Optional[bool] = None,
|
|
1010
1010
|
ignore_sitemap: Optional[bool] = None,
|
|
1011
|
-
scrape_options: Optional[
|
|
1011
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
1012
1012
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
1013
1013
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
1014
1014
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -1029,7 +1029,7 @@ class FirecrawlApp:
|
|
|
1029
1029
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
1030
1030
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
1031
1031
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
1032
|
-
scrape_options (Optional[
|
|
1032
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
1033
1033
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
1034
1034
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
1035
1035
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -1162,8 +1162,8 @@ class FirecrawlApp:
|
|
|
1162
1162
|
remove_base64_images: Optional[bool] = None,
|
|
1163
1163
|
block_ads: Optional[bool] = None,
|
|
1164
1164
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1165
|
-
extract: Optional[
|
|
1166
|
-
json_options: Optional[
|
|
1165
|
+
extract: Optional[JsonConfig] = None,
|
|
1166
|
+
json_options: Optional[JsonConfig] = None,
|
|
1167
1167
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1168
1168
|
agent: Optional[AgentOptions] = None,
|
|
1169
1169
|
poll_interval: Optional[int] = 2,
|
|
@@ -1188,8 +1188,8 @@ class FirecrawlApp:
|
|
|
1188
1188
|
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1189
1189
|
block_ads (Optional[bool]): Block advertisements
|
|
1190
1190
|
proxy (Optional[Literal]): Proxy type to use
|
|
1191
|
-
extract (Optional[
|
|
1192
|
-
json_options (Optional[
|
|
1191
|
+
extract (Optional[JsonConfig]): Content extraction config
|
|
1192
|
+
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1193
1193
|
actions (Optional[List[Union]]): Actions to perform
|
|
1194
1194
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1195
1195
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
@@ -1286,8 +1286,8 @@ class FirecrawlApp:
|
|
|
1286
1286
|
remove_base64_images: Optional[bool] = None,
|
|
1287
1287
|
block_ads: Optional[bool] = None,
|
|
1288
1288
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1289
|
-
extract: Optional[
|
|
1290
|
-
json_options: Optional[
|
|
1289
|
+
extract: Optional[JsonConfig] = None,
|
|
1290
|
+
json_options: Optional[JsonConfig] = None,
|
|
1291
1291
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1292
1292
|
agent: Optional[AgentOptions] = None,
|
|
1293
1293
|
idempotency_key: Optional[str] = None,
|
|
@@ -1311,8 +1311,8 @@ class FirecrawlApp:
|
|
|
1311
1311
|
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1312
1312
|
block_ads (Optional[bool]): Block advertisements
|
|
1313
1313
|
proxy (Optional[Literal]): Proxy type to use
|
|
1314
|
-
extract (Optional[
|
|
1315
|
-
json_options (Optional[
|
|
1314
|
+
extract (Optional[JsonConfig]): Content extraction config
|
|
1315
|
+
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1316
1316
|
actions (Optional[List[Union]]): Actions to perform
|
|
1317
1317
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1318
1318
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
@@ -1408,8 +1408,8 @@ class FirecrawlApp:
|
|
|
1408
1408
|
remove_base64_images: Optional[bool] = None,
|
|
1409
1409
|
block_ads: Optional[bool] = None,
|
|
1410
1410
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
1411
|
-
extract: Optional[
|
|
1412
|
-
json_options: Optional[
|
|
1411
|
+
extract: Optional[JsonConfig] = None,
|
|
1412
|
+
json_options: Optional[JsonConfig] = None,
|
|
1413
1413
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1414
1414
|
agent: Optional[AgentOptions] = None,
|
|
1415
1415
|
idempotency_key: Optional[str] = None,
|
|
@@ -1433,8 +1433,8 @@ class FirecrawlApp:
|
|
|
1433
1433
|
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
1434
1434
|
block_ads (Optional[bool]): Block advertisements
|
|
1435
1435
|
proxy (Optional[Literal]): Proxy type to use
|
|
1436
|
-
extract (Optional[
|
|
1437
|
-
json_options (Optional[
|
|
1436
|
+
extract (Optional[JsonConfig]): Content extraction config
|
|
1437
|
+
json_options (Optional[JsonConfig]): JSON extraction config
|
|
1438
1438
|
actions (Optional[List[Union]]): Actions to perform
|
|
1439
1439
|
agent (Optional[AgentOptions]): Agent configuration
|
|
1440
1440
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
@@ -1742,7 +1742,7 @@ class FirecrawlApp:
|
|
|
1742
1742
|
|
|
1743
1743
|
def async_extract(
|
|
1744
1744
|
self,
|
|
1745
|
-
urls: List[str],
|
|
1745
|
+
urls: Optional[List[str]] = None,
|
|
1746
1746
|
*,
|
|
1747
1747
|
prompt: Optional[str] = None,
|
|
1748
1748
|
schema: Optional[Any] = None,
|
|
@@ -1750,8 +1750,7 @@ class FirecrawlApp:
|
|
|
1750
1750
|
allow_external_links: Optional[bool] = False,
|
|
1751
1751
|
enable_web_search: Optional[bool] = False,
|
|
1752
1752
|
show_sources: Optional[bool] = False,
|
|
1753
|
-
agent: Optional[Dict[str, Any]] = None
|
|
1754
|
-
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
1753
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
1755
1754
|
"""
|
|
1756
1755
|
Initiate an asynchronous extract job.
|
|
1757
1756
|
|
|
@@ -1775,7 +1774,7 @@ class FirecrawlApp:
|
|
|
1775
1774
|
Raises:
|
|
1776
1775
|
ValueError: If job initiation fails
|
|
1777
1776
|
"""
|
|
1778
|
-
headers = self._prepare_headers(
|
|
1777
|
+
headers = self._prepare_headers()
|
|
1779
1778
|
|
|
1780
1779
|
schema = schema
|
|
1781
1780
|
if schema:
|
|
@@ -2707,8 +2706,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2707
2706
|
remove_base64_images: Optional[bool] = None,
|
|
2708
2707
|
block_ads: Optional[bool] = None,
|
|
2709
2708
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2710
|
-
extract: Optional[
|
|
2711
|
-
json_options: Optional[
|
|
2709
|
+
extract: Optional[JsonConfig] = None,
|
|
2710
|
+
json_options: Optional[JsonConfig] = None,
|
|
2712
2711
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]:
|
|
2713
2712
|
"""
|
|
2714
2713
|
Scrape and extract content from a URL asynchronously.
|
|
@@ -2727,8 +2726,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2727
2726
|
remove_base64_images (Optional[bool]): Remove base64 images
|
|
2728
2727
|
block_ads (Optional[bool]): Block ads
|
|
2729
2728
|
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
|
2730
|
-
extract (Optional[
|
|
2731
|
-
json_options (Optional[
|
|
2729
|
+
extract (Optional[JsonConfig]): Content extraction settings
|
|
2730
|
+
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
2732
2731
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
2733
2732
|
|
|
2734
2733
|
Returns:
|
|
@@ -2821,8 +2820,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2821
2820
|
remove_base64_images: Optional[bool] = None,
|
|
2822
2821
|
block_ads: Optional[bool] = None,
|
|
2823
2822
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2824
|
-
extract: Optional[
|
|
2825
|
-
json_options: Optional[
|
|
2823
|
+
extract: Optional[JsonConfig] = None,
|
|
2824
|
+
json_options: Optional[JsonConfig] = None,
|
|
2826
2825
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2827
2826
|
agent: Optional[AgentOptions] = None,
|
|
2828
2827
|
poll_interval: Optional[int] = 2,
|
|
@@ -2847,8 +2846,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2847
2846
|
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
2848
2847
|
block_ads (Optional[bool]): Block advertisements
|
|
2849
2848
|
proxy (Optional[Literal]): Proxy type to use
|
|
2850
|
-
extract (Optional[
|
|
2851
|
-
json_options (Optional[
|
|
2849
|
+
extract (Optional[JsonConfig]): Content extraction config
|
|
2850
|
+
json_options (Optional[JsonConfig]): JSON extraction config
|
|
2852
2851
|
actions (Optional[List[Union]]): Actions to perform
|
|
2853
2852
|
agent (Optional[AgentOptions]): Agent configuration
|
|
2854
2853
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
@@ -2923,9 +2922,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2923
2922
|
headers
|
|
2924
2923
|
)
|
|
2925
2924
|
|
|
2926
|
-
if response.
|
|
2925
|
+
if response.get('success'):
|
|
2927
2926
|
try:
|
|
2928
|
-
id = response.
|
|
2927
|
+
id = response.get('id')
|
|
2929
2928
|
except:
|
|
2930
2929
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
2931
2930
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
@@ -2950,8 +2949,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2950
2949
|
remove_base64_images: Optional[bool] = None,
|
|
2951
2950
|
block_ads: Optional[bool] = None,
|
|
2952
2951
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
|
2953
|
-
extract: Optional[
|
|
2954
|
-
json_options: Optional[
|
|
2952
|
+
extract: Optional[JsonConfig] = None,
|
|
2953
|
+
json_options: Optional[JsonConfig] = None,
|
|
2955
2954
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2956
2955
|
agent: Optional[AgentOptions] = None,
|
|
2957
2956
|
idempotency_key: Optional[str] = None,
|
|
@@ -2975,8 +2974,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2975
2974
|
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
|
2976
2975
|
block_ads (Optional[bool]): Block advertisements
|
|
2977
2976
|
proxy (Optional[Literal]): Proxy type to use
|
|
2978
|
-
extract (Optional[
|
|
2979
|
-
json_options (Optional[
|
|
2977
|
+
extract (Optional[JsonConfig]): Content extraction config
|
|
2978
|
+
json_options (Optional[JsonConfig]): JSON extraction config
|
|
2980
2979
|
actions (Optional[List[Union]]): Actions to perform
|
|
2981
2980
|
agent (Optional[AgentOptions]): Agent configuration
|
|
2982
2981
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
@@ -3051,7 +3050,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3051
3050
|
headers
|
|
3052
3051
|
)
|
|
3053
3052
|
|
|
3054
|
-
if response.status_code == 200:
|
|
3053
|
+
if response.get('status_code') == 200:
|
|
3055
3054
|
try:
|
|
3056
3055
|
return BatchScrapeResponse(**response.json())
|
|
3057
3056
|
except:
|
|
@@ -3060,7 +3059,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3060
3059
|
self._handle_error(response, 'start batch scrape job')
|
|
3061
3060
|
|
|
3062
3061
|
async def crawl_url(
|
|
3063
|
-
|
|
3062
|
+
self,
|
|
3064
3063
|
url: str,
|
|
3065
3064
|
*,
|
|
3066
3065
|
include_paths: Optional[List[str]] = None,
|
|
@@ -3071,7 +3070,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3071
3070
|
allow_backward_links: Optional[bool] = None,
|
|
3072
3071
|
allow_external_links: Optional[bool] = None,
|
|
3073
3072
|
ignore_sitemap: Optional[bool] = None,
|
|
3074
|
-
scrape_options: Optional[
|
|
3073
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
3075
3074
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3076
3075
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3077
3076
|
ignore_query_parameters: Optional[bool] = None,
|
|
@@ -3093,7 +3092,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3093
3092
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3094
3093
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3095
3094
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3096
|
-
scrape_options (Optional[
|
|
3095
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
3097
3096
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3098
3097
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3099
3098
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -3149,15 +3148,15 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3149
3148
|
params_dict = final_params.dict(exclude_none=True)
|
|
3150
3149
|
params_dict['url'] = url
|
|
3151
3150
|
params_dict['origin'] = f"python-sdk@{version}"
|
|
3152
|
-
|
|
3153
3151
|
# Make request
|
|
3154
3152
|
headers = self._prepare_headers(idempotency_key)
|
|
3155
3153
|
response = await self._async_post_request(
|
|
3156
3154
|
f'{self.api_url}/v1/crawl', params_dict, headers)
|
|
3157
3155
|
|
|
3158
|
-
|
|
3156
|
+
print(response)
|
|
3157
|
+
if response.get('success'):
|
|
3159
3158
|
try:
|
|
3160
|
-
id = response.
|
|
3159
|
+
id = response.get('id')
|
|
3161
3160
|
except:
|
|
3162
3161
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3163
3162
|
return self._monitor_job_status(id, headers, poll_interval)
|
|
@@ -3177,11 +3176,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3177
3176
|
allow_backward_links: Optional[bool] = None,
|
|
3178
3177
|
allow_external_links: Optional[bool] = None,
|
|
3179
3178
|
ignore_sitemap: Optional[bool] = None,
|
|
3180
|
-
scrape_options: Optional[
|
|
3179
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
3181
3180
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
3182
3181
|
deduplicate_similar_urls: Optional[bool] = None,
|
|
3183
3182
|
ignore_query_parameters: Optional[bool] = None,
|
|
3184
3183
|
regex_on_full_url: Optional[bool] = None,
|
|
3184
|
+
poll_interval: Optional[int] = 2,
|
|
3185
3185
|
idempotency_key: Optional[str] = None,
|
|
3186
3186
|
**kwargs
|
|
3187
3187
|
) -> CrawlResponse:
|
|
@@ -3198,7 +3198,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3198
3198
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
|
3199
3199
|
allow_external_links (Optional[bool]): Follow external domain links
|
|
3200
3200
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
|
3201
|
-
scrape_options (Optional[
|
|
3201
|
+
scrape_options (Optional[ScrapeOptions]): Page scraping configuration
|
|
3202
3202
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
|
3203
3203
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
|
3204
3204
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
@@ -3263,9 +3263,9 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3263
3263
|
headers
|
|
3264
3264
|
)
|
|
3265
3265
|
|
|
3266
|
-
if response.
|
|
3266
|
+
if response.get('success'):
|
|
3267
3267
|
try:
|
|
3268
|
-
return CrawlResponse(**response
|
|
3268
|
+
return CrawlResponse(**response)
|
|
3269
3269
|
except:
|
|
3270
3270
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
|
3271
3271
|
else:
|
|
@@ -3304,7 +3304,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3304
3304
|
headers
|
|
3305
3305
|
)
|
|
3306
3306
|
|
|
3307
|
-
if status_data
|
|
3307
|
+
if status_data.get('status') == 'completed':
|
|
3308
3308
|
if 'data' in status_data:
|
|
3309
3309
|
data = status_data['data']
|
|
3310
3310
|
while 'next' in status_data:
|
|
@@ -3318,26 +3318,24 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3318
3318
|
data.extend(next_data.get('data', []))
|
|
3319
3319
|
status_data = next_data
|
|
3320
3320
|
status_data['data'] = data
|
|
3321
|
-
|
|
3322
|
-
response =
|
|
3323
|
-
|
|
3324
|
-
|
|
3325
|
-
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3329
|
-
|
|
3321
|
+
# Create CrawlStatusResponse object from status data
|
|
3322
|
+
response = CrawlStatusResponse(
|
|
3323
|
+
status=status_data.get('status'),
|
|
3324
|
+
total=status_data.get('total'),
|
|
3325
|
+
completed=status_data.get('completed'),
|
|
3326
|
+
creditsUsed=status_data.get('creditsUsed'),
|
|
3327
|
+
expiresAt=status_data.get('expiresAt'),
|
|
3328
|
+
data=status_data.get('data'),
|
|
3329
|
+
success=False if 'error' in status_data else True
|
|
3330
|
+
)
|
|
3330
3331
|
|
|
3331
3332
|
if 'error' in status_data:
|
|
3332
|
-
response
|
|
3333
|
+
response.error = status_data.get('error')
|
|
3333
3334
|
|
|
3334
3335
|
if 'next' in status_data:
|
|
3335
|
-
response
|
|
3336
|
+
response.next = status_data.get('next')
|
|
3336
3337
|
|
|
3337
|
-
return
|
|
3338
|
-
'success': False if 'error' in status_data else True,
|
|
3339
|
-
**response
|
|
3340
|
-
}
|
|
3338
|
+
return response
|
|
3341
3339
|
|
|
3342
3340
|
async def _async_monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int = 2) -> CrawlStatusResponse:
|
|
3343
3341
|
"""
|
|
@@ -3360,7 +3358,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3360
3358
|
headers
|
|
3361
3359
|
)
|
|
3362
3360
|
|
|
3363
|
-
if status_data
|
|
3361
|
+
if status_data.get('status') == 'completed':
|
|
3364
3362
|
if 'data' in status_data:
|
|
3365
3363
|
data = status_data['data']
|
|
3366
3364
|
while 'next' in status_data:
|
|
@@ -3377,15 +3375,22 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3377
3375
|
return status_data
|
|
3378
3376
|
else:
|
|
3379
3377
|
raise Exception('Job completed but no data was returned')
|
|
3380
|
-
elif status_data
|
|
3378
|
+
elif status_data.get('status') in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
|
|
3381
3379
|
await asyncio.sleep(max(poll_interval, 2))
|
|
3382
3380
|
else:
|
|
3383
3381
|
raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
|
|
3384
3382
|
|
|
3385
3383
|
async def map_url(
|
|
3386
|
-
|
|
3387
|
-
|
|
3388
|
-
|
|
3384
|
+
self,
|
|
3385
|
+
url: str,
|
|
3386
|
+
*,
|
|
3387
|
+
search: Optional[str] = None,
|
|
3388
|
+
ignore_sitemap: Optional[bool] = None,
|
|
3389
|
+
include_subdomains: Optional[bool] = None,
|
|
3390
|
+
sitemap_only: Optional[bool] = None,
|
|
3391
|
+
limit: Optional[int] = None,
|
|
3392
|
+
timeout: Optional[int] = None,
|
|
3393
|
+
params: Optional[MapParams] = None) -> MapResponse:
|
|
3389
3394
|
"""
|
|
3390
3395
|
Asynchronously map and discover links from a URL.
|
|
3391
3396
|
|
|
@@ -3410,21 +3415,40 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3410
3415
|
Raises:
|
|
3411
3416
|
Exception: If mapping fails
|
|
3412
3417
|
"""
|
|
3413
|
-
|
|
3414
|
-
json_data = {'url': url}
|
|
3418
|
+
map_params = {}
|
|
3415
3419
|
if params:
|
|
3416
|
-
|
|
3417
|
-
|
|
3420
|
+
map_params.update(params.dict(exclude_none=True))
|
|
3421
|
+
|
|
3422
|
+
# Add individual parameters
|
|
3423
|
+
if search is not None:
|
|
3424
|
+
map_params['search'] = search
|
|
3425
|
+
if ignore_sitemap is not None:
|
|
3426
|
+
map_params['ignoreSitemap'] = ignore_sitemap
|
|
3427
|
+
if include_subdomains is not None:
|
|
3428
|
+
map_params['includeSubdomains'] = include_subdomains
|
|
3429
|
+
if sitemap_only is not None:
|
|
3430
|
+
map_params['sitemapOnly'] = sitemap_only
|
|
3431
|
+
if limit is not None:
|
|
3432
|
+
map_params['limit'] = limit
|
|
3433
|
+
if timeout is not None:
|
|
3434
|
+
map_params['timeout'] = timeout
|
|
3435
|
+
|
|
3436
|
+
# Create final params object
|
|
3437
|
+
final_params = MapParams(**map_params)
|
|
3438
|
+
params_dict = final_params.dict(exclude_none=True)
|
|
3439
|
+
params_dict['url'] = url
|
|
3440
|
+
params_dict['origin'] = f"python-sdk@{version}"
|
|
3418
3441
|
|
|
3442
|
+
# Make request
|
|
3419
3443
|
endpoint = f'/v1/map'
|
|
3420
3444
|
response = await self._async_post_request(
|
|
3421
3445
|
f'{self.api_url}{endpoint}',
|
|
3422
|
-
|
|
3423
|
-
headers
|
|
3446
|
+
params_dict,
|
|
3447
|
+
headers={"Authorization": f"Bearer {self.api_key}"}
|
|
3424
3448
|
)
|
|
3425
3449
|
|
|
3426
3450
|
if response.get('success') and 'links' in response:
|
|
3427
|
-
return response
|
|
3451
|
+
return MapResponse(**response)
|
|
3428
3452
|
elif 'error' in response:
|
|
3429
3453
|
raise Exception(f'Failed to map URL. Error: {response["error"]}')
|
|
3430
3454
|
else:
|
|
@@ -3432,27 +3456,28 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3432
3456
|
|
|
3433
3457
|
async def extract(
|
|
3434
3458
|
self,
|
|
3435
|
-
urls: List[str],
|
|
3436
|
-
|
|
3459
|
+
urls: Optional[List[str]] = None,
|
|
3460
|
+
*,
|
|
3461
|
+
prompt: Optional[str] = None,
|
|
3462
|
+
schema: Optional[Any] = None,
|
|
3463
|
+
system_prompt: Optional[str] = None,
|
|
3464
|
+
allow_external_links: Optional[bool] = False,
|
|
3465
|
+
enable_web_search: Optional[bool] = False,
|
|
3466
|
+
show_sources: Optional[bool] = False,
|
|
3467
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
3468
|
+
|
|
3437
3469
|
"""
|
|
3438
3470
|
Asynchronously extract structured information from URLs.
|
|
3439
3471
|
|
|
3440
3472
|
Args:
|
|
3441
|
-
urls (List[str]): URLs to extract from
|
|
3442
|
-
|
|
3443
|
-
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
|
|
3449
|
-
* allowExternalLinks - Follow external links
|
|
3450
|
-
* enableWebSearch - Enable web search
|
|
3451
|
-
* includeSubdomains - Include subdomains
|
|
3452
|
-
* showSources - Include source URLs
|
|
3453
|
-
|
|
3454
|
-
Scraping Options:
|
|
3455
|
-
* scrapeOptions - Page scraping config
|
|
3473
|
+
urls (Optional[List[str]]): URLs to extract from
|
|
3474
|
+
prompt (Optional[str]): Custom extraction prompt
|
|
3475
|
+
schema (Optional[Any]): JSON schema/Pydantic model
|
|
3476
|
+
system_prompt (Optional[str]): System context
|
|
3477
|
+
allow_external_links (Optional[bool]): Follow external links
|
|
3478
|
+
enable_web_search (Optional[bool]): Enable web search
|
|
3479
|
+
show_sources (Optional[bool]): Include source URLs
|
|
3480
|
+
agent (Optional[Dict[str, Any]]): Agent configuration
|
|
3456
3481
|
|
|
3457
3482
|
Returns:
|
|
3458
3483
|
ExtractResponse with:
|
|
@@ -3465,29 +3490,35 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3465
3490
|
"""
|
|
3466
3491
|
headers = self._prepare_headers()
|
|
3467
3492
|
|
|
3468
|
-
if not
|
|
3493
|
+
if not prompt and not schema:
|
|
3469
3494
|
raise ValueError("Either prompt or schema is required")
|
|
3470
3495
|
|
|
3471
|
-
|
|
3496
|
+
if not urls and not prompt:
|
|
3497
|
+
raise ValueError("Either urls or prompt is required")
|
|
3498
|
+
|
|
3472
3499
|
if schema:
|
|
3473
3500
|
if hasattr(schema, 'model_json_schema'):
|
|
3501
|
+
# Convert Pydantic model to JSON schema
|
|
3474
3502
|
schema = schema.model_json_schema()
|
|
3503
|
+
# Otherwise assume it's already a JSON schema dict
|
|
3475
3504
|
|
|
3476
3505
|
request_data = {
|
|
3477
|
-
'urls': urls,
|
|
3478
|
-
'allowExternalLinks':
|
|
3479
|
-
'enableWebSearch':
|
|
3480
|
-
'showSources':
|
|
3506
|
+
'urls': urls or [],
|
|
3507
|
+
'allowExternalLinks': allow_external_links,
|
|
3508
|
+
'enableWebSearch': enable_web_search,
|
|
3509
|
+
'showSources': show_sources,
|
|
3481
3510
|
'schema': schema,
|
|
3482
|
-
'origin': f'python-sdk@{
|
|
3511
|
+
'origin': f'python-sdk@{get_version()}'
|
|
3483
3512
|
}
|
|
3484
3513
|
|
|
3485
|
-
if
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3514
|
+
# Only add prompt and systemPrompt if they exist
|
|
3515
|
+
if prompt:
|
|
3516
|
+
request_data['prompt'] = prompt
|
|
3517
|
+
if system_prompt:
|
|
3518
|
+
request_data['systemPrompt'] = system_prompt
|
|
3519
|
+
|
|
3520
|
+
if agent:
|
|
3521
|
+
request_data['agent'] = agent
|
|
3491
3522
|
|
|
3492
3523
|
response = await self._async_post_request(
|
|
3493
3524
|
f'{self.api_url}/v1/extract',
|
|
@@ -3507,7 +3538,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3507
3538
|
)
|
|
3508
3539
|
|
|
3509
3540
|
if status_data['status'] == 'completed':
|
|
3510
|
-
return status_data
|
|
3541
|
+
return ExtractResponse(**status_data)
|
|
3511
3542
|
elif status_data['status'] in ['failed', 'cancelled']:
|
|
3512
3543
|
raise Exception(f'Extract job {status_data["status"]}. Error: {status_data["error"]}')
|
|
3513
3544
|
|
|
@@ -3563,14 +3594,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3563
3594
|
status_data = next_data
|
|
3564
3595
|
status_data['data'] = data
|
|
3565
3596
|
|
|
3566
|
-
response =
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3572
|
-
|
|
3573
|
-
|
|
3597
|
+
response = BatchScrapeStatusResponse(
|
|
3598
|
+
status=status_data.get('status'),
|
|
3599
|
+
total=status_data.get('total'),
|
|
3600
|
+
completed=status_data.get('completed'),
|
|
3601
|
+
creditsUsed=status_data.get('creditsUsed'),
|
|
3602
|
+
expiresAt=status_data.get('expiresAt'),
|
|
3603
|
+
data=status_data.get('data')
|
|
3604
|
+
)
|
|
3574
3605
|
|
|
3575
3606
|
if 'error' in status_data:
|
|
3576
3607
|
response['error'] = status_data['error']
|
|
@@ -3690,8 +3721,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3690
3721
|
allow_external_links: Optional[bool] = False,
|
|
3691
3722
|
enable_web_search: Optional[bool] = False,
|
|
3692
3723
|
show_sources: Optional[bool] = False,
|
|
3693
|
-
agent: Optional[Dict[str, Any]] = None
|
|
3694
|
-
idempotency_key: Optional[str] = None) -> ExtractResponse[Any]:
|
|
3724
|
+
agent: Optional[Dict[str, Any]] = None) -> ExtractResponse[Any]:
|
|
3695
3725
|
"""
|
|
3696
3726
|
Initiate an asynchronous extraction job without waiting for completion.
|
|
3697
3727
|
|
|
@@ -3715,7 +3745,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3715
3745
|
Raises:
|
|
3716
3746
|
ValueError: If job initiation fails
|
|
3717
3747
|
"""
|
|
3718
|
-
headers = self._prepare_headers(
|
|
3748
|
+
headers = self._prepare_headers()
|
|
3719
3749
|
|
|
3720
3750
|
if not prompt and not schema:
|
|
3721
3751
|
raise ValueError("Either prompt or schema is required")
|
|
@@ -3727,14 +3757,14 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3727
3757
|
if hasattr(schema, 'model_json_schema'):
|
|
3728
3758
|
schema = schema.model_json_schema()
|
|
3729
3759
|
|
|
3730
|
-
request_data =
|
|
3731
|
-
|
|
3732
|
-
|
|
3733
|
-
|
|
3734
|
-
|
|
3735
|
-
|
|
3736
|
-
|
|
3737
|
-
|
|
3760
|
+
request_data = ExtractResponse(
|
|
3761
|
+
urls=urls or [],
|
|
3762
|
+
allowExternalLinks=allow_external_links,
|
|
3763
|
+
enableWebSearch=enable_web_search,
|
|
3764
|
+
showSources=show_sources,
|
|
3765
|
+
schema=schema,
|
|
3766
|
+
origin=f'python-sdk@{version}'
|
|
3767
|
+
)
|
|
3738
3768
|
|
|
3739
3769
|
if prompt:
|
|
3740
3770
|
request_data['prompt'] = prompt
|
|
@@ -3811,7 +3841,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3811
3841
|
|
|
3812
3842
|
await asyncio.sleep(2)
|
|
3813
3843
|
|
|
3814
|
-
return
|
|
3844
|
+
return GenerateLLMsTextStatusResponse(success=False, error='LLMs.txt generation job terminated unexpectedly')
|
|
3815
3845
|
|
|
3816
3846
|
async def async_generate_llms_text(
|
|
3817
3847
|
self,
|
|
@@ -3846,6 +3876,12 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3846
3876
|
if experimental_stream is not None:
|
|
3847
3877
|
params['__experimental_stream'] = experimental_stream
|
|
3848
3878
|
|
|
3879
|
+
params = GenerateLLMsTextParams(
|
|
3880
|
+
maxUrls=max_urls,
|
|
3881
|
+
showFullText=show_full_text,
|
|
3882
|
+
__experimental_stream=experimental_stream
|
|
3883
|
+
)
|
|
3884
|
+
|
|
3849
3885
|
headers = self._prepare_headers()
|
|
3850
3886
|
json_data = {'url': url, **params.dict(exclude_none=True)}
|
|
3851
3887
|
json_data['origin'] = f"python-sdk@{version}"
|
|
@@ -3982,7 +4018,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3982
4018
|
|
|
3983
4019
|
await asyncio.sleep(2)
|
|
3984
4020
|
|
|
3985
|
-
return
|
|
4021
|
+
return DeepResearchStatusResponse(success=False, error='Deep research job terminated unexpectedly')
|
|
3986
4022
|
|
|
3987
4023
|
async def async_deep_research(
|
|
3988
4024
|
self,
|
|
@@ -4089,7 +4125,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4089
4125
|
country: Optional[str] = None,
|
|
4090
4126
|
location: Optional[str] = None,
|
|
4091
4127
|
timeout: Optional[int] = None,
|
|
4092
|
-
scrape_options: Optional[
|
|
4128
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
4093
4129
|
params: Optional[Union[Dict[str, Any], SearchParams]] = None,
|
|
4094
4130
|
**kwargs) -> SearchResponse:
|
|
4095
4131
|
"""
|
|
@@ -4104,7 +4140,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
4104
4140
|
country (Optional[str]): Country code (default: "us")
|
|
4105
4141
|
location (Optional[str]): Geo-targeting
|
|
4106
4142
|
timeout (Optional[int]): Request timeout in milliseconds
|
|
4107
|
-
scrape_options (Optional[
|
|
4143
|
+
scrape_options (Optional[ScrapeOptions]): Result scraping configuration
|
|
4108
4144
|
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters
|
|
4109
4145
|
**kwargs: Additional keyword arguments for future compatibility
|
|
4110
4146
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
firecrawl/__init__.py,sha256=
|
|
2
|
-
firecrawl/firecrawl.py,sha256=
|
|
1
|
+
firecrawl/__init__.py,sha256=iHizMdAIoTmkymj1pSBrh7ktCGYU3kZ1kXZgntQPm3g,2570
|
|
2
|
+
firecrawl/firecrawl.py,sha256=O-wyUWL9VnfRhZWgVAnmwpwIe0M3MPz9ek95KfYcHPQ,177750
|
|
3
3
|
firecrawl/__tests__/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
firecrawl/__tests__/e2e_withAuth/test.py,sha256=-Fq2vPcMo0iQi4dwsUkkCd931ybDaTxMBnZbRfGdDcA,7931
|
|
5
5
|
firecrawl/__tests__/v1/e2e_withAuth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
firecrawl/__tests__/v1/e2e_withAuth/test.py,sha256=DcCw-cohtnL-t9XPekUtRoQrgg3UCWu8Ikqudf9ory8,19880
|
|
7
7
|
tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
|
|
8
|
-
firecrawl-2.0.
|
|
9
|
-
firecrawl-2.0.
|
|
10
|
-
firecrawl-2.0.
|
|
11
|
-
firecrawl-2.0.
|
|
12
|
-
firecrawl-2.0.
|
|
8
|
+
firecrawl-2.1.0.dist-info/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
|
|
9
|
+
firecrawl-2.1.0.dist-info/METADATA,sha256=l-XNBUPSE1sFvGZ1wBvesKC7fRlEIGI0DTfY7BNPAWI,10583
|
|
10
|
+
firecrawl-2.1.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
11
|
+
firecrawl-2.1.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
|
|
12
|
+
firecrawl-2.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|