firecrawl 2.10.0__tar.gz → 2.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-2.10.0 → firecrawl-2.12.0}/PKG-INFO +1 -1
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl/__init__.py +1 -1
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl/firecrawl.py +38 -10
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl.egg-info/PKG-INFO +1 -1
- {firecrawl-2.10.0 → firecrawl-2.12.0}/LICENSE +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/README.md +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl/__tests__/e2e_withAuth/test.py +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl.egg-info/SOURCES.txt +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl.egg-info/requires.txt +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/pyproject.toml +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/setup.cfg +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/setup.py +0 -0
- {firecrawl-2.10.0 → firecrawl-2.12.0}/tests/test_change_tracking.py +0 -0
|
@@ -13,7 +13,7 @@ import os
|
|
|
13
13
|
|
|
14
14
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
|
15
15
|
|
|
16
|
-
__version__ = "2.
|
|
16
|
+
__version__ = "2.12.0"
|
|
17
17
|
|
|
18
18
|
# Define the logger for the Firecrawl project
|
|
19
19
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
|
@@ -96,6 +96,7 @@ class AgentOptionsExtract(pydantic.BaseModel):
|
|
|
96
96
|
class ActionsResult(pydantic.BaseModel):
|
|
97
97
|
"""Result of actions performed during scraping."""
|
|
98
98
|
screenshots: List[str]
|
|
99
|
+
pdfs: List[str]
|
|
99
100
|
|
|
100
101
|
class ChangeTrackingData(pydantic.BaseModel):
|
|
101
102
|
"""
|
|
@@ -172,6 +173,7 @@ class ScreenshotAction(pydantic.BaseModel):
|
|
|
172
173
|
"""Screenshot action to perform during scraping."""
|
|
173
174
|
type: Literal["screenshot"]
|
|
174
175
|
fullPage: Optional[bool] = None
|
|
176
|
+
quality: Optional[int] = None
|
|
175
177
|
|
|
176
178
|
class ClickAction(pydantic.BaseModel):
|
|
177
179
|
"""Click action to perform during scraping."""
|
|
@@ -203,6 +205,12 @@ class ExecuteJavascriptAction(pydantic.BaseModel):
|
|
|
203
205
|
type: Literal["executeJavascript"]
|
|
204
206
|
script: str
|
|
205
207
|
|
|
208
|
+
class PDFAction(pydantic.BaseModel):
|
|
209
|
+
"""PDF action to perform during scraping."""
|
|
210
|
+
type: Literal["pdf"]
|
|
211
|
+
format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
|
|
212
|
+
landscape: Optional[bool] = None
|
|
213
|
+
scale: Optional[float] = None
|
|
206
214
|
|
|
207
215
|
class ExtractAgent(pydantic.BaseModel):
|
|
208
216
|
"""Configuration for the agent in extract operations."""
|
|
@@ -219,7 +227,7 @@ class ScrapeParams(ScrapeOptions):
|
|
|
219
227
|
"""Parameters for scraping operations."""
|
|
220
228
|
extract: Optional[JsonConfig] = None
|
|
221
229
|
jsonOptions: Optional[JsonConfig] = None
|
|
222
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
|
230
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None
|
|
223
231
|
agent: Optional[AgentOptions] = None
|
|
224
232
|
webhook: Optional[WebhookConfig] = None
|
|
225
233
|
|
|
@@ -265,6 +273,7 @@ class CrawlParams(pydantic.BaseModel):
|
|
|
265
273
|
regexOnFullURL: Optional[bool] = None
|
|
266
274
|
delay: Optional[int] = None # Delay in seconds between scrapes
|
|
267
275
|
maxConcurrency: Optional[int] = None
|
|
276
|
+
allowSubdomains: Optional[bool] = None
|
|
268
277
|
|
|
269
278
|
class CrawlResponse(pydantic.BaseModel):
|
|
270
279
|
"""Response from crawling operations."""
|
|
@@ -469,7 +478,7 @@ class FirecrawlApp:
|
|
|
469
478
|
parse_pdf: Optional[bool] = None,
|
|
470
479
|
extract: Optional[JsonConfig] = None,
|
|
471
480
|
json_options: Optional[JsonConfig] = None,
|
|
472
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
481
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
473
482
|
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
|
474
483
|
max_age: Optional[int] = None,
|
|
475
484
|
store_in_cache: Optional[bool] = None,
|
|
@@ -493,7 +502,7 @@ class FirecrawlApp:
|
|
|
493
502
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
494
503
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
495
504
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
496
|
-
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
505
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
|
|
497
506
|
change_tracking_options (Optional[ChangeTrackingOptions]): Change tracking settings
|
|
498
507
|
|
|
499
508
|
|
|
@@ -700,6 +709,7 @@ class FirecrawlApp:
|
|
|
700
709
|
ignore_query_parameters: Optional[bool] = None,
|
|
701
710
|
regex_on_full_url: Optional[bool] = None,
|
|
702
711
|
delay: Optional[int] = None,
|
|
712
|
+
allow_subdomains: Optional[bool] = None,
|
|
703
713
|
max_concurrency: Optional[int] = None,
|
|
704
714
|
poll_interval: Optional[int] = 2,
|
|
705
715
|
idempotency_key: Optional[str] = None,
|
|
@@ -725,6 +735,7 @@ class FirecrawlApp:
|
|
|
725
735
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
726
736
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
727
737
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
738
|
+
allow_subdomains (Optional[bool]): Follow subdomains
|
|
728
739
|
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
729
740
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
730
741
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
@@ -775,6 +786,8 @@ class FirecrawlApp:
|
|
|
775
786
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
776
787
|
if delay is not None:
|
|
777
788
|
crawl_params['delay'] = delay
|
|
789
|
+
if allow_subdomains is not None:
|
|
790
|
+
crawl_params['allowSubdomains'] = allow_subdomains
|
|
778
791
|
if max_concurrency is not None:
|
|
779
792
|
crawl_params['maxConcurrency'] = max_concurrency
|
|
780
793
|
|
|
@@ -819,6 +832,8 @@ class FirecrawlApp:
|
|
|
819
832
|
ignore_query_parameters: Optional[bool] = None,
|
|
820
833
|
regex_on_full_url: Optional[bool] = None,
|
|
821
834
|
delay: Optional[int] = None,
|
|
835
|
+
allow_subdomains: Optional[bool] = None,
|
|
836
|
+
max_concurrency: Optional[int] = None,
|
|
822
837
|
idempotency_key: Optional[str] = None,
|
|
823
838
|
**kwargs
|
|
824
839
|
) -> CrawlResponse:
|
|
@@ -842,6 +857,7 @@ class FirecrawlApp:
|
|
|
842
857
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
843
858
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
844
859
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
860
|
+
allow_subdomains (Optional[bool]): Follow subdomains
|
|
845
861
|
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
846
862
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
847
863
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -892,6 +908,8 @@ class FirecrawlApp:
|
|
|
892
908
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
893
909
|
if delay is not None:
|
|
894
910
|
crawl_params['delay'] = delay
|
|
911
|
+
if allow_subdomains is not None:
|
|
912
|
+
crawl_params['allowSubdomains'] = allow_subdomains
|
|
895
913
|
if max_concurrency is not None:
|
|
896
914
|
crawl_params['maxConcurrency'] = max_concurrency
|
|
897
915
|
|
|
@@ -1072,6 +1090,7 @@ class FirecrawlApp:
|
|
|
1072
1090
|
ignore_query_parameters: Optional[bool] = None,
|
|
1073
1091
|
regex_on_full_url: Optional[bool] = None,
|
|
1074
1092
|
delay: Optional[int] = None,
|
|
1093
|
+
allow_subdomains: Optional[bool] = None,
|
|
1075
1094
|
max_concurrency: Optional[int] = None,
|
|
1076
1095
|
idempotency_key: Optional[str] = None,
|
|
1077
1096
|
**kwargs
|
|
@@ -1096,6 +1115,7 @@ class FirecrawlApp:
|
|
|
1096
1115
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
1097
1116
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
1098
1117
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
1118
|
+
allow_subdomains (Optional[bool]): Follow subdomains
|
|
1099
1119
|
max_concurrency (Optional[int]): Maximum number of concurrent scrapes
|
|
1100
1120
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
1101
1121
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -1122,6 +1142,7 @@ class FirecrawlApp:
|
|
|
1122
1142
|
ignore_query_parameters=ignore_query_parameters,
|
|
1123
1143
|
regex_on_full_url=regex_on_full_url,
|
|
1124
1144
|
delay=delay,
|
|
1145
|
+
allow_subdomains=allow_subdomains,
|
|
1125
1146
|
max_concurrency=max_concurrency,
|
|
1126
1147
|
idempotency_key=idempotency_key,
|
|
1127
1148
|
**kwargs
|
|
@@ -1236,7 +1257,7 @@ class FirecrawlApp:
|
|
|
1236
1257
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1237
1258
|
extract: Optional[JsonConfig] = None,
|
|
1238
1259
|
json_options: Optional[JsonConfig] = None,
|
|
1239
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1260
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1240
1261
|
agent: Optional[AgentOptions] = None,
|
|
1241
1262
|
poll_interval: Optional[int] = 2,
|
|
1242
1263
|
max_concurrency: Optional[int] = None,
|
|
@@ -1374,7 +1395,7 @@ class FirecrawlApp:
|
|
|
1374
1395
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1375
1396
|
extract: Optional[JsonConfig] = None,
|
|
1376
1397
|
json_options: Optional[JsonConfig] = None,
|
|
1377
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1398
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1378
1399
|
agent: Optional[AgentOptions] = None,
|
|
1379
1400
|
max_concurrency: Optional[int] = None,
|
|
1380
1401
|
idempotency_key: Optional[str] = None,
|
|
@@ -1510,7 +1531,7 @@ class FirecrawlApp:
|
|
|
1510
1531
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1511
1532
|
extract: Optional[JsonConfig] = None,
|
|
1512
1533
|
json_options: Optional[JsonConfig] = None,
|
|
1513
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
1534
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
1514
1535
|
agent: Optional[AgentOptions] = None,
|
|
1515
1536
|
max_concurrency: Optional[int] = None,
|
|
1516
1537
|
idempotency_key: Optional[str] = None,
|
|
@@ -2911,7 +2932,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2911
2932
|
parse_pdf: Optional[bool] = None,
|
|
2912
2933
|
extract: Optional[JsonConfig] = None,
|
|
2913
2934
|
json_options: Optional[JsonConfig] = None,
|
|
2914
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
2935
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
2915
2936
|
**kwargs) -> ScrapeResponse[Any]:
|
|
2916
2937
|
"""
|
|
2917
2938
|
Scrape a single URL asynchronously.
|
|
@@ -2932,7 +2953,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
2932
2953
|
proxy (Optional[Literal["basic", "stealth", "auto"]]): Proxy type (basic/stealth)
|
|
2933
2954
|
extract (Optional[JsonConfig]): Content extraction settings
|
|
2934
2955
|
json_options (Optional[JsonConfig]): JSON extraction settings
|
|
2935
|
-
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
|
2956
|
+
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]]): Actions to perform
|
|
2936
2957
|
**kwargs: Additional parameters to pass to the API
|
|
2937
2958
|
|
|
2938
2959
|
Returns:
|
|
@@ -3042,7 +3063,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3042
3063
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3043
3064
|
extract: Optional[JsonConfig] = None,
|
|
3044
3065
|
json_options: Optional[JsonConfig] = None,
|
|
3045
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
3066
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
3046
3067
|
agent: Optional[AgentOptions] = None,
|
|
3047
3068
|
poll_interval: Optional[int] = 2,
|
|
3048
3069
|
idempotency_key: Optional[str] = None,
|
|
@@ -3181,7 +3202,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3181
3202
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3182
3203
|
extract: Optional[JsonConfig] = None,
|
|
3183
3204
|
json_options: Optional[JsonConfig] = None,
|
|
3184
|
-
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
|
3205
|
+
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction, PDFAction]]] = None,
|
|
3185
3206
|
agent: Optional[AgentOptions] = None,
|
|
3186
3207
|
idempotency_key: Optional[str] = None,
|
|
3187
3208
|
**kwargs
|
|
@@ -3317,6 +3338,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3317
3338
|
ignore_query_parameters: Optional[bool] = None,
|
|
3318
3339
|
regex_on_full_url: Optional[bool] = None,
|
|
3319
3340
|
delay: Optional[int] = None,
|
|
3341
|
+
allow_subdomains: Optional[bool] = None,
|
|
3320
3342
|
poll_interval: Optional[int] = 2,
|
|
3321
3343
|
idempotency_key: Optional[str] = None,
|
|
3322
3344
|
**kwargs
|
|
@@ -3341,6 +3363,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3341
3363
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
|
3342
3364
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
|
3343
3365
|
delay (Optional[int]): Delay in seconds between scrapes
|
|
3366
|
+
allow_subdomains (Optional[bool]): Follow subdomains
|
|
3344
3367
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
|
3345
3368
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
3346
3369
|
**kwargs: Additional parameters to pass to the API
|
|
@@ -3390,6 +3413,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3390
3413
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3391
3414
|
if delay is not None:
|
|
3392
3415
|
crawl_params['delay'] = delay
|
|
3416
|
+
if allow_subdomains is not None:
|
|
3417
|
+
crawl_params['allowSubdomains'] = allow_subdomains
|
|
3393
3418
|
|
|
3394
3419
|
# Add any additional kwargs
|
|
3395
3420
|
crawl_params.update(kwargs)
|
|
@@ -3433,6 +3458,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3433
3458
|
ignore_query_parameters: Optional[bool] = None,
|
|
3434
3459
|
regex_on_full_url: Optional[bool] = None,
|
|
3435
3460
|
delay: Optional[int] = None,
|
|
3461
|
+
allow_subdomains: Optional[bool] = None,
|
|
3436
3462
|
poll_interval: Optional[int] = 2,
|
|
3437
3463
|
idempotency_key: Optional[str] = None,
|
|
3438
3464
|
**kwargs
|
|
@@ -3502,6 +3528,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|
|
3502
3528
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
|
3503
3529
|
if delay is not None:
|
|
3504
3530
|
crawl_params['delay'] = delay
|
|
3531
|
+
if allow_subdomains is not None:
|
|
3532
|
+
crawl_params['allowSubdomains'] = allow_subdomains
|
|
3505
3533
|
|
|
3506
3534
|
# Add any additional kwargs
|
|
3507
3535
|
crawl_params.update(kwargs)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|