firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- firecrawl/__init__.py +27 -19
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- firecrawl/__tests__/e2e/v2/test_search.py +265 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +241 -0
- firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
- firecrawl/types.py +157 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +4653 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +802 -0
- firecrawl/v2/client_async.py +250 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/batch.py +85 -0
- firecrawl/v2/methods/aio/crawl.py +174 -0
- firecrawl/v2/methods/aio/extract.py +126 -0
- firecrawl/v2/methods/aio/map.py +59 -0
- firecrawl/v2/methods/aio/scrape.py +36 -0
- firecrawl/v2/methods/aio/search.py +58 -0
- firecrawl/v2/methods/aio/usage.py +42 -0
- firecrawl/v2/methods/batch.py +420 -0
- firecrawl/v2/methods/crawl.py +468 -0
- firecrawl/v2/methods/extract.py +131 -0
- firecrawl/v2/methods/map.py +77 -0
- firecrawl/v2/methods/scrape.py +68 -0
- firecrawl/v2/methods/search.py +173 -0
- firecrawl/v2/methods/usage.py +41 -0
- firecrawl/v2/types.py +546 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +153 -0
- firecrawl/v2/utils/http_client_async.py +64 -0
- firecrawl/v2/utils/validation.py +324 -0
- firecrawl/v2/watcher.py +312 -0
- firecrawl/v2/watcher_async.py +245 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
- firecrawl-3.0.3.dist-info/RECORD +78 -0
- tests/test_timeout_conversion.py +117 -0
- firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/e2e_withAuth/test.py +0 -170
- firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
- firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
- firecrawl-2.16.5.dist-info/RECORD +0 -12
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
- {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
firecrawl/v2/types.py
ADDED
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Type definitions for Firecrawl v2 API.
|
|
3
|
+
|
|
4
|
+
This module contains clean, modern type definitions for the v2 API.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import warnings
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
|
|
10
|
+
from pydantic import BaseModel, Field, field_validator
|
|
11
|
+
|
|
12
|
+
# Suppress pydantic warnings about schema field shadowing
|
|
13
|
+
# Tested using schema_field alias="schema" but it doesn't work.
|
|
14
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"Format\" shadows an attribute in parent \"BaseModel\"")
|
|
15
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonFormat\" shadows an attribute in parent \"Format\"")
|
|
16
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingFormat\" shadows an attribute in parent \"Format\"")
|
|
17
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ScrapeFormats\" shadows an attribute in parent \"BaseModel\"")
|
|
18
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"Document\" shadows an attribute in parent \"BaseModel\"")
|
|
19
|
+
|
|
20
|
+
T = TypeVar('T')
|
|
21
|
+
|
|
22
|
+
# Base response types
|
|
23
|
+
class BaseResponse(BaseModel, Generic[T]):
|
|
24
|
+
"""Base response structure for all API responses."""
|
|
25
|
+
success: bool
|
|
26
|
+
data: Optional[T] = None
|
|
27
|
+
error: Optional[str] = None
|
|
28
|
+
warning: Optional[str] = None
|
|
29
|
+
|
|
30
|
+
# Document and content types
|
|
31
|
+
class DocumentMetadata(BaseModel):
|
|
32
|
+
"""Metadata for scraped documents."""
|
|
33
|
+
title: Optional[str] = None
|
|
34
|
+
description: Optional[str] = None
|
|
35
|
+
language: Optional[str] = None
|
|
36
|
+
keywords: Optional[Union[str, List[str]]] = None
|
|
37
|
+
robots: Optional[str] = None
|
|
38
|
+
og_title: Optional[str] = None
|
|
39
|
+
og_description: Optional[str] = None
|
|
40
|
+
og_url: Optional[str] = None
|
|
41
|
+
og_image: Optional[str] = None
|
|
42
|
+
source_url: Optional[str] = None
|
|
43
|
+
status_code: Optional[int] = None
|
|
44
|
+
error: Optional[str] = None
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _coerce_list_to_string(value: Any) -> Any:
|
|
48
|
+
if isinstance(value, list):
|
|
49
|
+
# Prefer first string if semantically a single-valued field, else join
|
|
50
|
+
if len(value) == 1:
|
|
51
|
+
return str(value[0])
|
|
52
|
+
return ', '.join(str(item) for item in value)
|
|
53
|
+
return value
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def _coerce_string_to_int(value: Any) -> Any:
|
|
57
|
+
if isinstance(value, str):
|
|
58
|
+
try:
|
|
59
|
+
return int(value)
|
|
60
|
+
except ValueError:
|
|
61
|
+
return value
|
|
62
|
+
return value
|
|
63
|
+
|
|
64
|
+
@field_validator('robots', 'og_title', 'og_description', 'og_url', 'og_image', 'language', mode='before')
|
|
65
|
+
@classmethod
|
|
66
|
+
def coerce_lists_to_string_fields(cls, v):
|
|
67
|
+
return cls._coerce_list_to_string(v)
|
|
68
|
+
|
|
69
|
+
@field_validator('status_code', mode='before')
|
|
70
|
+
@classmethod
|
|
71
|
+
def coerce_status_code_to_int(cls, v):
|
|
72
|
+
return cls._coerce_string_to_int(v)
|
|
73
|
+
|
|
74
|
+
class Document(BaseModel):
|
|
75
|
+
"""A scraped document."""
|
|
76
|
+
markdown: Optional[str] = None
|
|
77
|
+
html: Optional[str] = None
|
|
78
|
+
raw_html: Optional[str] = None
|
|
79
|
+
json: Optional[Any] = None
|
|
80
|
+
summary: Optional[str] = None
|
|
81
|
+
metadata: Optional[DocumentMetadata] = None
|
|
82
|
+
links: Optional[List[str]] = None
|
|
83
|
+
screenshot: Optional[str] = None
|
|
84
|
+
actions: Optional[Dict[str, Any]] = None
|
|
85
|
+
warning: Optional[str] = None
|
|
86
|
+
change_tracking: Optional[Dict[str, Any]] = None
|
|
87
|
+
|
|
88
|
+
# Webhook types
|
|
89
|
+
class WebhookConfig(BaseModel):
|
|
90
|
+
"""Configuration for webhooks."""
|
|
91
|
+
url: str
|
|
92
|
+
headers: Optional[Dict[str, str]] = None
|
|
93
|
+
metadata: Optional[Dict[str, str]] = None
|
|
94
|
+
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
95
|
+
|
|
96
|
+
class WebhookData(BaseModel):
|
|
97
|
+
"""Data sent to webhooks."""
|
|
98
|
+
job_id: str
|
|
99
|
+
status: str
|
|
100
|
+
current: Optional[int] = None
|
|
101
|
+
total: Optional[int] = None
|
|
102
|
+
data: Optional[List[Document]] = None
|
|
103
|
+
error: Optional[str] = None
|
|
104
|
+
|
|
105
|
+
class Source(BaseModel):
|
|
106
|
+
"""Configuration for a search source."""
|
|
107
|
+
type: str
|
|
108
|
+
|
|
109
|
+
SourceOption = Union[str, Source]
|
|
110
|
+
|
|
111
|
+
FormatString = Literal[
|
|
112
|
+
# camelCase versions (API format)
|
|
113
|
+
"markdown", "html", "rawHtml", "links", "screenshot", "summary", "changeTracking", "json",
|
|
114
|
+
# snake_case versions (user-friendly)
|
|
115
|
+
"raw_html", "change_tracking"
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
class Viewport(BaseModel):
|
|
119
|
+
"""Viewport configuration for screenshots."""
|
|
120
|
+
width: int
|
|
121
|
+
height: int
|
|
122
|
+
|
|
123
|
+
class Format(BaseModel):
|
|
124
|
+
"""Configuration for a format."""
|
|
125
|
+
type: FormatString
|
|
126
|
+
|
|
127
|
+
class JsonFormat(Format):
|
|
128
|
+
"""Configuration for JSON extraction."""
|
|
129
|
+
prompt: Optional[str] = None
|
|
130
|
+
schema: Optional[Any] = None
|
|
131
|
+
|
|
132
|
+
class ChangeTrackingFormat(Format):
|
|
133
|
+
"""Configuration for change tracking."""
|
|
134
|
+
modes: List[Literal["git-diff", "json"]]
|
|
135
|
+
schema: Optional[Dict[str, Any]] = None
|
|
136
|
+
prompt: Optional[str] = None
|
|
137
|
+
tag: Optional[str] = None
|
|
138
|
+
|
|
139
|
+
class ScreenshotFormat(BaseModel):
|
|
140
|
+
"""Configuration for screenshot format."""
|
|
141
|
+
type: Literal["screenshot"] = "screenshot"
|
|
142
|
+
full_page: Optional[bool] = None
|
|
143
|
+
quality: Optional[int] = None
|
|
144
|
+
viewport: Optional[Union[Dict[str, int], Viewport]] = None
|
|
145
|
+
|
|
146
|
+
FormatOption = Union[Dict[str, Any], FormatString, JsonFormat, ChangeTrackingFormat, ScreenshotFormat, Format]
|
|
147
|
+
|
|
148
|
+
# Scrape types
|
|
149
|
+
class ScrapeFormats(BaseModel):
|
|
150
|
+
"""Output formats for scraping."""
|
|
151
|
+
formats: Optional[List[FormatOption]] = None
|
|
152
|
+
markdown: bool = True
|
|
153
|
+
html: bool = False
|
|
154
|
+
raw_html: bool = False
|
|
155
|
+
summary: bool = False
|
|
156
|
+
links: bool = False
|
|
157
|
+
screenshot: bool = False
|
|
158
|
+
change_tracking: bool = False
|
|
159
|
+
json: bool = False
|
|
160
|
+
|
|
161
|
+
@field_validator('formats')
|
|
162
|
+
@classmethod
|
|
163
|
+
def validate_formats(cls, v):
|
|
164
|
+
"""Validate and normalize formats input."""
|
|
165
|
+
if v is None:
|
|
166
|
+
return v
|
|
167
|
+
|
|
168
|
+
normalized_formats = []
|
|
169
|
+
for format_item in v:
|
|
170
|
+
if isinstance(format_item, str):
|
|
171
|
+
normalized_formats.append(Format(type=format_item))
|
|
172
|
+
elif isinstance(format_item, dict):
|
|
173
|
+
# Preserve dicts as-is to avoid dropping custom fields like 'schema'
|
|
174
|
+
normalized_formats.append(format_item)
|
|
175
|
+
elif isinstance(format_item, Format):
|
|
176
|
+
normalized_formats.append(format_item)
|
|
177
|
+
else:
|
|
178
|
+
raise ValueError(f"Invalid format format: {format_item}")
|
|
179
|
+
|
|
180
|
+
return normalized_formats
|
|
181
|
+
|
|
182
|
+
class ScrapeOptions(BaseModel):
|
|
183
|
+
"""Options for scraping operations."""
|
|
184
|
+
formats: Optional[Union['ScrapeFormats', List[FormatOption]]] = None
|
|
185
|
+
headers: Optional[Dict[str, str]] = None
|
|
186
|
+
include_tags: Optional[List[str]] = None
|
|
187
|
+
exclude_tags: Optional[List[str]] = None
|
|
188
|
+
only_main_content: Optional[bool] = None
|
|
189
|
+
timeout: Optional[int] = None
|
|
190
|
+
wait_for: Optional[int] = None
|
|
191
|
+
mobile: Optional[bool] = None
|
|
192
|
+
parsers: Optional[List[str]] = None
|
|
193
|
+
actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None
|
|
194
|
+
location: Optional['Location'] = None
|
|
195
|
+
skip_tls_verification: Optional[bool] = None
|
|
196
|
+
remove_base64_images: Optional[bool] = None
|
|
197
|
+
fast_mode: Optional[bool] = None
|
|
198
|
+
use_mock: Optional[str] = None
|
|
199
|
+
block_ads: Optional[bool] = None
|
|
200
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
201
|
+
max_age: Optional[int] = None
|
|
202
|
+
store_in_cache: Optional[bool] = None
|
|
203
|
+
|
|
204
|
+
@field_validator('formats')
|
|
205
|
+
@classmethod
|
|
206
|
+
def validate_formats(cls, v):
|
|
207
|
+
"""Validate and normalize formats input."""
|
|
208
|
+
if v is None:
|
|
209
|
+
return v
|
|
210
|
+
if isinstance(v, ScrapeFormats):
|
|
211
|
+
return v
|
|
212
|
+
if isinstance(v, list):
|
|
213
|
+
return v
|
|
214
|
+
raise ValueError(f"Invalid formats type: {type(v)}. Expected ScrapeFormats or List[FormatOption]")
|
|
215
|
+
|
|
216
|
+
class ScrapeRequest(BaseModel):
|
|
217
|
+
"""Request for scraping a single URL."""
|
|
218
|
+
url: str
|
|
219
|
+
options: Optional[ScrapeOptions] = None
|
|
220
|
+
|
|
221
|
+
class ScrapeData(Document):
|
|
222
|
+
"""Scrape results data."""
|
|
223
|
+
pass
|
|
224
|
+
|
|
225
|
+
class ScrapeResponse(BaseResponse[ScrapeData]):
|
|
226
|
+
"""Response for scrape operations."""
|
|
227
|
+
pass
|
|
228
|
+
|
|
229
|
+
# Crawl types
|
|
230
|
+
class CrawlRequest(BaseModel):
|
|
231
|
+
"""Request for crawling a website."""
|
|
232
|
+
url: str
|
|
233
|
+
prompt: Optional[str] = None
|
|
234
|
+
exclude_paths: Optional[List[str]] = None
|
|
235
|
+
include_paths: Optional[List[str]] = None
|
|
236
|
+
max_discovery_depth: Optional[int] = None
|
|
237
|
+
sitemap: Literal["skip", "include"] = "include"
|
|
238
|
+
ignore_query_parameters: bool = False
|
|
239
|
+
limit: Optional[int] = None
|
|
240
|
+
crawl_entire_domain: bool = False
|
|
241
|
+
allow_external_links: bool = False
|
|
242
|
+
allow_subdomains: bool = False
|
|
243
|
+
delay: Optional[int] = None
|
|
244
|
+
max_concurrency: Optional[int] = None
|
|
245
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
246
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
247
|
+
zero_data_retention: bool = False
|
|
248
|
+
|
|
249
|
+
class CrawlResponse(BaseModel):
|
|
250
|
+
"""Information about a crawl job."""
|
|
251
|
+
id: str
|
|
252
|
+
url: str
|
|
253
|
+
|
|
254
|
+
class CrawlJob(BaseModel):
|
|
255
|
+
"""Crawl job status and progress data."""
|
|
256
|
+
status: Literal["scraping", "completed", "failed"]
|
|
257
|
+
total: int = 0
|
|
258
|
+
completed: int = 0
|
|
259
|
+
credits_used: int = 0
|
|
260
|
+
expires_at: Optional[datetime] = None
|
|
261
|
+
next: Optional[str] = None
|
|
262
|
+
data: List[Document] = []
|
|
263
|
+
|
|
264
|
+
class SearchDocument(Document):
|
|
265
|
+
"""A document from a search operation with URL and description."""
|
|
266
|
+
url: str
|
|
267
|
+
title: Optional[str] = None
|
|
268
|
+
description: Optional[str] = None
|
|
269
|
+
|
|
270
|
+
class MapDocument(Document):
|
|
271
|
+
"""A document from a map operation with URL and description."""
|
|
272
|
+
url: str
|
|
273
|
+
description: Optional[str] = None
|
|
274
|
+
|
|
275
|
+
# Crawl params types
|
|
276
|
+
class CrawlParamsRequest(BaseModel):
|
|
277
|
+
"""Request for getting crawl parameters from LLM."""
|
|
278
|
+
url: str
|
|
279
|
+
prompt: str
|
|
280
|
+
|
|
281
|
+
class CrawlParamsData(BaseModel):
|
|
282
|
+
"""Data returned from crawl params endpoint."""
|
|
283
|
+
include_paths: Optional[List[str]] = None
|
|
284
|
+
exclude_paths: Optional[List[str]] = None
|
|
285
|
+
max_discovery_depth: Optional[int] = None
|
|
286
|
+
ignore_sitemap: bool = False
|
|
287
|
+
ignore_query_parameters: bool = False
|
|
288
|
+
limit: Optional[int] = None
|
|
289
|
+
crawl_entire_domain: bool = False
|
|
290
|
+
allow_external_links: bool = False
|
|
291
|
+
allow_subdomains: bool = False
|
|
292
|
+
delay: Optional[int] = None
|
|
293
|
+
max_concurrency: Optional[int] = None
|
|
294
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
295
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
296
|
+
zero_data_retention: bool = False
|
|
297
|
+
warning: Optional[str] = None
|
|
298
|
+
|
|
299
|
+
class CrawlParamsResponse(BaseResponse[CrawlParamsData]):
|
|
300
|
+
"""Response from crawl params endpoint."""
|
|
301
|
+
pass
|
|
302
|
+
|
|
303
|
+
# Batch scrape types
|
|
304
|
+
class BatchScrapeRequest(BaseModel):
|
|
305
|
+
"""Request for batch scraping multiple URLs (internal helper only)."""
|
|
306
|
+
urls: List[str]
|
|
307
|
+
options: Optional[ScrapeOptions] = None
|
|
308
|
+
|
|
309
|
+
class BatchScrapeResponse(BaseModel):
|
|
310
|
+
"""Response from starting a batch scrape job (mirrors CrawlResponse naming)."""
|
|
311
|
+
id: str
|
|
312
|
+
url: str
|
|
313
|
+
invalid_urls: Optional[List[str]] = None
|
|
314
|
+
|
|
315
|
+
class BatchScrapeJob(BaseModel):
|
|
316
|
+
"""Batch scrape job status and results."""
|
|
317
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
318
|
+
completed: int
|
|
319
|
+
total: int
|
|
320
|
+
credits_used: Optional[int] = None
|
|
321
|
+
expires_at: Optional[datetime] = None
|
|
322
|
+
next: Optional[str] = None
|
|
323
|
+
data: List[Document] = []
|
|
324
|
+
|
|
325
|
+
# Map types
|
|
326
|
+
class MapOptions(BaseModel):
|
|
327
|
+
"""Options for mapping operations."""
|
|
328
|
+
search: Optional[str] = None
|
|
329
|
+
sitemap: Literal["only", "include", "skip"] = "include"
|
|
330
|
+
include_subdomains: Optional[bool] = None
|
|
331
|
+
limit: Optional[int] = None
|
|
332
|
+
timeout: Optional[int] = None
|
|
333
|
+
|
|
334
|
+
class MapRequest(BaseModel):
|
|
335
|
+
"""Request for mapping a website."""
|
|
336
|
+
url: str
|
|
337
|
+
options: Optional[MapOptions] = None
|
|
338
|
+
|
|
339
|
+
class MapData(BaseModel):
|
|
340
|
+
"""Map results data."""
|
|
341
|
+
links: List['SearchResult']
|
|
342
|
+
|
|
343
|
+
class MapResponse(BaseResponse[MapData]):
|
|
344
|
+
"""Response for map operations."""
|
|
345
|
+
pass
|
|
346
|
+
|
|
347
|
+
# Extract types
|
|
348
|
+
class ExtractResponse(BaseModel):
|
|
349
|
+
"""Response for extract operations (start/status/final)."""
|
|
350
|
+
success: Optional[bool] = None
|
|
351
|
+
id: Optional[str] = None
|
|
352
|
+
status: Optional[Literal["processing", "completed", "failed", "cancelled"]] = None
|
|
353
|
+
data: Optional[Any] = None
|
|
354
|
+
error: Optional[str] = None
|
|
355
|
+
warning: Optional[str] = None
|
|
356
|
+
sources: Optional[Dict[str, Any]] = None
|
|
357
|
+
expires_at: Optional[datetime] = None
|
|
358
|
+
|
|
359
|
+
# Usage/limits types
|
|
360
|
+
class ConcurrencyCheck(BaseModel):
|
|
361
|
+
"""Current concurrency and limits for the team/API key."""
|
|
362
|
+
concurrency: int
|
|
363
|
+
max_concurrency: int
|
|
364
|
+
|
|
365
|
+
class CreditUsage(BaseModel):
|
|
366
|
+
"""Remaining credits for the team/API key."""
|
|
367
|
+
remaining_credits: int
|
|
368
|
+
|
|
369
|
+
class TokenUsage(BaseModel):
|
|
370
|
+
"""Recent token usage metrics (if available)."""
|
|
371
|
+
remaining_tokens: int
|
|
372
|
+
|
|
373
|
+
# Action types
|
|
374
|
+
class WaitAction(BaseModel):
|
|
375
|
+
"""Wait action to perform during scraping."""
|
|
376
|
+
type: Literal["wait"] = "wait"
|
|
377
|
+
milliseconds: Optional[int] = None
|
|
378
|
+
selector: Optional[str] = None
|
|
379
|
+
|
|
380
|
+
class ScreenshotAction(BaseModel):
|
|
381
|
+
"""Screenshot action to perform during scraping."""
|
|
382
|
+
type: Literal["screenshot"] = "screenshot"
|
|
383
|
+
full_page: Optional[bool] = None
|
|
384
|
+
quality: Optional[int] = None
|
|
385
|
+
viewport: Optional[Union[Dict[str, int], Viewport]] = None
|
|
386
|
+
|
|
387
|
+
class ClickAction(BaseModel):
|
|
388
|
+
"""Click action to perform during scraping."""
|
|
389
|
+
type: Literal["click"] = "click"
|
|
390
|
+
selector: str
|
|
391
|
+
|
|
392
|
+
class WriteAction(BaseModel):
|
|
393
|
+
"""Write action to perform during scraping."""
|
|
394
|
+
type: Literal["write"] = "write"
|
|
395
|
+
text: str
|
|
396
|
+
|
|
397
|
+
class PressAction(BaseModel):
|
|
398
|
+
"""Press action to perform during scraping."""
|
|
399
|
+
type: Literal["press"] = "press"
|
|
400
|
+
key: str
|
|
401
|
+
|
|
402
|
+
class ScrollAction(BaseModel):
|
|
403
|
+
"""Scroll action to perform during scraping."""
|
|
404
|
+
type: Literal["scroll"] = "scroll"
|
|
405
|
+
direction: Literal["up", "down"]
|
|
406
|
+
selector: Optional[str] = None
|
|
407
|
+
|
|
408
|
+
class ScrapeAction(BaseModel):
|
|
409
|
+
"""Scrape action to perform during scraping."""
|
|
410
|
+
type: Literal["scrape"] = "scrape"
|
|
411
|
+
|
|
412
|
+
class ExecuteJavascriptAction(BaseModel):
|
|
413
|
+
"""Execute javascript action to perform during scraping."""
|
|
414
|
+
type: Literal["executeJavascript"] = "executeJavascript"
|
|
415
|
+
script: str
|
|
416
|
+
|
|
417
|
+
class PDFAction(BaseModel):
|
|
418
|
+
"""PDF action to perform during scraping."""
|
|
419
|
+
type: Literal["pdf"] = "pdf"
|
|
420
|
+
format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
|
|
421
|
+
landscape: Optional[bool] = None
|
|
422
|
+
scale: Optional[float] = None
|
|
423
|
+
|
|
424
|
+
# Location types
|
|
425
|
+
class Location(BaseModel):
|
|
426
|
+
"""Location configuration for scraping."""
|
|
427
|
+
country: Optional[str] = None
|
|
428
|
+
languages: Optional[List[str]] = None
|
|
429
|
+
|
|
430
|
+
class SearchRequest(BaseModel):
|
|
431
|
+
"""Request for search operations."""
|
|
432
|
+
query: str
|
|
433
|
+
sources: Optional[List[SourceOption]] = None
|
|
434
|
+
limit: Optional[int] = 5
|
|
435
|
+
tbs: Optional[str] = None
|
|
436
|
+
location: Optional[str] = None
|
|
437
|
+
ignore_invalid_urls: Optional[bool] = None
|
|
438
|
+
timeout: Optional[int] = 60000
|
|
439
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
440
|
+
|
|
441
|
+
@field_validator('sources')
|
|
442
|
+
@classmethod
|
|
443
|
+
def validate_sources(cls, v):
|
|
444
|
+
"""Validate and normalize sources input."""
|
|
445
|
+
if v is None:
|
|
446
|
+
return v
|
|
447
|
+
|
|
448
|
+
normalized_sources = []
|
|
449
|
+
for source in v:
|
|
450
|
+
if isinstance(source, str):
|
|
451
|
+
normalized_sources.append(Source(type=source))
|
|
452
|
+
elif isinstance(source, dict):
|
|
453
|
+
normalized_sources.append(Source(**source))
|
|
454
|
+
elif isinstance(source, Source):
|
|
455
|
+
normalized_sources.append(source)
|
|
456
|
+
else:
|
|
457
|
+
raise ValueError(f"Invalid source format: {source}")
|
|
458
|
+
|
|
459
|
+
return normalized_sources
|
|
460
|
+
|
|
461
|
+
class LinkResult(BaseModel):
|
|
462
|
+
"""A generic link result with optional metadata (used by search and map)."""
|
|
463
|
+
url: str
|
|
464
|
+
title: Optional[str] = None
|
|
465
|
+
description: Optional[str] = None
|
|
466
|
+
|
|
467
|
+
# Backward-compatible alias for existing tests/usages
|
|
468
|
+
SearchResult = LinkResult
|
|
469
|
+
|
|
470
|
+
class SearchData(BaseModel):
|
|
471
|
+
"""Search results grouped by source type."""
|
|
472
|
+
web: Optional[List[Union[LinkResult, SearchDocument]]] = None
|
|
473
|
+
news: Optional[List[Union[LinkResult, SearchDocument]]] = None
|
|
474
|
+
images: Optional[List[Union[LinkResult, SearchDocument]]] = None
|
|
475
|
+
|
|
476
|
+
class SearchResponse(BaseResponse[SearchData]):
|
|
477
|
+
"""Response from search operation."""
|
|
478
|
+
pass
|
|
479
|
+
|
|
480
|
+
# Error types
|
|
481
|
+
class ErrorDetails(BaseModel):
|
|
482
|
+
"""Detailed error information."""
|
|
483
|
+
code: Optional[str] = None
|
|
484
|
+
message: str
|
|
485
|
+
details: Optional[Dict[str, Any]] = None
|
|
486
|
+
|
|
487
|
+
class ErrorResponse(BaseModel):
|
|
488
|
+
"""Error response structure."""
|
|
489
|
+
success: bool = False
|
|
490
|
+
error: str
|
|
491
|
+
details: Optional[ErrorDetails] = None
|
|
492
|
+
|
|
493
|
+
# Job management types
|
|
494
|
+
class JobStatus(BaseModel):
|
|
495
|
+
"""Generic job status information."""
|
|
496
|
+
id: str
|
|
497
|
+
status: Literal["pending", "scraping", "completed", "failed"]
|
|
498
|
+
current: Optional[int] = None
|
|
499
|
+
total: Optional[int] = None
|
|
500
|
+
created_at: Optional[datetime] = None
|
|
501
|
+
completed_at: Optional[datetime] = None
|
|
502
|
+
expires_at: Optional[datetime] = None
|
|
503
|
+
|
|
504
|
+
class CrawlError(BaseModel):
|
|
505
|
+
"""A crawl error."""
|
|
506
|
+
id: str
|
|
507
|
+
timestamp: Optional[datetime] = None
|
|
508
|
+
url: str
|
|
509
|
+
code: Optional[str] = None
|
|
510
|
+
error: str
|
|
511
|
+
|
|
512
|
+
class CrawlErrorsResponse(BaseModel):
|
|
513
|
+
"""Response from crawl error monitoring."""
|
|
514
|
+
errors: List[CrawlError]
|
|
515
|
+
robots_blocked: List[str]
|
|
516
|
+
|
|
517
|
+
class ActiveCrawl(BaseModel):
|
|
518
|
+
"""Information about an active crawl job."""
|
|
519
|
+
id: str
|
|
520
|
+
team_id: str
|
|
521
|
+
url: str
|
|
522
|
+
options: Optional[Dict[str, Any]] = None
|
|
523
|
+
|
|
524
|
+
class ActiveCrawlsResponse(BaseModel):
|
|
525
|
+
"""Response from active crawls endpoint."""
|
|
526
|
+
success: bool = True
|
|
527
|
+
crawls: List[ActiveCrawl]
|
|
528
|
+
|
|
529
|
+
# Configuration types
|
|
530
|
+
class ClientConfig(BaseModel):
|
|
531
|
+
"""Configuration for the Firecrawl client."""
|
|
532
|
+
api_key: str
|
|
533
|
+
api_url: str = "https://api.firecrawl.dev"
|
|
534
|
+
timeout: Optional[float] = None
|
|
535
|
+
max_retries: int = 3
|
|
536
|
+
backoff_factor: float = 0.5
|
|
537
|
+
|
|
538
|
+
# Response union types
|
|
539
|
+
AnyResponse = Union[
|
|
540
|
+
ScrapeResponse,
|
|
541
|
+
CrawlResponse,
|
|
542
|
+
BatchScrapeResponse,
|
|
543
|
+
MapResponse,
|
|
544
|
+
SearchResponse,
|
|
545
|
+
ErrorResponse,
|
|
546
|
+
]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility modules for v2 API client.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .http_client import HttpClient
|
|
6
|
+
from .error_handler import FirecrawlError, handle_response_error
|
|
7
|
+
from .validation import validate_scrape_options, prepare_scrape_options
|
|
8
|
+
|
|
9
|
+
__all__ = ['HttpClient', 'FirecrawlError', 'handle_response_error', 'validate_scrape_options', 'prepare_scrape_options']
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Error handling utilities for v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
from typing import Dict, Any, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FirecrawlError(Exception):
|
|
10
|
+
"""Base exception for Firecrawl API errors."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[requests.Response] = None):
|
|
13
|
+
super().__init__(message)
|
|
14
|
+
self.status_code = status_code
|
|
15
|
+
self.response = response
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BadRequestError(FirecrawlError):
|
|
19
|
+
"""Raised when the request is invalid (400)."""
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class UnauthorizedError(FirecrawlError):
|
|
25
|
+
"""Raised when the request is unauthorized (401)."""
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class PaymentRequiredError(FirecrawlError):
|
|
30
|
+
"""Raised when payment is required (402)."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class WebsiteNotSupportedError(FirecrawlError):
|
|
35
|
+
"""Raised when website is not supported (403)."""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class RequestTimeoutError(FirecrawlError):
|
|
40
|
+
"""Raised when request times out (408)."""
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class RateLimitError(FirecrawlError):
|
|
45
|
+
"""Raised when the rate limit is exceeded (429)."""
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class InternalServerError(FirecrawlError):
|
|
50
|
+
"""Raised when there's an internal server error (500)."""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def handle_response_error(response: requests.Response, action: str) -> None:
|
|
55
|
+
"""
|
|
56
|
+
Handle API response errors and raise appropriate exceptions.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
response: The HTTP response object
|
|
60
|
+
action: Description of the action being performed
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
FirecrawlError: Appropriate error based on status code
|
|
64
|
+
"""
|
|
65
|
+
try:
|
|
66
|
+
response_json = response.json()
|
|
67
|
+
error_message = response_json.get('error', 'No error message provided.')
|
|
68
|
+
error_details = response_json.get('details', 'No additional error details provided.')
|
|
69
|
+
except:
|
|
70
|
+
# If we can't parse JSON, provide a helpful error message
|
|
71
|
+
try:
|
|
72
|
+
response_text = response.text[:500] # Limit to first 500 chars
|
|
73
|
+
if response_text.strip():
|
|
74
|
+
error_message = f"Server returned non-JSON response: {response_text}"
|
|
75
|
+
error_details = f"Full response status: {response.status_code}"
|
|
76
|
+
else:
|
|
77
|
+
error_message = f"Server returned empty response with status {response.status_code}"
|
|
78
|
+
error_details = "No additional details available"
|
|
79
|
+
except:
|
|
80
|
+
error_message = f"Server returned unreadable response with status {response.status_code}"
|
|
81
|
+
error_details = "No additional details available"
|
|
82
|
+
|
|
83
|
+
# Create appropriate error message
|
|
84
|
+
if response.status_code == 400:
|
|
85
|
+
message = f"Bad Request: Failed to {action}. {error_message} - {error_details}"
|
|
86
|
+
raise BadRequestError(message, response.status_code, response)
|
|
87
|
+
elif response.status_code == 401:
|
|
88
|
+
message = f"Unauthorized: Failed to {action}. {error_message} - {error_details}"
|
|
89
|
+
raise UnauthorizedError(message, response.status_code, response)
|
|
90
|
+
elif response.status_code == 402:
|
|
91
|
+
message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
|
|
92
|
+
raise PaymentRequiredError(message, response.status_code, response)
|
|
93
|
+
elif response.status_code == 403:
|
|
94
|
+
message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
|
|
95
|
+
raise WebsiteNotSupportedError(message, response.status_code, response)
|
|
96
|
+
elif response.status_code == 408:
|
|
97
|
+
message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
|
|
98
|
+
raise RequestTimeoutError(message, response.status_code, response)
|
|
99
|
+
elif response.status_code == 429:
|
|
100
|
+
message = f"Rate Limit Exceeded: Failed to {action}. {error_message} - {error_details}"
|
|
101
|
+
raise RateLimitError(message, response.status_code, response)
|
|
102
|
+
elif response.status_code == 500:
|
|
103
|
+
message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
|
|
104
|
+
raise InternalServerError(message, response.status_code, response)
|
|
105
|
+
else:
|
|
106
|
+
message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
|
|
107
|
+
raise FirecrawlError(message, response.status_code, response)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
def get_version():
|
|
6
|
+
try:
|
|
7
|
+
package_path = Path(__file__).parents[2]
|
|
8
|
+
version_file = (package_path / "__init__.py").read_text()
|
|
9
|
+
version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
|
|
10
|
+
if version_match:
|
|
11
|
+
return version_match.group(1).strip()
|
|
12
|
+
return "3.x.x"
|
|
13
|
+
except Exception as e:
|
|
14
|
+
print(f"Failed to get version from __init__.py: {e}")
|
|
15
|
+
return "3.x.x"
|