firecrawl-py 3.2.1__py3-none-any.whl → 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl-py might be problematic. Click here for more details.
- build/lib/firecrawl/__init__.py +87 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +188 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +248 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- build/lib/firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_search.py +269 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- build/lib/firecrawl/client.py +242 -0
- build/lib/firecrawl/firecrawl.backup.py +4635 -0
- build/lib/firecrawl/types.py +161 -0
- build/lib/firecrawl/v1/__init__.py +14 -0
- build/lib/firecrawl/v1/client.py +4653 -0
- build/lib/firecrawl/v2/__init__.py +4 -0
- build/lib/firecrawl/v2/client.py +805 -0
- build/lib/firecrawl/v2/client_async.py +250 -0
- build/lib/firecrawl/v2/methods/aio/__init__.py +1 -0
- build/lib/firecrawl/v2/methods/aio/batch.py +85 -0
- build/lib/firecrawl/v2/methods/aio/crawl.py +171 -0
- build/lib/firecrawl/v2/methods/aio/extract.py +126 -0
- build/lib/firecrawl/v2/methods/aio/map.py +59 -0
- build/lib/firecrawl/v2/methods/aio/scrape.py +33 -0
- build/lib/firecrawl/v2/methods/aio/search.py +172 -0
- build/lib/firecrawl/v2/methods/aio/usage.py +42 -0
- build/lib/firecrawl/v2/methods/batch.py +417 -0
- build/lib/firecrawl/v2/methods/crawl.py +469 -0
- build/lib/firecrawl/v2/methods/extract.py +131 -0
- build/lib/firecrawl/v2/methods/map.py +77 -0
- build/lib/firecrawl/v2/methods/scrape.py +64 -0
- build/lib/firecrawl/v2/methods/search.py +197 -0
- build/lib/firecrawl/v2/methods/usage.py +41 -0
- build/lib/firecrawl/v2/types.py +665 -0
- build/lib/firecrawl/v2/utils/__init__.py +9 -0
- build/lib/firecrawl/v2/utils/error_handler.py +107 -0
- build/lib/firecrawl/v2/utils/get_version.py +15 -0
- build/lib/firecrawl/v2/utils/http_client.py +153 -0
- build/lib/firecrawl/v2/utils/http_client_async.py +65 -0
- build/lib/firecrawl/v2/utils/normalize.py +107 -0
- build/lib/firecrawl/v2/utils/validation.py +324 -0
- build/lib/firecrawl/v2/watcher.py +301 -0
- build/lib/firecrawl/v2/watcher_async.py +242 -0
- build/lib/tests/test_change_tracking.py +98 -0
- build/lib/tests/test_timeout_conversion.py +117 -0
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +2 -2
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +6 -6
- firecrawl/v2/client.py +3 -0
- firecrawl/v2/methods/search.py +11 -0
- firecrawl/v2/types.py +30 -1
- {firecrawl_py-3.2.1.dist-info/licenses → firecrawl_py-3.3.1.dist-info}/LICENSE +0 -0
- {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.1.dist-info}/METADATA +3 -7
- firecrawl_py-3.3.1.dist-info/RECORD +153 -0
- {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.1.dist-info}/WHEEL +1 -1
- {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.1.dist-info}/top_level.txt +2 -0
- firecrawl_py-3.2.1.dist-info/RECORD +0 -79
|
@@ -0,0 +1,665 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Type definitions for Firecrawl v2 API.
|
|
3
|
+
|
|
4
|
+
This module contains clean, modern type definitions for the v2 API.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import warnings
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
|
|
10
|
+
import logging
|
|
11
|
+
from pydantic import BaseModel, Field, field_validator, ValidationError
|
|
12
|
+
|
|
13
|
+
# Suppress pydantic warnings about schema field shadowing
|
|
14
|
+
# Tested using schema_field alias="schema" but it doesn't work.
|
|
15
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"Format\" shadows an attribute in parent \"BaseModel\"")
|
|
16
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonFormat\" shadows an attribute in parent \"Format\"")
|
|
17
|
+
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingFormat\" shadows an attribute in parent \"Format\"")
|
|
18
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ScrapeFormats\" shadows an attribute in parent \"BaseModel\"")
|
|
19
|
+
warnings.filterwarnings("ignore", message="Field name \"json\" in \"Document\" shadows an attribute in parent \"BaseModel\"")
|
|
20
|
+
|
|
21
|
+
T = TypeVar('T')
|
|
22
|
+
|
|
23
|
+
# Module logger
|
|
24
|
+
logger = logging.getLogger("firecrawl")
|
|
25
|
+
|
|
26
|
+
# Base response types
|
|
27
|
+
class BaseResponse(BaseModel, Generic[T]):
|
|
28
|
+
"""Base response structure for all API responses."""
|
|
29
|
+
success: bool
|
|
30
|
+
data: Optional[T] = None
|
|
31
|
+
error: Optional[str] = None
|
|
32
|
+
warning: Optional[str] = None
|
|
33
|
+
|
|
34
|
+
# Document and content types
|
|
35
|
+
class DocumentMetadata(BaseModel):
|
|
36
|
+
"""Metadata for scraped documents (snake_case only; API camelCase normalized in code)."""
|
|
37
|
+
# Common metadata fields
|
|
38
|
+
title: Optional[str] = None
|
|
39
|
+
description: Optional[str] = None
|
|
40
|
+
url: Optional[str] = None
|
|
41
|
+
language: Optional[str] = None
|
|
42
|
+
keywords: Optional[Union[str, List[str]]] = None
|
|
43
|
+
robots: Optional[str] = None
|
|
44
|
+
|
|
45
|
+
# OpenGraph and social metadata
|
|
46
|
+
og_title: Optional[str] = None
|
|
47
|
+
og_description: Optional[str] = None
|
|
48
|
+
og_url: Optional[str] = None
|
|
49
|
+
og_image: Optional[str] = None
|
|
50
|
+
og_audio: Optional[str] = None
|
|
51
|
+
og_determiner: Optional[str] = None
|
|
52
|
+
og_locale: Optional[str] = None
|
|
53
|
+
og_locale_alternate: Optional[List[str]] = None
|
|
54
|
+
og_site_name: Optional[str] = None
|
|
55
|
+
og_video: Optional[str] = None
|
|
56
|
+
|
|
57
|
+
# Dublin Core and other site metadata
|
|
58
|
+
favicon: Optional[str] = None
|
|
59
|
+
dc_terms_created: Optional[str] = None
|
|
60
|
+
dc_date_created: Optional[str] = None
|
|
61
|
+
dc_date: Optional[str] = None
|
|
62
|
+
dc_terms_type: Optional[str] = None
|
|
63
|
+
dc_type: Optional[str] = None
|
|
64
|
+
dc_terms_audience: Optional[str] = None
|
|
65
|
+
dc_terms_subject: Optional[str] = None
|
|
66
|
+
dc_subject: Optional[str] = None
|
|
67
|
+
dc_description: Optional[str] = None
|
|
68
|
+
dc_terms_keywords: Optional[str] = None
|
|
69
|
+
|
|
70
|
+
modified_time: Optional[str] = None
|
|
71
|
+
published_time: Optional[str] = None
|
|
72
|
+
article_tag: Optional[str] = None
|
|
73
|
+
article_section: Optional[str] = None
|
|
74
|
+
|
|
75
|
+
# Response-level metadata
|
|
76
|
+
source_url: Optional[str] = None
|
|
77
|
+
status_code: Optional[int] = None
|
|
78
|
+
scrape_id: Optional[str] = None
|
|
79
|
+
num_pages: Optional[int] = None
|
|
80
|
+
content_type: Optional[str] = None
|
|
81
|
+
proxy_used: Optional[Literal["basic", "stealth"]] = None
|
|
82
|
+
cache_state: Optional[Literal["hit", "miss"]] = None
|
|
83
|
+
cached_at: Optional[str] = None
|
|
84
|
+
credits_used: Optional[int] = None
|
|
85
|
+
|
|
86
|
+
# Error information
|
|
87
|
+
error: Optional[str] = None
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _coerce_list_to_string(value: Any) -> Any:
|
|
91
|
+
if isinstance(value, list):
|
|
92
|
+
# Prefer first string if semantically a single-valued field, else join
|
|
93
|
+
if len(value) == 1:
|
|
94
|
+
return str(value[0])
|
|
95
|
+
return ', '.join(str(item) for item in value)
|
|
96
|
+
return value
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def _coerce_string_to_int(value: Any) -> Any:
|
|
100
|
+
if isinstance(value, str):
|
|
101
|
+
try:
|
|
102
|
+
return int(value)
|
|
103
|
+
except ValueError:
|
|
104
|
+
return value
|
|
105
|
+
return value
|
|
106
|
+
|
|
107
|
+
@field_validator('robots', 'og_title', 'og_description', 'og_url', 'og_image', 'language', mode='before')
|
|
108
|
+
@classmethod
|
|
109
|
+
def coerce_lists_to_string_fields(cls, v):
|
|
110
|
+
return cls._coerce_list_to_string(v)
|
|
111
|
+
|
|
112
|
+
@field_validator('status_code', mode='before')
|
|
113
|
+
@classmethod
|
|
114
|
+
def coerce_status_code_to_int(cls, v):
|
|
115
|
+
return cls._coerce_string_to_int(v)
|
|
116
|
+
|
|
117
|
+
class Document(BaseModel):
|
|
118
|
+
"""A scraped document."""
|
|
119
|
+
markdown: Optional[str] = None
|
|
120
|
+
html: Optional[str] = None
|
|
121
|
+
raw_html: Optional[str] = None
|
|
122
|
+
json: Optional[Any] = None
|
|
123
|
+
summary: Optional[str] = None
|
|
124
|
+
metadata: Optional[DocumentMetadata] = None
|
|
125
|
+
links: Optional[List[str]] = None
|
|
126
|
+
screenshot: Optional[str] = None
|
|
127
|
+
actions: Optional[Dict[str, Any]] = None
|
|
128
|
+
warning: Optional[str] = None
|
|
129
|
+
change_tracking: Optional[Dict[str, Any]] = None
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def metadata_typed(self) -> DocumentMetadata:
|
|
133
|
+
"""Always returns a DocumentMetadata instance for LSP-friendly access."""
|
|
134
|
+
md = self.metadata
|
|
135
|
+
if isinstance(md, DocumentMetadata):
|
|
136
|
+
return md
|
|
137
|
+
if isinstance(md, dict):
|
|
138
|
+
try:
|
|
139
|
+
return DocumentMetadata(**md)
|
|
140
|
+
except (ValidationError, TypeError) as exc:
|
|
141
|
+
logger.debug("Failed to construct DocumentMetadata from dict: %s", exc)
|
|
142
|
+
return DocumentMetadata()
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def metadata_dict(self) -> Dict[str, Any]:
|
|
146
|
+
"""Returns metadata as a plain dict (exclude None)."""
|
|
147
|
+
md = self.metadata
|
|
148
|
+
if isinstance(md, DocumentMetadata):
|
|
149
|
+
return md.model_dump(exclude_none=True)
|
|
150
|
+
if isinstance(md, dict):
|
|
151
|
+
return {k: v for k, v in md.items() if v is not None}
|
|
152
|
+
return {}
|
|
153
|
+
|
|
154
|
+
# Webhook types
|
|
155
|
+
class WebhookConfig(BaseModel):
|
|
156
|
+
"""Configuration for webhooks."""
|
|
157
|
+
url: str
|
|
158
|
+
headers: Optional[Dict[str, str]] = None
|
|
159
|
+
metadata: Optional[Dict[str, str]] = None
|
|
160
|
+
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
161
|
+
|
|
162
|
+
class WebhookData(BaseModel):
|
|
163
|
+
"""Data sent to webhooks."""
|
|
164
|
+
job_id: str
|
|
165
|
+
status: str
|
|
166
|
+
current: Optional[int] = None
|
|
167
|
+
total: Optional[int] = None
|
|
168
|
+
data: Optional[List[Document]] = None
|
|
169
|
+
error: Optional[str] = None
|
|
170
|
+
|
|
171
|
+
class Source(BaseModel):
|
|
172
|
+
"""Configuration for a search source."""
|
|
173
|
+
type: str
|
|
174
|
+
|
|
175
|
+
SourceOption = Union[str, Source]
|
|
176
|
+
|
|
177
|
+
class Category(BaseModel):
|
|
178
|
+
"""Configuration for a search category."""
|
|
179
|
+
type: str
|
|
180
|
+
|
|
181
|
+
CategoryOption = Union[str, Category]
|
|
182
|
+
|
|
183
|
+
FormatString = Literal[
|
|
184
|
+
# camelCase versions (API format)
|
|
185
|
+
"markdown", "html", "rawHtml", "links", "screenshot", "summary", "changeTracking", "json",
|
|
186
|
+
# snake_case versions (user-friendly)
|
|
187
|
+
"raw_html", "change_tracking"
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
class Viewport(BaseModel):
|
|
191
|
+
"""Viewport configuration for screenshots."""
|
|
192
|
+
width: int
|
|
193
|
+
height: int
|
|
194
|
+
|
|
195
|
+
class Format(BaseModel):
|
|
196
|
+
"""Configuration for a format."""
|
|
197
|
+
type: FormatString
|
|
198
|
+
|
|
199
|
+
class JsonFormat(Format):
|
|
200
|
+
"""Configuration for JSON extraction."""
|
|
201
|
+
prompt: Optional[str] = None
|
|
202
|
+
schema: Optional[Any] = None
|
|
203
|
+
|
|
204
|
+
class ChangeTrackingFormat(Format):
|
|
205
|
+
"""Configuration for change tracking."""
|
|
206
|
+
modes: List[Literal["git-diff", "json"]]
|
|
207
|
+
schema: Optional[Dict[str, Any]] = None
|
|
208
|
+
prompt: Optional[str] = None
|
|
209
|
+
tag: Optional[str] = None
|
|
210
|
+
|
|
211
|
+
class ScreenshotFormat(BaseModel):
|
|
212
|
+
"""Configuration for screenshot format."""
|
|
213
|
+
type: Literal["screenshot"] = "screenshot"
|
|
214
|
+
full_page: Optional[bool] = None
|
|
215
|
+
quality: Optional[int] = None
|
|
216
|
+
viewport: Optional[Union[Dict[str, int], Viewport]] = None
|
|
217
|
+
|
|
218
|
+
FormatOption = Union[Dict[str, Any], FormatString, JsonFormat, ChangeTrackingFormat, ScreenshotFormat, Format]
|
|
219
|
+
|
|
220
|
+
# Scrape types
|
|
221
|
+
class ScrapeFormats(BaseModel):
|
|
222
|
+
"""Output formats for scraping."""
|
|
223
|
+
formats: Optional[List[FormatOption]] = None
|
|
224
|
+
markdown: bool = True
|
|
225
|
+
html: bool = False
|
|
226
|
+
raw_html: bool = False
|
|
227
|
+
summary: bool = False
|
|
228
|
+
links: bool = False
|
|
229
|
+
screenshot: bool = False
|
|
230
|
+
change_tracking: bool = False
|
|
231
|
+
json: bool = False
|
|
232
|
+
|
|
233
|
+
@field_validator('formats')
|
|
234
|
+
@classmethod
|
|
235
|
+
def validate_formats(cls, v):
|
|
236
|
+
"""Validate and normalize formats input."""
|
|
237
|
+
if v is None:
|
|
238
|
+
return v
|
|
239
|
+
|
|
240
|
+
normalized_formats = []
|
|
241
|
+
for format_item in v:
|
|
242
|
+
if isinstance(format_item, str):
|
|
243
|
+
normalized_formats.append(Format(type=format_item))
|
|
244
|
+
elif isinstance(format_item, dict):
|
|
245
|
+
# Preserve dicts as-is to avoid dropping custom fields like 'schema'
|
|
246
|
+
normalized_formats.append(format_item)
|
|
247
|
+
elif isinstance(format_item, Format):
|
|
248
|
+
normalized_formats.append(format_item)
|
|
249
|
+
else:
|
|
250
|
+
raise ValueError(f"Invalid format format: {format_item}")
|
|
251
|
+
|
|
252
|
+
return normalized_formats
|
|
253
|
+
|
|
254
|
+
class ScrapeOptions(BaseModel):
|
|
255
|
+
"""Options for scraping operations."""
|
|
256
|
+
formats: Optional[Union['ScrapeFormats', List[FormatOption]]] = None
|
|
257
|
+
headers: Optional[Dict[str, str]] = None
|
|
258
|
+
include_tags: Optional[List[str]] = None
|
|
259
|
+
exclude_tags: Optional[List[str]] = None
|
|
260
|
+
only_main_content: Optional[bool] = None
|
|
261
|
+
timeout: Optional[int] = None
|
|
262
|
+
wait_for: Optional[int] = None
|
|
263
|
+
mobile: Optional[bool] = None
|
|
264
|
+
parsers: Optional[List[str]] = None
|
|
265
|
+
actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None
|
|
266
|
+
location: Optional['Location'] = None
|
|
267
|
+
skip_tls_verification: Optional[bool] = None
|
|
268
|
+
remove_base64_images: Optional[bool] = None
|
|
269
|
+
fast_mode: Optional[bool] = None
|
|
270
|
+
use_mock: Optional[str] = None
|
|
271
|
+
block_ads: Optional[bool] = None
|
|
272
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
273
|
+
max_age: Optional[int] = None
|
|
274
|
+
store_in_cache: Optional[bool] = None
|
|
275
|
+
|
|
276
|
+
@field_validator('formats')
|
|
277
|
+
@classmethod
|
|
278
|
+
def validate_formats(cls, v):
|
|
279
|
+
"""Validate and normalize formats input."""
|
|
280
|
+
if v is None:
|
|
281
|
+
return v
|
|
282
|
+
if isinstance(v, ScrapeFormats):
|
|
283
|
+
return v
|
|
284
|
+
if isinstance(v, list):
|
|
285
|
+
return v
|
|
286
|
+
raise ValueError(f"Invalid formats type: {type(v)}. Expected ScrapeFormats or List[FormatOption]")
|
|
287
|
+
|
|
288
|
+
class ScrapeRequest(BaseModel):
|
|
289
|
+
"""Request for scraping a single URL."""
|
|
290
|
+
url: str
|
|
291
|
+
options: Optional[ScrapeOptions] = None
|
|
292
|
+
|
|
293
|
+
class ScrapeData(Document):
|
|
294
|
+
"""Scrape results data."""
|
|
295
|
+
pass
|
|
296
|
+
|
|
297
|
+
class ScrapeResponse(BaseResponse[ScrapeData]):
|
|
298
|
+
"""Response for scrape operations."""
|
|
299
|
+
pass
|
|
300
|
+
|
|
301
|
+
# Crawl types
|
|
302
|
+
class CrawlRequest(BaseModel):
|
|
303
|
+
"""Request for crawling a website."""
|
|
304
|
+
url: str
|
|
305
|
+
prompt: Optional[str] = None
|
|
306
|
+
exclude_paths: Optional[List[str]] = None
|
|
307
|
+
include_paths: Optional[List[str]] = None
|
|
308
|
+
max_discovery_depth: Optional[int] = None
|
|
309
|
+
sitemap: Literal["skip", "include"] = "include"
|
|
310
|
+
ignore_query_parameters: bool = False
|
|
311
|
+
limit: Optional[int] = None
|
|
312
|
+
crawl_entire_domain: bool = False
|
|
313
|
+
allow_external_links: bool = False
|
|
314
|
+
allow_subdomains: bool = False
|
|
315
|
+
delay: Optional[int] = None
|
|
316
|
+
max_concurrency: Optional[int] = None
|
|
317
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
318
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
319
|
+
zero_data_retention: bool = False
|
|
320
|
+
|
|
321
|
+
class CrawlResponse(BaseModel):
|
|
322
|
+
"""Information about a crawl job."""
|
|
323
|
+
id: str
|
|
324
|
+
url: str
|
|
325
|
+
|
|
326
|
+
class CrawlJob(BaseModel):
|
|
327
|
+
"""Crawl job status and progress data."""
|
|
328
|
+
status: Literal["scraping", "completed", "failed"]
|
|
329
|
+
total: int = 0
|
|
330
|
+
completed: int = 0
|
|
331
|
+
credits_used: int = 0
|
|
332
|
+
expires_at: Optional[datetime] = None
|
|
333
|
+
next: Optional[str] = None
|
|
334
|
+
data: List[Document] = []
|
|
335
|
+
|
|
336
|
+
class SearchResultWeb(BaseModel):
|
|
337
|
+
"""A web search result with URL, title, and description."""
|
|
338
|
+
url: str
|
|
339
|
+
title: Optional[str] = None
|
|
340
|
+
description: Optional[str] = None
|
|
341
|
+
category: Optional[str] = None
|
|
342
|
+
|
|
343
|
+
class SearchResultNews(BaseModel):
|
|
344
|
+
"""A news search result with URL, title, snippet, date, image URL, and position."""
|
|
345
|
+
title: Optional[str] = None
|
|
346
|
+
url: Optional[str] = None
|
|
347
|
+
snippet: Optional[str] = None
|
|
348
|
+
date: Optional[str] = None
|
|
349
|
+
image_url: Optional[str] = None
|
|
350
|
+
position: Optional[int] = None
|
|
351
|
+
category: Optional[str] = None
|
|
352
|
+
|
|
353
|
+
class SearchResultImages(BaseModel):
|
|
354
|
+
"""An image search result with URL, title, image URL, image width, image height, and position."""
|
|
355
|
+
title: Optional[str] = None
|
|
356
|
+
image_url: Optional[str] = None
|
|
357
|
+
image_width: Optional[int] = None
|
|
358
|
+
image_height: Optional[int] = None
|
|
359
|
+
url: Optional[str] = None
|
|
360
|
+
position: Optional[int] = None
|
|
361
|
+
|
|
362
|
+
class SearchData(BaseModel):
|
|
363
|
+
"""Search results grouped by source type."""
|
|
364
|
+
web: Optional[List[Union[SearchResultWeb, Document]]] = None
|
|
365
|
+
news: Optional[List[Union[SearchResultNews, Document]]] = None
|
|
366
|
+
images: Optional[List[Union[SearchResultImages, Document]]] = None
|
|
367
|
+
|
|
368
|
+
class MapDocument(Document):
|
|
369
|
+
"""A document from a map operation with URL and description."""
|
|
370
|
+
url: str
|
|
371
|
+
description: Optional[str] = None
|
|
372
|
+
|
|
373
|
+
# Crawl params types
|
|
374
|
+
class CrawlParamsRequest(BaseModel):
|
|
375
|
+
"""Request for getting crawl parameters from LLM."""
|
|
376
|
+
url: str
|
|
377
|
+
prompt: str
|
|
378
|
+
|
|
379
|
+
class CrawlParamsData(BaseModel):
|
|
380
|
+
"""Data returned from crawl params endpoint."""
|
|
381
|
+
include_paths: Optional[List[str]] = None
|
|
382
|
+
exclude_paths: Optional[List[str]] = None
|
|
383
|
+
max_discovery_depth: Optional[int] = None
|
|
384
|
+
ignore_sitemap: bool = False
|
|
385
|
+
ignore_query_parameters: bool = False
|
|
386
|
+
limit: Optional[int] = None
|
|
387
|
+
crawl_entire_domain: bool = False
|
|
388
|
+
allow_external_links: bool = False
|
|
389
|
+
allow_subdomains: bool = False
|
|
390
|
+
delay: Optional[int] = None
|
|
391
|
+
max_concurrency: Optional[int] = None
|
|
392
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
393
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
394
|
+
zero_data_retention: bool = False
|
|
395
|
+
warning: Optional[str] = None
|
|
396
|
+
|
|
397
|
+
class CrawlParamsResponse(BaseResponse[CrawlParamsData]):
|
|
398
|
+
"""Response from crawl params endpoint."""
|
|
399
|
+
pass
|
|
400
|
+
|
|
401
|
+
# Batch scrape types
|
|
402
|
+
class BatchScrapeRequest(BaseModel):
|
|
403
|
+
"""Request for batch scraping multiple URLs (internal helper only)."""
|
|
404
|
+
urls: List[str]
|
|
405
|
+
options: Optional[ScrapeOptions] = None
|
|
406
|
+
|
|
407
|
+
class BatchScrapeResponse(BaseModel):
|
|
408
|
+
"""Response from starting a batch scrape job (mirrors CrawlResponse naming)."""
|
|
409
|
+
id: str
|
|
410
|
+
url: str
|
|
411
|
+
invalid_urls: Optional[List[str]] = None
|
|
412
|
+
|
|
413
|
+
class BatchScrapeJob(BaseModel):
|
|
414
|
+
"""Batch scrape job status and results."""
|
|
415
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
416
|
+
completed: int
|
|
417
|
+
total: int
|
|
418
|
+
credits_used: Optional[int] = None
|
|
419
|
+
expires_at: Optional[datetime] = None
|
|
420
|
+
next: Optional[str] = None
|
|
421
|
+
data: List[Document] = []
|
|
422
|
+
|
|
423
|
+
# Map types
|
|
424
|
+
class MapOptions(BaseModel):
|
|
425
|
+
"""Options for mapping operations."""
|
|
426
|
+
search: Optional[str] = None
|
|
427
|
+
sitemap: Literal["only", "include", "skip"] = "include"
|
|
428
|
+
include_subdomains: Optional[bool] = None
|
|
429
|
+
limit: Optional[int] = None
|
|
430
|
+
timeout: Optional[int] = None
|
|
431
|
+
|
|
432
|
+
class MapRequest(BaseModel):
|
|
433
|
+
"""Request for mapping a website."""
|
|
434
|
+
url: str
|
|
435
|
+
options: Optional[MapOptions] = None
|
|
436
|
+
|
|
437
|
+
class MapData(BaseModel):
|
|
438
|
+
"""Map results data."""
|
|
439
|
+
links: List['SearchResult']
|
|
440
|
+
|
|
441
|
+
class MapResponse(BaseResponse[MapData]):
|
|
442
|
+
"""Response for map operations."""
|
|
443
|
+
pass
|
|
444
|
+
|
|
445
|
+
# Extract types
|
|
446
|
+
class ExtractResponse(BaseModel):
|
|
447
|
+
"""Response for extract operations (start/status/final)."""
|
|
448
|
+
success: Optional[bool] = None
|
|
449
|
+
id: Optional[str] = None
|
|
450
|
+
status: Optional[Literal["processing", "completed", "failed", "cancelled"]] = None
|
|
451
|
+
data: Optional[Any] = None
|
|
452
|
+
error: Optional[str] = None
|
|
453
|
+
warning: Optional[str] = None
|
|
454
|
+
sources: Optional[Dict[str, Any]] = None
|
|
455
|
+
expires_at: Optional[datetime] = None
|
|
456
|
+
|
|
457
|
+
# Usage/limits types
|
|
458
|
+
class ConcurrencyCheck(BaseModel):
|
|
459
|
+
"""Current concurrency and limits for the team/API key."""
|
|
460
|
+
concurrency: int
|
|
461
|
+
max_concurrency: int
|
|
462
|
+
|
|
463
|
+
class CreditUsage(BaseModel):
|
|
464
|
+
"""Remaining credits for the team/API key."""
|
|
465
|
+
remaining_credits: int
|
|
466
|
+
|
|
467
|
+
class TokenUsage(BaseModel):
|
|
468
|
+
"""Recent token usage metrics (if available)."""
|
|
469
|
+
remaining_tokens: int
|
|
470
|
+
|
|
471
|
+
# Action types
|
|
472
|
+
class WaitAction(BaseModel):
|
|
473
|
+
"""Wait action to perform during scraping."""
|
|
474
|
+
type: Literal["wait"] = "wait"
|
|
475
|
+
milliseconds: Optional[int] = None
|
|
476
|
+
selector: Optional[str] = None
|
|
477
|
+
|
|
478
|
+
class ScreenshotAction(BaseModel):
|
|
479
|
+
"""Screenshot action to perform during scraping."""
|
|
480
|
+
type: Literal["screenshot"] = "screenshot"
|
|
481
|
+
full_page: Optional[bool] = None
|
|
482
|
+
quality: Optional[int] = None
|
|
483
|
+
viewport: Optional[Union[Dict[str, int], Viewport]] = None
|
|
484
|
+
|
|
485
|
+
class ClickAction(BaseModel):
|
|
486
|
+
"""Click action to perform during scraping."""
|
|
487
|
+
type: Literal["click"] = "click"
|
|
488
|
+
selector: str
|
|
489
|
+
|
|
490
|
+
class WriteAction(BaseModel):
|
|
491
|
+
"""Write action to perform during scraping."""
|
|
492
|
+
type: Literal["write"] = "write"
|
|
493
|
+
text: str
|
|
494
|
+
|
|
495
|
+
class PressAction(BaseModel):
|
|
496
|
+
"""Press action to perform during scraping."""
|
|
497
|
+
type: Literal["press"] = "press"
|
|
498
|
+
key: str
|
|
499
|
+
|
|
500
|
+
class ScrollAction(BaseModel):
|
|
501
|
+
"""Scroll action to perform during scraping."""
|
|
502
|
+
type: Literal["scroll"] = "scroll"
|
|
503
|
+
direction: Literal["up", "down"]
|
|
504
|
+
selector: Optional[str] = None
|
|
505
|
+
|
|
506
|
+
class ScrapeAction(BaseModel):
|
|
507
|
+
"""Scrape action to perform during scraping."""
|
|
508
|
+
type: Literal["scrape"] = "scrape"
|
|
509
|
+
|
|
510
|
+
class ExecuteJavascriptAction(BaseModel):
|
|
511
|
+
"""Execute javascript action to perform during scraping."""
|
|
512
|
+
type: Literal["executeJavascript"] = "executeJavascript"
|
|
513
|
+
script: str
|
|
514
|
+
|
|
515
|
+
class PDFAction(BaseModel):
|
|
516
|
+
"""PDF action to perform during scraping."""
|
|
517
|
+
type: Literal["pdf"] = "pdf"
|
|
518
|
+
format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
|
|
519
|
+
landscape: Optional[bool] = None
|
|
520
|
+
scale: Optional[float] = None
|
|
521
|
+
|
|
522
|
+
# Location types
|
|
523
|
+
class Location(BaseModel):
|
|
524
|
+
"""Location configuration for scraping."""
|
|
525
|
+
country: Optional[str] = None
|
|
526
|
+
languages: Optional[List[str]] = None
|
|
527
|
+
|
|
528
|
+
class SearchRequest(BaseModel):
|
|
529
|
+
"""Request for search operations."""
|
|
530
|
+
query: str
|
|
531
|
+
sources: Optional[List[SourceOption]] = None
|
|
532
|
+
categories: Optional[List[CategoryOption]] = None
|
|
533
|
+
limit: Optional[int] = 5
|
|
534
|
+
tbs: Optional[str] = None
|
|
535
|
+
location: Optional[str] = None
|
|
536
|
+
ignore_invalid_urls: Optional[bool] = None
|
|
537
|
+
timeout: Optional[int] = 60000
|
|
538
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
539
|
+
|
|
540
|
+
@field_validator('sources')
|
|
541
|
+
@classmethod
|
|
542
|
+
def validate_sources(cls, v):
|
|
543
|
+
"""Validate and normalize sources input."""
|
|
544
|
+
if v is None:
|
|
545
|
+
return v
|
|
546
|
+
|
|
547
|
+
normalized_sources = []
|
|
548
|
+
for source in v:
|
|
549
|
+
if isinstance(source, str):
|
|
550
|
+
normalized_sources.append(Source(type=source))
|
|
551
|
+
elif isinstance(source, dict):
|
|
552
|
+
normalized_sources.append(Source(**source))
|
|
553
|
+
elif isinstance(source, Source):
|
|
554
|
+
normalized_sources.append(source)
|
|
555
|
+
else:
|
|
556
|
+
raise ValueError(f"Invalid source format: {source}")
|
|
557
|
+
|
|
558
|
+
return normalized_sources
|
|
559
|
+
|
|
560
|
+
@field_validator('categories')
|
|
561
|
+
@classmethod
|
|
562
|
+
def validate_categories(cls, v):
|
|
563
|
+
"""Validate and normalize categories input."""
|
|
564
|
+
if v is None:
|
|
565
|
+
return v
|
|
566
|
+
|
|
567
|
+
normalized_categories = []
|
|
568
|
+
for category in v:
|
|
569
|
+
if isinstance(category, str):
|
|
570
|
+
normalized_categories.append(Category(type=category))
|
|
571
|
+
elif isinstance(category, dict):
|
|
572
|
+
normalized_categories.append(Category(**category))
|
|
573
|
+
elif isinstance(category, Category):
|
|
574
|
+
normalized_categories.append(category)
|
|
575
|
+
else:
|
|
576
|
+
raise ValueError(f"Invalid category format: {category}")
|
|
577
|
+
|
|
578
|
+
return normalized_categories
|
|
579
|
+
|
|
580
|
+
class LinkResult(BaseModel):
|
|
581
|
+
"""A generic link result with optional metadata (used by search and map)."""
|
|
582
|
+
url: str
|
|
583
|
+
title: Optional[str] = None
|
|
584
|
+
description: Optional[str] = None
|
|
585
|
+
|
|
586
|
+
# Backward-compatible alias for existing tests/usages
|
|
587
|
+
SearchResult = LinkResult
|
|
588
|
+
|
|
589
|
+
class SearchData(BaseModel):
|
|
590
|
+
"""Search results grouped by source type."""
|
|
591
|
+
web: Optional[List[Union[SearchResultWeb, Document]]] = None
|
|
592
|
+
news: Optional[List[Union[SearchResultNews, Document]]] = None
|
|
593
|
+
images: Optional[List[Union[SearchResultImages, Document]]] = None
|
|
594
|
+
|
|
595
|
+
class SearchResponse(BaseResponse[SearchData]):
|
|
596
|
+
"""Response from search operation."""
|
|
597
|
+
pass
|
|
598
|
+
|
|
599
|
+
# Error types
|
|
600
|
+
class ErrorDetails(BaseModel):
|
|
601
|
+
"""Detailed error information."""
|
|
602
|
+
code: Optional[str] = None
|
|
603
|
+
message: str
|
|
604
|
+
details: Optional[Dict[str, Any]] = None
|
|
605
|
+
|
|
606
|
+
class ErrorResponse(BaseModel):
|
|
607
|
+
"""Error response structure."""
|
|
608
|
+
success: bool = False
|
|
609
|
+
error: str
|
|
610
|
+
details: Optional[ErrorDetails] = None
|
|
611
|
+
|
|
612
|
+
# Job management types
|
|
613
|
+
class JobStatus(BaseModel):
|
|
614
|
+
"""Generic job status information."""
|
|
615
|
+
id: str
|
|
616
|
+
status: Literal["pending", "scraping", "completed", "failed"]
|
|
617
|
+
current: Optional[int] = None
|
|
618
|
+
total: Optional[int] = None
|
|
619
|
+
created_at: Optional[datetime] = None
|
|
620
|
+
completed_at: Optional[datetime] = None
|
|
621
|
+
expires_at: Optional[datetime] = None
|
|
622
|
+
|
|
623
|
+
class CrawlError(BaseModel):
|
|
624
|
+
"""A crawl error."""
|
|
625
|
+
id: str
|
|
626
|
+
timestamp: Optional[datetime] = None
|
|
627
|
+
url: str
|
|
628
|
+
code: Optional[str] = None
|
|
629
|
+
error: str
|
|
630
|
+
|
|
631
|
+
class CrawlErrorsResponse(BaseModel):
|
|
632
|
+
"""Response from crawl error monitoring."""
|
|
633
|
+
errors: List[CrawlError]
|
|
634
|
+
robots_blocked: List[str]
|
|
635
|
+
|
|
636
|
+
class ActiveCrawl(BaseModel):
|
|
637
|
+
"""Information about an active crawl job."""
|
|
638
|
+
id: str
|
|
639
|
+
team_id: str
|
|
640
|
+
url: str
|
|
641
|
+
options: Optional[Dict[str, Any]] = None
|
|
642
|
+
|
|
643
|
+
class ActiveCrawlsResponse(BaseModel):
|
|
644
|
+
"""Response from active crawls endpoint."""
|
|
645
|
+
success: bool = True
|
|
646
|
+
crawls: List[ActiveCrawl]
|
|
647
|
+
|
|
648
|
+
# Configuration types
|
|
649
|
+
class ClientConfig(BaseModel):
|
|
650
|
+
"""Configuration for the Firecrawl client."""
|
|
651
|
+
api_key: str
|
|
652
|
+
api_url: str = "https://api.firecrawl.dev"
|
|
653
|
+
timeout: Optional[float] = None
|
|
654
|
+
max_retries: int = 3
|
|
655
|
+
backoff_factor: float = 0.5
|
|
656
|
+
|
|
657
|
+
# Response union types
|
|
658
|
+
AnyResponse = Union[
|
|
659
|
+
ScrapeResponse,
|
|
660
|
+
CrawlResponse,
|
|
661
|
+
BatchScrapeResponse,
|
|
662
|
+
MapResponse,
|
|
663
|
+
SearchResponse,
|
|
664
|
+
ErrorResponse,
|
|
665
|
+
]
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility modules for v2 API client.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .http_client import HttpClient
|
|
6
|
+
from .error_handler import FirecrawlError, handle_response_error
|
|
7
|
+
from .validation import validate_scrape_options, prepare_scrape_options
|
|
8
|
+
|
|
9
|
+
__all__ = ['HttpClient', 'FirecrawlError', 'handle_response_error', 'validate_scrape_options', 'prepare_scrape_options']
|