firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
firecrawl/v2/types.py
ADDED
|
@@ -0,0 +1,1143 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Type definitions for Firecrawl v2 API.
|
|
3
|
+
|
|
4
|
+
This module contains clean, modern type definitions for the v2 API.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import warnings
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
|
|
10
|
+
import logging
|
|
11
|
+
from pydantic import (
|
|
12
|
+
BaseModel,
|
|
13
|
+
Field,
|
|
14
|
+
field_validator,
|
|
15
|
+
ValidationError,
|
|
16
|
+
model_serializer,
|
|
17
|
+
model_validator,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Suppress pydantic warnings about schema field shadowing
|
|
21
|
+
# Tested using schema_field alias="schema" but it doesn't work.
|
|
22
|
+
warnings.filterwarnings(
|
|
23
|
+
"ignore",
|
|
24
|
+
message='Field name "schema" in "Format" shadows an attribute in parent "BaseModel"',
|
|
25
|
+
)
|
|
26
|
+
warnings.filterwarnings(
|
|
27
|
+
"ignore",
|
|
28
|
+
message='Field name "schema" in "JsonFormat" shadows an attribute in parent "Format"',
|
|
29
|
+
)
|
|
30
|
+
warnings.filterwarnings(
|
|
31
|
+
"ignore",
|
|
32
|
+
message='Field name "schema" in "ChangeTrackingFormat" shadows an attribute in parent "Format"',
|
|
33
|
+
)
|
|
34
|
+
warnings.filterwarnings(
|
|
35
|
+
"ignore",
|
|
36
|
+
message='Field name "json" in "ScrapeFormats" shadows an attribute in parent "BaseModel"',
|
|
37
|
+
)
|
|
38
|
+
warnings.filterwarnings(
|
|
39
|
+
"ignore",
|
|
40
|
+
message='Field name "json" in "Document" shadows an attribute in parent "BaseModel"',
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
T = TypeVar("T")
|
|
44
|
+
|
|
45
|
+
# Module logger
|
|
46
|
+
logger = logging.getLogger("firecrawl")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Base response types
|
|
50
|
+
class BaseResponse(BaseModel, Generic[T]):
|
|
51
|
+
"""Base response structure for all API responses."""
|
|
52
|
+
|
|
53
|
+
success: bool
|
|
54
|
+
data: Optional[T] = None
|
|
55
|
+
error: Optional[str] = None
|
|
56
|
+
warning: Optional[str] = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Document and content types
|
|
60
|
+
class DocumentMetadata(BaseModel):
|
|
61
|
+
"""Metadata for scraped documents (snake_case only; API camelCase normalized in code)."""
|
|
62
|
+
|
|
63
|
+
model_config = {"extra": "allow"}
|
|
64
|
+
|
|
65
|
+
@model_serializer(mode="wrap")
|
|
66
|
+
def _serialize(self, handler):
|
|
67
|
+
out = handler(self)
|
|
68
|
+
extra = getattr(self, "__pydantic_extra__", None)
|
|
69
|
+
if isinstance(extra, dict):
|
|
70
|
+
for k, v in extra.items():
|
|
71
|
+
if v is not None:
|
|
72
|
+
out[k] = v
|
|
73
|
+
return out
|
|
74
|
+
|
|
75
|
+
# Common metadata fields
|
|
76
|
+
title: Optional[str] = None
|
|
77
|
+
description: Optional[str] = None
|
|
78
|
+
url: Optional[str] = None
|
|
79
|
+
language: Optional[str] = None
|
|
80
|
+
keywords: Optional[Union[str, List[str]]] = None
|
|
81
|
+
robots: Optional[str] = None
|
|
82
|
+
|
|
83
|
+
# OpenGraph and social metadata
|
|
84
|
+
og_title: Optional[str] = None
|
|
85
|
+
og_description: Optional[str] = None
|
|
86
|
+
og_url: Optional[str] = None
|
|
87
|
+
og_image: Optional[str] = None
|
|
88
|
+
og_audio: Optional[str] = None
|
|
89
|
+
og_determiner: Optional[str] = None
|
|
90
|
+
og_locale: Optional[str] = None
|
|
91
|
+
og_locale_alternate: Optional[List[str]] = None
|
|
92
|
+
og_site_name: Optional[str] = None
|
|
93
|
+
og_video: Optional[str] = None
|
|
94
|
+
|
|
95
|
+
# Dublin Core and other site metadata
|
|
96
|
+
favicon: Optional[str] = None
|
|
97
|
+
dc_terms_created: Optional[str] = None
|
|
98
|
+
dc_date_created: Optional[str] = None
|
|
99
|
+
dc_date: Optional[str] = None
|
|
100
|
+
dc_terms_type: Optional[str] = None
|
|
101
|
+
dc_type: Optional[str] = None
|
|
102
|
+
dc_terms_audience: Optional[str] = None
|
|
103
|
+
dc_terms_subject: Optional[str] = None
|
|
104
|
+
dc_subject: Optional[str] = None
|
|
105
|
+
dc_description: Optional[str] = None
|
|
106
|
+
dc_terms_keywords: Optional[str] = None
|
|
107
|
+
|
|
108
|
+
modified_time: Optional[str] = None
|
|
109
|
+
published_time: Optional[str] = None
|
|
110
|
+
article_tag: Optional[str] = None
|
|
111
|
+
article_section: Optional[str] = None
|
|
112
|
+
|
|
113
|
+
# Response-level metadata
|
|
114
|
+
source_url: Optional[str] = None
|
|
115
|
+
status_code: Optional[int] = None
|
|
116
|
+
scrape_id: Optional[str] = None
|
|
117
|
+
num_pages: Optional[int] = None
|
|
118
|
+
content_type: Optional[str] = None
|
|
119
|
+
proxy_used: Optional[Literal["basic", "stealth"]] = None
|
|
120
|
+
timezone: Optional[str] = None
|
|
121
|
+
cache_state: Optional[Literal["hit", "miss"]] = None
|
|
122
|
+
cached_at: Optional[str] = None
|
|
123
|
+
credits_used: Optional[int] = None
|
|
124
|
+
concurrency_limited: Optional[bool] = None
|
|
125
|
+
concurrency_queue_duration_ms: Optional[int] = None
|
|
126
|
+
|
|
127
|
+
# Error information
|
|
128
|
+
error: Optional[str] = None
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def extras(self) -> Dict[str, Any]:
|
|
132
|
+
"""Return unknown metadata keys preserved on the model."""
|
|
133
|
+
extra = getattr(self, "__pydantic_extra__", None)
|
|
134
|
+
return dict(extra) if isinstance(extra, dict) else {}
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def _coerce_list_to_string(value: Any) -> Any:
|
|
138
|
+
if isinstance(value, list):
|
|
139
|
+
# Prefer first string if semantically a single-valued field, else join
|
|
140
|
+
if len(value) == 1:
|
|
141
|
+
return str(value[0])
|
|
142
|
+
return ", ".join(str(item) for item in value)
|
|
143
|
+
return value
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def _coerce_string_to_int(value: Any) -> Any:
|
|
147
|
+
if isinstance(value, str):
|
|
148
|
+
try:
|
|
149
|
+
return int(value)
|
|
150
|
+
except ValueError:
|
|
151
|
+
return value
|
|
152
|
+
return value
|
|
153
|
+
|
|
154
|
+
@model_validator(mode="before")
|
|
155
|
+
@classmethod
|
|
156
|
+
def coerce_lists_for_string_fields(cls, data):
|
|
157
|
+
"""Before validation: coerce lists to strings for known single-string fields.
|
|
158
|
+
Preserves unknown-key lists.
|
|
159
|
+
"""
|
|
160
|
+
if not isinstance(data, dict):
|
|
161
|
+
return data
|
|
162
|
+
single_str_fields = {
|
|
163
|
+
"title",
|
|
164
|
+
"description",
|
|
165
|
+
"url",
|
|
166
|
+
"language",
|
|
167
|
+
"robots",
|
|
168
|
+
"og_title",
|
|
169
|
+
"og_description",
|
|
170
|
+
"og_url",
|
|
171
|
+
"og_image",
|
|
172
|
+
"og_audio",
|
|
173
|
+
"og_determiner",
|
|
174
|
+
"og_locale",
|
|
175
|
+
"og_site_name",
|
|
176
|
+
"og_video",
|
|
177
|
+
"favicon",
|
|
178
|
+
"dc_terms_created",
|
|
179
|
+
"dc_date_created",
|
|
180
|
+
"dc_date",
|
|
181
|
+
"dc_terms_type",
|
|
182
|
+
"dc_type",
|
|
183
|
+
"dc_terms_audience",
|
|
184
|
+
"dc_terms_subject",
|
|
185
|
+
"dc_subject",
|
|
186
|
+
"dc_description",
|
|
187
|
+
"dc_terms_keywords",
|
|
188
|
+
"modified_time",
|
|
189
|
+
"published_time",
|
|
190
|
+
"article_tag",
|
|
191
|
+
"article_section",
|
|
192
|
+
"source_url",
|
|
193
|
+
"scrape_id",
|
|
194
|
+
"content_type",
|
|
195
|
+
"cached_at",
|
|
196
|
+
"error",
|
|
197
|
+
"timezone",
|
|
198
|
+
}
|
|
199
|
+
for k, v in list(data.items()):
|
|
200
|
+
if isinstance(v, list) and k in single_str_fields:
|
|
201
|
+
data[k] = cls._coerce_list_to_string(v)
|
|
202
|
+
# For ints that might appear as list, take first
|
|
203
|
+
if isinstance(v, list) and k in {
|
|
204
|
+
"status_code",
|
|
205
|
+
"num_pages",
|
|
206
|
+
"credits_used",
|
|
207
|
+
}:
|
|
208
|
+
first = v[0] if v else None
|
|
209
|
+
data[k] = cls._coerce_string_to_int(first)
|
|
210
|
+
return data
|
|
211
|
+
|
|
212
|
+
@field_validator(
|
|
213
|
+
"robots",
|
|
214
|
+
"og_title",
|
|
215
|
+
"og_description",
|
|
216
|
+
"og_url",
|
|
217
|
+
"og_image",
|
|
218
|
+
"language",
|
|
219
|
+
mode="before",
|
|
220
|
+
)
|
|
221
|
+
@classmethod
|
|
222
|
+
def coerce_lists_to_string_fields(cls, v):
|
|
223
|
+
return cls._coerce_list_to_string(v)
|
|
224
|
+
|
|
225
|
+
@field_validator("status_code", mode="before")
|
|
226
|
+
@classmethod
|
|
227
|
+
def coerce_status_code_to_int(cls, v):
|
|
228
|
+
return cls._coerce_string_to_int(v)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class AgentOptions(BaseModel):
|
|
232
|
+
"""Configuration for the agent in extract operations."""
|
|
233
|
+
|
|
234
|
+
model: Literal["FIRE-1", "v3-beta"] = "FIRE-1"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class AttributeResult(BaseModel):
|
|
238
|
+
"""Result of attribute extraction."""
|
|
239
|
+
|
|
240
|
+
selector: str
|
|
241
|
+
attribute: str
|
|
242
|
+
values: List[str]
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
class BrandingProfile(BaseModel):
|
|
246
|
+
"""Branding information extracted from a website."""
|
|
247
|
+
|
|
248
|
+
model_config = {"extra": "allow"}
|
|
249
|
+
|
|
250
|
+
color_scheme: Optional[Literal["light", "dark"]] = None
|
|
251
|
+
logo: Optional[str] = None
|
|
252
|
+
fonts: Optional[List[Dict[str, Any]]] = None
|
|
253
|
+
colors: Optional[Dict[str, str]] = None
|
|
254
|
+
typography: Optional[Dict[str, Any]] = None
|
|
255
|
+
spacing: Optional[Dict[str, Any]] = None
|
|
256
|
+
components: Optional[Dict[str, Any]] = None
|
|
257
|
+
icons: Optional[Dict[str, str]] = None
|
|
258
|
+
images: Optional[Dict[str, Optional[str]]] = None
|
|
259
|
+
animations: Optional[Dict[str, str]] = None
|
|
260
|
+
layout: Optional[Dict[str, Any]] = None
|
|
261
|
+
tone: Optional[Dict[str, str]] = None
|
|
262
|
+
personality: Optional[Dict[str, Any]] = None
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class Document(BaseModel):
|
|
266
|
+
"""A scraped document."""
|
|
267
|
+
|
|
268
|
+
markdown: Optional[str] = None
|
|
269
|
+
html: Optional[str] = None
|
|
270
|
+
raw_html: Optional[str] = None
|
|
271
|
+
json: Optional[Any] = None
|
|
272
|
+
summary: Optional[str] = None
|
|
273
|
+
metadata: Optional[DocumentMetadata] = None
|
|
274
|
+
links: Optional[List[str]] = None
|
|
275
|
+
images: Optional[List[str]] = None
|
|
276
|
+
screenshot: Optional[str] = None
|
|
277
|
+
actions: Optional[Dict[str, Any]] = None
|
|
278
|
+
warning: Optional[str] = None
|
|
279
|
+
change_tracking: Optional[Dict[str, Any]] = None
|
|
280
|
+
branding: Optional[BrandingProfile] = None
|
|
281
|
+
|
|
282
|
+
@property
|
|
283
|
+
def metadata_typed(self) -> DocumentMetadata:
|
|
284
|
+
"""Always returns a DocumentMetadata instance for LSP-friendly access."""
|
|
285
|
+
md = self.metadata
|
|
286
|
+
if isinstance(md, DocumentMetadata):
|
|
287
|
+
return md
|
|
288
|
+
if isinstance(md, dict):
|
|
289
|
+
try:
|
|
290
|
+
return DocumentMetadata.model_validate(md)
|
|
291
|
+
except (ValidationError, TypeError) as exc:
|
|
292
|
+
logger.debug("Failed to construct DocumentMetadata from dict: %s", exc)
|
|
293
|
+
return DocumentMetadata()
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def metadata_dict(self) -> Dict[str, Any]:
|
|
297
|
+
"""Returns metadata as a plain dict (exclude None), including extras."""
|
|
298
|
+
md = self.metadata
|
|
299
|
+
if isinstance(md, DocumentMetadata):
|
|
300
|
+
out = md.model_dump(exclude_none=True)
|
|
301
|
+
# Ensure extras are preserved even if model_dump omits them
|
|
302
|
+
extra = getattr(md, "__pydantic_extra__", None)
|
|
303
|
+
if isinstance(extra, dict):
|
|
304
|
+
for k, v in extra.items():
|
|
305
|
+
if v is not None:
|
|
306
|
+
out[k] = v
|
|
307
|
+
return out
|
|
308
|
+
if isinstance(md, dict):
|
|
309
|
+
return {k: v for k, v in md.items() if v is not None}
|
|
310
|
+
return {}
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
# Webhook types
|
|
314
|
+
class WebhookConfig(BaseModel):
|
|
315
|
+
"""Configuration for webhooks."""
|
|
316
|
+
|
|
317
|
+
url: str
|
|
318
|
+
headers: Optional[Dict[str, str]] = None
|
|
319
|
+
metadata: Optional[Dict[str, str]] = None
|
|
320
|
+
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
class WebhookData(BaseModel):
|
|
324
|
+
"""Data sent to webhooks."""
|
|
325
|
+
|
|
326
|
+
job_id: str
|
|
327
|
+
status: str
|
|
328
|
+
current: Optional[int] = None
|
|
329
|
+
total: Optional[int] = None
|
|
330
|
+
data: Optional[List[Document]] = None
|
|
331
|
+
error: Optional[str] = None
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class Source(BaseModel):
|
|
335
|
+
"""Configuration for a search source."""
|
|
336
|
+
|
|
337
|
+
type: str
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
SourceOption = Union[str, Source]
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
class Category(BaseModel):
|
|
344
|
+
"""Configuration for a search category.
|
|
345
|
+
|
|
346
|
+
Supported categories:
|
|
347
|
+
- "github": Filter results to GitHub repositories
|
|
348
|
+
- "research": Filter results to research papers and academic sites
|
|
349
|
+
- "pdf": Filter results to PDF files (adds filetype:pdf to search)
|
|
350
|
+
"""
|
|
351
|
+
|
|
352
|
+
type: str
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
CategoryOption = Union[str, Category]
|
|
356
|
+
|
|
357
|
+
FormatString = Literal[
|
|
358
|
+
# camelCase versions (API format)
|
|
359
|
+
"markdown",
|
|
360
|
+
"html",
|
|
361
|
+
"rawHtml",
|
|
362
|
+
"links",
|
|
363
|
+
"images",
|
|
364
|
+
"screenshot",
|
|
365
|
+
"summary",
|
|
366
|
+
"changeTracking",
|
|
367
|
+
"json",
|
|
368
|
+
"attributes",
|
|
369
|
+
"branding",
|
|
370
|
+
# snake_case versions (user-friendly)
|
|
371
|
+
"raw_html",
|
|
372
|
+
"change_tracking",
|
|
373
|
+
]
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
class Viewport(BaseModel):
|
|
377
|
+
"""Viewport configuration for screenshots."""
|
|
378
|
+
|
|
379
|
+
width: int
|
|
380
|
+
height: int
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
class Format(BaseModel):
|
|
384
|
+
"""Configuration for a format."""
|
|
385
|
+
|
|
386
|
+
type: FormatString
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
class JsonFormat(Format):
|
|
390
|
+
"""Configuration for JSON extraction."""
|
|
391
|
+
|
|
392
|
+
prompt: Optional[str] = None
|
|
393
|
+
schema: Optional[Any] = None
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
class ChangeTrackingFormat(Format):
|
|
397
|
+
"""Configuration for change tracking."""
|
|
398
|
+
|
|
399
|
+
modes: List[Literal["git-diff", "json"]]
|
|
400
|
+
schema: Optional[Dict[str, Any]] = None
|
|
401
|
+
prompt: Optional[str] = None
|
|
402
|
+
tag: Optional[str] = None
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
class ScreenshotFormat(BaseModel):
|
|
406
|
+
"""Configuration for screenshot format."""
|
|
407
|
+
|
|
408
|
+
type: Literal["screenshot"] = "screenshot"
|
|
409
|
+
full_page: Optional[bool] = None
|
|
410
|
+
quality: Optional[int] = None
|
|
411
|
+
viewport: Optional[Union[Dict[str, int], Viewport]] = None
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
class AttributeSelector(BaseModel):
|
|
415
|
+
"""Selector and attribute pair for attribute extraction."""
|
|
416
|
+
|
|
417
|
+
selector: str
|
|
418
|
+
attribute: str
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
class AttributesFormat(Format):
|
|
422
|
+
"""Configuration for attribute extraction."""
|
|
423
|
+
|
|
424
|
+
type: Literal["attributes"] = "attributes"
|
|
425
|
+
selectors: List[AttributeSelector]
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
FormatOption = Union[
|
|
429
|
+
Dict[str, Any],
|
|
430
|
+
FormatString,
|
|
431
|
+
JsonFormat,
|
|
432
|
+
ChangeTrackingFormat,
|
|
433
|
+
ScreenshotFormat,
|
|
434
|
+
AttributesFormat,
|
|
435
|
+
Format,
|
|
436
|
+
]
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
# Scrape types
|
|
440
|
+
class ScrapeFormats(BaseModel):
|
|
441
|
+
"""Output formats for scraping."""
|
|
442
|
+
|
|
443
|
+
formats: Optional[List[FormatOption]] = None
|
|
444
|
+
markdown: bool = True
|
|
445
|
+
html: bool = False
|
|
446
|
+
raw_html: bool = False
|
|
447
|
+
summary: bool = False
|
|
448
|
+
links: bool = False
|
|
449
|
+
images: bool = False
|
|
450
|
+
screenshot: bool = False
|
|
451
|
+
change_tracking: bool = False
|
|
452
|
+
json: bool = False
|
|
453
|
+
|
|
454
|
+
@field_validator("formats")
|
|
455
|
+
@classmethod
|
|
456
|
+
def validate_formats(cls, v):
|
|
457
|
+
"""Validate and normalize formats input."""
|
|
458
|
+
if v is None:
|
|
459
|
+
return v
|
|
460
|
+
|
|
461
|
+
normalized_formats = []
|
|
462
|
+
for format_item in v:
|
|
463
|
+
if isinstance(format_item, str):
|
|
464
|
+
normalized_formats.append(Format(type=format_item))
|
|
465
|
+
elif isinstance(format_item, dict):
|
|
466
|
+
# Preserve dicts as-is to avoid dropping custom fields like 'schema'
|
|
467
|
+
normalized_formats.append(format_item)
|
|
468
|
+
elif isinstance(format_item, Format):
|
|
469
|
+
normalized_formats.append(format_item)
|
|
470
|
+
else:
|
|
471
|
+
raise ValueError(f"Invalid format format: {format_item}")
|
|
472
|
+
|
|
473
|
+
return normalized_formats
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
class ScrapeOptions(BaseModel):
|
|
477
|
+
"""Options for scraping operations."""
|
|
478
|
+
|
|
479
|
+
formats: Optional[Union["ScrapeFormats", List[FormatOption]]] = None
|
|
480
|
+
headers: Optional[Dict[str, str]] = None
|
|
481
|
+
include_tags: Optional[List[str]] = None
|
|
482
|
+
exclude_tags: Optional[List[str]] = None
|
|
483
|
+
only_main_content: Optional[bool] = None
|
|
484
|
+
timeout: Optional[int] = None
|
|
485
|
+
wait_for: Optional[int] = None
|
|
486
|
+
mobile: Optional[bool] = None
|
|
487
|
+
parsers: Optional[Union[List[str], List[Union[str, "PDFParser"]]]] = None
|
|
488
|
+
actions: Optional[
|
|
489
|
+
List[
|
|
490
|
+
Union[
|
|
491
|
+
"WaitAction",
|
|
492
|
+
"ScreenshotAction",
|
|
493
|
+
"ClickAction",
|
|
494
|
+
"WriteAction",
|
|
495
|
+
"PressAction",
|
|
496
|
+
"ScrollAction",
|
|
497
|
+
"ScrapeAction",
|
|
498
|
+
"ExecuteJavascriptAction",
|
|
499
|
+
"PDFAction",
|
|
500
|
+
]
|
|
501
|
+
]
|
|
502
|
+
] = None
|
|
503
|
+
location: Optional["Location"] = None
|
|
504
|
+
skip_tls_verification: Optional[bool] = None
|
|
505
|
+
remove_base64_images: Optional[bool] = None
|
|
506
|
+
fast_mode: Optional[bool] = None
|
|
507
|
+
use_mock: Optional[str] = None
|
|
508
|
+
block_ads: Optional[bool] = None
|
|
509
|
+
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
510
|
+
max_age: Optional[int] = None
|
|
511
|
+
min_age: Optional[int] = None
|
|
512
|
+
store_in_cache: Optional[bool] = None
|
|
513
|
+
integration: Optional[str] = None
|
|
514
|
+
|
|
515
|
+
@field_validator("formats")
|
|
516
|
+
@classmethod
|
|
517
|
+
def validate_formats(cls, v):
|
|
518
|
+
"""Validate and normalize formats input."""
|
|
519
|
+
if v is None:
|
|
520
|
+
return v
|
|
521
|
+
if isinstance(v, ScrapeFormats):
|
|
522
|
+
return v
|
|
523
|
+
if isinstance(v, list):
|
|
524
|
+
return v
|
|
525
|
+
raise ValueError(
|
|
526
|
+
f"Invalid formats type: {type(v)}. Expected ScrapeFormats or List[FormatOption]"
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
class ScrapeRequest(BaseModel):
|
|
531
|
+
"""Request for scraping a single URL."""
|
|
532
|
+
|
|
533
|
+
url: str
|
|
534
|
+
options: Optional[ScrapeOptions] = None
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
class ScrapeData(Document):
|
|
538
|
+
"""Scrape results data."""
|
|
539
|
+
|
|
540
|
+
pass
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
class ScrapeResponse(BaseResponse[ScrapeData]):
|
|
544
|
+
"""Response for scrape operations."""
|
|
545
|
+
|
|
546
|
+
pass
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
# Crawl types
|
|
550
|
+
class CrawlRequest(BaseModel):
|
|
551
|
+
"""Request for crawling a website."""
|
|
552
|
+
|
|
553
|
+
url: str
|
|
554
|
+
prompt: Optional[str] = None
|
|
555
|
+
exclude_paths: Optional[List[str]] = None
|
|
556
|
+
include_paths: Optional[List[str]] = None
|
|
557
|
+
max_discovery_depth: Optional[int] = None
|
|
558
|
+
sitemap: Literal["skip", "include"] = "include"
|
|
559
|
+
ignore_query_parameters: bool = False
|
|
560
|
+
limit: Optional[int] = None
|
|
561
|
+
crawl_entire_domain: bool = False
|
|
562
|
+
allow_external_links: bool = False
|
|
563
|
+
allow_subdomains: bool = False
|
|
564
|
+
delay: Optional[int] = None
|
|
565
|
+
max_concurrency: Optional[int] = None
|
|
566
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
567
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
568
|
+
zero_data_retention: bool = False
|
|
569
|
+
integration: Optional[str] = None
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
class CrawlResponse(BaseModel):
|
|
573
|
+
"""Information about a crawl job."""
|
|
574
|
+
|
|
575
|
+
id: str
|
|
576
|
+
url: str
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
class CrawlJob(BaseModel):
|
|
580
|
+
"""Crawl job status and progress data."""
|
|
581
|
+
|
|
582
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
583
|
+
total: int = 0
|
|
584
|
+
completed: int = 0
|
|
585
|
+
credits_used: int = 0
|
|
586
|
+
expires_at: Optional[datetime] = None
|
|
587
|
+
next: Optional[str] = None
|
|
588
|
+
data: List[Document] = []
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
class CrawlStatusRequest(BaseModel):
|
|
592
|
+
"""Request to get crawl job status."""
|
|
593
|
+
|
|
594
|
+
job_id: str
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
class SearchResultWeb(BaseModel):
|
|
598
|
+
"""A web search result with URL, title, and description."""
|
|
599
|
+
|
|
600
|
+
url: str
|
|
601
|
+
title: Optional[str] = None
|
|
602
|
+
description: Optional[str] = None
|
|
603
|
+
category: Optional[str] = None
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
class SearchResultNews(BaseModel):
|
|
607
|
+
"""A news search result with URL, title, snippet, date, image URL, and position."""
|
|
608
|
+
|
|
609
|
+
title: Optional[str] = None
|
|
610
|
+
url: Optional[str] = None
|
|
611
|
+
snippet: Optional[str] = None
|
|
612
|
+
date: Optional[str] = None
|
|
613
|
+
image_url: Optional[str] = None
|
|
614
|
+
position: Optional[int] = None
|
|
615
|
+
category: Optional[str] = None
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
class SearchResultImages(BaseModel):
|
|
619
|
+
"""An image search result with URL, title, image URL, image width, image height, and position."""
|
|
620
|
+
|
|
621
|
+
title: Optional[str] = None
|
|
622
|
+
image_url: Optional[str] = None
|
|
623
|
+
image_width: Optional[int] = None
|
|
624
|
+
image_height: Optional[int] = None
|
|
625
|
+
url: Optional[str] = None
|
|
626
|
+
position: Optional[int] = None
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
class MapDocument(Document):
|
|
630
|
+
"""A document from a map operation with URL and description."""
|
|
631
|
+
|
|
632
|
+
url: str
|
|
633
|
+
description: Optional[str] = None
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
# Crawl params types
|
|
637
|
+
class CrawlParamsRequest(BaseModel):
|
|
638
|
+
"""Request for getting crawl parameters from LLM."""
|
|
639
|
+
|
|
640
|
+
url: str
|
|
641
|
+
prompt: str
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
class CrawlParamsData(BaseModel):
|
|
645
|
+
"""Data returned from crawl params endpoint."""
|
|
646
|
+
|
|
647
|
+
include_paths: Optional[List[str]] = None
|
|
648
|
+
exclude_paths: Optional[List[str]] = None
|
|
649
|
+
max_discovery_depth: Optional[int] = None
|
|
650
|
+
ignore_sitemap: bool = False
|
|
651
|
+
ignore_query_parameters: bool = False
|
|
652
|
+
limit: Optional[int] = None
|
|
653
|
+
crawl_entire_domain: bool = False
|
|
654
|
+
allow_external_links: bool = False
|
|
655
|
+
allow_subdomains: bool = False
|
|
656
|
+
delay: Optional[int] = None
|
|
657
|
+
max_concurrency: Optional[int] = None
|
|
658
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
659
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
660
|
+
zero_data_retention: bool = False
|
|
661
|
+
warning: Optional[str] = None
|
|
662
|
+
integration: Optional[str] = None
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
class CrawlParamsResponse(BaseResponse[CrawlParamsData]):
|
|
666
|
+
"""Response from crawl params endpoint."""
|
|
667
|
+
|
|
668
|
+
pass
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
# Batch scrape types
|
|
672
|
+
class BatchScrapeRequest(BaseModel):
|
|
673
|
+
"""Request for batch scraping multiple URLs (internal helper only)."""
|
|
674
|
+
|
|
675
|
+
urls: List[str]
|
|
676
|
+
options: Optional[ScrapeOptions] = None
|
|
677
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
678
|
+
append_to_id: Optional[str] = None
|
|
679
|
+
ignore_invalid_urls: Optional[bool] = None
|
|
680
|
+
max_concurrency: Optional[int] = None
|
|
681
|
+
zero_data_retention: Optional[bool] = None
|
|
682
|
+
integration: Optional[str] = None
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
class BatchScrapeResponse(BaseModel):
|
|
686
|
+
"""Response from starting a batch scrape job (mirrors CrawlResponse naming)."""
|
|
687
|
+
|
|
688
|
+
id: str
|
|
689
|
+
url: str
|
|
690
|
+
invalid_urls: Optional[List[str]] = None
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
class BatchScrapeJob(BaseModel):
|
|
694
|
+
"""Batch scrape job status and results."""
|
|
695
|
+
|
|
696
|
+
status: Literal["scraping", "completed", "failed", "cancelled"]
|
|
697
|
+
completed: int
|
|
698
|
+
total: int
|
|
699
|
+
credits_used: Optional[int] = None
|
|
700
|
+
expires_at: Optional[datetime] = None
|
|
701
|
+
next: Optional[str] = None
|
|
702
|
+
data: List[Document] = []
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
class BatchScrapeStatusRequest(BaseModel):
|
|
706
|
+
"""Request to get batch scrape job status."""
|
|
707
|
+
|
|
708
|
+
job_id: str
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
class BatchScrapeErrorsRequest(BaseModel):
|
|
712
|
+
"""Request to get errors for a batch scrape job."""
|
|
713
|
+
|
|
714
|
+
job_id: str
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
# Map types
|
|
718
|
+
class MapOptions(BaseModel):
|
|
719
|
+
"""Options for mapping operations."""
|
|
720
|
+
|
|
721
|
+
search: Optional[str] = None
|
|
722
|
+
sitemap: Literal["only", "include", "skip"] = "include"
|
|
723
|
+
include_subdomains: Optional[bool] = None
|
|
724
|
+
ignore_query_parameters: Optional[bool] = None
|
|
725
|
+
limit: Optional[int] = None
|
|
726
|
+
timeout: Optional[int] = None
|
|
727
|
+
integration: Optional[str] = None
|
|
728
|
+
location: Optional["Location"] = None
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
class MapRequest(BaseModel):
|
|
732
|
+
"""Request for mapping a website."""
|
|
733
|
+
|
|
734
|
+
url: str
|
|
735
|
+
options: Optional[MapOptions] = None
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
class MapData(BaseModel):
|
|
739
|
+
"""Map results data."""
|
|
740
|
+
|
|
741
|
+
links: List["SearchResult"]
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
class MapResponse(BaseResponse[MapData]):
|
|
745
|
+
"""Response for map operations."""
|
|
746
|
+
|
|
747
|
+
pass
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
# Extract types
|
|
751
|
+
class ExtractRequest(BaseModel):
|
|
752
|
+
"""Request for extract operations."""
|
|
753
|
+
|
|
754
|
+
urls: Optional[List[str]] = None
|
|
755
|
+
prompt: Optional[str] = None
|
|
756
|
+
schema_: Optional[Dict[str, Any]] = Field(default=None, alias="schema")
|
|
757
|
+
system_prompt: Optional[str] = None
|
|
758
|
+
allow_external_links: Optional[bool] = None
|
|
759
|
+
enable_web_search: Optional[bool] = None
|
|
760
|
+
show_sources: Optional[bool] = None
|
|
761
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
762
|
+
ignore_invalid_urls: Optional[bool] = None
|
|
763
|
+
integration: Optional[str] = None
|
|
764
|
+
agent: Optional[AgentOptions] = None
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
class ExtractResponse(BaseModel):
|
|
768
|
+
"""Response for extract operations (start/status/final)."""
|
|
769
|
+
|
|
770
|
+
success: Optional[bool] = None
|
|
771
|
+
id: Optional[str] = None
|
|
772
|
+
status: Optional[Literal["processing", "completed", "failed", "cancelled"]] = None
|
|
773
|
+
data: Optional[Any] = None
|
|
774
|
+
error: Optional[str] = None
|
|
775
|
+
warning: Optional[str] = None
|
|
776
|
+
sources: Optional[Dict[str, Any]] = None
|
|
777
|
+
expires_at: Optional[datetime] = None
|
|
778
|
+
credits_used: Optional[int] = None
|
|
779
|
+
tokens_used: Optional[int] = None
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
class AgentResponse(BaseModel):
|
|
783
|
+
"""Response for agent operations (start/status/final)."""
|
|
784
|
+
|
|
785
|
+
success: Optional[bool] = None
|
|
786
|
+
id: Optional[str] = None
|
|
787
|
+
status: Optional[Literal["processing", "completed", "failed"]] = None
|
|
788
|
+
data: Optional[Any] = None
|
|
789
|
+
error: Optional[str] = None
|
|
790
|
+
expires_at: Optional[datetime] = None
|
|
791
|
+
credits_used: Optional[int] = None
|
|
792
|
+
|
|
793
|
+
# Usage/limits types
|
|
794
|
+
class ConcurrencyCheck(BaseModel):
|
|
795
|
+
"""Current concurrency and limits for the team/API key."""
|
|
796
|
+
|
|
797
|
+
concurrency: int
|
|
798
|
+
max_concurrency: int
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
class CreditUsage(BaseModel):
|
|
802
|
+
"""Remaining credits for the team/API key."""
|
|
803
|
+
|
|
804
|
+
remaining_credits: int
|
|
805
|
+
plan_credits: Optional[int] = None
|
|
806
|
+
billing_period_start: Optional[str] = None
|
|
807
|
+
billing_period_end: Optional[str] = None
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
class TokenUsage(BaseModel):
|
|
811
|
+
"""Recent token usage metrics (if available)."""
|
|
812
|
+
|
|
813
|
+
remaining_tokens: int
|
|
814
|
+
plan_tokens: Optional[int] = None
|
|
815
|
+
billing_period_start: Optional[str] = None
|
|
816
|
+
billing_period_end: Optional[str] = None
|
|
817
|
+
|
|
818
|
+
|
|
819
|
+
class QueueStatusRequest(BaseModel):
|
|
820
|
+
"""Request to retrieve queue status."""
|
|
821
|
+
|
|
822
|
+
pass
|
|
823
|
+
|
|
824
|
+
|
|
825
|
+
class QueueStatusResponse(BaseModel):
|
|
826
|
+
"""Metrics about the team's scrape queue."""
|
|
827
|
+
|
|
828
|
+
jobs_in_queue: int
|
|
829
|
+
active_jobs_in_queue: int
|
|
830
|
+
waiting_jobs_in_queue: int
|
|
831
|
+
max_concurrency: int
|
|
832
|
+
most_recent_success: Optional[datetime] = None
|
|
833
|
+
|
|
834
|
+
|
|
835
|
+
class CreditUsageHistoricalPeriod(BaseModel):
|
|
836
|
+
startDate: Optional[str] = None
|
|
837
|
+
endDate: Optional[str] = None
|
|
838
|
+
apiKey: Optional[str] = None
|
|
839
|
+
creditsUsed: int
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
class CreditUsageHistoricalResponse(BaseModel):
|
|
843
|
+
success: bool
|
|
844
|
+
periods: List[CreditUsageHistoricalPeriod]
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
class TokenUsageHistoricalPeriod(BaseModel):
|
|
848
|
+
startDate: Optional[str] = None
|
|
849
|
+
endDate: Optional[str] = None
|
|
850
|
+
apiKey: Optional[str] = None
|
|
851
|
+
tokensUsed: int
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
class TokenUsageHistoricalResponse(BaseModel):
|
|
855
|
+
success: bool
|
|
856
|
+
periods: List[TokenUsageHistoricalPeriod]
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
# Action types
|
|
860
|
+
class WaitAction(BaseModel):
|
|
861
|
+
"""Wait action to perform during scraping."""
|
|
862
|
+
|
|
863
|
+
type: Literal["wait"] = "wait"
|
|
864
|
+
milliseconds: Optional[int] = None
|
|
865
|
+
selector: Optional[str] = None
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
class ScreenshotAction(BaseModel):
|
|
869
|
+
"""Screenshot action to perform during scraping."""
|
|
870
|
+
|
|
871
|
+
type: Literal["screenshot"] = "screenshot"
|
|
872
|
+
full_page: Optional[bool] = None
|
|
873
|
+
quality: Optional[int] = None
|
|
874
|
+
viewport: Optional[Union[Dict[str, int], Viewport]] = None
|
|
875
|
+
|
|
876
|
+
|
|
877
|
+
class ClickAction(BaseModel):
|
|
878
|
+
"""Click action to perform during scraping."""
|
|
879
|
+
|
|
880
|
+
type: Literal["click"] = "click"
|
|
881
|
+
selector: str
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
class WriteAction(BaseModel):
|
|
885
|
+
"""Write action to perform during scraping."""
|
|
886
|
+
|
|
887
|
+
type: Literal["write"] = "write"
|
|
888
|
+
text: str
|
|
889
|
+
|
|
890
|
+
|
|
891
|
+
class PressAction(BaseModel):
|
|
892
|
+
"""Press action to perform during scraping."""
|
|
893
|
+
|
|
894
|
+
type: Literal["press"] = "press"
|
|
895
|
+
key: str
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
class ScrollAction(BaseModel):
|
|
899
|
+
"""Scroll action to perform during scraping."""
|
|
900
|
+
|
|
901
|
+
type: Literal["scroll"] = "scroll"
|
|
902
|
+
direction: Literal["up", "down"]
|
|
903
|
+
selector: Optional[str] = None
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
class ScrapeAction(BaseModel):
|
|
907
|
+
"""Scrape action to perform during scraping."""
|
|
908
|
+
|
|
909
|
+
type: Literal["scrape"] = "scrape"
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
class ExecuteJavascriptAction(BaseModel):
|
|
913
|
+
"""Execute javascript action to perform during scraping."""
|
|
914
|
+
|
|
915
|
+
type: Literal["executeJavascript"] = "executeJavascript"
|
|
916
|
+
script: str
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
class PDFAction(BaseModel):
|
|
920
|
+
"""PDF action to perform during scraping."""
|
|
921
|
+
|
|
922
|
+
type: Literal["pdf"] = "pdf"
|
|
923
|
+
format: Optional[
|
|
924
|
+
Literal[
|
|
925
|
+
"A0",
|
|
926
|
+
"A1",
|
|
927
|
+
"A2",
|
|
928
|
+
"A3",
|
|
929
|
+
"A4",
|
|
930
|
+
"A5",
|
|
931
|
+
"A6",
|
|
932
|
+
"Letter",
|
|
933
|
+
"Legal",
|
|
934
|
+
"Tabloid",
|
|
935
|
+
"Ledger",
|
|
936
|
+
]
|
|
937
|
+
] = None
|
|
938
|
+
landscape: Optional[bool] = None
|
|
939
|
+
scale: Optional[float] = None
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
class PDFParser(BaseModel):
|
|
943
|
+
"""PDF parser configuration with optional page limit."""
|
|
944
|
+
|
|
945
|
+
type: Literal["pdf"] = "pdf"
|
|
946
|
+
max_pages: Optional[int] = None
|
|
947
|
+
|
|
948
|
+
|
|
949
|
+
# Location types
|
|
950
|
+
class Location(BaseModel):
|
|
951
|
+
"""Location configuration for scraping."""
|
|
952
|
+
|
|
953
|
+
country: Optional[str] = None
|
|
954
|
+
languages: Optional[List[str]] = None
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
class SearchRequest(BaseModel):
|
|
958
|
+
"""Request for search operations."""
|
|
959
|
+
|
|
960
|
+
query: str
|
|
961
|
+
sources: Optional[List[SourceOption]] = None
|
|
962
|
+
categories: Optional[List[CategoryOption]] = None
|
|
963
|
+
limit: Optional[int] = 5
|
|
964
|
+
tbs: Optional[str] = None
|
|
965
|
+
location: Optional[str] = None
|
|
966
|
+
ignore_invalid_urls: Optional[bool] = None
|
|
967
|
+
timeout: Optional[int] = 300000
|
|
968
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
969
|
+
integration: Optional[str] = None
|
|
970
|
+
|
|
971
|
+
@field_validator("sources")
|
|
972
|
+
@classmethod
|
|
973
|
+
def validate_sources(cls, v):
|
|
974
|
+
"""Validate and normalize sources input."""
|
|
975
|
+
if v is None:
|
|
976
|
+
return v
|
|
977
|
+
|
|
978
|
+
normalized_sources = []
|
|
979
|
+
for source in v:
|
|
980
|
+
if isinstance(source, str):
|
|
981
|
+
normalized_sources.append(Source(type=source))
|
|
982
|
+
elif isinstance(source, dict):
|
|
983
|
+
normalized_sources.append(Source(**source))
|
|
984
|
+
elif isinstance(source, Source):
|
|
985
|
+
normalized_sources.append(source)
|
|
986
|
+
else:
|
|
987
|
+
raise ValueError(f"Invalid source format: {source}")
|
|
988
|
+
|
|
989
|
+
return normalized_sources
|
|
990
|
+
|
|
991
|
+
@field_validator("categories")
|
|
992
|
+
@classmethod
|
|
993
|
+
def validate_categories(cls, v):
|
|
994
|
+
"""Validate and normalize categories input."""
|
|
995
|
+
if v is None:
|
|
996
|
+
return v
|
|
997
|
+
|
|
998
|
+
normalized_categories = []
|
|
999
|
+
for category in v:
|
|
1000
|
+
if isinstance(category, str):
|
|
1001
|
+
normalized_categories.append(Category(type=category))
|
|
1002
|
+
elif isinstance(category, dict):
|
|
1003
|
+
normalized_categories.append(Category(**category))
|
|
1004
|
+
elif isinstance(category, Category):
|
|
1005
|
+
normalized_categories.append(category)
|
|
1006
|
+
else:
|
|
1007
|
+
raise ValueError(f"Invalid category format: {category}")
|
|
1008
|
+
|
|
1009
|
+
return normalized_categories
|
|
1010
|
+
|
|
1011
|
+
# NOTE: parsers validation does not belong on SearchRequest; it is part of ScrapeOptions.
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
class LinkResult(BaseModel):
|
|
1015
|
+
"""A generic link result with optional metadata (used by search and map)."""
|
|
1016
|
+
|
|
1017
|
+
url: str
|
|
1018
|
+
title: Optional[str] = None
|
|
1019
|
+
description: Optional[str] = None
|
|
1020
|
+
|
|
1021
|
+
|
|
1022
|
+
# Backward-compatible alias for existing tests/usages
|
|
1023
|
+
SearchResult = LinkResult
|
|
1024
|
+
|
|
1025
|
+
|
|
1026
|
+
class SearchData(BaseModel):
|
|
1027
|
+
"""Search results grouped by source type."""
|
|
1028
|
+
|
|
1029
|
+
web: Optional[List[Union[SearchResultWeb, Document]]] = None
|
|
1030
|
+
news: Optional[List[Union[SearchResultNews, Document]]] = None
|
|
1031
|
+
images: Optional[List[Union[SearchResultImages, Document]]] = None
|
|
1032
|
+
|
|
1033
|
+
|
|
1034
|
+
class SearchResponse(BaseResponse[SearchData]):
|
|
1035
|
+
"""Response from search operation."""
|
|
1036
|
+
|
|
1037
|
+
pass
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
# Error types
|
|
1041
|
+
class ErrorDetails(BaseModel):
|
|
1042
|
+
"""Detailed error information."""
|
|
1043
|
+
|
|
1044
|
+
code: Optional[str] = None
|
|
1045
|
+
message: str
|
|
1046
|
+
details: Optional[Dict[str, Any]] = None
|
|
1047
|
+
|
|
1048
|
+
|
|
1049
|
+
class ErrorResponse(BaseModel):
|
|
1050
|
+
"""Error response structure."""
|
|
1051
|
+
|
|
1052
|
+
success: bool = False
|
|
1053
|
+
error: str
|
|
1054
|
+
details: Optional[ErrorDetails] = None
|
|
1055
|
+
|
|
1056
|
+
|
|
1057
|
+
# Job management types
|
|
1058
|
+
class JobStatus(BaseModel):
|
|
1059
|
+
"""Generic job status information."""
|
|
1060
|
+
|
|
1061
|
+
id: str
|
|
1062
|
+
status: Literal["pending", "scraping", "completed", "failed"]
|
|
1063
|
+
current: Optional[int] = None
|
|
1064
|
+
total: Optional[int] = None
|
|
1065
|
+
created_at: Optional[datetime] = None
|
|
1066
|
+
completed_at: Optional[datetime] = None
|
|
1067
|
+
expires_at: Optional[datetime] = None
|
|
1068
|
+
|
|
1069
|
+
|
|
1070
|
+
class CrawlError(BaseModel):
|
|
1071
|
+
"""A crawl error."""
|
|
1072
|
+
|
|
1073
|
+
id: str
|
|
1074
|
+
timestamp: Optional[datetime] = None
|
|
1075
|
+
url: str
|
|
1076
|
+
code: Optional[str] = None
|
|
1077
|
+
error: str
|
|
1078
|
+
|
|
1079
|
+
|
|
1080
|
+
class CrawlErrorsResponse(BaseModel):
|
|
1081
|
+
"""Response from crawl error monitoring."""
|
|
1082
|
+
|
|
1083
|
+
errors: List[CrawlError]
|
|
1084
|
+
robots_blocked: List[str]
|
|
1085
|
+
|
|
1086
|
+
|
|
1087
|
+
class CrawlErrorsRequest(BaseModel):
|
|
1088
|
+
"""Request for crawl error monitoring."""
|
|
1089
|
+
|
|
1090
|
+
crawl_id: str
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
class ActiveCrawl(BaseModel):
|
|
1094
|
+
"""Information about an active crawl job."""
|
|
1095
|
+
|
|
1096
|
+
id: str
|
|
1097
|
+
team_id: str
|
|
1098
|
+
url: str
|
|
1099
|
+
options: Optional[Dict[str, Any]] = None
|
|
1100
|
+
|
|
1101
|
+
|
|
1102
|
+
class ActiveCrawlsResponse(BaseModel):
|
|
1103
|
+
"""Response from active crawls endpoint."""
|
|
1104
|
+
|
|
1105
|
+
success: bool = True
|
|
1106
|
+
crawls: List[ActiveCrawl]
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
class ActiveCrawlsRequest(BaseModel):
|
|
1110
|
+
"""Request for listing active crawl jobs."""
|
|
1111
|
+
|
|
1112
|
+
pass
|
|
1113
|
+
|
|
1114
|
+
|
|
1115
|
+
# Configuration types
|
|
1116
|
+
class ClientConfig(BaseModel):
|
|
1117
|
+
"""Configuration for the Firecrawl client."""
|
|
1118
|
+
|
|
1119
|
+
api_key: Optional[str] = None
|
|
1120
|
+
api_url: str = "https://api.firecrawl.dev"
|
|
1121
|
+
timeout: Optional[float] = None
|
|
1122
|
+
max_retries: int = 3
|
|
1123
|
+
backoff_factor: float = 0.5
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
class PaginationConfig(BaseModel):
|
|
1127
|
+
"""Configuration for pagination behavior."""
|
|
1128
|
+
|
|
1129
|
+
auto_paginate: bool = True
|
|
1130
|
+
max_pages: Optional[int] = Field(default=None, ge=0)
|
|
1131
|
+
max_results: Optional[int] = Field(default=None, ge=0)
|
|
1132
|
+
max_wait_time: Optional[int] = Field(default=None, ge=0) # seconds
|
|
1133
|
+
|
|
1134
|
+
|
|
1135
|
+
# Response union types
|
|
1136
|
+
AnyResponse = Union[
|
|
1137
|
+
ScrapeResponse,
|
|
1138
|
+
CrawlResponse,
|
|
1139
|
+
BatchScrapeResponse,
|
|
1140
|
+
MapResponse,
|
|
1141
|
+
SearchResponse,
|
|
1142
|
+
ErrorResponse,
|
|
1143
|
+
]
|