firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
firecrawl/v2/types.py ADDED
@@ -0,0 +1,1143 @@
1
+ """
2
+ Type definitions for Firecrawl v2 API.
3
+
4
+ This module contains clean, modern type definitions for the v2 API.
5
+ """
6
+
7
+ import warnings
8
+ from datetime import datetime
9
+ from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
10
+ import logging
11
+ from pydantic import (
12
+ BaseModel,
13
+ Field,
14
+ field_validator,
15
+ ValidationError,
16
+ model_serializer,
17
+ model_validator,
18
+ )
19
+
20
+ # Suppress pydantic warnings about schema field shadowing
21
+ # Tested using schema_field alias="schema" but it doesn't work.
22
+ warnings.filterwarnings(
23
+ "ignore",
24
+ message='Field name "schema" in "Format" shadows an attribute in parent "BaseModel"',
25
+ )
26
+ warnings.filterwarnings(
27
+ "ignore",
28
+ message='Field name "schema" in "JsonFormat" shadows an attribute in parent "Format"',
29
+ )
30
+ warnings.filterwarnings(
31
+ "ignore",
32
+ message='Field name "schema" in "ChangeTrackingFormat" shadows an attribute in parent "Format"',
33
+ )
34
+ warnings.filterwarnings(
35
+ "ignore",
36
+ message='Field name "json" in "ScrapeFormats" shadows an attribute in parent "BaseModel"',
37
+ )
38
+ warnings.filterwarnings(
39
+ "ignore",
40
+ message='Field name "json" in "Document" shadows an attribute in parent "BaseModel"',
41
+ )
42
+
43
+ T = TypeVar("T")
44
+
45
+ # Module logger
46
+ logger = logging.getLogger("firecrawl")
47
+
48
+
49
+ # Base response types
50
+ class BaseResponse(BaseModel, Generic[T]):
51
+ """Base response structure for all API responses."""
52
+
53
+ success: bool
54
+ data: Optional[T] = None
55
+ error: Optional[str] = None
56
+ warning: Optional[str] = None
57
+
58
+
59
+ # Document and content types
60
+ class DocumentMetadata(BaseModel):
61
+ """Metadata for scraped documents (snake_case only; API camelCase normalized in code)."""
62
+
63
+ model_config = {"extra": "allow"}
64
+
65
+ @model_serializer(mode="wrap")
66
+ def _serialize(self, handler):
67
+ out = handler(self)
68
+ extra = getattr(self, "__pydantic_extra__", None)
69
+ if isinstance(extra, dict):
70
+ for k, v in extra.items():
71
+ if v is not None:
72
+ out[k] = v
73
+ return out
74
+
75
+ # Common metadata fields
76
+ title: Optional[str] = None
77
+ description: Optional[str] = None
78
+ url: Optional[str] = None
79
+ language: Optional[str] = None
80
+ keywords: Optional[Union[str, List[str]]] = None
81
+ robots: Optional[str] = None
82
+
83
+ # OpenGraph and social metadata
84
+ og_title: Optional[str] = None
85
+ og_description: Optional[str] = None
86
+ og_url: Optional[str] = None
87
+ og_image: Optional[str] = None
88
+ og_audio: Optional[str] = None
89
+ og_determiner: Optional[str] = None
90
+ og_locale: Optional[str] = None
91
+ og_locale_alternate: Optional[List[str]] = None
92
+ og_site_name: Optional[str] = None
93
+ og_video: Optional[str] = None
94
+
95
+ # Dublin Core and other site metadata
96
+ favicon: Optional[str] = None
97
+ dc_terms_created: Optional[str] = None
98
+ dc_date_created: Optional[str] = None
99
+ dc_date: Optional[str] = None
100
+ dc_terms_type: Optional[str] = None
101
+ dc_type: Optional[str] = None
102
+ dc_terms_audience: Optional[str] = None
103
+ dc_terms_subject: Optional[str] = None
104
+ dc_subject: Optional[str] = None
105
+ dc_description: Optional[str] = None
106
+ dc_terms_keywords: Optional[str] = None
107
+
108
+ modified_time: Optional[str] = None
109
+ published_time: Optional[str] = None
110
+ article_tag: Optional[str] = None
111
+ article_section: Optional[str] = None
112
+
113
+ # Response-level metadata
114
+ source_url: Optional[str] = None
115
+ status_code: Optional[int] = None
116
+ scrape_id: Optional[str] = None
117
+ num_pages: Optional[int] = None
118
+ content_type: Optional[str] = None
119
+ proxy_used: Optional[Literal["basic", "stealth"]] = None
120
+ timezone: Optional[str] = None
121
+ cache_state: Optional[Literal["hit", "miss"]] = None
122
+ cached_at: Optional[str] = None
123
+ credits_used: Optional[int] = None
124
+ concurrency_limited: Optional[bool] = None
125
+ concurrency_queue_duration_ms: Optional[int] = None
126
+
127
+ # Error information
128
+ error: Optional[str] = None
129
+
130
+ @property
131
+ def extras(self) -> Dict[str, Any]:
132
+ """Return unknown metadata keys preserved on the model."""
133
+ extra = getattr(self, "__pydantic_extra__", None)
134
+ return dict(extra) if isinstance(extra, dict) else {}
135
+
136
+ @staticmethod
137
+ def _coerce_list_to_string(value: Any) -> Any:
138
+ if isinstance(value, list):
139
+ # Prefer first string if semantically a single-valued field, else join
140
+ if len(value) == 1:
141
+ return str(value[0])
142
+ return ", ".join(str(item) for item in value)
143
+ return value
144
+
145
+ @staticmethod
146
+ def _coerce_string_to_int(value: Any) -> Any:
147
+ if isinstance(value, str):
148
+ try:
149
+ return int(value)
150
+ except ValueError:
151
+ return value
152
+ return value
153
+
154
+ @model_validator(mode="before")
155
+ @classmethod
156
+ def coerce_lists_for_string_fields(cls, data):
157
+ """Before validation: coerce lists to strings for known single-string fields.
158
+ Preserves unknown-key lists.
159
+ """
160
+ if not isinstance(data, dict):
161
+ return data
162
+ single_str_fields = {
163
+ "title",
164
+ "description",
165
+ "url",
166
+ "language",
167
+ "robots",
168
+ "og_title",
169
+ "og_description",
170
+ "og_url",
171
+ "og_image",
172
+ "og_audio",
173
+ "og_determiner",
174
+ "og_locale",
175
+ "og_site_name",
176
+ "og_video",
177
+ "favicon",
178
+ "dc_terms_created",
179
+ "dc_date_created",
180
+ "dc_date",
181
+ "dc_terms_type",
182
+ "dc_type",
183
+ "dc_terms_audience",
184
+ "dc_terms_subject",
185
+ "dc_subject",
186
+ "dc_description",
187
+ "dc_terms_keywords",
188
+ "modified_time",
189
+ "published_time",
190
+ "article_tag",
191
+ "article_section",
192
+ "source_url",
193
+ "scrape_id",
194
+ "content_type",
195
+ "cached_at",
196
+ "error",
197
+ "timezone",
198
+ }
199
+ for k, v in list(data.items()):
200
+ if isinstance(v, list) and k in single_str_fields:
201
+ data[k] = cls._coerce_list_to_string(v)
202
+ # For ints that might appear as list, take first
203
+ if isinstance(v, list) and k in {
204
+ "status_code",
205
+ "num_pages",
206
+ "credits_used",
207
+ }:
208
+ first = v[0] if v else None
209
+ data[k] = cls._coerce_string_to_int(first)
210
+ return data
211
+
212
+ @field_validator(
213
+ "robots",
214
+ "og_title",
215
+ "og_description",
216
+ "og_url",
217
+ "og_image",
218
+ "language",
219
+ mode="before",
220
+ )
221
+ @classmethod
222
+ def coerce_lists_to_string_fields(cls, v):
223
+ return cls._coerce_list_to_string(v)
224
+
225
+ @field_validator("status_code", mode="before")
226
+ @classmethod
227
+ def coerce_status_code_to_int(cls, v):
228
+ return cls._coerce_string_to_int(v)
229
+
230
+
231
+ class AgentOptions(BaseModel):
232
+ """Configuration for the agent in extract operations."""
233
+
234
+ model: Literal["FIRE-1", "v3-beta"] = "FIRE-1"
235
+
236
+
237
+ class AttributeResult(BaseModel):
238
+ """Result of attribute extraction."""
239
+
240
+ selector: str
241
+ attribute: str
242
+ values: List[str]
243
+
244
+
245
+ class BrandingProfile(BaseModel):
246
+ """Branding information extracted from a website."""
247
+
248
+ model_config = {"extra": "allow"}
249
+
250
+ color_scheme: Optional[Literal["light", "dark"]] = None
251
+ logo: Optional[str] = None
252
+ fonts: Optional[List[Dict[str, Any]]] = None
253
+ colors: Optional[Dict[str, str]] = None
254
+ typography: Optional[Dict[str, Any]] = None
255
+ spacing: Optional[Dict[str, Any]] = None
256
+ components: Optional[Dict[str, Any]] = None
257
+ icons: Optional[Dict[str, str]] = None
258
+ images: Optional[Dict[str, Optional[str]]] = None
259
+ animations: Optional[Dict[str, str]] = None
260
+ layout: Optional[Dict[str, Any]] = None
261
+ tone: Optional[Dict[str, str]] = None
262
+ personality: Optional[Dict[str, Any]] = None
263
+
264
+
265
+ class Document(BaseModel):
266
+ """A scraped document."""
267
+
268
+ markdown: Optional[str] = None
269
+ html: Optional[str] = None
270
+ raw_html: Optional[str] = None
271
+ json: Optional[Any] = None
272
+ summary: Optional[str] = None
273
+ metadata: Optional[DocumentMetadata] = None
274
+ links: Optional[List[str]] = None
275
+ images: Optional[List[str]] = None
276
+ screenshot: Optional[str] = None
277
+ actions: Optional[Dict[str, Any]] = None
278
+ warning: Optional[str] = None
279
+ change_tracking: Optional[Dict[str, Any]] = None
280
+ branding: Optional[BrandingProfile] = None
281
+
282
+ @property
283
+ def metadata_typed(self) -> DocumentMetadata:
284
+ """Always returns a DocumentMetadata instance for LSP-friendly access."""
285
+ md = self.metadata
286
+ if isinstance(md, DocumentMetadata):
287
+ return md
288
+ if isinstance(md, dict):
289
+ try:
290
+ return DocumentMetadata.model_validate(md)
291
+ except (ValidationError, TypeError) as exc:
292
+ logger.debug("Failed to construct DocumentMetadata from dict: %s", exc)
293
+ return DocumentMetadata()
294
+
295
+ @property
296
+ def metadata_dict(self) -> Dict[str, Any]:
297
+ """Returns metadata as a plain dict (exclude None), including extras."""
298
+ md = self.metadata
299
+ if isinstance(md, DocumentMetadata):
300
+ out = md.model_dump(exclude_none=True)
301
+ # Ensure extras are preserved even if model_dump omits them
302
+ extra = getattr(md, "__pydantic_extra__", None)
303
+ if isinstance(extra, dict):
304
+ for k, v in extra.items():
305
+ if v is not None:
306
+ out[k] = v
307
+ return out
308
+ if isinstance(md, dict):
309
+ return {k: v for k, v in md.items() if v is not None}
310
+ return {}
311
+
312
+
313
+ # Webhook types
314
+ class WebhookConfig(BaseModel):
315
+ """Configuration for webhooks."""
316
+
317
+ url: str
318
+ headers: Optional[Dict[str, str]] = None
319
+ metadata: Optional[Dict[str, str]] = None
320
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
321
+
322
+
323
+ class WebhookData(BaseModel):
324
+ """Data sent to webhooks."""
325
+
326
+ job_id: str
327
+ status: str
328
+ current: Optional[int] = None
329
+ total: Optional[int] = None
330
+ data: Optional[List[Document]] = None
331
+ error: Optional[str] = None
332
+
333
+
334
+ class Source(BaseModel):
335
+ """Configuration for a search source."""
336
+
337
+ type: str
338
+
339
+
340
+ SourceOption = Union[str, Source]
341
+
342
+
343
+ class Category(BaseModel):
344
+ """Configuration for a search category.
345
+
346
+ Supported categories:
347
+ - "github": Filter results to GitHub repositories
348
+ - "research": Filter results to research papers and academic sites
349
+ - "pdf": Filter results to PDF files (adds filetype:pdf to search)
350
+ """
351
+
352
+ type: str
353
+
354
+
355
+ CategoryOption = Union[str, Category]
356
+
357
+ FormatString = Literal[
358
+ # camelCase versions (API format)
359
+ "markdown",
360
+ "html",
361
+ "rawHtml",
362
+ "links",
363
+ "images",
364
+ "screenshot",
365
+ "summary",
366
+ "changeTracking",
367
+ "json",
368
+ "attributes",
369
+ "branding",
370
+ # snake_case versions (user-friendly)
371
+ "raw_html",
372
+ "change_tracking",
373
+ ]
374
+
375
+
376
+ class Viewport(BaseModel):
377
+ """Viewport configuration for screenshots."""
378
+
379
+ width: int
380
+ height: int
381
+
382
+
383
+ class Format(BaseModel):
384
+ """Configuration for a format."""
385
+
386
+ type: FormatString
387
+
388
+
389
+ class JsonFormat(Format):
390
+ """Configuration for JSON extraction."""
391
+
392
+ prompt: Optional[str] = None
393
+ schema: Optional[Any] = None
394
+
395
+
396
+ class ChangeTrackingFormat(Format):
397
+ """Configuration for change tracking."""
398
+
399
+ modes: List[Literal["git-diff", "json"]]
400
+ schema: Optional[Dict[str, Any]] = None
401
+ prompt: Optional[str] = None
402
+ tag: Optional[str] = None
403
+
404
+
405
+ class ScreenshotFormat(BaseModel):
406
+ """Configuration for screenshot format."""
407
+
408
+ type: Literal["screenshot"] = "screenshot"
409
+ full_page: Optional[bool] = None
410
+ quality: Optional[int] = None
411
+ viewport: Optional[Union[Dict[str, int], Viewport]] = None
412
+
413
+
414
+ class AttributeSelector(BaseModel):
415
+ """Selector and attribute pair for attribute extraction."""
416
+
417
+ selector: str
418
+ attribute: str
419
+
420
+
421
+ class AttributesFormat(Format):
422
+ """Configuration for attribute extraction."""
423
+
424
+ type: Literal["attributes"] = "attributes"
425
+ selectors: List[AttributeSelector]
426
+
427
+
428
+ FormatOption = Union[
429
+ Dict[str, Any],
430
+ FormatString,
431
+ JsonFormat,
432
+ ChangeTrackingFormat,
433
+ ScreenshotFormat,
434
+ AttributesFormat,
435
+ Format,
436
+ ]
437
+
438
+
439
+ # Scrape types
440
+ class ScrapeFormats(BaseModel):
441
+ """Output formats for scraping."""
442
+
443
+ formats: Optional[List[FormatOption]] = None
444
+ markdown: bool = True
445
+ html: bool = False
446
+ raw_html: bool = False
447
+ summary: bool = False
448
+ links: bool = False
449
+ images: bool = False
450
+ screenshot: bool = False
451
+ change_tracking: bool = False
452
+ json: bool = False
453
+
454
+ @field_validator("formats")
455
+ @classmethod
456
+ def validate_formats(cls, v):
457
+ """Validate and normalize formats input."""
458
+ if v is None:
459
+ return v
460
+
461
+ normalized_formats = []
462
+ for format_item in v:
463
+ if isinstance(format_item, str):
464
+ normalized_formats.append(Format(type=format_item))
465
+ elif isinstance(format_item, dict):
466
+ # Preserve dicts as-is to avoid dropping custom fields like 'schema'
467
+ normalized_formats.append(format_item)
468
+ elif isinstance(format_item, Format):
469
+ normalized_formats.append(format_item)
470
+ else:
471
+ raise ValueError(f"Invalid format format: {format_item}")
472
+
473
+ return normalized_formats
474
+
475
+
476
+ class ScrapeOptions(BaseModel):
477
+ """Options for scraping operations."""
478
+
479
+ formats: Optional[Union["ScrapeFormats", List[FormatOption]]] = None
480
+ headers: Optional[Dict[str, str]] = None
481
+ include_tags: Optional[List[str]] = None
482
+ exclude_tags: Optional[List[str]] = None
483
+ only_main_content: Optional[bool] = None
484
+ timeout: Optional[int] = None
485
+ wait_for: Optional[int] = None
486
+ mobile: Optional[bool] = None
487
+ parsers: Optional[Union[List[str], List[Union[str, "PDFParser"]]]] = None
488
+ actions: Optional[
489
+ List[
490
+ Union[
491
+ "WaitAction",
492
+ "ScreenshotAction",
493
+ "ClickAction",
494
+ "WriteAction",
495
+ "PressAction",
496
+ "ScrollAction",
497
+ "ScrapeAction",
498
+ "ExecuteJavascriptAction",
499
+ "PDFAction",
500
+ ]
501
+ ]
502
+ ] = None
503
+ location: Optional["Location"] = None
504
+ skip_tls_verification: Optional[bool] = None
505
+ remove_base64_images: Optional[bool] = None
506
+ fast_mode: Optional[bool] = None
507
+ use_mock: Optional[str] = None
508
+ block_ads: Optional[bool] = None
509
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None
510
+ max_age: Optional[int] = None
511
+ min_age: Optional[int] = None
512
+ store_in_cache: Optional[bool] = None
513
+ integration: Optional[str] = None
514
+
515
+ @field_validator("formats")
516
+ @classmethod
517
+ def validate_formats(cls, v):
518
+ """Validate and normalize formats input."""
519
+ if v is None:
520
+ return v
521
+ if isinstance(v, ScrapeFormats):
522
+ return v
523
+ if isinstance(v, list):
524
+ return v
525
+ raise ValueError(
526
+ f"Invalid formats type: {type(v)}. Expected ScrapeFormats or List[FormatOption]"
527
+ )
528
+
529
+
530
+ class ScrapeRequest(BaseModel):
531
+ """Request for scraping a single URL."""
532
+
533
+ url: str
534
+ options: Optional[ScrapeOptions] = None
535
+
536
+
537
+ class ScrapeData(Document):
538
+ """Scrape results data."""
539
+
540
+ pass
541
+
542
+
543
+ class ScrapeResponse(BaseResponse[ScrapeData]):
544
+ """Response for scrape operations."""
545
+
546
+ pass
547
+
548
+
549
+ # Crawl types
550
+ class CrawlRequest(BaseModel):
551
+ """Request for crawling a website."""
552
+
553
+ url: str
554
+ prompt: Optional[str] = None
555
+ exclude_paths: Optional[List[str]] = None
556
+ include_paths: Optional[List[str]] = None
557
+ max_discovery_depth: Optional[int] = None
558
+ sitemap: Literal["skip", "include"] = "include"
559
+ ignore_query_parameters: bool = False
560
+ limit: Optional[int] = None
561
+ crawl_entire_domain: bool = False
562
+ allow_external_links: bool = False
563
+ allow_subdomains: bool = False
564
+ delay: Optional[int] = None
565
+ max_concurrency: Optional[int] = None
566
+ webhook: Optional[Union[str, WebhookConfig]] = None
567
+ scrape_options: Optional[ScrapeOptions] = None
568
+ zero_data_retention: bool = False
569
+ integration: Optional[str] = None
570
+
571
+
572
+ class CrawlResponse(BaseModel):
573
+ """Information about a crawl job."""
574
+
575
+ id: str
576
+ url: str
577
+
578
+
579
+ class CrawlJob(BaseModel):
580
+ """Crawl job status and progress data."""
581
+
582
+ status: Literal["scraping", "completed", "failed", "cancelled"]
583
+ total: int = 0
584
+ completed: int = 0
585
+ credits_used: int = 0
586
+ expires_at: Optional[datetime] = None
587
+ next: Optional[str] = None
588
+ data: List[Document] = []
589
+
590
+
591
+ class CrawlStatusRequest(BaseModel):
592
+ """Request to get crawl job status."""
593
+
594
+ job_id: str
595
+
596
+
597
+ class SearchResultWeb(BaseModel):
598
+ """A web search result with URL, title, and description."""
599
+
600
+ url: str
601
+ title: Optional[str] = None
602
+ description: Optional[str] = None
603
+ category: Optional[str] = None
604
+
605
+
606
+ class SearchResultNews(BaseModel):
607
+ """A news search result with URL, title, snippet, date, image URL, and position."""
608
+
609
+ title: Optional[str] = None
610
+ url: Optional[str] = None
611
+ snippet: Optional[str] = None
612
+ date: Optional[str] = None
613
+ image_url: Optional[str] = None
614
+ position: Optional[int] = None
615
+ category: Optional[str] = None
616
+
617
+
618
+ class SearchResultImages(BaseModel):
619
+ """An image search result with URL, title, image URL, image width, image height, and position."""
620
+
621
+ title: Optional[str] = None
622
+ image_url: Optional[str] = None
623
+ image_width: Optional[int] = None
624
+ image_height: Optional[int] = None
625
+ url: Optional[str] = None
626
+ position: Optional[int] = None
627
+
628
+
629
+ class MapDocument(Document):
630
+ """A document from a map operation with URL and description."""
631
+
632
+ url: str
633
+ description: Optional[str] = None
634
+
635
+
636
+ # Crawl params types
637
+ class CrawlParamsRequest(BaseModel):
638
+ """Request for getting crawl parameters from LLM."""
639
+
640
+ url: str
641
+ prompt: str
642
+
643
+
644
+ class CrawlParamsData(BaseModel):
645
+ """Data returned from crawl params endpoint."""
646
+
647
+ include_paths: Optional[List[str]] = None
648
+ exclude_paths: Optional[List[str]] = None
649
+ max_discovery_depth: Optional[int] = None
650
+ ignore_sitemap: bool = False
651
+ ignore_query_parameters: bool = False
652
+ limit: Optional[int] = None
653
+ crawl_entire_domain: bool = False
654
+ allow_external_links: bool = False
655
+ allow_subdomains: bool = False
656
+ delay: Optional[int] = None
657
+ max_concurrency: Optional[int] = None
658
+ webhook: Optional[Union[str, WebhookConfig]] = None
659
+ scrape_options: Optional[ScrapeOptions] = None
660
+ zero_data_retention: bool = False
661
+ warning: Optional[str] = None
662
+ integration: Optional[str] = None
663
+
664
+
665
+ class CrawlParamsResponse(BaseResponse[CrawlParamsData]):
666
+ """Response from crawl params endpoint."""
667
+
668
+ pass
669
+
670
+
671
+ # Batch scrape types
672
+ class BatchScrapeRequest(BaseModel):
673
+ """Request for batch scraping multiple URLs (internal helper only)."""
674
+
675
+ urls: List[str]
676
+ options: Optional[ScrapeOptions] = None
677
+ webhook: Optional[Union[str, WebhookConfig]] = None
678
+ append_to_id: Optional[str] = None
679
+ ignore_invalid_urls: Optional[bool] = None
680
+ max_concurrency: Optional[int] = None
681
+ zero_data_retention: Optional[bool] = None
682
+ integration: Optional[str] = None
683
+
684
+
685
+ class BatchScrapeResponse(BaseModel):
686
+ """Response from starting a batch scrape job (mirrors CrawlResponse naming)."""
687
+
688
+ id: str
689
+ url: str
690
+ invalid_urls: Optional[List[str]] = None
691
+
692
+
693
+ class BatchScrapeJob(BaseModel):
694
+ """Batch scrape job status and results."""
695
+
696
+ status: Literal["scraping", "completed", "failed", "cancelled"]
697
+ completed: int
698
+ total: int
699
+ credits_used: Optional[int] = None
700
+ expires_at: Optional[datetime] = None
701
+ next: Optional[str] = None
702
+ data: List[Document] = []
703
+
704
+
705
+ class BatchScrapeStatusRequest(BaseModel):
706
+ """Request to get batch scrape job status."""
707
+
708
+ job_id: str
709
+
710
+
711
+ class BatchScrapeErrorsRequest(BaseModel):
712
+ """Request to get errors for a batch scrape job."""
713
+
714
+ job_id: str
715
+
716
+
717
+ # Map types
718
+ class MapOptions(BaseModel):
719
+ """Options for mapping operations."""
720
+
721
+ search: Optional[str] = None
722
+ sitemap: Literal["only", "include", "skip"] = "include"
723
+ include_subdomains: Optional[bool] = None
724
+ ignore_query_parameters: Optional[bool] = None
725
+ limit: Optional[int] = None
726
+ timeout: Optional[int] = None
727
+ integration: Optional[str] = None
728
+ location: Optional["Location"] = None
729
+
730
+
731
+ class MapRequest(BaseModel):
732
+ """Request for mapping a website."""
733
+
734
+ url: str
735
+ options: Optional[MapOptions] = None
736
+
737
+
738
+ class MapData(BaseModel):
739
+ """Map results data."""
740
+
741
+ links: List["SearchResult"]
742
+
743
+
744
+ class MapResponse(BaseResponse[MapData]):
745
+ """Response for map operations."""
746
+
747
+ pass
748
+
749
+
750
+ # Extract types
751
+ class ExtractRequest(BaseModel):
752
+ """Request for extract operations."""
753
+
754
+ urls: Optional[List[str]] = None
755
+ prompt: Optional[str] = None
756
+ schema_: Optional[Dict[str, Any]] = Field(default=None, alias="schema")
757
+ system_prompt: Optional[str] = None
758
+ allow_external_links: Optional[bool] = None
759
+ enable_web_search: Optional[bool] = None
760
+ show_sources: Optional[bool] = None
761
+ scrape_options: Optional[ScrapeOptions] = None
762
+ ignore_invalid_urls: Optional[bool] = None
763
+ integration: Optional[str] = None
764
+ agent: Optional[AgentOptions] = None
765
+
766
+
767
+ class ExtractResponse(BaseModel):
768
+ """Response for extract operations (start/status/final)."""
769
+
770
+ success: Optional[bool] = None
771
+ id: Optional[str] = None
772
+ status: Optional[Literal["processing", "completed", "failed", "cancelled"]] = None
773
+ data: Optional[Any] = None
774
+ error: Optional[str] = None
775
+ warning: Optional[str] = None
776
+ sources: Optional[Dict[str, Any]] = None
777
+ expires_at: Optional[datetime] = None
778
+ credits_used: Optional[int] = None
779
+ tokens_used: Optional[int] = None
780
+
781
+
782
+ class AgentResponse(BaseModel):
783
+ """Response for agent operations (start/status/final)."""
784
+
785
+ success: Optional[bool] = None
786
+ id: Optional[str] = None
787
+ status: Optional[Literal["processing", "completed", "failed"]] = None
788
+ data: Optional[Any] = None
789
+ error: Optional[str] = None
790
+ expires_at: Optional[datetime] = None
791
+ credits_used: Optional[int] = None
792
+
793
+ # Usage/limits types
794
+ class ConcurrencyCheck(BaseModel):
795
+ """Current concurrency and limits for the team/API key."""
796
+
797
+ concurrency: int
798
+ max_concurrency: int
799
+
800
+
801
+ class CreditUsage(BaseModel):
802
+ """Remaining credits for the team/API key."""
803
+
804
+ remaining_credits: int
805
+ plan_credits: Optional[int] = None
806
+ billing_period_start: Optional[str] = None
807
+ billing_period_end: Optional[str] = None
808
+
809
+
810
+ class TokenUsage(BaseModel):
811
+ """Recent token usage metrics (if available)."""
812
+
813
+ remaining_tokens: int
814
+ plan_tokens: Optional[int] = None
815
+ billing_period_start: Optional[str] = None
816
+ billing_period_end: Optional[str] = None
817
+
818
+
819
+ class QueueStatusRequest(BaseModel):
820
+ """Request to retrieve queue status."""
821
+
822
+ pass
823
+
824
+
825
+ class QueueStatusResponse(BaseModel):
826
+ """Metrics about the team's scrape queue."""
827
+
828
+ jobs_in_queue: int
829
+ active_jobs_in_queue: int
830
+ waiting_jobs_in_queue: int
831
+ max_concurrency: int
832
+ most_recent_success: Optional[datetime] = None
833
+
834
+
835
+ class CreditUsageHistoricalPeriod(BaseModel):
836
+ startDate: Optional[str] = None
837
+ endDate: Optional[str] = None
838
+ apiKey: Optional[str] = None
839
+ creditsUsed: int
840
+
841
+
842
+ class CreditUsageHistoricalResponse(BaseModel):
843
+ success: bool
844
+ periods: List[CreditUsageHistoricalPeriod]
845
+
846
+
847
+ class TokenUsageHistoricalPeriod(BaseModel):
848
+ startDate: Optional[str] = None
849
+ endDate: Optional[str] = None
850
+ apiKey: Optional[str] = None
851
+ tokensUsed: int
852
+
853
+
854
+ class TokenUsageHistoricalResponse(BaseModel):
855
+ success: bool
856
+ periods: List[TokenUsageHistoricalPeriod]
857
+
858
+
859
+ # Action types
860
+ class WaitAction(BaseModel):
861
+ """Wait action to perform during scraping."""
862
+
863
+ type: Literal["wait"] = "wait"
864
+ milliseconds: Optional[int] = None
865
+ selector: Optional[str] = None
866
+
867
+
868
+ class ScreenshotAction(BaseModel):
869
+ """Screenshot action to perform during scraping."""
870
+
871
+ type: Literal["screenshot"] = "screenshot"
872
+ full_page: Optional[bool] = None
873
+ quality: Optional[int] = None
874
+ viewport: Optional[Union[Dict[str, int], Viewport]] = None
875
+
876
+
877
+ class ClickAction(BaseModel):
878
+ """Click action to perform during scraping."""
879
+
880
+ type: Literal["click"] = "click"
881
+ selector: str
882
+
883
+
884
+ class WriteAction(BaseModel):
885
+ """Write action to perform during scraping."""
886
+
887
+ type: Literal["write"] = "write"
888
+ text: str
889
+
890
+
891
+ class PressAction(BaseModel):
892
+ """Press action to perform during scraping."""
893
+
894
+ type: Literal["press"] = "press"
895
+ key: str
896
+
897
+
898
+ class ScrollAction(BaseModel):
899
+ """Scroll action to perform during scraping."""
900
+
901
+ type: Literal["scroll"] = "scroll"
902
+ direction: Literal["up", "down"]
903
+ selector: Optional[str] = None
904
+
905
+
906
+ class ScrapeAction(BaseModel):
907
+ """Scrape action to perform during scraping."""
908
+
909
+ type: Literal["scrape"] = "scrape"
910
+
911
+
912
+ class ExecuteJavascriptAction(BaseModel):
913
+ """Execute javascript action to perform during scraping."""
914
+
915
+ type: Literal["executeJavascript"] = "executeJavascript"
916
+ script: str
917
+
918
+
919
+ class PDFAction(BaseModel):
920
+ """PDF action to perform during scraping."""
921
+
922
+ type: Literal["pdf"] = "pdf"
923
+ format: Optional[
924
+ Literal[
925
+ "A0",
926
+ "A1",
927
+ "A2",
928
+ "A3",
929
+ "A4",
930
+ "A5",
931
+ "A6",
932
+ "Letter",
933
+ "Legal",
934
+ "Tabloid",
935
+ "Ledger",
936
+ ]
937
+ ] = None
938
+ landscape: Optional[bool] = None
939
+ scale: Optional[float] = None
940
+
941
+
942
+ class PDFParser(BaseModel):
943
+ """PDF parser configuration with optional page limit."""
944
+
945
+ type: Literal["pdf"] = "pdf"
946
+ max_pages: Optional[int] = None
947
+
948
+
949
+ # Location types
950
+ class Location(BaseModel):
951
+ """Location configuration for scraping."""
952
+
953
+ country: Optional[str] = None
954
+ languages: Optional[List[str]] = None
955
+
956
+
957
+ class SearchRequest(BaseModel):
958
+ """Request for search operations."""
959
+
960
+ query: str
961
+ sources: Optional[List[SourceOption]] = None
962
+ categories: Optional[List[CategoryOption]] = None
963
+ limit: Optional[int] = 5
964
+ tbs: Optional[str] = None
965
+ location: Optional[str] = None
966
+ ignore_invalid_urls: Optional[bool] = None
967
+ timeout: Optional[int] = 300000
968
+ scrape_options: Optional[ScrapeOptions] = None
969
+ integration: Optional[str] = None
970
+
971
+ @field_validator("sources")
972
+ @classmethod
973
+ def validate_sources(cls, v):
974
+ """Validate and normalize sources input."""
975
+ if v is None:
976
+ return v
977
+
978
+ normalized_sources = []
979
+ for source in v:
980
+ if isinstance(source, str):
981
+ normalized_sources.append(Source(type=source))
982
+ elif isinstance(source, dict):
983
+ normalized_sources.append(Source(**source))
984
+ elif isinstance(source, Source):
985
+ normalized_sources.append(source)
986
+ else:
987
+ raise ValueError(f"Invalid source format: {source}")
988
+
989
+ return normalized_sources
990
+
991
+ @field_validator("categories")
992
+ @classmethod
993
+ def validate_categories(cls, v):
994
+ """Validate and normalize categories input."""
995
+ if v is None:
996
+ return v
997
+
998
+ normalized_categories = []
999
+ for category in v:
1000
+ if isinstance(category, str):
1001
+ normalized_categories.append(Category(type=category))
1002
+ elif isinstance(category, dict):
1003
+ normalized_categories.append(Category(**category))
1004
+ elif isinstance(category, Category):
1005
+ normalized_categories.append(category)
1006
+ else:
1007
+ raise ValueError(f"Invalid category format: {category}")
1008
+
1009
+ return normalized_categories
1010
+
1011
+ # NOTE: parsers validation does not belong on SearchRequest; it is part of ScrapeOptions.
1012
+
1013
+
1014
+ class LinkResult(BaseModel):
1015
+ """A generic link result with optional metadata (used by search and map)."""
1016
+
1017
+ url: str
1018
+ title: Optional[str] = None
1019
+ description: Optional[str] = None
1020
+
1021
+
1022
+ # Backward-compatible alias for existing tests/usages
1023
+ SearchResult = LinkResult
1024
+
1025
+
1026
+ class SearchData(BaseModel):
1027
+ """Search results grouped by source type."""
1028
+
1029
+ web: Optional[List[Union[SearchResultWeb, Document]]] = None
1030
+ news: Optional[List[Union[SearchResultNews, Document]]] = None
1031
+ images: Optional[List[Union[SearchResultImages, Document]]] = None
1032
+
1033
+
1034
+ class SearchResponse(BaseResponse[SearchData]):
1035
+ """Response from search operation."""
1036
+
1037
+ pass
1038
+
1039
+
1040
+ # Error types
1041
+ class ErrorDetails(BaseModel):
1042
+ """Detailed error information."""
1043
+
1044
+ code: Optional[str] = None
1045
+ message: str
1046
+ details: Optional[Dict[str, Any]] = None
1047
+
1048
+
1049
+ class ErrorResponse(BaseModel):
1050
+ """Error response structure."""
1051
+
1052
+ success: bool = False
1053
+ error: str
1054
+ details: Optional[ErrorDetails] = None
1055
+
1056
+
1057
+ # Job management types
1058
+ class JobStatus(BaseModel):
1059
+ """Generic job status information."""
1060
+
1061
+ id: str
1062
+ status: Literal["pending", "scraping", "completed", "failed"]
1063
+ current: Optional[int] = None
1064
+ total: Optional[int] = None
1065
+ created_at: Optional[datetime] = None
1066
+ completed_at: Optional[datetime] = None
1067
+ expires_at: Optional[datetime] = None
1068
+
1069
+
1070
+ class CrawlError(BaseModel):
1071
+ """A crawl error."""
1072
+
1073
+ id: str
1074
+ timestamp: Optional[datetime] = None
1075
+ url: str
1076
+ code: Optional[str] = None
1077
+ error: str
1078
+
1079
+
1080
+ class CrawlErrorsResponse(BaseModel):
1081
+ """Response from crawl error monitoring."""
1082
+
1083
+ errors: List[CrawlError]
1084
+ robots_blocked: List[str]
1085
+
1086
+
1087
+ class CrawlErrorsRequest(BaseModel):
1088
+ """Request for crawl error monitoring."""
1089
+
1090
+ crawl_id: str
1091
+
1092
+
1093
+ class ActiveCrawl(BaseModel):
1094
+ """Information about an active crawl job."""
1095
+
1096
+ id: str
1097
+ team_id: str
1098
+ url: str
1099
+ options: Optional[Dict[str, Any]] = None
1100
+
1101
+
1102
+ class ActiveCrawlsResponse(BaseModel):
1103
+ """Response from active crawls endpoint."""
1104
+
1105
+ success: bool = True
1106
+ crawls: List[ActiveCrawl]
1107
+
1108
+
1109
+ class ActiveCrawlsRequest(BaseModel):
1110
+ """Request for listing active crawl jobs."""
1111
+
1112
+ pass
1113
+
1114
+
1115
+ # Configuration types
1116
+ class ClientConfig(BaseModel):
1117
+ """Configuration for the Firecrawl client."""
1118
+
1119
+ api_key: Optional[str] = None
1120
+ api_url: str = "https://api.firecrawl.dev"
1121
+ timeout: Optional[float] = None
1122
+ max_retries: int = 3
1123
+ backoff_factor: float = 0.5
1124
+
1125
+
1126
+ class PaginationConfig(BaseModel):
1127
+ """Configuration for pagination behavior."""
1128
+
1129
+ auto_paginate: bool = True
1130
+ max_pages: Optional[int] = Field(default=None, ge=0)
1131
+ max_results: Optional[int] = Field(default=None, ge=0)
1132
+ max_wait_time: Optional[int] = Field(default=None, ge=0) # seconds
1133
+
1134
+
1135
+ # Response union types
1136
+ AnyResponse = Union[
1137
+ ScrapeResponse,
1138
+ CrawlResponse,
1139
+ BatchScrapeResponse,
1140
+ MapResponse,
1141
+ SearchResponse,
1142
+ ErrorResponse,
1143
+ ]