firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (82) hide show
  1. firecrawl/__init__.py +27 -19
  2. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. firecrawl/__tests__/e2e/v2/test_search.py +265 -0
  18. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
  36. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. firecrawl/client.py +241 -0
  41. firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
  42. firecrawl/types.py +157 -0
  43. firecrawl/v1/__init__.py +14 -0
  44. firecrawl/v1/client.py +4653 -0
  45. firecrawl/v2/__init__.py +4 -0
  46. firecrawl/v2/client.py +802 -0
  47. firecrawl/v2/client_async.py +250 -0
  48. firecrawl/v2/methods/aio/__init__.py +1 -0
  49. firecrawl/v2/methods/aio/batch.py +85 -0
  50. firecrawl/v2/methods/aio/crawl.py +174 -0
  51. firecrawl/v2/methods/aio/extract.py +126 -0
  52. firecrawl/v2/methods/aio/map.py +59 -0
  53. firecrawl/v2/methods/aio/scrape.py +36 -0
  54. firecrawl/v2/methods/aio/search.py +58 -0
  55. firecrawl/v2/methods/aio/usage.py +42 -0
  56. firecrawl/v2/methods/batch.py +420 -0
  57. firecrawl/v2/methods/crawl.py +468 -0
  58. firecrawl/v2/methods/extract.py +131 -0
  59. firecrawl/v2/methods/map.py +77 -0
  60. firecrawl/v2/methods/scrape.py +68 -0
  61. firecrawl/v2/methods/search.py +173 -0
  62. firecrawl/v2/methods/usage.py +41 -0
  63. firecrawl/v2/types.py +546 -0
  64. firecrawl/v2/utils/__init__.py +9 -0
  65. firecrawl/v2/utils/error_handler.py +107 -0
  66. firecrawl/v2/utils/get_version.py +15 -0
  67. firecrawl/v2/utils/http_client.py +153 -0
  68. firecrawl/v2/utils/http_client_async.py +64 -0
  69. firecrawl/v2/utils/validation.py +324 -0
  70. firecrawl/v2/watcher.py +312 -0
  71. firecrawl/v2/watcher_async.py +245 -0
  72. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
  73. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
  74. firecrawl-3.0.3.dist-info/RECORD +78 -0
  75. tests/test_timeout_conversion.py +117 -0
  76. firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  77. firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  78. firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  79. firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
  80. firecrawl-2.16.5.dist-info/RECORD +0 -12
  81. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
  82. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
firecrawl/v2/types.py ADDED
@@ -0,0 +1,546 @@
1
+ """
2
+ Type definitions for Firecrawl v2 API.
3
+
4
+ This module contains clean, modern type definitions for the v2 API.
5
+ """
6
+
7
+ import warnings
8
+ from datetime import datetime
9
+ from typing import Any, Dict, Generic, List, Literal, Optional, TypeVar, Union
10
+ from pydantic import BaseModel, Field, field_validator
11
+
12
+ # Suppress pydantic warnings about schema field shadowing
13
+ # Tested using schema_field alias="schema" but it doesn't work.
14
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"Format\" shadows an attribute in parent \"BaseModel\"")
15
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonFormat\" shadows an attribute in parent \"Format\"")
16
+ warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingFormat\" shadows an attribute in parent \"Format\"")
17
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"ScrapeFormats\" shadows an attribute in parent \"BaseModel\"")
18
+ warnings.filterwarnings("ignore", message="Field name \"json\" in \"Document\" shadows an attribute in parent \"BaseModel\"")
19
+
20
+ T = TypeVar('T')
21
+
22
+ # Base response types
23
+ class BaseResponse(BaseModel, Generic[T]):
24
+ """Base response structure for all API responses."""
25
+ success: bool
26
+ data: Optional[T] = None
27
+ error: Optional[str] = None
28
+ warning: Optional[str] = None
29
+
30
+ # Document and content types
31
+ class DocumentMetadata(BaseModel):
32
+ """Metadata for scraped documents."""
33
+ title: Optional[str] = None
34
+ description: Optional[str] = None
35
+ language: Optional[str] = None
36
+ keywords: Optional[Union[str, List[str]]] = None
37
+ robots: Optional[str] = None
38
+ og_title: Optional[str] = None
39
+ og_description: Optional[str] = None
40
+ og_url: Optional[str] = None
41
+ og_image: Optional[str] = None
42
+ source_url: Optional[str] = None
43
+ status_code: Optional[int] = None
44
+ error: Optional[str] = None
45
+
46
+ @staticmethod
47
+ def _coerce_list_to_string(value: Any) -> Any:
48
+ if isinstance(value, list):
49
+ # Prefer first string if semantically a single-valued field, else join
50
+ if len(value) == 1:
51
+ return str(value[0])
52
+ return ', '.join(str(item) for item in value)
53
+ return value
54
+
55
+ @staticmethod
56
+ def _coerce_string_to_int(value: Any) -> Any:
57
+ if isinstance(value, str):
58
+ try:
59
+ return int(value)
60
+ except ValueError:
61
+ return value
62
+ return value
63
+
64
+ @field_validator('robots', 'og_title', 'og_description', 'og_url', 'og_image', 'language', mode='before')
65
+ @classmethod
66
+ def coerce_lists_to_string_fields(cls, v):
67
+ return cls._coerce_list_to_string(v)
68
+
69
+ @field_validator('status_code', mode='before')
70
+ @classmethod
71
+ def coerce_status_code_to_int(cls, v):
72
+ return cls._coerce_string_to_int(v)
73
+
74
+ class Document(BaseModel):
75
+ """A scraped document."""
76
+ markdown: Optional[str] = None
77
+ html: Optional[str] = None
78
+ raw_html: Optional[str] = None
79
+ json: Optional[Any] = None
80
+ summary: Optional[str] = None
81
+ metadata: Optional[DocumentMetadata] = None
82
+ links: Optional[List[str]] = None
83
+ screenshot: Optional[str] = None
84
+ actions: Optional[Dict[str, Any]] = None
85
+ warning: Optional[str] = None
86
+ change_tracking: Optional[Dict[str, Any]] = None
87
+
88
+ # Webhook types
89
+ class WebhookConfig(BaseModel):
90
+ """Configuration for webhooks."""
91
+ url: str
92
+ headers: Optional[Dict[str, str]] = None
93
+ metadata: Optional[Dict[str, str]] = None
94
+ events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
95
+
96
+ class WebhookData(BaseModel):
97
+ """Data sent to webhooks."""
98
+ job_id: str
99
+ status: str
100
+ current: Optional[int] = None
101
+ total: Optional[int] = None
102
+ data: Optional[List[Document]] = None
103
+ error: Optional[str] = None
104
+
105
+ class Source(BaseModel):
106
+ """Configuration for a search source."""
107
+ type: str
108
+
109
+ SourceOption = Union[str, Source]
110
+
111
+ FormatString = Literal[
112
+ # camelCase versions (API format)
113
+ "markdown", "html", "rawHtml", "links", "screenshot", "summary", "changeTracking", "json",
114
+ # snake_case versions (user-friendly)
115
+ "raw_html", "change_tracking"
116
+ ]
117
+
118
+ class Viewport(BaseModel):
119
+ """Viewport configuration for screenshots."""
120
+ width: int
121
+ height: int
122
+
123
+ class Format(BaseModel):
124
+ """Configuration for a format."""
125
+ type: FormatString
126
+
127
+ class JsonFormat(Format):
128
+ """Configuration for JSON extraction."""
129
+ prompt: Optional[str] = None
130
+ schema: Optional[Any] = None
131
+
132
+ class ChangeTrackingFormat(Format):
133
+ """Configuration for change tracking."""
134
+ modes: List[Literal["git-diff", "json"]]
135
+ schema: Optional[Dict[str, Any]] = None
136
+ prompt: Optional[str] = None
137
+ tag: Optional[str] = None
138
+
139
+ class ScreenshotFormat(BaseModel):
140
+ """Configuration for screenshot format."""
141
+ type: Literal["screenshot"] = "screenshot"
142
+ full_page: Optional[bool] = None
143
+ quality: Optional[int] = None
144
+ viewport: Optional[Union[Dict[str, int], Viewport]] = None
145
+
146
+ FormatOption = Union[Dict[str, Any], FormatString, JsonFormat, ChangeTrackingFormat, ScreenshotFormat, Format]
147
+
148
+ # Scrape types
149
+ class ScrapeFormats(BaseModel):
150
+ """Output formats for scraping."""
151
+ formats: Optional[List[FormatOption]] = None
152
+ markdown: bool = True
153
+ html: bool = False
154
+ raw_html: bool = False
155
+ summary: bool = False
156
+ links: bool = False
157
+ screenshot: bool = False
158
+ change_tracking: bool = False
159
+ json: bool = False
160
+
161
+ @field_validator('formats')
162
+ @classmethod
163
+ def validate_formats(cls, v):
164
+ """Validate and normalize formats input."""
165
+ if v is None:
166
+ return v
167
+
168
+ normalized_formats = []
169
+ for format_item in v:
170
+ if isinstance(format_item, str):
171
+ normalized_formats.append(Format(type=format_item))
172
+ elif isinstance(format_item, dict):
173
+ # Preserve dicts as-is to avoid dropping custom fields like 'schema'
174
+ normalized_formats.append(format_item)
175
+ elif isinstance(format_item, Format):
176
+ normalized_formats.append(format_item)
177
+ else:
178
+ raise ValueError(f"Invalid format format: {format_item}")
179
+
180
+ return normalized_formats
181
+
182
+ class ScrapeOptions(BaseModel):
183
+ """Options for scraping operations."""
184
+ formats: Optional[Union['ScrapeFormats', List[FormatOption]]] = None
185
+ headers: Optional[Dict[str, str]] = None
186
+ include_tags: Optional[List[str]] = None
187
+ exclude_tags: Optional[List[str]] = None
188
+ only_main_content: Optional[bool] = None
189
+ timeout: Optional[int] = None
190
+ wait_for: Optional[int] = None
191
+ mobile: Optional[bool] = None
192
+ parsers: Optional[List[str]] = None
193
+ actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None
194
+ location: Optional['Location'] = None
195
+ skip_tls_verification: Optional[bool] = None
196
+ remove_base64_images: Optional[bool] = None
197
+ fast_mode: Optional[bool] = None
198
+ use_mock: Optional[str] = None
199
+ block_ads: Optional[bool] = None
200
+ proxy: Optional[Literal["basic", "stealth", "auto"]] = None
201
+ max_age: Optional[int] = None
202
+ store_in_cache: Optional[bool] = None
203
+
204
+ @field_validator('formats')
205
+ @classmethod
206
+ def validate_formats(cls, v):
207
+ """Validate and normalize formats input."""
208
+ if v is None:
209
+ return v
210
+ if isinstance(v, ScrapeFormats):
211
+ return v
212
+ if isinstance(v, list):
213
+ return v
214
+ raise ValueError(f"Invalid formats type: {type(v)}. Expected ScrapeFormats or List[FormatOption]")
215
+
216
+ class ScrapeRequest(BaseModel):
217
+ """Request for scraping a single URL."""
218
+ url: str
219
+ options: Optional[ScrapeOptions] = None
220
+
221
+ class ScrapeData(Document):
222
+ """Scrape results data."""
223
+ pass
224
+
225
+ class ScrapeResponse(BaseResponse[ScrapeData]):
226
+ """Response for scrape operations."""
227
+ pass
228
+
229
+ # Crawl types
230
+ class CrawlRequest(BaseModel):
231
+ """Request for crawling a website."""
232
+ url: str
233
+ prompt: Optional[str] = None
234
+ exclude_paths: Optional[List[str]] = None
235
+ include_paths: Optional[List[str]] = None
236
+ max_discovery_depth: Optional[int] = None
237
+ sitemap: Literal["skip", "include"] = "include"
238
+ ignore_query_parameters: bool = False
239
+ limit: Optional[int] = None
240
+ crawl_entire_domain: bool = False
241
+ allow_external_links: bool = False
242
+ allow_subdomains: bool = False
243
+ delay: Optional[int] = None
244
+ max_concurrency: Optional[int] = None
245
+ webhook: Optional[Union[str, WebhookConfig]] = None
246
+ scrape_options: Optional[ScrapeOptions] = None
247
+ zero_data_retention: bool = False
248
+
249
+ class CrawlResponse(BaseModel):
250
+ """Information about a crawl job."""
251
+ id: str
252
+ url: str
253
+
254
+ class CrawlJob(BaseModel):
255
+ """Crawl job status and progress data."""
256
+ status: Literal["scraping", "completed", "failed"]
257
+ total: int = 0
258
+ completed: int = 0
259
+ credits_used: int = 0
260
+ expires_at: Optional[datetime] = None
261
+ next: Optional[str] = None
262
+ data: List[Document] = []
263
+
264
+ class SearchDocument(Document):
265
+ """A document from a search operation with URL and description."""
266
+ url: str
267
+ title: Optional[str] = None
268
+ description: Optional[str] = None
269
+
270
+ class MapDocument(Document):
271
+ """A document from a map operation with URL and description."""
272
+ url: str
273
+ description: Optional[str] = None
274
+
275
+ # Crawl params types
276
+ class CrawlParamsRequest(BaseModel):
277
+ """Request for getting crawl parameters from LLM."""
278
+ url: str
279
+ prompt: str
280
+
281
+ class CrawlParamsData(BaseModel):
282
+ """Data returned from crawl params endpoint."""
283
+ include_paths: Optional[List[str]] = None
284
+ exclude_paths: Optional[List[str]] = None
285
+ max_discovery_depth: Optional[int] = None
286
+ ignore_sitemap: bool = False
287
+ ignore_query_parameters: bool = False
288
+ limit: Optional[int] = None
289
+ crawl_entire_domain: bool = False
290
+ allow_external_links: bool = False
291
+ allow_subdomains: bool = False
292
+ delay: Optional[int] = None
293
+ max_concurrency: Optional[int] = None
294
+ webhook: Optional[Union[str, WebhookConfig]] = None
295
+ scrape_options: Optional[ScrapeOptions] = None
296
+ zero_data_retention: bool = False
297
+ warning: Optional[str] = None
298
+
299
+ class CrawlParamsResponse(BaseResponse[CrawlParamsData]):
300
+ """Response from crawl params endpoint."""
301
+ pass
302
+
303
+ # Batch scrape types
304
+ class BatchScrapeRequest(BaseModel):
305
+ """Request for batch scraping multiple URLs (internal helper only)."""
306
+ urls: List[str]
307
+ options: Optional[ScrapeOptions] = None
308
+
309
+ class BatchScrapeResponse(BaseModel):
310
+ """Response from starting a batch scrape job (mirrors CrawlResponse naming)."""
311
+ id: str
312
+ url: str
313
+ invalid_urls: Optional[List[str]] = None
314
+
315
+ class BatchScrapeJob(BaseModel):
316
+ """Batch scrape job status and results."""
317
+ status: Literal["scraping", "completed", "failed", "cancelled"]
318
+ completed: int
319
+ total: int
320
+ credits_used: Optional[int] = None
321
+ expires_at: Optional[datetime] = None
322
+ next: Optional[str] = None
323
+ data: List[Document] = []
324
+
325
+ # Map types
326
+ class MapOptions(BaseModel):
327
+ """Options for mapping operations."""
328
+ search: Optional[str] = None
329
+ sitemap: Literal["only", "include", "skip"] = "include"
330
+ include_subdomains: Optional[bool] = None
331
+ limit: Optional[int] = None
332
+ timeout: Optional[int] = None
333
+
334
+ class MapRequest(BaseModel):
335
+ """Request for mapping a website."""
336
+ url: str
337
+ options: Optional[MapOptions] = None
338
+
339
+ class MapData(BaseModel):
340
+ """Map results data."""
341
+ links: List['SearchResult']
342
+
343
+ class MapResponse(BaseResponse[MapData]):
344
+ """Response for map operations."""
345
+ pass
346
+
347
+ # Extract types
348
+ class ExtractResponse(BaseModel):
349
+ """Response for extract operations (start/status/final)."""
350
+ success: Optional[bool] = None
351
+ id: Optional[str] = None
352
+ status: Optional[Literal["processing", "completed", "failed", "cancelled"]] = None
353
+ data: Optional[Any] = None
354
+ error: Optional[str] = None
355
+ warning: Optional[str] = None
356
+ sources: Optional[Dict[str, Any]] = None
357
+ expires_at: Optional[datetime] = None
358
+
359
+ # Usage/limits types
360
+ class ConcurrencyCheck(BaseModel):
361
+ """Current concurrency and limits for the team/API key."""
362
+ concurrency: int
363
+ max_concurrency: int
364
+
365
+ class CreditUsage(BaseModel):
366
+ """Remaining credits for the team/API key."""
367
+ remaining_credits: int
368
+
369
+ class TokenUsage(BaseModel):
370
+ """Recent token usage metrics (if available)."""
371
+ remaining_tokens: int
372
+
373
+ # Action types
374
+ class WaitAction(BaseModel):
375
+ """Wait action to perform during scraping."""
376
+ type: Literal["wait"] = "wait"
377
+ milliseconds: Optional[int] = None
378
+ selector: Optional[str] = None
379
+
380
+ class ScreenshotAction(BaseModel):
381
+ """Screenshot action to perform during scraping."""
382
+ type: Literal["screenshot"] = "screenshot"
383
+ full_page: Optional[bool] = None
384
+ quality: Optional[int] = None
385
+ viewport: Optional[Union[Dict[str, int], Viewport]] = None
386
+
387
+ class ClickAction(BaseModel):
388
+ """Click action to perform during scraping."""
389
+ type: Literal["click"] = "click"
390
+ selector: str
391
+
392
+ class WriteAction(BaseModel):
393
+ """Write action to perform during scraping."""
394
+ type: Literal["write"] = "write"
395
+ text: str
396
+
397
+ class PressAction(BaseModel):
398
+ """Press action to perform during scraping."""
399
+ type: Literal["press"] = "press"
400
+ key: str
401
+
402
+ class ScrollAction(BaseModel):
403
+ """Scroll action to perform during scraping."""
404
+ type: Literal["scroll"] = "scroll"
405
+ direction: Literal["up", "down"]
406
+ selector: Optional[str] = None
407
+
408
+ class ScrapeAction(BaseModel):
409
+ """Scrape action to perform during scraping."""
410
+ type: Literal["scrape"] = "scrape"
411
+
412
+ class ExecuteJavascriptAction(BaseModel):
413
+ """Execute javascript action to perform during scraping."""
414
+ type: Literal["executeJavascript"] = "executeJavascript"
415
+ script: str
416
+
417
+ class PDFAction(BaseModel):
418
+ """PDF action to perform during scraping."""
419
+ type: Literal["pdf"] = "pdf"
420
+ format: Optional[Literal["A0", "A1", "A2", "A3", "A4", "A5", "A6", "Letter", "Legal", "Tabloid", "Ledger"]] = None
421
+ landscape: Optional[bool] = None
422
+ scale: Optional[float] = None
423
+
424
+ # Location types
425
+ class Location(BaseModel):
426
+ """Location configuration for scraping."""
427
+ country: Optional[str] = None
428
+ languages: Optional[List[str]] = None
429
+
430
+ class SearchRequest(BaseModel):
431
+ """Request for search operations."""
432
+ query: str
433
+ sources: Optional[List[SourceOption]] = None
434
+ limit: Optional[int] = 5
435
+ tbs: Optional[str] = None
436
+ location: Optional[str] = None
437
+ ignore_invalid_urls: Optional[bool] = None
438
+ timeout: Optional[int] = 60000
439
+ scrape_options: Optional[ScrapeOptions] = None
440
+
441
+ @field_validator('sources')
442
+ @classmethod
443
+ def validate_sources(cls, v):
444
+ """Validate and normalize sources input."""
445
+ if v is None:
446
+ return v
447
+
448
+ normalized_sources = []
449
+ for source in v:
450
+ if isinstance(source, str):
451
+ normalized_sources.append(Source(type=source))
452
+ elif isinstance(source, dict):
453
+ normalized_sources.append(Source(**source))
454
+ elif isinstance(source, Source):
455
+ normalized_sources.append(source)
456
+ else:
457
+ raise ValueError(f"Invalid source format: {source}")
458
+
459
+ return normalized_sources
460
+
461
+ class LinkResult(BaseModel):
462
+ """A generic link result with optional metadata (used by search and map)."""
463
+ url: str
464
+ title: Optional[str] = None
465
+ description: Optional[str] = None
466
+
467
+ # Backward-compatible alias for existing tests/usages
468
+ SearchResult = LinkResult
469
+
470
+ class SearchData(BaseModel):
471
+ """Search results grouped by source type."""
472
+ web: Optional[List[Union[LinkResult, SearchDocument]]] = None
473
+ news: Optional[List[Union[LinkResult, SearchDocument]]] = None
474
+ images: Optional[List[Union[LinkResult, SearchDocument]]] = None
475
+
476
+ class SearchResponse(BaseResponse[SearchData]):
477
+ """Response from search operation."""
478
+ pass
479
+
480
+ # Error types
481
+ class ErrorDetails(BaseModel):
482
+ """Detailed error information."""
483
+ code: Optional[str] = None
484
+ message: str
485
+ details: Optional[Dict[str, Any]] = None
486
+
487
+ class ErrorResponse(BaseModel):
488
+ """Error response structure."""
489
+ success: bool = False
490
+ error: str
491
+ details: Optional[ErrorDetails] = None
492
+
493
+ # Job management types
494
+ class JobStatus(BaseModel):
495
+ """Generic job status information."""
496
+ id: str
497
+ status: Literal["pending", "scraping", "completed", "failed"]
498
+ current: Optional[int] = None
499
+ total: Optional[int] = None
500
+ created_at: Optional[datetime] = None
501
+ completed_at: Optional[datetime] = None
502
+ expires_at: Optional[datetime] = None
503
+
504
+ class CrawlError(BaseModel):
505
+ """A crawl error."""
506
+ id: str
507
+ timestamp: Optional[datetime] = None
508
+ url: str
509
+ code: Optional[str] = None
510
+ error: str
511
+
512
+ class CrawlErrorsResponse(BaseModel):
513
+ """Response from crawl error monitoring."""
514
+ errors: List[CrawlError]
515
+ robots_blocked: List[str]
516
+
517
+ class ActiveCrawl(BaseModel):
518
+ """Information about an active crawl job."""
519
+ id: str
520
+ team_id: str
521
+ url: str
522
+ options: Optional[Dict[str, Any]] = None
523
+
524
+ class ActiveCrawlsResponse(BaseModel):
525
+ """Response from active crawls endpoint."""
526
+ success: bool = True
527
+ crawls: List[ActiveCrawl]
528
+
529
+ # Configuration types
530
+ class ClientConfig(BaseModel):
531
+ """Configuration for the Firecrawl client."""
532
+ api_key: str
533
+ api_url: str = "https://api.firecrawl.dev"
534
+ timeout: Optional[float] = None
535
+ max_retries: int = 3
536
+ backoff_factor: float = 0.5
537
+
538
+ # Response union types
539
+ AnyResponse = Union[
540
+ ScrapeResponse,
541
+ CrawlResponse,
542
+ BatchScrapeResponse,
543
+ MapResponse,
544
+ SearchResponse,
545
+ ErrorResponse,
546
+ ]
@@ -0,0 +1,9 @@
1
+ """
2
+ Utility modules for v2 API client.
3
+ """
4
+
5
+ from .http_client import HttpClient
6
+ from .error_handler import FirecrawlError, handle_response_error
7
+ from .validation import validate_scrape_options, prepare_scrape_options
8
+
9
+ __all__ = ['HttpClient', 'FirecrawlError', 'handle_response_error', 'validate_scrape_options', 'prepare_scrape_options']
@@ -0,0 +1,107 @@
1
+ """
2
+ Error handling utilities for v2 API.
3
+ """
4
+
5
+ import requests
6
+ from typing import Dict, Any, Optional
7
+
8
+
9
+ class FirecrawlError(Exception):
10
+ """Base exception for Firecrawl API errors."""
11
+
12
+ def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[requests.Response] = None):
13
+ super().__init__(message)
14
+ self.status_code = status_code
15
+ self.response = response
16
+
17
+
18
+ class BadRequestError(FirecrawlError):
19
+ """Raised when the request is invalid (400)."""
20
+ pass
21
+
22
+
23
+
24
+ class UnauthorizedError(FirecrawlError):
25
+ """Raised when the request is unauthorized (401)."""
26
+ pass
27
+
28
+
29
+ class PaymentRequiredError(FirecrawlError):
30
+ """Raised when payment is required (402)."""
31
+ pass
32
+
33
+
34
+ class WebsiteNotSupportedError(FirecrawlError):
35
+ """Raised when website is not supported (403)."""
36
+ pass
37
+
38
+
39
+ class RequestTimeoutError(FirecrawlError):
40
+ """Raised when request times out (408)."""
41
+ pass
42
+
43
+
44
+ class RateLimitError(FirecrawlError):
45
+ """Raised when the rate limit is exceeded (429)."""
46
+ pass
47
+
48
+
49
+ class InternalServerError(FirecrawlError):
50
+ """Raised when there's an internal server error (500)."""
51
+ pass
52
+
53
+
54
+ def handle_response_error(response: requests.Response, action: str) -> None:
55
+ """
56
+ Handle API response errors and raise appropriate exceptions.
57
+
58
+ Args:
59
+ response: The HTTP response object
60
+ action: Description of the action being performed
61
+
62
+ Raises:
63
+ FirecrawlError: Appropriate error based on status code
64
+ """
65
+ try:
66
+ response_json = response.json()
67
+ error_message = response_json.get('error', 'No error message provided.')
68
+ error_details = response_json.get('details', 'No additional error details provided.')
69
+ except:
70
+ # If we can't parse JSON, provide a helpful error message
71
+ try:
72
+ response_text = response.text[:500] # Limit to first 500 chars
73
+ if response_text.strip():
74
+ error_message = f"Server returned non-JSON response: {response_text}"
75
+ error_details = f"Full response status: {response.status_code}"
76
+ else:
77
+ error_message = f"Server returned empty response with status {response.status_code}"
78
+ error_details = "No additional details available"
79
+ except:
80
+ error_message = f"Server returned unreadable response with status {response.status_code}"
81
+ error_details = "No additional details available"
82
+
83
+ # Create appropriate error message
84
+ if response.status_code == 400:
85
+ message = f"Bad Request: Failed to {action}. {error_message} - {error_details}"
86
+ raise BadRequestError(message, response.status_code, response)
87
+ elif response.status_code == 401:
88
+ message = f"Unauthorized: Failed to {action}. {error_message} - {error_details}"
89
+ raise UnauthorizedError(message, response.status_code, response)
90
+ elif response.status_code == 402:
91
+ message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
92
+ raise PaymentRequiredError(message, response.status_code, response)
93
+ elif response.status_code == 403:
94
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
95
+ raise WebsiteNotSupportedError(message, response.status_code, response)
96
+ elif response.status_code == 408:
97
+ message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
98
+ raise RequestTimeoutError(message, response.status_code, response)
99
+ elif response.status_code == 429:
100
+ message = f"Rate Limit Exceeded: Failed to {action}. {error_message} - {error_details}"
101
+ raise RateLimitError(message, response.status_code, response)
102
+ elif response.status_code == 500:
103
+ message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
104
+ raise InternalServerError(message, response.status_code, response)
105
+ else:
106
+ message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
107
+ raise FirecrawlError(message, response.status_code, response)
@@ -0,0 +1,15 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+
5
+ def get_version():
6
+ try:
7
+ package_path = Path(__file__).parents[2]
8
+ version_file = (package_path / "__init__.py").read_text()
9
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
10
+ if version_match:
11
+ return version_match.group(1).strip()
12
+ return "3.x.x"
13
+ except Exception as e:
14
+ print(f"Failed to get version from __init__.py: {e}")
15
+ return "3.x.x"