firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,592 @@
1
+ """
2
+ Crawling functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ import time
6
+ from typing import Optional, Dict, Any, List
7
+ from ..types import (
8
+ CrawlRequest,
9
+ CrawlJob,
10
+ CrawlResponse, Document, CrawlParamsRequest, CrawlParamsResponse, CrawlParamsData,
11
+ WebhookConfig, CrawlErrorsResponse, ActiveCrawlsResponse, ActiveCrawl, PaginationConfig
12
+ )
13
+ from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
14
+ from ..utils.normalize import normalize_document_input
15
+
16
+
17
+ def _validate_crawl_request(request: CrawlRequest) -> None:
18
+ """
19
+ Validate crawl request parameters.
20
+
21
+ Args:
22
+ request: CrawlRequest to validate
23
+
24
+ Raises:
25
+ ValueError: If request is invalid
26
+ """
27
+ if not request.url or not request.url.strip():
28
+ raise ValueError("URL cannot be empty")
29
+
30
+ if request.limit is not None and request.limit <= 0:
31
+ raise ValueError("Limit must be positive")
32
+
33
+ # Validate scrape_options (if provided)
34
+ if request.scrape_options is not None:
35
+ validate_scrape_options(request.scrape_options)
36
+
37
+
38
+ def _prepare_crawl_request(request: CrawlRequest) -> dict:
39
+ """
40
+ Prepare crawl request for API submission.
41
+
42
+ Args:
43
+ request: CrawlRequest to prepare
44
+
45
+ Returns:
46
+ Dictionary ready for API submission
47
+ """
48
+ # Validate request
49
+ _validate_crawl_request(request)
50
+
51
+ # Start with basic data
52
+ data = {"url": request.url}
53
+
54
+ # Add prompt if present
55
+ if request.prompt:
56
+ data["prompt"] = request.prompt
57
+
58
+ # Handle scrape_options conversion first (before model_dump)
59
+ if request.scrape_options is not None:
60
+ scrape_data = prepare_scrape_options(request.scrape_options)
61
+ if scrape_data:
62
+ data["scrapeOptions"] = scrape_data
63
+
64
+ # Convert request to dict
65
+ request_data = request.model_dump(exclude_none=True, exclude_unset=True)
66
+
67
+ # Remove url, prompt, and scrape_options (already handled)
68
+ request_data.pop("url", None)
69
+ request_data.pop("prompt", None)
70
+ request_data.pop("scrape_options", None)
71
+
72
+ # Handle webhook conversion first (before model_dump)
73
+ if request.webhook is not None:
74
+ if isinstance(request.webhook, str):
75
+ data["webhook"] = request.webhook
76
+ else:
77
+ # Convert WebhookConfig to dict
78
+ data["webhook"] = request.webhook.model_dump(exclude_none=True)
79
+
80
+ # Convert other snake_case fields to camelCase
81
+ field_mappings = {
82
+ "include_paths": "includePaths",
83
+ "exclude_paths": "excludePaths",
84
+ "max_discovery_depth": "maxDiscoveryDepth",
85
+ "sitemap": "sitemap",
86
+ "ignore_query_parameters": "ignoreQueryParameters",
87
+ "crawl_entire_domain": "crawlEntireDomain",
88
+ "allow_external_links": "allowExternalLinks",
89
+ "allow_subdomains": "allowSubdomains",
90
+ "delay": "delay",
91
+ "max_concurrency": "maxConcurrency",
92
+ "zero_data_retention": "zeroDataRetention"
93
+ }
94
+
95
+ # Apply field mappings
96
+ for snake_case, camel_case in field_mappings.items():
97
+ if snake_case in request_data:
98
+ data[camel_case] = request_data.pop(snake_case)
99
+
100
+ # Add any remaining fields that don't need conversion (like limit)
101
+ data.update(request_data)
102
+ # Trim integration if present
103
+ if "integration" in data and isinstance(data["integration"], str):
104
+ data["integration"] = data["integration"].strip()
105
+
106
+ return data
107
+
108
+
109
+ def start_crawl(client: HttpClient, request: CrawlRequest) -> CrawlResponse:
110
+ """
111
+ Start a crawl job for a website.
112
+
113
+ Args:
114
+ client: HTTP client instance
115
+ request: CrawlRequest containing URL and options
116
+
117
+ Returns:
118
+ CrawlResponse with job information
119
+
120
+ Raises:
121
+ ValueError: If request is invalid
122
+ Exception: If the crawl operation fails to start
123
+ """
124
+ request_data = _prepare_crawl_request(request)
125
+
126
+ response = client.post("/v2/crawl", request_data)
127
+
128
+ if not response.ok:
129
+ handle_response_error(response, "start crawl")
130
+
131
+ response_data = response.json()
132
+
133
+ if response_data.get("success"):
134
+ job_data = {
135
+ "id": response_data.get("id"),
136
+ "url": response_data.get("url")
137
+ }
138
+
139
+ return CrawlResponse(**job_data)
140
+ else:
141
+ raise Exception(response_data.get("error", "Unknown error occurred"))
142
+
143
+
144
+ def get_crawl_status(
145
+ client: HttpClient,
146
+ job_id: str,
147
+ pagination_config: Optional[PaginationConfig] = None,
148
+ *,
149
+ request_timeout: Optional[float] = None,
150
+ ) -> CrawlJob:
151
+ """
152
+ Get the status of a crawl job.
153
+
154
+ Args:
155
+ client: HTTP client instance
156
+ job_id: ID of the crawl job
157
+ pagination_config: Optional configuration for pagination behavior
158
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
159
+ is enabled (default) and there are multiple pages of results, this timeout applies to
160
+ each page request separately, not to the entire operation
161
+
162
+ Returns:
163
+ CrawlJob with current status and data
164
+
165
+ Raises:
166
+ Exception: If the status check fails
167
+ """
168
+ # Make the API request
169
+ response = client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
170
+
171
+ # Handle errors
172
+ if not response.ok:
173
+ handle_response_error(response, "get crawl status")
174
+
175
+ # Parse response
176
+ response_data = response.json()
177
+
178
+ if response_data.get("success"):
179
+ # The API returns status fields at the top level, not in a data field
180
+
181
+ # Convert documents
182
+ documents = []
183
+ data_list = response_data.get("data", [])
184
+ for doc_data in data_list:
185
+ if isinstance(doc_data, str):
186
+ # Handle case where API returns just URLs - this shouldn't happen for crawl
187
+ # but we'll handle it gracefully
188
+ continue
189
+ else:
190
+ documents.append(Document(**normalize_document_input(doc_data)))
191
+
192
+ # Handle pagination if requested
193
+ auto_paginate = pagination_config.auto_paginate if pagination_config else True
194
+ if auto_paginate and response_data.get("next") and not (
195
+ pagination_config
196
+ and pagination_config.max_results is not None
197
+ and len(documents) >= pagination_config.max_results
198
+ ):
199
+ documents = _fetch_all_pages(
200
+ client,
201
+ response_data.get("next"),
202
+ documents,
203
+ pagination_config,
204
+ request_timeout=request_timeout,
205
+ )
206
+
207
+ # Create CrawlJob with current status and data
208
+ return CrawlJob(
209
+ status=response_data.get("status"),
210
+ completed=response_data.get("completed", 0),
211
+ total=response_data.get("total", 0),
212
+ credits_used=response_data.get("creditsUsed", 0),
213
+ expires_at=response_data.get("expiresAt"),
214
+ next=response_data.get("next", None) if not auto_paginate else None,
215
+ data=documents
216
+ )
217
+ else:
218
+ raise Exception(response_data.get("error", "Unknown error occurred"))
219
+
220
+
221
+ def _fetch_all_pages(
222
+ client: HttpClient,
223
+ next_url: str,
224
+ initial_documents: List[Document],
225
+ pagination_config: Optional[PaginationConfig] = None,
226
+ *,
227
+ request_timeout: Optional[float] = None,
228
+ ) -> List[Document]:
229
+ """
230
+ Fetch all pages of crawl results.
231
+
232
+ Args:
233
+ client: HTTP client instance
234
+ next_url: URL for the next page
235
+ initial_documents: Documents from the first page
236
+ pagination_config: Optional configuration for pagination limits
237
+ request_timeout: Optional timeout (in seconds) for the underlying HTTP request
238
+
239
+ Returns:
240
+ List of all documents from all pages
241
+ """
242
+ documents = initial_documents.copy()
243
+ current_url = next_url
244
+ page_count = 0
245
+
246
+ # Apply pagination limits
247
+ max_pages = pagination_config.max_pages if pagination_config else None
248
+ max_results = pagination_config.max_results if pagination_config else None
249
+ max_wait_time = pagination_config.max_wait_time if pagination_config else None
250
+
251
+ start_time = time.monotonic()
252
+
253
+ while current_url:
254
+ # Check pagination limits (treat 0 as a valid limit)
255
+ if (max_pages is not None) and page_count >= max_pages:
256
+ break
257
+
258
+ if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
259
+ break
260
+
261
+ # Fetch next page
262
+ response = client.get(current_url, timeout=request_timeout)
263
+
264
+ if not response.ok:
265
+ # Log error but continue with what we have
266
+ import logging
267
+ logger = logging.getLogger("firecrawl")
268
+ logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
269
+ break
270
+
271
+ page_data = response.json()
272
+
273
+ if not page_data.get("success"):
274
+ break
275
+
276
+ # Add documents from this page
277
+ data_list = page_data.get("data", [])
278
+ for doc_data in data_list:
279
+ if isinstance(doc_data, str):
280
+ continue
281
+ else:
282
+ # Check max_results limit BEFORE adding each document
283
+ if max_results is not None and len(documents) >= max_results:
284
+ break
285
+ documents.append(Document(**normalize_document_input(doc_data)))
286
+
287
+ # Check if we hit max_results limit
288
+ if max_results is not None and len(documents) >= max_results:
289
+ break
290
+
291
+ # Get next URL
292
+ current_url = page_data.get("next")
293
+ page_count += 1
294
+
295
+ return documents
296
+
297
+
298
+ def cancel_crawl(client: HttpClient, job_id: str) -> bool:
299
+ """
300
+ Cancel a running crawl job.
301
+
302
+ Args:
303
+ client: HTTP client instance
304
+ job_id: ID of the crawl job to cancel
305
+
306
+ Returns:
307
+ bool: True if the crawl was cancelled, False otherwise
308
+
309
+ Raises:
310
+ Exception: If the cancellation fails
311
+ """
312
+ response = client.delete(f"/v2/crawl/{job_id}")
313
+
314
+ if not response.ok:
315
+ handle_response_error(response, "cancel crawl")
316
+
317
+ response_data = response.json()
318
+
319
+ return response_data.get("status") == "cancelled"
320
+
321
+ def wait_for_crawl_completion(
322
+ client: HttpClient,
323
+ job_id: str,
324
+ poll_interval: int = 2,
325
+ timeout: Optional[int] = None,
326
+ *,
327
+ request_timeout: Optional[float] = None,
328
+ ) -> CrawlJob:
329
+ """
330
+ Wait for a crawl job to complete, polling for status updates.
331
+
332
+ Args:
333
+ client: HTTP client instance
334
+ job_id: ID of the crawl job
335
+ poll_interval: Seconds between status checks
336
+ timeout: Maximum seconds to wait (None for no timeout)
337
+ request_timeout: Optional timeout (in seconds) for each status request
338
+
339
+ Returns:
340
+ CrawlJob when job completes
341
+
342
+ Raises:
343
+ Exception: If the job fails
344
+ TimeoutError: If timeout is reached
345
+ """
346
+ start_time = time.monotonic()
347
+
348
+ while True:
349
+ crawl_job = get_crawl_status(
350
+ client,
351
+ job_id,
352
+ request_timeout=request_timeout,
353
+ )
354
+
355
+ # Check if job is complete
356
+ if crawl_job.status in ["completed", "failed", "cancelled"]:
357
+ return crawl_job
358
+
359
+ # Check timeout
360
+ if timeout is not None and (time.monotonic() - start_time) > timeout:
361
+ raise TimeoutError(f"Crawl job {job_id} did not complete within {timeout} seconds")
362
+
363
+ # Wait before next poll
364
+ time.sleep(poll_interval)
365
+
366
+
367
+ def crawl(
368
+ client: HttpClient,
369
+ request: CrawlRequest,
370
+ poll_interval: int = 2,
371
+ timeout: Optional[int] = None,
372
+ *,
373
+ request_timeout: Optional[float] = None,
374
+ ) -> CrawlJob:
375
+ """
376
+ Start a crawl job and wait for it to complete.
377
+
378
+ Args:
379
+ client: HTTP client instance
380
+ request: CrawlRequest containing URL and options
381
+ poll_interval: Seconds between status checks
382
+ timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
383
+ request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination
384
+ requests when fetching results. If there are multiple pages, each page request gets this timeout
385
+
386
+ Returns:
387
+ CrawlJob when job completes
388
+
389
+ Raises:
390
+ ValueError: If request is invalid
391
+ Exception: If the crawl fails to start or complete
392
+ TimeoutError: If timeout is reached
393
+ """
394
+ # Start the crawl
395
+ crawl_job = start_crawl(client, request)
396
+ job_id = crawl_job.id
397
+
398
+ # Determine the per-request timeout. If not provided, reuse the overall timeout value.
399
+ effective_request_timeout = request_timeout if request_timeout is not None else timeout
400
+
401
+ # Wait for completion
402
+ return wait_for_crawl_completion(
403
+ client,
404
+ job_id,
405
+ poll_interval,
406
+ timeout,
407
+ request_timeout=effective_request_timeout,
408
+ )
409
+
410
+
411
+ def crawl_params_preview(client: HttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
412
+ """
413
+ Get crawl parameters from LLM based on URL and prompt.
414
+
415
+ Args:
416
+ client: HTTP client instance
417
+ request: CrawlParamsRequest containing URL and prompt
418
+
419
+ Returns:
420
+ CrawlParamsData containing suggested crawl options
421
+
422
+ Raises:
423
+ ValueError: If request is invalid
424
+ Exception: If the operation fails
425
+ """
426
+ # Validate request
427
+ if not request.url or not request.url.strip():
428
+ raise ValueError("URL cannot be empty")
429
+
430
+ if not request.prompt or not request.prompt.strip():
431
+ raise ValueError("Prompt cannot be empty")
432
+
433
+ # Prepare request data
434
+ request_data = {
435
+ "url": request.url,
436
+ "prompt": request.prompt
437
+ }
438
+
439
+ # Make the API request
440
+ response = client.post("/v2/crawl/params-preview", request_data)
441
+
442
+ # Handle errors
443
+ if not response.ok:
444
+ handle_response_error(response, "crawl params preview")
445
+
446
+ # Parse response
447
+ response_data = response.json()
448
+
449
+ if response_data.get("success"):
450
+ params_data = response_data.get("data", {})
451
+
452
+ # Convert camelCase to snake_case for CrawlParamsData
453
+ converted_params = {}
454
+ field_mappings = {
455
+ "includePaths": "include_paths",
456
+ "excludePaths": "exclude_paths",
457
+ "maxDiscoveryDepth": "max_discovery_depth",
458
+ "sitemap": "sitemap",
459
+ "ignoreQueryParameters": "ignore_query_parameters",
460
+ "crawlEntireDomain": "crawl_entire_domain",
461
+ "allowExternalLinks": "allow_external_links",
462
+ "allowSubdomains": "allow_subdomains",
463
+ "maxConcurrency": "max_concurrency",
464
+ "scrapeOptions": "scrape_options",
465
+ "zeroDataRetention": "zero_data_retention"
466
+ }
467
+
468
+ # Handle webhook conversion
469
+ if "webhook" in params_data:
470
+ webhook_data = params_data["webhook"]
471
+ if isinstance(webhook_data, dict):
472
+ converted_params["webhook"] = WebhookConfig(**webhook_data)
473
+ else:
474
+ converted_params["webhook"] = webhook_data
475
+
476
+ for camel_case, snake_case in field_mappings.items():
477
+ if camel_case in params_data:
478
+ if camel_case == "scrapeOptions" and params_data[camel_case] is not None:
479
+ # Handle nested scrapeOptions conversion
480
+ scrape_opts_data = params_data[camel_case]
481
+ converted_scrape_opts = {}
482
+ scrape_field_mappings = {
483
+ "includeTags": "include_tags",
484
+ "excludeTags": "exclude_tags",
485
+ "onlyMainContent": "only_main_content",
486
+ "waitFor": "wait_for",
487
+ "skipTlsVerification": "skip_tls_verification",
488
+ "removeBase64Images": "remove_base64_images"
489
+ }
490
+
491
+ for scrape_camel, scrape_snake in scrape_field_mappings.items():
492
+ if scrape_camel in scrape_opts_data:
493
+ converted_scrape_opts[scrape_snake] = scrape_opts_data[scrape_camel]
494
+
495
+ # Handle formats field - if it's a list, convert to ScrapeFormats
496
+ if "formats" in scrape_opts_data:
497
+ formats_data = scrape_opts_data["formats"]
498
+ if isinstance(formats_data, list):
499
+ # Convert list to ScrapeFormats object
500
+ from ..types import ScrapeFormats
501
+ converted_scrape_opts["formats"] = ScrapeFormats(formats=formats_data)
502
+ else:
503
+ converted_scrape_opts["formats"] = formats_data
504
+
505
+ # Add fields that don't need conversion
506
+ for key, value in scrape_opts_data.items():
507
+ if key not in scrape_field_mappings and key != "formats":
508
+ converted_scrape_opts[key] = value
509
+
510
+ converted_params[snake_case] = converted_scrape_opts
511
+ else:
512
+ converted_params[snake_case] = params_data[camel_case]
513
+
514
+ # Add fields that don't need conversion
515
+ for key, value in params_data.items():
516
+ if key not in field_mappings:
517
+ converted_params[key] = value
518
+
519
+ # Add warning if present
520
+ if "warning" in response_data:
521
+ converted_params["warning"] = response_data["warning"]
522
+
523
+ return CrawlParamsData(**converted_params)
524
+ else:
525
+ raise Exception(response_data.get("error", "Unknown error occurred"))
526
+
527
+
528
+ def get_crawl_errors(http_client: HttpClient, crawl_id: str) -> CrawlErrorsResponse:
529
+ """
530
+ Get errors from a crawl job.
531
+
532
+ Args:
533
+ http_client: HTTP client for making requests
534
+ crawl_id: The ID of the crawl job
535
+
536
+ Returns:
537
+ CrawlErrorsResponse containing errors and robots blocked URLs
538
+
539
+ Raises:
540
+ Exception: If the request fails
541
+ """
542
+ response = http_client.get(f"/v2/crawl/{crawl_id}/errors")
543
+
544
+ if not response.ok:
545
+ handle_response_error(response, "check crawl errors")
546
+
547
+ try:
548
+ body = response.json()
549
+ payload = body.get("data", body)
550
+ # Manual key normalization since we avoid Pydantic aliases
551
+ normalized = {
552
+ "errors": payload.get("errors", []),
553
+ "robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
554
+ }
555
+ return CrawlErrorsResponse(**normalized)
556
+ except Exception as e:
557
+ raise Exception(f"Failed to parse crawl errors response: {e}")
558
+
559
+
560
+ def get_active_crawls(client: HttpClient) -> ActiveCrawlsResponse:
561
+ """
562
+ Get a list of currently active crawl jobs.
563
+
564
+ Args:
565
+ client: HTTP client instance
566
+
567
+ Returns:
568
+ ActiveCrawlsResponse containing a list of active crawl jobs
569
+
570
+ Raises:
571
+ Exception: If the request fails
572
+ """
573
+ response = client.get("/v2/crawl/active")
574
+
575
+ if not response.ok:
576
+ handle_response_error(response, "get active crawls")
577
+
578
+ body = response.json()
579
+ if not body.get("success"):
580
+ raise Exception(body.get("error", "Unknown error occurred"))
581
+
582
+ crawls_in = body.get("crawls", [])
583
+ normalized_crawls = []
584
+ for c in crawls_in:
585
+ if isinstance(c, dict):
586
+ normalized_crawls.append({
587
+ "id": c.get("id"),
588
+ "team_id": c.get("teamId", c.get("team_id")),
589
+ "url": c.get("url"),
590
+ "options": c.get("options"),
591
+ })
592
+ return ActiveCrawlsResponse(success=True, crawls=[ActiveCrawl(**nc) for nc in normalized_crawls])