firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,692 @@
1
+ """
2
+ Shared validation functions for Firecrawl v2 API.
3
+ """
4
+
5
+ from typing import Optional, Dict, Any, List
6
+ from ..types import ScrapeOptions, ScrapeFormats
7
+
8
+
9
+ def _convert_format_string(format_str: str) -> str:
10
+ """
11
+ Convert format string from snake_case to camelCase.
12
+
13
+ Args:
14
+ format_str: Format string in snake_case
15
+
16
+ Returns:
17
+ Format string in camelCase
18
+ """
19
+ format_mapping = {
20
+ "raw_html": "rawHtml",
21
+ "change_tracking": "changeTracking",
22
+ "screenshot_full_page": "screenshot@fullPage"
23
+ }
24
+ return format_mapping.get(format_str, format_str)
25
+
26
+
27
+ def normalize_schema_for_openai(schema: Any) -> Any:
28
+ """
29
+ Normalize a schema for OpenAI compatibility by handling recursive references.
30
+
31
+ Args:
32
+ schema: Schema to normalize
33
+
34
+ Returns:
35
+ Normalized schema
36
+ """
37
+ if not schema or not isinstance(schema, dict):
38
+ return schema
39
+
40
+ visited = set()
41
+
42
+ def normalize_object(obj: Any) -> Any:
43
+ if not isinstance(obj, dict):
44
+ if isinstance(obj, list):
45
+ return [normalize_object(item) for item in obj]
46
+ return obj
47
+
48
+ obj_id = id(obj)
49
+ if obj_id in visited:
50
+ return obj
51
+ visited.add(obj_id)
52
+
53
+ normalized = dict(obj)
54
+
55
+ # Handle $ref recursion
56
+ if "$ref" in normalized:
57
+ visited.discard(obj_id)
58
+ return normalized
59
+
60
+ if "$defs" in normalized:
61
+ defs = normalized.pop("$defs")
62
+ processed_rest = {}
63
+
64
+ for key, value in normalized.items():
65
+ if isinstance(value, dict) and "$ref" not in value:
66
+ processed_rest[key] = normalize_object(value)
67
+ else:
68
+ processed_rest[key] = value
69
+
70
+ normalized_defs = {}
71
+ for key, value in defs.items():
72
+ normalized_defs[key] = normalize_object(value)
73
+
74
+ result = {**processed_rest, "$defs": normalized_defs}
75
+ visited.discard(obj_id)
76
+ return result
77
+
78
+ if (normalized.get("type") == "object" and
79
+ "properties" in normalized and
80
+ normalized.get("additionalProperties") is True):
81
+ del normalized["additionalProperties"]
82
+
83
+ if (normalized.get("type") == "object" and
84
+ "required" in normalized and
85
+ "properties" in normalized):
86
+ if (isinstance(normalized["required"], list) and
87
+ isinstance(normalized["properties"], dict)):
88
+ valid_required = [field for field in normalized["required"]
89
+ if field in normalized["properties"]]
90
+ if valid_required:
91
+ normalized["required"] = valid_required
92
+ else:
93
+ del normalized["required"]
94
+ else:
95
+ del normalized["required"]
96
+
97
+ for key, value in list(normalized.items()):
98
+ if isinstance(value, dict) and "$ref" not in value:
99
+ normalized[key] = normalize_object(value)
100
+ elif isinstance(value, list):
101
+ normalized[key] = [normalize_object(item) if isinstance(item, dict) else item for item in value]
102
+
103
+ visited.discard(obj_id)
104
+ return normalized
105
+
106
+ return normalize_object(schema)
107
+
108
+
109
+ def validate_schema_for_openai(schema: Any) -> bool:
110
+ """
111
+ Validate schema for OpenAI compatibility.
112
+
113
+ Args:
114
+ schema: Schema to validate
115
+
116
+ Returns:
117
+ True if schema is valid, False otherwise
118
+ """
119
+ if not schema or not isinstance(schema, dict):
120
+ return True
121
+
122
+ visited = set()
123
+
124
+ def has_invalid_structure(obj: Any) -> bool:
125
+ if not isinstance(obj, dict):
126
+ return False
127
+
128
+ obj_id = id(obj)
129
+ if obj_id in visited:
130
+ return False
131
+ visited.add(obj_id)
132
+
133
+ if "$ref" in obj:
134
+ visited.discard(obj_id)
135
+ return False
136
+
137
+ if (obj.get("type") == "object" and
138
+ "properties" not in obj and
139
+ "patternProperties" not in obj and
140
+ obj.get("additionalProperties") is True):
141
+ visited.discard(obj_id)
142
+ return True
143
+
144
+ for value in obj.values():
145
+ if isinstance(value, dict) and "$ref" not in value:
146
+ if has_invalid_structure(value):
147
+ visited.discard(obj_id)
148
+ return True
149
+ elif isinstance(value, list):
150
+ for item in value:
151
+ if isinstance(item, dict) and "$ref" not in item:
152
+ if has_invalid_structure(item):
153
+ visited.discard(obj_id)
154
+ return True
155
+
156
+ visited.discard(obj_id)
157
+ return False
158
+
159
+ return not has_invalid_structure(schema)
160
+
161
+
162
+ OPENAI_SCHEMA_ERROR_MESSAGE = (
163
+ "Schema contains invalid structure for OpenAI: object type with no 'properties' defined "
164
+ "but 'additionalProperties: true' (schema-less dictionary not supported by OpenAI). "
165
+ "Please define specific properties for your object. Note: Recursive schemas using '$ref' are supported."
166
+ )
167
+
168
+
169
+ def _contains_recursive_ref(obj: Any, target_def_name: str, defs: Dict[str, Any], visited: Optional[set] = None) -> bool:
170
+ """
171
+ Check if an object contains a recursive reference to a specific definition.
172
+
173
+ Args:
174
+ obj: Object to check
175
+ target_def_name: Name of the definition to check for recursion
176
+ defs: Dictionary of definitions
177
+ visited: Set of visited object keys to detect cycles
178
+
179
+ Returns:
180
+ True if recursive reference is found, False otherwise
181
+ """
182
+ if not obj or not isinstance(obj, (dict, list)):
183
+ return False
184
+
185
+ if visited is None:
186
+ visited = set()
187
+
188
+ import json
189
+ obj_key = json.dumps(obj, sort_keys=True, default=str)
190
+ if obj_key in visited:
191
+ return False
192
+ visited.add(obj_key)
193
+
194
+ try:
195
+ if isinstance(obj, dict):
196
+ if "$ref" in obj and isinstance(obj["$ref"], str):
197
+ ref_path = obj["$ref"].split("/")
198
+ if len(ref_path) >= 3 and ref_path[0] == "#" and ref_path[1] == "$defs":
199
+ def_name = ref_path[-1]
200
+ if def_name == target_def_name:
201
+ return True
202
+ if def_name in defs:
203
+ return _contains_recursive_ref(defs[def_name], target_def_name, defs, visited)
204
+
205
+ for value in obj.values():
206
+ if _contains_recursive_ref(value, target_def_name, defs, visited):
207
+ return True
208
+
209
+ elif isinstance(obj, list):
210
+ for item in obj:
211
+ if _contains_recursive_ref(item, target_def_name, defs, visited):
212
+ return True
213
+
214
+ finally:
215
+ visited.discard(obj_key)
216
+
217
+ return False
218
+
219
+
220
+ def _check_for_circular_defs(defs: Dict[str, Any]) -> bool:
221
+ """
222
+ Check if $defs contain circular references.
223
+
224
+ Args:
225
+ defs: Dictionary of definitions to check
226
+
227
+ Returns:
228
+ True if circular references are found, False otherwise
229
+ """
230
+ if not defs:
231
+ return False
232
+
233
+ for def_name, def_value in defs.items():
234
+ if _contains_recursive_ref(def_value, def_name, defs):
235
+ return True
236
+
237
+ return False
238
+
239
+
240
+ def resolve_refs(obj: Any, defs: Dict[str, Any], visited: Optional[set] = None, depth: int = 0) -> Any:
241
+ """
242
+ Resolve $ref references in a JSON schema object.
243
+
244
+ Args:
245
+ obj: Object to resolve references in
246
+ defs: Dictionary of definitions
247
+ visited: Set to track visited objects and prevent infinite recursion
248
+ depth: Current recursion depth
249
+
250
+ Returns:
251
+ Object with resolved references
252
+ """
253
+ if not obj or not isinstance(obj, (dict, list)) or depth > 10:
254
+ return obj
255
+
256
+ if visited is None:
257
+ visited = set()
258
+
259
+ obj_id = id(obj)
260
+ if obj_id in visited:
261
+ return obj
262
+
263
+ visited.add(obj_id)
264
+
265
+ try:
266
+ if isinstance(obj, dict):
267
+ if "$ref" in obj and isinstance(obj["$ref"], str):
268
+ ref_path = obj["$ref"].split("/")
269
+ if len(ref_path) >= 3 and ref_path[0] == "#" and ref_path[1] == "$defs":
270
+ def_name = ref_path[-1]
271
+ if def_name in defs:
272
+ return resolve_refs(dict(defs[def_name]), defs, visited, depth + 1)
273
+ return obj
274
+
275
+ resolved = {}
276
+ for key, value in obj.items():
277
+ if key == "$defs":
278
+ continue
279
+ resolved[key] = resolve_refs(value, defs, visited, depth + 1)
280
+ return resolved
281
+
282
+ elif isinstance(obj, list):
283
+ return [resolve_refs(item, defs, visited, depth + 1) for item in obj]
284
+
285
+ finally:
286
+ visited.discard(obj_id)
287
+
288
+ return obj
289
+
290
+
291
+ def detect_recursive_schema(schema: Any) -> bool:
292
+ """
293
+ Detect if a schema contains recursive references.
294
+
295
+ Args:
296
+ schema: Schema to analyze
297
+
298
+ Returns:
299
+ True if schema has recursive patterns, False otherwise
300
+ """
301
+ if not schema or not isinstance(schema, dict):
302
+ return False
303
+
304
+ import json
305
+ schema_string = json.dumps(schema)
306
+ has_refs = (
307
+ '"$ref"' in schema_string or
308
+ "#/$defs/" in schema_string or
309
+ "#/definitions/" in schema_string
310
+ )
311
+ has_defs = bool(schema.get("$defs") or schema.get("definitions"))
312
+
313
+ return has_refs or has_defs
314
+
315
+
316
+ def select_model_for_schema(schema: Any = None) -> Dict[str, str]:
317
+ """
318
+ Select appropriate model based on schema complexity.
319
+
320
+ Args:
321
+ schema: Schema to analyze
322
+
323
+ Returns:
324
+ Dict with modelName and reason
325
+ """
326
+ if not schema:
327
+ return {"modelName": "gpt-4o-mini", "reason": "no_schema"}
328
+
329
+ if detect_recursive_schema(schema):
330
+ return {"modelName": "gpt-4o", "reason": "recursive_schema_detected"}
331
+
332
+ return {"modelName": "gpt-4o-mini", "reason": "simple_schema"}
333
+
334
+
335
+ def _normalize_schema(schema: Any) -> Optional[Dict[str, Any]]:
336
+ """
337
+ Normalize a schema object which may be a dict, Pydantic BaseModel subclass,
338
+ or a Pydantic model instance into a plain dict.
339
+ """
340
+ try:
341
+ # Pydantic v2 BaseModel subclass: has "model_json_schema"
342
+ if hasattr(schema, "model_json_schema") and callable(schema.model_json_schema):
343
+ return schema.model_json_schema()
344
+ # Pydantic v2 BaseModel instance: has "model_dump" or "model_json_schema"
345
+ if hasattr(schema, "model_dump") and callable(schema.model_dump):
346
+ # Try to get JSON schema if available on the class
347
+ mjs = getattr(schema.__class__, "model_json_schema", None)
348
+ if callable(mjs):
349
+ return schema.__class__.model_json_schema()
350
+ # Fallback to data shape (not ideal, but better than dropping)
351
+ return schema.model_dump()
352
+ # Pydantic v1 BaseModel subclass: has "schema"
353
+ if hasattr(schema, "schema") and callable(schema.schema):
354
+ return schema.schema()
355
+ # Pydantic v1 BaseModel instance
356
+ if hasattr(schema, "dict") and callable(schema.dict):
357
+ # Prefer class-level schema if present
358
+ sch = getattr(schema.__class__, "schema", None)
359
+ if callable(sch):
360
+ return schema.__class__.schema()
361
+ return schema.dict()
362
+ except Exception:
363
+ pass
364
+ # Already a dict or unsupported type
365
+ return schema if isinstance(schema, dict) else None
366
+
367
+
368
+ def _validate_json_format(format_obj: Any) -> Dict[str, Any]:
369
+ """
370
+ Validate and prepare json format object.
371
+
372
+ Args:
373
+ format_obj: Format object that should be json type
374
+
375
+ Returns:
376
+ Validated json format dict
377
+
378
+ Raises:
379
+ ValueError: If json format is missing required fields
380
+ """
381
+ if not isinstance(format_obj, dict):
382
+ raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
383
+
384
+ if format_obj.get('type') != 'json':
385
+ raise ValueError("json format must have type='json'")
386
+
387
+ # prompt is optional in v2; only normalize when present
388
+ # schema is recommended; if provided, normalize Pydantic forms
389
+ schema = format_obj.get('schema')
390
+ normalized = dict(format_obj)
391
+ if schema is not None:
392
+ normalized_schema = _normalize_schema(schema)
393
+ if normalized_schema is not None:
394
+ # Handle schema reference resolution similar to TypeScript implementation
395
+ if isinstance(normalized_schema, dict):
396
+ defs = normalized_schema.get("$defs", {})
397
+ import json
398
+ schema_string = json.dumps(normalized_schema)
399
+ has_any_refs = (
400
+ normalized_schema.get("$defs") or
401
+ '"$ref"' in schema_string or
402
+ "#/$defs/" in schema_string
403
+ )
404
+
405
+ if has_any_refs:
406
+ try:
407
+ resolved_schema = resolve_refs(normalized_schema, defs)
408
+ resolved_string = json.dumps(resolved_schema)
409
+ has_remaining_refs = '"$ref"' in resolved_string or "#/$defs/" in resolved_string
410
+
411
+ if not has_remaining_refs:
412
+ normalized_schema = resolved_schema
413
+ # Remove $defs after successful resolution
414
+ if isinstance(normalized_schema, dict) and "$defs" in normalized_schema:
415
+ del normalized_schema["$defs"]
416
+ # If refs remain, preserve original schema
417
+ except Exception:
418
+ # Failed to resolve refs, preserve original schema
419
+ pass
420
+ else:
421
+ # No recursive references detected, resolve refs anyway
422
+ try:
423
+ normalized_schema = resolve_refs(normalized_schema, defs)
424
+ if isinstance(normalized_schema, dict) and "$defs" in normalized_schema:
425
+ del normalized_schema["$defs"]
426
+ except Exception:
427
+ pass
428
+
429
+ # Apply OpenAI normalization and validation
430
+ openai_normalized_schema = normalize_schema_for_openai(normalized_schema)
431
+ if not validate_schema_for_openai(openai_normalized_schema):
432
+ raise ValueError(OPENAI_SCHEMA_ERROR_MESSAGE)
433
+
434
+ normalized['schema'] = openai_normalized_schema
435
+ return normalized
436
+
437
+
438
+ def validate_scrape_options(options: Optional[ScrapeOptions]) -> Optional[ScrapeOptions]:
439
+ """
440
+ Validate and normalize scrape options.
441
+
442
+ Args:
443
+ options: Scraping options to validate
444
+
445
+ Returns:
446
+ Validated options or None
447
+
448
+ Raises:
449
+ ValueError: If options are invalid
450
+ """
451
+ if options is None:
452
+ return None
453
+
454
+ # Validate timeout
455
+ if options.timeout is not None and options.timeout <= 0:
456
+ raise ValueError("Timeout must be positive")
457
+
458
+ # Validate wait_for
459
+ if options.wait_for is not None and options.wait_for < 0:
460
+ raise ValueError("wait_for must be non-negative")
461
+
462
+ return options
463
+
464
+
465
+ def prepare_scrape_options(options: Optional[ScrapeOptions]) -> Optional[Dict[str, Any]]:
466
+ """
467
+ Prepare ScrapeOptions for API submission with manual snake_case to camelCase conversion.
468
+
469
+ Args:
470
+ options: ScrapeOptions to prepare
471
+
472
+ Returns:
473
+ Dictionary ready for API submission or None if options is None
474
+ """
475
+ if options is None:
476
+ return None
477
+
478
+ # Validate options first
479
+ validated_options = validate_scrape_options(options)
480
+ if validated_options is None:
481
+ return None
482
+
483
+ # Apply default values for None fields
484
+ default_values = {
485
+ "only_main_content": True,
486
+ "mobile": False,
487
+ "skip_tls_verification": True,
488
+ "remove_base64_images": True,
489
+ "fast_mode": False,
490
+ "block_ads": True,
491
+ "max_age": 14400000,
492
+ "store_in_cache": True
493
+ }
494
+
495
+ # Convert to dict and handle manual snake_case to camelCase conversion
496
+ options_data = validated_options.model_dump(exclude_none=True)
497
+
498
+ # Apply defaults for None fields
499
+ for field, default_value in default_values.items():
500
+ if field not in options_data:
501
+ options_data[field] = default_value
502
+
503
+ scrape_data = {}
504
+
505
+ # Manual field mapping for snake_case to camelCase conversion
506
+ field_mappings = {
507
+ "include_tags": "includeTags",
508
+ "exclude_tags": "excludeTags",
509
+ "only_main_content": "onlyMainContent",
510
+ "wait_for": "waitFor",
511
+ "skip_tls_verification": "skipTlsVerification",
512
+ "remove_base64_images": "removeBase64Images",
513
+ "fast_mode": "fastMode",
514
+ "use_mock": "useMock",
515
+ "block_ads": "blockAds",
516
+ "store_in_cache": "storeInCache",
517
+ "max_age": "maxAge"
518
+ }
519
+
520
+ # Apply field mappings
521
+ for snake_case, camel_case in field_mappings.items():
522
+ if snake_case in options_data:
523
+ scrape_data[camel_case] = options_data.pop(snake_case)
524
+
525
+ # Handle special cases
526
+ for key, value in options_data.items():
527
+ if value is not None:
528
+ if key == "integration":
529
+ scrape_data["integration"] = (str(value).strip() or None)
530
+ continue
531
+ if key == "formats":
532
+ # Handle formats conversion
533
+ converted_formats: List[Any] = []
534
+
535
+ # Prefer using original object to detect ScrapeFormats vs list
536
+ original_formats = getattr(options, 'formats', None)
537
+
538
+ if isinstance(original_formats, ScrapeFormats):
539
+ # Include explicit list first
540
+ if original_formats.formats:
541
+ for fmt in original_formats.formats:
542
+ if isinstance(fmt, str):
543
+ if fmt == "json":
544
+ raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
545
+ converted_formats.append(_convert_format_string(fmt))
546
+ elif isinstance(fmt, dict):
547
+ fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None
548
+ if fmt_type == 'json':
549
+ validated_json = _validate_json_format({**fmt, 'type': 'json'})
550
+ converted_formats.append(validated_json)
551
+ elif fmt_type == 'screenshot':
552
+ # Normalize screenshot options
553
+ normalized = {**fmt, 'type': 'screenshot'}
554
+ if 'full_page' in normalized:
555
+ normalized['fullPage'] = normalized.pop('full_page')
556
+ # Normalize viewport if it's a model instance
557
+ vp = normalized.get('viewport')
558
+ if hasattr(vp, 'model_dump'):
559
+ normalized['viewport'] = vp.model_dump(exclude_none=True)
560
+ converted_formats.append(normalized)
561
+ else:
562
+ if 'type' in fmt:
563
+ fmt['type'] = fmt_type or fmt['type']
564
+ converted_formats.append(fmt)
565
+ elif hasattr(fmt, 'type'):
566
+ if fmt.type == 'json':
567
+ converted_formats.append(_validate_json_format(fmt.model_dump()))
568
+ else:
569
+ converted_formats.append(_convert_format_string(fmt.type))
570
+ else:
571
+ converted_formats.append(fmt)
572
+
573
+ # Add booleans from ScrapeFormats
574
+ if original_formats.markdown:
575
+ converted_formats.append("markdown")
576
+ if original_formats.html:
577
+ converted_formats.append("html")
578
+ if original_formats.raw_html:
579
+ converted_formats.append("rawHtml")
580
+ if original_formats.summary:
581
+ converted_formats.append("summary")
582
+ if original_formats.links:
583
+ converted_formats.append("links")
584
+ if original_formats.screenshot:
585
+ converted_formats.append("screenshot")
586
+ if original_formats.change_tracking:
587
+ converted_formats.append("changeTracking")
588
+ # Note: We intentionally do not auto-include 'json' when boolean is set,
589
+ # because JSON requires an object with schema/prompt. The caller must
590
+ # supply the full json format object explicitly.
591
+ elif isinstance(original_formats, list):
592
+ for fmt in original_formats:
593
+ if isinstance(fmt, str):
594
+ if fmt == "json":
595
+ raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
596
+ converted_formats.append(_convert_format_string(fmt))
597
+ elif isinstance(fmt, dict):
598
+ fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None
599
+ if fmt_type == 'json':
600
+ validated_json = _validate_json_format({**fmt, 'type': 'json'})
601
+ converted_formats.append(validated_json)
602
+ elif fmt_type == 'screenshot':
603
+ normalized = {**fmt, 'type': 'screenshot'}
604
+ if 'full_page' in normalized:
605
+ normalized['fullPage'] = normalized.pop('full_page')
606
+ vp = normalized.get('viewport')
607
+ if hasattr(vp, 'model_dump'):
608
+ normalized['viewport'] = vp.model_dump(exclude_none=True)
609
+ converted_formats.append(normalized)
610
+ else:
611
+ if 'type' in fmt:
612
+ fmt['type'] = fmt_type or fmt['type']
613
+ converted_formats.append(fmt)
614
+ elif hasattr(fmt, 'type'):
615
+ if fmt.type == 'json':
616
+ converted_formats.append(_validate_json_format(fmt.model_dump()))
617
+ elif fmt.type == 'screenshot':
618
+ normalized = {'type': 'screenshot'}
619
+ if getattr(fmt, 'full_page', None) is not None:
620
+ normalized['fullPage'] = fmt.full_page
621
+ if getattr(fmt, 'quality', None) is not None:
622
+ normalized['quality'] = fmt.quality
623
+ vp = getattr(fmt, 'viewport', None)
624
+ if vp is not None:
625
+ normalized['viewport'] = vp.model_dump(exclude_none=True) if hasattr(vp, 'model_dump') else vp
626
+ converted_formats.append(normalized)
627
+ else:
628
+ converted_formats.append(_convert_format_string(fmt.type))
629
+ else:
630
+ converted_formats.append(fmt)
631
+ else:
632
+ # Fallback: try to iterate over value if it's a list-like
633
+ try:
634
+ for fmt in value:
635
+ converted_formats.append(fmt)
636
+ except TypeError:
637
+ pass
638
+
639
+ if converted_formats:
640
+ scrape_data["formats"] = converted_formats
641
+ elif key == "actions":
642
+ # Handle actions conversion
643
+ converted_actions = []
644
+ for action in value:
645
+ if isinstance(action, dict):
646
+ # Convert action dict
647
+ converted_action = {}
648
+ for action_key, action_value in action.items():
649
+ if action_key == "full_page":
650
+ converted_action["fullPage"] = action_value
651
+ else:
652
+ converted_action[action_key] = action_value
653
+ converted_actions.append(converted_action)
654
+ else:
655
+ # Handle action objects
656
+ action_data = action.model_dump(exclude_none=True)
657
+ converted_action = {}
658
+ for action_key, action_value in action_data.items():
659
+ if action_key == "full_page":
660
+ converted_action["fullPage"] = action_value
661
+ else:
662
+ converted_action[action_key] = action_value
663
+ converted_actions.append(converted_action)
664
+ scrape_data["actions"] = converted_actions
665
+ elif key == "parsers":
666
+ converted_parsers = []
667
+ for parser in value:
668
+ if isinstance(parser, str):
669
+ converted_parsers.append(parser)
670
+ elif isinstance(parser, dict):
671
+ parser_data = dict(parser)
672
+ if "max_pages" in parser_data:
673
+ parser_data["maxPages"] = parser_data.pop("max_pages")
674
+ converted_parsers.append(parser_data)
675
+ else:
676
+ parser_data = parser.model_dump(exclude_none=True)
677
+ # Convert snake_case to camelCase for API
678
+ if "max_pages" in parser_data:
679
+ parser_data["maxPages"] = parser_data.pop("max_pages")
680
+ converted_parsers.append(parser_data)
681
+ scrape_data["parsers"] = converted_parsers
682
+ elif key == "location":
683
+ # Handle location conversion
684
+ if isinstance(value, dict):
685
+ scrape_data["location"] = value
686
+ else:
687
+ scrape_data["location"] = value.model_dump(exclude_none=True)
688
+ else:
689
+ # For fields that don't need conversion, use as-is
690
+ scrape_data[key] = value
691
+
692
+ return scrape_data