firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
|
@@ -0,0 +1,692 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared validation functions for Firecrawl v2 API.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Dict, Any, List
|
|
6
|
+
from ..types import ScrapeOptions, ScrapeFormats
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _convert_format_string(format_str: str) -> str:
|
|
10
|
+
"""
|
|
11
|
+
Convert format string from snake_case to camelCase.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
format_str: Format string in snake_case
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Format string in camelCase
|
|
18
|
+
"""
|
|
19
|
+
format_mapping = {
|
|
20
|
+
"raw_html": "rawHtml",
|
|
21
|
+
"change_tracking": "changeTracking",
|
|
22
|
+
"screenshot_full_page": "screenshot@fullPage"
|
|
23
|
+
}
|
|
24
|
+
return format_mapping.get(format_str, format_str)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def normalize_schema_for_openai(schema: Any) -> Any:
|
|
28
|
+
"""
|
|
29
|
+
Normalize a schema for OpenAI compatibility by handling recursive references.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
schema: Schema to normalize
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Normalized schema
|
|
36
|
+
"""
|
|
37
|
+
if not schema or not isinstance(schema, dict):
|
|
38
|
+
return schema
|
|
39
|
+
|
|
40
|
+
visited = set()
|
|
41
|
+
|
|
42
|
+
def normalize_object(obj: Any) -> Any:
|
|
43
|
+
if not isinstance(obj, dict):
|
|
44
|
+
if isinstance(obj, list):
|
|
45
|
+
return [normalize_object(item) for item in obj]
|
|
46
|
+
return obj
|
|
47
|
+
|
|
48
|
+
obj_id = id(obj)
|
|
49
|
+
if obj_id in visited:
|
|
50
|
+
return obj
|
|
51
|
+
visited.add(obj_id)
|
|
52
|
+
|
|
53
|
+
normalized = dict(obj)
|
|
54
|
+
|
|
55
|
+
# Handle $ref recursion
|
|
56
|
+
if "$ref" in normalized:
|
|
57
|
+
visited.discard(obj_id)
|
|
58
|
+
return normalized
|
|
59
|
+
|
|
60
|
+
if "$defs" in normalized:
|
|
61
|
+
defs = normalized.pop("$defs")
|
|
62
|
+
processed_rest = {}
|
|
63
|
+
|
|
64
|
+
for key, value in normalized.items():
|
|
65
|
+
if isinstance(value, dict) and "$ref" not in value:
|
|
66
|
+
processed_rest[key] = normalize_object(value)
|
|
67
|
+
else:
|
|
68
|
+
processed_rest[key] = value
|
|
69
|
+
|
|
70
|
+
normalized_defs = {}
|
|
71
|
+
for key, value in defs.items():
|
|
72
|
+
normalized_defs[key] = normalize_object(value)
|
|
73
|
+
|
|
74
|
+
result = {**processed_rest, "$defs": normalized_defs}
|
|
75
|
+
visited.discard(obj_id)
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
if (normalized.get("type") == "object" and
|
|
79
|
+
"properties" in normalized and
|
|
80
|
+
normalized.get("additionalProperties") is True):
|
|
81
|
+
del normalized["additionalProperties"]
|
|
82
|
+
|
|
83
|
+
if (normalized.get("type") == "object" and
|
|
84
|
+
"required" in normalized and
|
|
85
|
+
"properties" in normalized):
|
|
86
|
+
if (isinstance(normalized["required"], list) and
|
|
87
|
+
isinstance(normalized["properties"], dict)):
|
|
88
|
+
valid_required = [field for field in normalized["required"]
|
|
89
|
+
if field in normalized["properties"]]
|
|
90
|
+
if valid_required:
|
|
91
|
+
normalized["required"] = valid_required
|
|
92
|
+
else:
|
|
93
|
+
del normalized["required"]
|
|
94
|
+
else:
|
|
95
|
+
del normalized["required"]
|
|
96
|
+
|
|
97
|
+
for key, value in list(normalized.items()):
|
|
98
|
+
if isinstance(value, dict) and "$ref" not in value:
|
|
99
|
+
normalized[key] = normalize_object(value)
|
|
100
|
+
elif isinstance(value, list):
|
|
101
|
+
normalized[key] = [normalize_object(item) if isinstance(item, dict) else item for item in value]
|
|
102
|
+
|
|
103
|
+
visited.discard(obj_id)
|
|
104
|
+
return normalized
|
|
105
|
+
|
|
106
|
+
return normalize_object(schema)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def validate_schema_for_openai(schema: Any) -> bool:
|
|
110
|
+
"""
|
|
111
|
+
Validate schema for OpenAI compatibility.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
schema: Schema to validate
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
True if schema is valid, False otherwise
|
|
118
|
+
"""
|
|
119
|
+
if not schema or not isinstance(schema, dict):
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
visited = set()
|
|
123
|
+
|
|
124
|
+
def has_invalid_structure(obj: Any) -> bool:
|
|
125
|
+
if not isinstance(obj, dict):
|
|
126
|
+
return False
|
|
127
|
+
|
|
128
|
+
obj_id = id(obj)
|
|
129
|
+
if obj_id in visited:
|
|
130
|
+
return False
|
|
131
|
+
visited.add(obj_id)
|
|
132
|
+
|
|
133
|
+
if "$ref" in obj:
|
|
134
|
+
visited.discard(obj_id)
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
if (obj.get("type") == "object" and
|
|
138
|
+
"properties" not in obj and
|
|
139
|
+
"patternProperties" not in obj and
|
|
140
|
+
obj.get("additionalProperties") is True):
|
|
141
|
+
visited.discard(obj_id)
|
|
142
|
+
return True
|
|
143
|
+
|
|
144
|
+
for value in obj.values():
|
|
145
|
+
if isinstance(value, dict) and "$ref" not in value:
|
|
146
|
+
if has_invalid_structure(value):
|
|
147
|
+
visited.discard(obj_id)
|
|
148
|
+
return True
|
|
149
|
+
elif isinstance(value, list):
|
|
150
|
+
for item in value:
|
|
151
|
+
if isinstance(item, dict) and "$ref" not in item:
|
|
152
|
+
if has_invalid_structure(item):
|
|
153
|
+
visited.discard(obj_id)
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
visited.discard(obj_id)
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
return not has_invalid_structure(schema)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
OPENAI_SCHEMA_ERROR_MESSAGE = (
|
|
163
|
+
"Schema contains invalid structure for OpenAI: object type with no 'properties' defined "
|
|
164
|
+
"but 'additionalProperties: true' (schema-less dictionary not supported by OpenAI). "
|
|
165
|
+
"Please define specific properties for your object. Note: Recursive schemas using '$ref' are supported."
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _contains_recursive_ref(obj: Any, target_def_name: str, defs: Dict[str, Any], visited: Optional[set] = None) -> bool:
|
|
170
|
+
"""
|
|
171
|
+
Check if an object contains a recursive reference to a specific definition.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
obj: Object to check
|
|
175
|
+
target_def_name: Name of the definition to check for recursion
|
|
176
|
+
defs: Dictionary of definitions
|
|
177
|
+
visited: Set of visited object keys to detect cycles
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
True if recursive reference is found, False otherwise
|
|
181
|
+
"""
|
|
182
|
+
if not obj or not isinstance(obj, (dict, list)):
|
|
183
|
+
return False
|
|
184
|
+
|
|
185
|
+
if visited is None:
|
|
186
|
+
visited = set()
|
|
187
|
+
|
|
188
|
+
import json
|
|
189
|
+
obj_key = json.dumps(obj, sort_keys=True, default=str)
|
|
190
|
+
if obj_key in visited:
|
|
191
|
+
return False
|
|
192
|
+
visited.add(obj_key)
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
if isinstance(obj, dict):
|
|
196
|
+
if "$ref" in obj and isinstance(obj["$ref"], str):
|
|
197
|
+
ref_path = obj["$ref"].split("/")
|
|
198
|
+
if len(ref_path) >= 3 and ref_path[0] == "#" and ref_path[1] == "$defs":
|
|
199
|
+
def_name = ref_path[-1]
|
|
200
|
+
if def_name == target_def_name:
|
|
201
|
+
return True
|
|
202
|
+
if def_name in defs:
|
|
203
|
+
return _contains_recursive_ref(defs[def_name], target_def_name, defs, visited)
|
|
204
|
+
|
|
205
|
+
for value in obj.values():
|
|
206
|
+
if _contains_recursive_ref(value, target_def_name, defs, visited):
|
|
207
|
+
return True
|
|
208
|
+
|
|
209
|
+
elif isinstance(obj, list):
|
|
210
|
+
for item in obj:
|
|
211
|
+
if _contains_recursive_ref(item, target_def_name, defs, visited):
|
|
212
|
+
return True
|
|
213
|
+
|
|
214
|
+
finally:
|
|
215
|
+
visited.discard(obj_key)
|
|
216
|
+
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _check_for_circular_defs(defs: Dict[str, Any]) -> bool:
|
|
221
|
+
"""
|
|
222
|
+
Check if $defs contain circular references.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
defs: Dictionary of definitions to check
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
True if circular references are found, False otherwise
|
|
229
|
+
"""
|
|
230
|
+
if not defs:
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
for def_name, def_value in defs.items():
|
|
234
|
+
if _contains_recursive_ref(def_value, def_name, defs):
|
|
235
|
+
return True
|
|
236
|
+
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def resolve_refs(obj: Any, defs: Dict[str, Any], visited: Optional[set] = None, depth: int = 0) -> Any:
|
|
241
|
+
"""
|
|
242
|
+
Resolve $ref references in a JSON schema object.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
obj: Object to resolve references in
|
|
246
|
+
defs: Dictionary of definitions
|
|
247
|
+
visited: Set to track visited objects and prevent infinite recursion
|
|
248
|
+
depth: Current recursion depth
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Object with resolved references
|
|
252
|
+
"""
|
|
253
|
+
if not obj or not isinstance(obj, (dict, list)) or depth > 10:
|
|
254
|
+
return obj
|
|
255
|
+
|
|
256
|
+
if visited is None:
|
|
257
|
+
visited = set()
|
|
258
|
+
|
|
259
|
+
obj_id = id(obj)
|
|
260
|
+
if obj_id in visited:
|
|
261
|
+
return obj
|
|
262
|
+
|
|
263
|
+
visited.add(obj_id)
|
|
264
|
+
|
|
265
|
+
try:
|
|
266
|
+
if isinstance(obj, dict):
|
|
267
|
+
if "$ref" in obj and isinstance(obj["$ref"], str):
|
|
268
|
+
ref_path = obj["$ref"].split("/")
|
|
269
|
+
if len(ref_path) >= 3 and ref_path[0] == "#" and ref_path[1] == "$defs":
|
|
270
|
+
def_name = ref_path[-1]
|
|
271
|
+
if def_name in defs:
|
|
272
|
+
return resolve_refs(dict(defs[def_name]), defs, visited, depth + 1)
|
|
273
|
+
return obj
|
|
274
|
+
|
|
275
|
+
resolved = {}
|
|
276
|
+
for key, value in obj.items():
|
|
277
|
+
if key == "$defs":
|
|
278
|
+
continue
|
|
279
|
+
resolved[key] = resolve_refs(value, defs, visited, depth + 1)
|
|
280
|
+
return resolved
|
|
281
|
+
|
|
282
|
+
elif isinstance(obj, list):
|
|
283
|
+
return [resolve_refs(item, defs, visited, depth + 1) for item in obj]
|
|
284
|
+
|
|
285
|
+
finally:
|
|
286
|
+
visited.discard(obj_id)
|
|
287
|
+
|
|
288
|
+
return obj
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def detect_recursive_schema(schema: Any) -> bool:
|
|
292
|
+
"""
|
|
293
|
+
Detect if a schema contains recursive references.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
schema: Schema to analyze
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
True if schema has recursive patterns, False otherwise
|
|
300
|
+
"""
|
|
301
|
+
if not schema or not isinstance(schema, dict):
|
|
302
|
+
return False
|
|
303
|
+
|
|
304
|
+
import json
|
|
305
|
+
schema_string = json.dumps(schema)
|
|
306
|
+
has_refs = (
|
|
307
|
+
'"$ref"' in schema_string or
|
|
308
|
+
"#/$defs/" in schema_string or
|
|
309
|
+
"#/definitions/" in schema_string
|
|
310
|
+
)
|
|
311
|
+
has_defs = bool(schema.get("$defs") or schema.get("definitions"))
|
|
312
|
+
|
|
313
|
+
return has_refs or has_defs
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def select_model_for_schema(schema: Any = None) -> Dict[str, str]:
|
|
317
|
+
"""
|
|
318
|
+
Select appropriate model based on schema complexity.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
schema: Schema to analyze
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Dict with modelName and reason
|
|
325
|
+
"""
|
|
326
|
+
if not schema:
|
|
327
|
+
return {"modelName": "gpt-4o-mini", "reason": "no_schema"}
|
|
328
|
+
|
|
329
|
+
if detect_recursive_schema(schema):
|
|
330
|
+
return {"modelName": "gpt-4o", "reason": "recursive_schema_detected"}
|
|
331
|
+
|
|
332
|
+
return {"modelName": "gpt-4o-mini", "reason": "simple_schema"}
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _normalize_schema(schema: Any) -> Optional[Dict[str, Any]]:
|
|
336
|
+
"""
|
|
337
|
+
Normalize a schema object which may be a dict, Pydantic BaseModel subclass,
|
|
338
|
+
or a Pydantic model instance into a plain dict.
|
|
339
|
+
"""
|
|
340
|
+
try:
|
|
341
|
+
# Pydantic v2 BaseModel subclass: has "model_json_schema"
|
|
342
|
+
if hasattr(schema, "model_json_schema") and callable(schema.model_json_schema):
|
|
343
|
+
return schema.model_json_schema()
|
|
344
|
+
# Pydantic v2 BaseModel instance: has "model_dump" or "model_json_schema"
|
|
345
|
+
if hasattr(schema, "model_dump") and callable(schema.model_dump):
|
|
346
|
+
# Try to get JSON schema if available on the class
|
|
347
|
+
mjs = getattr(schema.__class__, "model_json_schema", None)
|
|
348
|
+
if callable(mjs):
|
|
349
|
+
return schema.__class__.model_json_schema()
|
|
350
|
+
# Fallback to data shape (not ideal, but better than dropping)
|
|
351
|
+
return schema.model_dump()
|
|
352
|
+
# Pydantic v1 BaseModel subclass: has "schema"
|
|
353
|
+
if hasattr(schema, "schema") and callable(schema.schema):
|
|
354
|
+
return schema.schema()
|
|
355
|
+
# Pydantic v1 BaseModel instance
|
|
356
|
+
if hasattr(schema, "dict") and callable(schema.dict):
|
|
357
|
+
# Prefer class-level schema if present
|
|
358
|
+
sch = getattr(schema.__class__, "schema", None)
|
|
359
|
+
if callable(sch):
|
|
360
|
+
return schema.__class__.schema()
|
|
361
|
+
return schema.dict()
|
|
362
|
+
except Exception:
|
|
363
|
+
pass
|
|
364
|
+
# Already a dict or unsupported type
|
|
365
|
+
return schema if isinstance(schema, dict) else None
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _validate_json_format(format_obj: Any) -> Dict[str, Any]:
|
|
369
|
+
"""
|
|
370
|
+
Validate and prepare json format object.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
format_obj: Format object that should be json type
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Validated json format dict
|
|
377
|
+
|
|
378
|
+
Raises:
|
|
379
|
+
ValueError: If json format is missing required fields
|
|
380
|
+
"""
|
|
381
|
+
if not isinstance(format_obj, dict):
|
|
382
|
+
raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
|
|
383
|
+
|
|
384
|
+
if format_obj.get('type') != 'json':
|
|
385
|
+
raise ValueError("json format must have type='json'")
|
|
386
|
+
|
|
387
|
+
# prompt is optional in v2; only normalize when present
|
|
388
|
+
# schema is recommended; if provided, normalize Pydantic forms
|
|
389
|
+
schema = format_obj.get('schema')
|
|
390
|
+
normalized = dict(format_obj)
|
|
391
|
+
if schema is not None:
|
|
392
|
+
normalized_schema = _normalize_schema(schema)
|
|
393
|
+
if normalized_schema is not None:
|
|
394
|
+
# Handle schema reference resolution similar to TypeScript implementation
|
|
395
|
+
if isinstance(normalized_schema, dict):
|
|
396
|
+
defs = normalized_schema.get("$defs", {})
|
|
397
|
+
import json
|
|
398
|
+
schema_string = json.dumps(normalized_schema)
|
|
399
|
+
has_any_refs = (
|
|
400
|
+
normalized_schema.get("$defs") or
|
|
401
|
+
'"$ref"' in schema_string or
|
|
402
|
+
"#/$defs/" in schema_string
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
if has_any_refs:
|
|
406
|
+
try:
|
|
407
|
+
resolved_schema = resolve_refs(normalized_schema, defs)
|
|
408
|
+
resolved_string = json.dumps(resolved_schema)
|
|
409
|
+
has_remaining_refs = '"$ref"' in resolved_string or "#/$defs/" in resolved_string
|
|
410
|
+
|
|
411
|
+
if not has_remaining_refs:
|
|
412
|
+
normalized_schema = resolved_schema
|
|
413
|
+
# Remove $defs after successful resolution
|
|
414
|
+
if isinstance(normalized_schema, dict) and "$defs" in normalized_schema:
|
|
415
|
+
del normalized_schema["$defs"]
|
|
416
|
+
# If refs remain, preserve original schema
|
|
417
|
+
except Exception:
|
|
418
|
+
# Failed to resolve refs, preserve original schema
|
|
419
|
+
pass
|
|
420
|
+
else:
|
|
421
|
+
# No recursive references detected, resolve refs anyway
|
|
422
|
+
try:
|
|
423
|
+
normalized_schema = resolve_refs(normalized_schema, defs)
|
|
424
|
+
if isinstance(normalized_schema, dict) and "$defs" in normalized_schema:
|
|
425
|
+
del normalized_schema["$defs"]
|
|
426
|
+
except Exception:
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
# Apply OpenAI normalization and validation
|
|
430
|
+
openai_normalized_schema = normalize_schema_for_openai(normalized_schema)
|
|
431
|
+
if not validate_schema_for_openai(openai_normalized_schema):
|
|
432
|
+
raise ValueError(OPENAI_SCHEMA_ERROR_MESSAGE)
|
|
433
|
+
|
|
434
|
+
normalized['schema'] = openai_normalized_schema
|
|
435
|
+
return normalized
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def validate_scrape_options(options: Optional[ScrapeOptions]) -> Optional[ScrapeOptions]:
|
|
439
|
+
"""
|
|
440
|
+
Validate and normalize scrape options.
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
options: Scraping options to validate
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
Validated options or None
|
|
447
|
+
|
|
448
|
+
Raises:
|
|
449
|
+
ValueError: If options are invalid
|
|
450
|
+
"""
|
|
451
|
+
if options is None:
|
|
452
|
+
return None
|
|
453
|
+
|
|
454
|
+
# Validate timeout
|
|
455
|
+
if options.timeout is not None and options.timeout <= 0:
|
|
456
|
+
raise ValueError("Timeout must be positive")
|
|
457
|
+
|
|
458
|
+
# Validate wait_for
|
|
459
|
+
if options.wait_for is not None and options.wait_for < 0:
|
|
460
|
+
raise ValueError("wait_for must be non-negative")
|
|
461
|
+
|
|
462
|
+
return options
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def prepare_scrape_options(options: Optional[ScrapeOptions]) -> Optional[Dict[str, Any]]:
|
|
466
|
+
"""
|
|
467
|
+
Prepare ScrapeOptions for API submission with manual snake_case to camelCase conversion.
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
options: ScrapeOptions to prepare
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
Dictionary ready for API submission or None if options is None
|
|
474
|
+
"""
|
|
475
|
+
if options is None:
|
|
476
|
+
return None
|
|
477
|
+
|
|
478
|
+
# Validate options first
|
|
479
|
+
validated_options = validate_scrape_options(options)
|
|
480
|
+
if validated_options is None:
|
|
481
|
+
return None
|
|
482
|
+
|
|
483
|
+
# Apply default values for None fields
|
|
484
|
+
default_values = {
|
|
485
|
+
"only_main_content": True,
|
|
486
|
+
"mobile": False,
|
|
487
|
+
"skip_tls_verification": True,
|
|
488
|
+
"remove_base64_images": True,
|
|
489
|
+
"fast_mode": False,
|
|
490
|
+
"block_ads": True,
|
|
491
|
+
"max_age": 14400000,
|
|
492
|
+
"store_in_cache": True
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
# Convert to dict and handle manual snake_case to camelCase conversion
|
|
496
|
+
options_data = validated_options.model_dump(exclude_none=True)
|
|
497
|
+
|
|
498
|
+
# Apply defaults for None fields
|
|
499
|
+
for field, default_value in default_values.items():
|
|
500
|
+
if field not in options_data:
|
|
501
|
+
options_data[field] = default_value
|
|
502
|
+
|
|
503
|
+
scrape_data = {}
|
|
504
|
+
|
|
505
|
+
# Manual field mapping for snake_case to camelCase conversion
|
|
506
|
+
field_mappings = {
|
|
507
|
+
"include_tags": "includeTags",
|
|
508
|
+
"exclude_tags": "excludeTags",
|
|
509
|
+
"only_main_content": "onlyMainContent",
|
|
510
|
+
"wait_for": "waitFor",
|
|
511
|
+
"skip_tls_verification": "skipTlsVerification",
|
|
512
|
+
"remove_base64_images": "removeBase64Images",
|
|
513
|
+
"fast_mode": "fastMode",
|
|
514
|
+
"use_mock": "useMock",
|
|
515
|
+
"block_ads": "blockAds",
|
|
516
|
+
"store_in_cache": "storeInCache",
|
|
517
|
+
"max_age": "maxAge"
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
# Apply field mappings
|
|
521
|
+
for snake_case, camel_case in field_mappings.items():
|
|
522
|
+
if snake_case in options_data:
|
|
523
|
+
scrape_data[camel_case] = options_data.pop(snake_case)
|
|
524
|
+
|
|
525
|
+
# Handle special cases
|
|
526
|
+
for key, value in options_data.items():
|
|
527
|
+
if value is not None:
|
|
528
|
+
if key == "integration":
|
|
529
|
+
scrape_data["integration"] = (str(value).strip() or None)
|
|
530
|
+
continue
|
|
531
|
+
if key == "formats":
|
|
532
|
+
# Handle formats conversion
|
|
533
|
+
converted_formats: List[Any] = []
|
|
534
|
+
|
|
535
|
+
# Prefer using original object to detect ScrapeFormats vs list
|
|
536
|
+
original_formats = getattr(options, 'formats', None)
|
|
537
|
+
|
|
538
|
+
if isinstance(original_formats, ScrapeFormats):
|
|
539
|
+
# Include explicit list first
|
|
540
|
+
if original_formats.formats:
|
|
541
|
+
for fmt in original_formats.formats:
|
|
542
|
+
if isinstance(fmt, str):
|
|
543
|
+
if fmt == "json":
|
|
544
|
+
raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
|
|
545
|
+
converted_formats.append(_convert_format_string(fmt))
|
|
546
|
+
elif isinstance(fmt, dict):
|
|
547
|
+
fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None
|
|
548
|
+
if fmt_type == 'json':
|
|
549
|
+
validated_json = _validate_json_format({**fmt, 'type': 'json'})
|
|
550
|
+
converted_formats.append(validated_json)
|
|
551
|
+
elif fmt_type == 'screenshot':
|
|
552
|
+
# Normalize screenshot options
|
|
553
|
+
normalized = {**fmt, 'type': 'screenshot'}
|
|
554
|
+
if 'full_page' in normalized:
|
|
555
|
+
normalized['fullPage'] = normalized.pop('full_page')
|
|
556
|
+
# Normalize viewport if it's a model instance
|
|
557
|
+
vp = normalized.get('viewport')
|
|
558
|
+
if hasattr(vp, 'model_dump'):
|
|
559
|
+
normalized['viewport'] = vp.model_dump(exclude_none=True)
|
|
560
|
+
converted_formats.append(normalized)
|
|
561
|
+
else:
|
|
562
|
+
if 'type' in fmt:
|
|
563
|
+
fmt['type'] = fmt_type or fmt['type']
|
|
564
|
+
converted_formats.append(fmt)
|
|
565
|
+
elif hasattr(fmt, 'type'):
|
|
566
|
+
if fmt.type == 'json':
|
|
567
|
+
converted_formats.append(_validate_json_format(fmt.model_dump()))
|
|
568
|
+
else:
|
|
569
|
+
converted_formats.append(_convert_format_string(fmt.type))
|
|
570
|
+
else:
|
|
571
|
+
converted_formats.append(fmt)
|
|
572
|
+
|
|
573
|
+
# Add booleans from ScrapeFormats
|
|
574
|
+
if original_formats.markdown:
|
|
575
|
+
converted_formats.append("markdown")
|
|
576
|
+
if original_formats.html:
|
|
577
|
+
converted_formats.append("html")
|
|
578
|
+
if original_formats.raw_html:
|
|
579
|
+
converted_formats.append("rawHtml")
|
|
580
|
+
if original_formats.summary:
|
|
581
|
+
converted_formats.append("summary")
|
|
582
|
+
if original_formats.links:
|
|
583
|
+
converted_formats.append("links")
|
|
584
|
+
if original_formats.screenshot:
|
|
585
|
+
converted_formats.append("screenshot")
|
|
586
|
+
if original_formats.change_tracking:
|
|
587
|
+
converted_formats.append("changeTracking")
|
|
588
|
+
# Note: We intentionally do not auto-include 'json' when boolean is set,
|
|
589
|
+
# because JSON requires an object with schema/prompt. The caller must
|
|
590
|
+
# supply the full json format object explicitly.
|
|
591
|
+
elif isinstance(original_formats, list):
|
|
592
|
+
for fmt in original_formats:
|
|
593
|
+
if isinstance(fmt, str):
|
|
594
|
+
if fmt == "json":
|
|
595
|
+
raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
|
|
596
|
+
converted_formats.append(_convert_format_string(fmt))
|
|
597
|
+
elif isinstance(fmt, dict):
|
|
598
|
+
fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None
|
|
599
|
+
if fmt_type == 'json':
|
|
600
|
+
validated_json = _validate_json_format({**fmt, 'type': 'json'})
|
|
601
|
+
converted_formats.append(validated_json)
|
|
602
|
+
elif fmt_type == 'screenshot':
|
|
603
|
+
normalized = {**fmt, 'type': 'screenshot'}
|
|
604
|
+
if 'full_page' in normalized:
|
|
605
|
+
normalized['fullPage'] = normalized.pop('full_page')
|
|
606
|
+
vp = normalized.get('viewport')
|
|
607
|
+
if hasattr(vp, 'model_dump'):
|
|
608
|
+
normalized['viewport'] = vp.model_dump(exclude_none=True)
|
|
609
|
+
converted_formats.append(normalized)
|
|
610
|
+
else:
|
|
611
|
+
if 'type' in fmt:
|
|
612
|
+
fmt['type'] = fmt_type or fmt['type']
|
|
613
|
+
converted_formats.append(fmt)
|
|
614
|
+
elif hasattr(fmt, 'type'):
|
|
615
|
+
if fmt.type == 'json':
|
|
616
|
+
converted_formats.append(_validate_json_format(fmt.model_dump()))
|
|
617
|
+
elif fmt.type == 'screenshot':
|
|
618
|
+
normalized = {'type': 'screenshot'}
|
|
619
|
+
if getattr(fmt, 'full_page', None) is not None:
|
|
620
|
+
normalized['fullPage'] = fmt.full_page
|
|
621
|
+
if getattr(fmt, 'quality', None) is not None:
|
|
622
|
+
normalized['quality'] = fmt.quality
|
|
623
|
+
vp = getattr(fmt, 'viewport', None)
|
|
624
|
+
if vp is not None:
|
|
625
|
+
normalized['viewport'] = vp.model_dump(exclude_none=True) if hasattr(vp, 'model_dump') else vp
|
|
626
|
+
converted_formats.append(normalized)
|
|
627
|
+
else:
|
|
628
|
+
converted_formats.append(_convert_format_string(fmt.type))
|
|
629
|
+
else:
|
|
630
|
+
converted_formats.append(fmt)
|
|
631
|
+
else:
|
|
632
|
+
# Fallback: try to iterate over value if it's a list-like
|
|
633
|
+
try:
|
|
634
|
+
for fmt in value:
|
|
635
|
+
converted_formats.append(fmt)
|
|
636
|
+
except TypeError:
|
|
637
|
+
pass
|
|
638
|
+
|
|
639
|
+
if converted_formats:
|
|
640
|
+
scrape_data["formats"] = converted_formats
|
|
641
|
+
elif key == "actions":
|
|
642
|
+
# Handle actions conversion
|
|
643
|
+
converted_actions = []
|
|
644
|
+
for action in value:
|
|
645
|
+
if isinstance(action, dict):
|
|
646
|
+
# Convert action dict
|
|
647
|
+
converted_action = {}
|
|
648
|
+
for action_key, action_value in action.items():
|
|
649
|
+
if action_key == "full_page":
|
|
650
|
+
converted_action["fullPage"] = action_value
|
|
651
|
+
else:
|
|
652
|
+
converted_action[action_key] = action_value
|
|
653
|
+
converted_actions.append(converted_action)
|
|
654
|
+
else:
|
|
655
|
+
# Handle action objects
|
|
656
|
+
action_data = action.model_dump(exclude_none=True)
|
|
657
|
+
converted_action = {}
|
|
658
|
+
for action_key, action_value in action_data.items():
|
|
659
|
+
if action_key == "full_page":
|
|
660
|
+
converted_action["fullPage"] = action_value
|
|
661
|
+
else:
|
|
662
|
+
converted_action[action_key] = action_value
|
|
663
|
+
converted_actions.append(converted_action)
|
|
664
|
+
scrape_data["actions"] = converted_actions
|
|
665
|
+
elif key == "parsers":
|
|
666
|
+
converted_parsers = []
|
|
667
|
+
for parser in value:
|
|
668
|
+
if isinstance(parser, str):
|
|
669
|
+
converted_parsers.append(parser)
|
|
670
|
+
elif isinstance(parser, dict):
|
|
671
|
+
parser_data = dict(parser)
|
|
672
|
+
if "max_pages" in parser_data:
|
|
673
|
+
parser_data["maxPages"] = parser_data.pop("max_pages")
|
|
674
|
+
converted_parsers.append(parser_data)
|
|
675
|
+
else:
|
|
676
|
+
parser_data = parser.model_dump(exclude_none=True)
|
|
677
|
+
# Convert snake_case to camelCase for API
|
|
678
|
+
if "max_pages" in parser_data:
|
|
679
|
+
parser_data["maxPages"] = parser_data.pop("max_pages")
|
|
680
|
+
converted_parsers.append(parser_data)
|
|
681
|
+
scrape_data["parsers"] = converted_parsers
|
|
682
|
+
elif key == "location":
|
|
683
|
+
# Handle location conversion
|
|
684
|
+
if isinstance(value, dict):
|
|
685
|
+
scrape_data["location"] = value
|
|
686
|
+
else:
|
|
687
|
+
scrape_data["location"] = value.model_dump(exclude_none=True)
|
|
688
|
+
else:
|
|
689
|
+
# For fields that don't need conversion, use as-is
|
|
690
|
+
scrape_data[key] = value
|
|
691
|
+
|
|
692
|
+
return scrape_data
|