retab 0.0.42__py3-none-any.whl → 0.0.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- retab/__init__.py +2 -1
- retab/client.py +26 -51
- retab/generate_types.py +180 -0
- retab/resources/consensus/client.py +1 -1
- retab/resources/consensus/responses.py +1 -1
- retab/resources/deployments/__init__.py +3 -0
- retab/resources/deployments/automations/__init__.py +9 -0
- retab/resources/deployments/automations/client.py +244 -0
- retab/resources/deployments/automations/endpoints.py +290 -0
- retab/resources/deployments/automations/links.py +303 -0
- retab/resources/deployments/automations/logs.py +222 -0
- retab/resources/deployments/automations/mailboxes.py +423 -0
- retab/resources/deployments/automations/outlook.py +377 -0
- retab/resources/deployments/automations/tests.py +161 -0
- retab/resources/deployments/client.py +148 -0
- retab/resources/documents/client.py +94 -68
- retab/resources/documents/extractions.py +55 -46
- retab/resources/evaluations/__init__.py +2 -2
- retab/resources/evaluations/client.py +61 -77
- retab/resources/evaluations/documents.py +48 -37
- retab/resources/evaluations/iterations.py +58 -40
- retab/resources/jsonlUtils.py +3 -4
- retab/resources/processors/automations/endpoints.py +49 -39
- retab/resources/processors/automations/links.py +52 -43
- retab/resources/processors/automations/mailboxes.py +74 -59
- retab/resources/processors/automations/outlook.py +104 -82
- retab/resources/processors/client.py +35 -30
- retab/resources/projects/__init__.py +3 -0
- retab/resources/projects/client.py +285 -0
- retab/resources/projects/documents.py +244 -0
- retab/resources/projects/iterations.py +470 -0
- retab/resources/usage.py +2 -0
- retab/types/ai_models.py +2 -1
- retab/types/deprecated_evals.py +195 -0
- retab/types/evaluations/__init__.py +5 -2
- retab/types/evaluations/iterations.py +9 -43
- retab/types/evaluations/model.py +19 -24
- retab/types/extractions.py +1 -0
- retab/types/jobs/base.py +1 -1
- retab/types/jobs/evaluation.py +1 -1
- retab/types/logs.py +5 -6
- retab/types/mime.py +1 -10
- retab/types/projects/__init__.py +34 -0
- retab/types/projects/documents.py +30 -0
- retab/types/projects/iterations.py +78 -0
- retab/types/projects/model.py +68 -0
- retab/types/schemas/enhance.py +22 -5
- retab/types/schemas/evaluate.py +2 -2
- retab/types/schemas/object.py +27 -25
- retab/types/standards.py +2 -2
- retab/utils/__init__.py +3 -0
- retab/utils/ai_models.py +127 -12
- retab/utils/hashing.py +24 -0
- retab/utils/json_schema.py +1 -26
- retab/utils/mime.py +0 -17
- retab/utils/usage/usage.py +0 -1
- {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/METADATA +4 -6
- {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/RECORD +60 -55
- retab/_utils/__init__.py +0 -0
- retab/_utils/_model_cards/anthropic.yaml +0 -59
- retab/_utils/_model_cards/auto.yaml +0 -43
- retab/_utils/_model_cards/gemini.yaml +0 -117
- retab/_utils/_model_cards/openai.yaml +0 -301
- retab/_utils/_model_cards/xai.yaml +0 -28
- retab/_utils/ai_models.py +0 -138
- retab/_utils/benchmarking.py +0 -484
- retab/_utils/chat.py +0 -327
- retab/_utils/display.py +0 -440
- retab/_utils/json_schema.py +0 -2156
- retab/_utils/mime.py +0 -165
- retab/_utils/responses.py +0 -169
- retab/_utils/stream_context_managers.py +0 -52
- retab/_utils/usage/__init__.py +0 -0
- retab/_utils/usage/usage.py +0 -301
- {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/WHEEL +0 -0
- {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/top_level.txt +0 -0
retab/_utils/json_schema.py
DELETED
@@ -1,2156 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import datetime
|
3
|
-
import json
|
4
|
-
import re
|
5
|
-
import types
|
6
|
-
from collections import defaultdict
|
7
|
-
from pathlib import Path
|
8
|
-
from typing import Annotated, Any, Callable, Literal, MutableMapping, MutableSequence, Optional, Tuple, Type, Union, cast, get_args, get_origin
|
9
|
-
|
10
|
-
import phonenumbers
|
11
|
-
import pycountry
|
12
|
-
import stdnum.eu.vat # type: ignore
|
13
|
-
from email_validator import validate_email
|
14
|
-
from pydantic import BaseModel, BeforeValidator, Field, create_model
|
15
|
-
from pydantic.config import ConfigDict
|
16
|
-
|
17
|
-
from ..types.schemas.layout import Column, FieldItem, Layout, RefObject, Row, RowList
|
18
|
-
from .mime import generate_blake2b_hash_from_string
|
19
|
-
|
20
|
-
# **** Validation Functions ****
|
21
|
-
|
22
|
-
# 1) Special Objects
|
23
|
-
|
24
|
-
|
25
|
-
def generate_schema_data_id(json_schema: dict[str, Any]) -> str:
|
26
|
-
"""Generate a SHA1 hash ID for schema data, ignoring prompt/description/default fields.
|
27
|
-
|
28
|
-
Args:
|
29
|
-
json_schema: The JSON schema to generate an ID for
|
30
|
-
|
31
|
-
Returns:
|
32
|
-
str: A SHA1 hash string with "sch_data_id_" prefix
|
33
|
-
"""
|
34
|
-
return "sch_data_id_" + generate_blake2b_hash_from_string(
|
35
|
-
json.dumps(
|
36
|
-
clean_schema(
|
37
|
-
copy.deepcopy(json_schema),
|
38
|
-
remove_custom_fields=True,
|
39
|
-
fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
|
40
|
-
),
|
41
|
-
sort_keys=True,
|
42
|
-
).strip()
|
43
|
-
)
|
44
|
-
|
45
|
-
|
46
|
-
def generate_schema_id(json_schema: dict[str, Any]) -> str:
|
47
|
-
"""Generate a SHA1 hash ID for the complete schema.
|
48
|
-
|
49
|
-
Args:
|
50
|
-
json_schema: The JSON schema to generate an ID for
|
51
|
-
|
52
|
-
Returns:
|
53
|
-
str: A SHA1 hash string with "sch_id_" prefix
|
54
|
-
"""
|
55
|
-
return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(json_schema, sort_keys=True).strip())
|
56
|
-
|
57
|
-
|
58
|
-
def validate_currency(currency_code: Any) -> Optional[str]:
|
59
|
-
"""
|
60
|
-
Return the valid currency code (ISO 4217) or None if invalid.
|
61
|
-
"""
|
62
|
-
if currency_code is None:
|
63
|
-
return None
|
64
|
-
currency_code = str(currency_code).strip() # convert to str and trim
|
65
|
-
if not currency_code:
|
66
|
-
return None
|
67
|
-
try:
|
68
|
-
if pycountry.currencies.lookup(currency_code):
|
69
|
-
return currency_code
|
70
|
-
except LookupError:
|
71
|
-
pass
|
72
|
-
return None
|
73
|
-
|
74
|
-
|
75
|
-
def validate_country_code(v: Any) -> Optional[str]:
|
76
|
-
"""
|
77
|
-
Return the valid country code (ISO 3166) or None if invalid.
|
78
|
-
"""
|
79
|
-
if v is None:
|
80
|
-
return None
|
81
|
-
v_str = str(v).strip()
|
82
|
-
if not v_str:
|
83
|
-
return None
|
84
|
-
try:
|
85
|
-
if pycountry.countries.lookup(v_str):
|
86
|
-
return v_str
|
87
|
-
except LookupError:
|
88
|
-
pass
|
89
|
-
return None
|
90
|
-
|
91
|
-
|
92
|
-
def validate_email_regex(v: Any) -> Optional[str]:
|
93
|
-
"""
|
94
|
-
Return the string if it matches a basic email pattern, else None.
|
95
|
-
"""
|
96
|
-
if v is None:
|
97
|
-
return None
|
98
|
-
v_str = str(v).strip()
|
99
|
-
if not v_str:
|
100
|
-
return None
|
101
|
-
pattern = r"^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$"
|
102
|
-
if re.match(pattern, v_str):
|
103
|
-
return v_str.lower()
|
104
|
-
return None
|
105
|
-
|
106
|
-
|
107
|
-
def validate_vat_number(v: Any) -> Optional[str]:
|
108
|
-
"""
|
109
|
-
Return the VAT number if valid (EU format) else None.
|
110
|
-
"""
|
111
|
-
if v is None:
|
112
|
-
return None
|
113
|
-
v_str = str(v).strip()
|
114
|
-
if not v_str:
|
115
|
-
return None
|
116
|
-
try:
|
117
|
-
if stdnum.eu.vat.is_valid(v_str):
|
118
|
-
return stdnum.eu.vat.validate(v_str)
|
119
|
-
except Exception:
|
120
|
-
pass
|
121
|
-
return None
|
122
|
-
|
123
|
-
|
124
|
-
def validate_phone_number(v: Any) -> Optional[str]:
|
125
|
-
"""
|
126
|
-
Return E.164 phone number format if valid, else None.
|
127
|
-
"""
|
128
|
-
if v is None:
|
129
|
-
return None
|
130
|
-
v_str = str(v).strip()
|
131
|
-
if not v_str:
|
132
|
-
return None
|
133
|
-
try:
|
134
|
-
phone_number = phonenumbers.parse(v_str, "FR") # Default region: FR
|
135
|
-
if phonenumbers.is_valid_number(phone_number):
|
136
|
-
return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.E164)
|
137
|
-
except phonenumbers.NumberParseException:
|
138
|
-
pass
|
139
|
-
return None
|
140
|
-
|
141
|
-
|
142
|
-
def validate_email_address(v: Any) -> Optional[str]:
|
143
|
-
"""
|
144
|
-
Return the normalized email address if valid, else None.
|
145
|
-
"""
|
146
|
-
if v is None:
|
147
|
-
return None
|
148
|
-
v_str = str(v).strip()
|
149
|
-
if not v_str:
|
150
|
-
return None
|
151
|
-
try:
|
152
|
-
return validate_email(v_str).normalized
|
153
|
-
except Exception:
|
154
|
-
return None
|
155
|
-
|
156
|
-
|
157
|
-
def validate_frenchpostcode(v: Any) -> Optional[str]:
|
158
|
-
"""
|
159
|
-
Return a 5-digit postcode if valid, else None.
|
160
|
-
"""
|
161
|
-
if v is None:
|
162
|
-
return None
|
163
|
-
v_str = str(v).strip()
|
164
|
-
if not v_str:
|
165
|
-
return None
|
166
|
-
# Zero-pad to 5 digits
|
167
|
-
try:
|
168
|
-
v_str = v_str.zfill(5)
|
169
|
-
# Optionally check numeric
|
170
|
-
if not v_str.isdigit():
|
171
|
-
return None
|
172
|
-
return v_str
|
173
|
-
except Exception:
|
174
|
-
return None
|
175
|
-
|
176
|
-
|
177
|
-
def validate_packing_type(v: Any) -> Optional[str]:
|
178
|
-
"""
|
179
|
-
Return the packing type if in the known set, else None.
|
180
|
-
"""
|
181
|
-
if v is None:
|
182
|
-
return None
|
183
|
-
v_str = str(v).strip().lower()
|
184
|
-
# We'll store the valid set in lower for easy comparison
|
185
|
-
valid_packing_types = {"box", "pallet", "container", "bag", "drum", "other"}
|
186
|
-
if v_str in valid_packing_types:
|
187
|
-
return v_str
|
188
|
-
return None
|
189
|
-
|
190
|
-
|
191
|
-
def validate_un_code(v: Any) -> Optional[int]:
|
192
|
-
"""
|
193
|
-
Return an integer UN code in range [0..3481], else None.
|
194
|
-
"""
|
195
|
-
if v is None:
|
196
|
-
return None
|
197
|
-
v_str = str(v).strip()
|
198
|
-
if not v_str:
|
199
|
-
return None
|
200
|
-
try:
|
201
|
-
val = int(float(v_str)) # handle numeric strings
|
202
|
-
if 0 <= val <= 3481:
|
203
|
-
return val
|
204
|
-
except Exception:
|
205
|
-
pass
|
206
|
-
return None
|
207
|
-
|
208
|
-
|
209
|
-
def validate_adr_tunnel_code(v: Any) -> Optional[str]:
|
210
|
-
"""
|
211
|
-
Return a valid ADR tunnel code from a known set, else None.
|
212
|
-
"""
|
213
|
-
if v is None:
|
214
|
-
return None
|
215
|
-
v_str = str(v).strip().upper() # unify for set comparison
|
216
|
-
valid_codes = {"B", "B1000C", "B/D", "B/E", "C", "C5000D", "C/D", "C/E", "D", "D/E", "E", "-"}
|
217
|
-
return v_str if v_str in valid_codes else None
|
218
|
-
|
219
|
-
|
220
|
-
def validate_un_packing_group(v: Any) -> Optional[str]:
|
221
|
-
"""
|
222
|
-
Return a valid UN packing group (I, II, or III), else None.
|
223
|
-
"""
|
224
|
-
if v is None:
|
225
|
-
return None
|
226
|
-
v_str = str(v).strip().upper()
|
227
|
-
valid_groups = {"I", "II", "III"}
|
228
|
-
return v_str if v_str in valid_groups else None
|
229
|
-
|
230
|
-
|
231
|
-
# 2) General Objects
|
232
|
-
|
233
|
-
|
234
|
-
def validate_integer(v: Any) -> Optional[int]:
|
235
|
-
"""
|
236
|
-
Return an integer if parseable, else None.
|
237
|
-
"""
|
238
|
-
if v is None:
|
239
|
-
return None
|
240
|
-
v_str = str(v).strip()
|
241
|
-
if not v_str:
|
242
|
-
return None
|
243
|
-
try:
|
244
|
-
return int(float(v_str))
|
245
|
-
except Exception:
|
246
|
-
return None
|
247
|
-
|
248
|
-
|
249
|
-
def validate_float(v: Any) -> Optional[float]:
|
250
|
-
"""
|
251
|
-
Return a float if parseable, else None.
|
252
|
-
"""
|
253
|
-
if v is None:
|
254
|
-
return None
|
255
|
-
v_str = str(v).strip()
|
256
|
-
if not v_str:
|
257
|
-
return None
|
258
|
-
try:
|
259
|
-
return float(v_str)
|
260
|
-
except Exception:
|
261
|
-
return None
|
262
|
-
|
263
|
-
|
264
|
-
def validate_date(v: Union[str, datetime.date, None]) -> Optional[str]:
|
265
|
-
"""
|
266
|
-
Return date in ISO format (YYYY-MM-DD) if valid, else None.
|
267
|
-
"""
|
268
|
-
if v is None:
|
269
|
-
return None
|
270
|
-
|
271
|
-
# If it's already a date object
|
272
|
-
if isinstance(v, datetime.date):
|
273
|
-
return v.isoformat()
|
274
|
-
|
275
|
-
# If it's a string
|
276
|
-
v_str = str(v).strip()
|
277
|
-
if not v_str:
|
278
|
-
return None
|
279
|
-
|
280
|
-
# Try ISO or a close variant
|
281
|
-
try:
|
282
|
-
return datetime.date.fromisoformat(v_str).isoformat()
|
283
|
-
except ValueError:
|
284
|
-
# Fallback to strptime
|
285
|
-
try:
|
286
|
-
return datetime.datetime.strptime(v_str, "%Y-%m-%d").date().isoformat()
|
287
|
-
except ValueError:
|
288
|
-
return None
|
289
|
-
|
290
|
-
|
291
|
-
def validate_time(v: Union[str, datetime.time, None]) -> Optional[str]:
|
292
|
-
"""
|
293
|
-
Return time in ISO format (HH:MM[:SS]) if valid, else None.
|
294
|
-
"""
|
295
|
-
if v is None:
|
296
|
-
return None
|
297
|
-
|
298
|
-
# If it's already a time object
|
299
|
-
if isinstance(v, datetime.time):
|
300
|
-
return v.isoformat()
|
301
|
-
|
302
|
-
v_str = str(v).strip()
|
303
|
-
if not v_str:
|
304
|
-
return None
|
305
|
-
|
306
|
-
# Try multiple formats
|
307
|
-
time_formats = ["%H:%M:%S", "%H:%M", "%I:%M %p", "%I:%M:%S %p"]
|
308
|
-
for fmt in time_formats:
|
309
|
-
try:
|
310
|
-
parsed = datetime.datetime.strptime(v_str, fmt).time()
|
311
|
-
return parsed.isoformat()
|
312
|
-
except ValueError:
|
313
|
-
continue
|
314
|
-
return None
|
315
|
-
|
316
|
-
|
317
|
-
def validate_bool(v: Any) -> bool:
|
318
|
-
"""
|
319
|
-
Convert to bool if matches known true/false strings or actual bool.
|
320
|
-
Otherwise return False.
|
321
|
-
"""
|
322
|
-
if v is None:
|
323
|
-
return False
|
324
|
-
|
325
|
-
if isinstance(v, bool):
|
326
|
-
return v
|
327
|
-
|
328
|
-
try:
|
329
|
-
v_str = str(v).strip().lower()
|
330
|
-
true_values = {"true", "t", "yes", "y", "1"}
|
331
|
-
false_values = {"false", "f", "no", "n", "0"}
|
332
|
-
if v_str in true_values:
|
333
|
-
return True
|
334
|
-
elif v_str in false_values:
|
335
|
-
return False
|
336
|
-
except Exception:
|
337
|
-
pass
|
338
|
-
|
339
|
-
return False
|
340
|
-
|
341
|
-
|
342
|
-
def validate_strold(v: Any) -> Optional[str]:
|
343
|
-
"""
|
344
|
-
Return a stripped string unless it's empty or a known 'null' placeholder, else None.
|
345
|
-
"""
|
346
|
-
if v is None:
|
347
|
-
return None
|
348
|
-
v_str = str(v).strip()
|
349
|
-
# Treat these placeholders (and empty) as invalid
|
350
|
-
if v_str.lower() in {"null", "none", "nan", ""}:
|
351
|
-
return None
|
352
|
-
return v_str
|
353
|
-
|
354
|
-
|
355
|
-
def validate_str(v: Any) -> Optional[str]:
|
356
|
-
"""
|
357
|
-
Return a stripped string unless it's invalid (e.g., placeholders like 'null'), else None.
|
358
|
-
Does NOT convert empty strings to None—leaves them as-is.
|
359
|
-
"""
|
360
|
-
if v is None:
|
361
|
-
return None
|
362
|
-
v_str = str(v).strip()
|
363
|
-
if v_str.lower() in {"null", "none", "nan"}: # Only treat explicit placeholders as None
|
364
|
-
return None
|
365
|
-
return v_str # Keep empty strings intact
|
366
|
-
|
367
|
-
|
368
|
-
def notnan(x: Any) -> bool:
|
369
|
-
"""
|
370
|
-
Return False if x is None, 'null', 'nan', or x != x (NaN check).
|
371
|
-
True otherwise.
|
372
|
-
"""
|
373
|
-
if x is None:
|
374
|
-
return False
|
375
|
-
x_str = str(x).lower().strip()
|
376
|
-
if x_str in {"null", "nan"}:
|
377
|
-
return False
|
378
|
-
# Check for actual float NaN (x != x)
|
379
|
-
return not (x != x)
|
380
|
-
|
381
|
-
|
382
|
-
def merge_descriptions(outer_schema: dict[str, Any], inner_schema: dict[str, Any]) -> dict[str, Any]:
|
383
|
-
"""
|
384
|
-
Merge descriptions from outer and inner schemas, giving preference to outer.
|
385
|
-
Also merges X-ReasoningPrompt similarly.
|
386
|
-
"""
|
387
|
-
merged = copy.deepcopy(inner_schema)
|
388
|
-
|
389
|
-
# Outer description preferred if present
|
390
|
-
if outer_schema.get("description", "").strip():
|
391
|
-
merged["description"] = outer_schema["description"]
|
392
|
-
|
393
|
-
# Outer reasoning preferred if present
|
394
|
-
if outer_schema.get("X-ReasoningPrompt", "").strip():
|
395
|
-
merged["X-ReasoningPrompt"] = outer_schema["X-ReasoningPrompt"]
|
396
|
-
elif inner_schema.get("X-ReasoningPrompt", "").strip():
|
397
|
-
merged["X-ReasoningPrompt"] = inner_schema["X-ReasoningPrompt"]
|
398
|
-
|
399
|
-
if not merged.get("X-ReasoningPrompt", "").strip():
|
400
|
-
# delete it
|
401
|
-
merged.pop("X-ReasoningPrompt", None)
|
402
|
-
|
403
|
-
# Outer LLM Description preferred if present
|
404
|
-
if outer_schema.get("X-FieldPrompt", "").strip():
|
405
|
-
merged["X-FieldPrompt"] = outer_schema["X-FieldPrompt"]
|
406
|
-
elif inner_schema.get("X-FieldPrompt", "").strip():
|
407
|
-
merged["X-FieldPrompt"] = inner_schema["X-FieldPrompt"]
|
408
|
-
|
409
|
-
if not merged.get("X-FieldPrompt", "").strip():
|
410
|
-
# delete it
|
411
|
-
merged.pop("X-FieldPrompt", None)
|
412
|
-
|
413
|
-
# System-Prompt
|
414
|
-
if not merged.get("X-SystemPrompt", "").strip():
|
415
|
-
# delete it
|
416
|
-
merged.pop("X-SystemPrompt", None)
|
417
|
-
|
418
|
-
return merged
|
419
|
-
|
420
|
-
|
421
|
-
def has_cyclic_refs(schema: dict[str, Any]) -> bool:
|
422
|
-
"""Check if the JSON Schema contains cyclic references.
|
423
|
-
|
424
|
-
The function recursively traverses all nested objects and arrays in the schema.
|
425
|
-
It follows any "$ref" that points to a definition (i.e. "#/$defs/<name>")
|
426
|
-
and uses DFS with a current-path stack to detect cycles.
|
427
|
-
"""
|
428
|
-
definitions = schema.get("$defs", {})
|
429
|
-
if not definitions:
|
430
|
-
return False
|
431
|
-
|
432
|
-
# Memoize results for each definition to avoid repeated work.
|
433
|
-
memo: dict[str, bool] = {}
|
434
|
-
|
435
|
-
def dfs(def_name: str, stack: set[str]) -> bool:
|
436
|
-
"""Perform DFS on a definition (by name) using 'stack' to detect cycles."""
|
437
|
-
if def_name in stack:
|
438
|
-
return True
|
439
|
-
if def_name in memo:
|
440
|
-
return memo[def_name]
|
441
|
-
|
442
|
-
# Add to current path and traverse the definition.
|
443
|
-
stack.add(def_name)
|
444
|
-
node = definitions.get(def_name)
|
445
|
-
if node is None:
|
446
|
-
# No such definition, so nothing to do.
|
447
|
-
stack.remove(def_name)
|
448
|
-
memo[def_name] = False
|
449
|
-
return False
|
450
|
-
|
451
|
-
result = traverse(node, stack)
|
452
|
-
stack.remove(def_name)
|
453
|
-
memo[def_name] = result
|
454
|
-
return result
|
455
|
-
|
456
|
-
def traverse(node: Any, stack: set[str]) -> bool:
|
457
|
-
"""Recursively traverse an arbitrary JSON Schema node."""
|
458
|
-
if isinstance(node, dict):
|
459
|
-
# If we see a "$ref", try to follow it.
|
460
|
-
if "$ref" in node:
|
461
|
-
ref = node["$ref"]
|
462
|
-
if ref.startswith("#/$defs/"):
|
463
|
-
target = ref[len("#/$defs/") :]
|
464
|
-
if dfs(target, stack):
|
465
|
-
return True
|
466
|
-
# Recursively check all values in the dictionary.
|
467
|
-
for key, value in node.items():
|
468
|
-
# Skip "$ref" as it has already been processed.
|
469
|
-
if key == "$ref":
|
470
|
-
continue
|
471
|
-
if traverse(value, stack):
|
472
|
-
return True
|
473
|
-
elif isinstance(node, list):
|
474
|
-
for item in node:
|
475
|
-
if traverse(item, stack):
|
476
|
-
return True
|
477
|
-
return False
|
478
|
-
|
479
|
-
# Start DFS on each top-level definition.
|
480
|
-
for def_name in definitions:
|
481
|
-
if dfs(def_name, set()):
|
482
|
-
return True
|
483
|
-
|
484
|
-
return False
|
485
|
-
|
486
|
-
|
487
|
-
def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] | None = None) -> dict[str, Any]:
|
488
|
-
"""
|
489
|
-
Recursively resolve $ref in the given schema.
|
490
|
-
For each $ref, fetch the target schema, merge descriptions, and resolve further.
|
491
|
-
"""
|
492
|
-
if not isinstance(schema, dict):
|
493
|
-
return schema
|
494
|
-
|
495
|
-
# First, we will verify if this schema is expandable, we do this by checking if there are cyclic $refs (infinite loop)
|
496
|
-
# If there are, we will return the schema as is
|
497
|
-
|
498
|
-
if has_cyclic_refs(schema):
|
499
|
-
print("Cyclic refs found, keeping it as is")
|
500
|
-
return schema
|
501
|
-
|
502
|
-
if definitions is None:
|
503
|
-
definitions = schema.pop("$defs", {})
|
504
|
-
|
505
|
-
assert isinstance(definitions, dict)
|
506
|
-
|
507
|
-
if "allOf" in schema:
|
508
|
-
# Some schemas (notably the one converted from a pydantic model) have allOf. We only accept one element in allOf
|
509
|
-
if len(schema["allOf"]) != 1:
|
510
|
-
raise ValueError(f"Property schema must have a single element in 'allOf'. Found: {schema['allOf']}")
|
511
|
-
schema.update(schema.pop("allOf", [{}])[0])
|
512
|
-
|
513
|
-
if "$ref" in schema:
|
514
|
-
ref: str = schema["$ref"]
|
515
|
-
if ref.startswith("#/$defs/"):
|
516
|
-
def_name = ref.removeprefix("#/$defs/")
|
517
|
-
if def_name not in definitions:
|
518
|
-
raise ValueError(f"Reference {ref} not found in definitions.")
|
519
|
-
target = definitions[def_name]
|
520
|
-
merged = merge_descriptions(schema, target)
|
521
|
-
merged.pop("$ref", None)
|
522
|
-
return expand_refs(merged, definitions)
|
523
|
-
else:
|
524
|
-
raise ValueError(f"Unsupported reference format: {ref}")
|
525
|
-
|
526
|
-
result: dict[str, Any] = {}
|
527
|
-
for annotation, subschema in schema.items():
|
528
|
-
if annotation in ["properties", "$defs"]:
|
529
|
-
if isinstance(subschema, dict):
|
530
|
-
new_dict = {}
|
531
|
-
for pk, pv in subschema.items():
|
532
|
-
new_dict[pk] = expand_refs(pv, definitions)
|
533
|
-
result[annotation] = new_dict
|
534
|
-
else:
|
535
|
-
result[annotation] = subschema
|
536
|
-
elif annotation == "items":
|
537
|
-
if isinstance(subschema, list):
|
538
|
-
result[annotation] = [expand_refs(item, definitions) for item in subschema]
|
539
|
-
else:
|
540
|
-
result[annotation] = expand_refs(subschema, definitions)
|
541
|
-
else:
|
542
|
-
if isinstance(subschema, dict):
|
543
|
-
result[annotation] = expand_refs(subschema, definitions)
|
544
|
-
elif isinstance(subschema, list):
|
545
|
-
new_list = []
|
546
|
-
for item in subschema:
|
547
|
-
if isinstance(item, dict):
|
548
|
-
new_list.append(expand_refs(item, definitions))
|
549
|
-
else:
|
550
|
-
new_list.append(item)
|
551
|
-
result[annotation] = new_list
|
552
|
-
else:
|
553
|
-
result[annotation] = subschema
|
554
|
-
|
555
|
-
return result
|
556
|
-
|
557
|
-
|
558
|
-
def json_schema_to_typescript_interface(
|
559
|
-
schema: dict[str, Any],
|
560
|
-
name: str = "RootInterface",
|
561
|
-
definitions: Optional[dict[str, dict[str, Any]]] = None,
|
562
|
-
processed_refs: Optional[dict[str, str]] = None,
|
563
|
-
indent: int = 2,
|
564
|
-
add_field_description: bool = False,
|
565
|
-
) -> str:
|
566
|
-
"""
|
567
|
-
Convert a JSON Schema to a TypeScript interface.
|
568
|
-
|
569
|
-
:param schema: The JSON schema as a dict.
|
570
|
-
:param name: Name of the interface to generate.
|
571
|
-
:param definitions: A dictionary of named schemas that can be referenced by $ref.
|
572
|
-
:param processed_refs: A dict to keep track of processed $refs to avoid recursion.
|
573
|
-
:param indent: Number of spaces for indentation in the output.
|
574
|
-
:param add_field_description: If True, include field descriptions as comments.
|
575
|
-
:return: A string containing the TypeScript interface.
|
576
|
-
"""
|
577
|
-
if definitions is None:
|
578
|
-
# Extract definitions from $defs if present
|
579
|
-
definitions = schema.get("$defs", {})
|
580
|
-
|
581
|
-
if processed_refs is None:
|
582
|
-
processed_refs = {}
|
583
|
-
|
584
|
-
# If we have a top-level object schema
|
585
|
-
if schema.get("type") == "object" or "properties" in schema:
|
586
|
-
interface_lines = [f"interface {name} {{"]
|
587
|
-
indentation = " " * indent
|
588
|
-
properties = schema.get("properties", {})
|
589
|
-
required_fields = set(schema.get("required", []))
|
590
|
-
|
591
|
-
for prop_name, prop_schema in properties.items():
|
592
|
-
is_optional = prop_name not in required_fields
|
593
|
-
field_ts = schema_to_ts_type(prop_schema, definitions or {}, processed_refs, indent, indent, add_field_description=add_field_description)
|
594
|
-
optional_flag = "?" if is_optional else ""
|
595
|
-
line = ""
|
596
|
-
if add_field_description and "description" in prop_schema:
|
597
|
-
desc = prop_schema["description"].replace("\n", f"\n{indentation}// ")
|
598
|
-
line = f"{indentation}// {desc}\n"
|
599
|
-
line += f"{indentation}{prop_name}{optional_flag}: {field_ts};"
|
600
|
-
interface_lines.append(line)
|
601
|
-
|
602
|
-
interface_lines.append("}")
|
603
|
-
return "\n".join(interface_lines)
|
604
|
-
else:
|
605
|
-
# Otherwise, produce a type alias if it's not an object
|
606
|
-
ts_type = schema_to_ts_type(schema, definitions or {}, processed_refs, indent, indent, add_field_description=add_field_description)
|
607
|
-
return f"type {name} = {ts_type};"
|
608
|
-
|
609
|
-
|
610
|
-
def schema_to_ts_type(
|
611
|
-
schema: dict[str, Any], definitions: dict[str, dict[str, Any]], processed_refs: dict[str, str], indent: int, increment: int, add_field_description: bool = False
|
612
|
-
) -> str:
|
613
|
-
"""
|
614
|
-
Convert a JSON schema snippet to a TypeScript type (string).
|
615
|
-
Handles objects, arrays, primitives, enums, oneOf/anyOf/allOf, and $ref.
|
616
|
-
"""
|
617
|
-
|
618
|
-
# Handle $ref upfront
|
619
|
-
if "$ref" in schema:
|
620
|
-
ref = schema["$ref"]
|
621
|
-
if ref in processed_refs:
|
622
|
-
return processed_refs[ref]
|
623
|
-
resolved = resolve_ref(ref, definitions)
|
624
|
-
if resolved is None:
|
625
|
-
return "any"
|
626
|
-
processed_refs[ref] = "" # to avoid recursion
|
627
|
-
ts_type = schema_to_ts_type(resolved, definitions, processed_refs, indent, increment, add_field_description=add_field_description)
|
628
|
-
processed_refs[ref] = ts_type
|
629
|
-
return ts_type
|
630
|
-
|
631
|
-
# Handle allOf, oneOf, anyOf
|
632
|
-
if "allOf" in schema:
|
633
|
-
# allOf means intersection of all subschemas
|
634
|
-
subtypes = [schema_to_ts_type(s, definitions, processed_refs, indent, increment, add_field_description) for s in schema["allOf"]]
|
635
|
-
return "(" + " & ".join(subtypes) + ")"
|
636
|
-
|
637
|
-
if "oneOf" in schema:
|
638
|
-
# oneOf means a union type
|
639
|
-
subtypes = [schema_to_ts_type(s, definitions, processed_refs, indent, increment, add_field_description) for s in schema["oneOf"]]
|
640
|
-
return "(" + " | ".join(subtypes) + ")"
|
641
|
-
|
642
|
-
if "anyOf" in schema:
|
643
|
-
# anyOf means a union type
|
644
|
-
subtypes = [
|
645
|
-
schema_to_ts_type(s, definitions, processed_refs, indent, increment, add_field_description)
|
646
|
-
for s in schema["anyOf"]
|
647
|
-
# Remove "null" from subtypes if it's present
|
648
|
-
# if not (isinstance(s, dict) and s.get("type") == "null")
|
649
|
-
]
|
650
|
-
if len(subtypes) == 1:
|
651
|
-
return subtypes[0]
|
652
|
-
|
653
|
-
return "(" + " | ".join(subtypes) + ")"
|
654
|
-
|
655
|
-
# Handle enums
|
656
|
-
if "enum" in schema:
|
657
|
-
# Create a union of literal types
|
658
|
-
enum_values = schema["enum"]
|
659
|
-
ts_literals = []
|
660
|
-
for val in enum_values:
|
661
|
-
if isinstance(val, str):
|
662
|
-
ts_literals.append(f'"{val}"')
|
663
|
-
elif val is None:
|
664
|
-
ts_literals.append("null")
|
665
|
-
else:
|
666
|
-
ts_literals.append(str(val).lower() if isinstance(val, bool) else str(val))
|
667
|
-
return " | ".join(ts_literals)
|
668
|
-
|
669
|
-
# Handle type
|
670
|
-
schema_type = schema.get("type")
|
671
|
-
if schema_type == "object" or "properties" in schema:
|
672
|
-
# Inline object
|
673
|
-
return inline_object(schema, definitions, processed_refs, indent, increment, add_field_description)
|
674
|
-
elif schema_type == "array":
|
675
|
-
# items define the type of array elements
|
676
|
-
items_schema = schema.get("items", {})
|
677
|
-
item_type = schema_to_ts_type(items_schema, definitions, processed_refs, indent + increment, increment, add_field_description)
|
678
|
-
return f"Array<{item_type}>"
|
679
|
-
else:
|
680
|
-
# Primitive types or missing type
|
681
|
-
if isinstance(schema_type, list):
|
682
|
-
# union of multiple primitive types
|
683
|
-
primitive_types = [primitive_type_to_ts(t) for t in schema_type]
|
684
|
-
return "(" + " | ".join(primitive_types) + ")"
|
685
|
-
else:
|
686
|
-
# single primitive
|
687
|
-
return primitive_type_to_ts(schema_type)
|
688
|
-
|
689
|
-
|
690
|
-
def inline_object(schema: dict[str, Any], definitions: dict[str, dict[str, Any]], processed_refs: dict[str, str], indent: int, increment: int, add_field_description: bool) -> str:
|
691
|
-
"""
|
692
|
-
Inline an object type from a JSON schema into a TypeScript type.
|
693
|
-
"""
|
694
|
-
properties = schema.get("properties", {})
|
695
|
-
required_fields = set(schema.get("required", []))
|
696
|
-
lines = ["{"]
|
697
|
-
field_indentation = " " * (indent + increment)
|
698
|
-
for prop_name, prop_schema in properties.items():
|
699
|
-
is_optional = prop_name not in required_fields
|
700
|
-
ts_type = schema_to_ts_type(prop_schema, definitions, processed_refs, indent + increment, increment, add_field_description)
|
701
|
-
optional_flag = "?" if is_optional else ""
|
702
|
-
line = ""
|
703
|
-
if add_field_description and "description" in prop_schema:
|
704
|
-
desc = prop_schema["description"].replace("\n", f"\n{field_indentation}// ")
|
705
|
-
line = f"{field_indentation}// {desc}\n"
|
706
|
-
line += f"{field_indentation}{prop_name}{optional_flag}: {ts_type};"
|
707
|
-
lines.append(line)
|
708
|
-
lines.append(" " * indent + "}")
|
709
|
-
return "\n".join(lines)
|
710
|
-
|
711
|
-
|
712
|
-
def primitive_type_to_ts(t: Union[str, None]) -> str:
|
713
|
-
"""
|
714
|
-
Convert a primitive JSON schema type to a TypeScript type.
|
715
|
-
"""
|
716
|
-
if t == "string":
|
717
|
-
return "string"
|
718
|
-
elif t in ("integer", "number"):
|
719
|
-
return "number"
|
720
|
-
elif t == "boolean":
|
721
|
-
return "boolean"
|
722
|
-
elif t == "null":
|
723
|
-
return "null"
|
724
|
-
elif t is None:
|
725
|
-
# no specific type given
|
726
|
-
return "any"
|
727
|
-
else:
|
728
|
-
# fallback
|
729
|
-
return "any"
|
730
|
-
|
731
|
-
|
732
|
-
def resolve_ref(ref: str, definitions: dict[str, dict[str, Any]]) -> Optional[dict[str, Any]]:
|
733
|
-
"""
|
734
|
-
Resolve a $ref against the given definitions.
|
735
|
-
The schema uses $defs. Ref format: "#/$defs/SomeDefinition"
|
736
|
-
"""
|
737
|
-
if ref.startswith("#/$defs/"):
|
738
|
-
key = ref[len("#/$defs/") :]
|
739
|
-
return definitions.get(key)
|
740
|
-
# No known resolution strategy
|
741
|
-
return None
|
742
|
-
|
743
|
-
|
744
|
-
def json_schema_to_strict_openai_schema(obj: Union[dict[str, Any], list[Any]]) -> Union[dict[str, Any], list[Any]]:
|
745
|
-
# Gets a json supported by GPT Structured Output from a pydantic Basemodel
|
746
|
-
|
747
|
-
if isinstance(obj, dict):
|
748
|
-
new_obj: dict[str, Any] = copy.deepcopy(obj)
|
749
|
-
|
750
|
-
# Remove some not-supported fields
|
751
|
-
for key in ["default", "format", "X-FieldTranslation", "X-EnumTranslation"]:
|
752
|
-
new_obj.pop(key, None)
|
753
|
-
|
754
|
-
# Handle integer type
|
755
|
-
if "type" in new_obj:
|
756
|
-
if new_obj["type"] == "integer":
|
757
|
-
new_obj["type"] = "number"
|
758
|
-
elif isinstance(new_obj["type"], list):
|
759
|
-
new_obj["type"] = ["number" if t == "integer" else t for t in new_obj["type"]]
|
760
|
-
|
761
|
-
# Handle allOf
|
762
|
-
if "allOf" in new_obj:
|
763
|
-
subschemas = new_obj.pop("allOf")
|
764
|
-
merged: dict[str, Any] = {}
|
765
|
-
for subschema in subschemas:
|
766
|
-
if "$ref" in subschema:
|
767
|
-
merged.update({"$ref": subschema["$ref"]})
|
768
|
-
else:
|
769
|
-
merged.update(json_schema_to_strict_openai_schema(subschema))
|
770
|
-
new_obj.update(merged)
|
771
|
-
|
772
|
-
# Handle anyOf
|
773
|
-
if "anyOf" in new_obj:
|
774
|
-
new_obj["anyOf"] = [json_schema_to_strict_openai_schema(subschema) for subschema in new_obj["anyOf"]]
|
775
|
-
|
776
|
-
# Handle enum (force type to string)
|
777
|
-
if "enum" in new_obj:
|
778
|
-
new_obj["enum"] = [str(e) for e in new_obj["enum"]]
|
779
|
-
new_obj["type"] = "string"
|
780
|
-
|
781
|
-
# Handle object type
|
782
|
-
if new_obj.get("type") == "object" and "properties" in new_obj and isinstance(new_obj["properties"], dict):
|
783
|
-
new_obj["required"] = list(new_obj["properties"].keys())
|
784
|
-
new_obj["additionalProperties"] = False
|
785
|
-
new_obj["properties"] = {k: json_schema_to_strict_openai_schema(v) for k, v in new_obj["properties"].items()}
|
786
|
-
|
787
|
-
# Handle array type
|
788
|
-
if new_obj.get("type") == "array" and "items" in new_obj:
|
789
|
-
new_obj["items"] = json_schema_to_strict_openai_schema(new_obj["items"])
|
790
|
-
|
791
|
-
# Handle defs
|
792
|
-
if "$defs" in new_obj:
|
793
|
-
new_obj["$defs"] = {k: json_schema_to_strict_openai_schema(v) for k, v in new_obj["$defs"].items()}
|
794
|
-
|
795
|
-
return new_obj
|
796
|
-
elif isinstance(obj, list):
|
797
|
-
return [json_schema_to_strict_openai_schema(item) for item in obj]
|
798
|
-
else:
|
799
|
-
return obj
|
800
|
-
|
801
|
-
|
802
|
-
def clean_schema(schema: dict[str, Any], remove_custom_fields: bool = False, fields_to_remove: list[str] = ["default", "minlength", "maxlength"]) -> dict[str, Any]:
|
803
|
-
"""
|
804
|
-
Recursively remove specified fields from a JSON schema.
|
805
|
-
|
806
|
-
Args:
|
807
|
-
schema: The JSON schema to be cleaned.
|
808
|
-
remove_custom_fields: If True, also remove fields starting with 'x-'.
|
809
|
-
fields_to_remove: List of keys to remove (case-insensitive check).
|
810
|
-
|
811
|
-
Returns:
|
812
|
-
The resulting cleaned JSON schema.
|
813
|
-
"""
|
814
|
-
schema = schema.copy()
|
815
|
-
lower_fields_to_remove = [f.lower() for f in fields_to_remove]
|
816
|
-
for key in list(schema.keys()):
|
817
|
-
if not isinstance(key, str):
|
818
|
-
continue
|
819
|
-
|
820
|
-
lower_key = key.lower()
|
821
|
-
|
822
|
-
conditions_to_remove = [
|
823
|
-
# Empty keys
|
824
|
-
not key,
|
825
|
-
# Empty subschemas
|
826
|
-
isinstance(schema[key], dict) and len(schema[key]) == 0,
|
827
|
-
# Fields to remove
|
828
|
-
lower_key in lower_fields_to_remove,
|
829
|
-
# Custom fields
|
830
|
-
remove_custom_fields and lower_key.startswith("x-"),
|
831
|
-
]
|
832
|
-
|
833
|
-
if any(conditions_to_remove):
|
834
|
-
schema.pop(key)
|
835
|
-
continue
|
836
|
-
|
837
|
-
if "properties" in schema:
|
838
|
-
schema["properties"] = {
|
839
|
-
prop_key: clean_schema(prop_schema, fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields)
|
840
|
-
for prop_key, prop_schema in schema["properties"].items()
|
841
|
-
}
|
842
|
-
if "items" in schema:
|
843
|
-
schema["items"] = clean_schema(schema["items"], fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields)
|
844
|
-
if "$defs" in schema:
|
845
|
-
schema["$defs"] = {k: clean_schema(v, fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields) for k, v in schema["$defs"].items()}
|
846
|
-
if "allOf" in schema:
|
847
|
-
schema["allOf"] = [clean_schema(subschema, fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields) for subschema in schema["allOf"]]
|
848
|
-
if "anyOf" in schema:
|
849
|
-
schema["anyOf"] = [clean_schema(subschema, fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields) for subschema in schema["anyOf"]]
|
850
|
-
|
851
|
-
return schema
|
852
|
-
|
853
|
-
|
854
|
-
def add_reasoning_sibling_inplace(properties: dict[str, Any], field_name: str, reasoning_desc: str) -> None:
|
855
|
-
"""
|
856
|
-
Add a reasoning sibling for a given property field_name into properties dict.
|
857
|
-
We'll use the naming convention reasoning___<field_name>.
|
858
|
-
If the field_name is 'root', we add 'reasoning___root'.
|
859
|
-
"""
|
860
|
-
reasoning_key = f"reasoning___{field_name}"
|
861
|
-
new_properties: dict[str, Any]
|
862
|
-
if field_name == "root":
|
863
|
-
new_properties = {reasoning_key: {"type": "string", "description": reasoning_desc}, **properties}
|
864
|
-
else:
|
865
|
-
# Insert reasoning_key just above the field_name
|
866
|
-
new_properties = {}
|
867
|
-
for key, value in properties.items():
|
868
|
-
if key == field_name:
|
869
|
-
new_properties[reasoning_key] = {"type": "string", "description": reasoning_desc}
|
870
|
-
new_properties[key] = value
|
871
|
-
properties.clear()
|
872
|
-
properties.update(new_properties)
|
873
|
-
|
874
|
-
|
875
|
-
def _insert_reasoning_fields_inner(schema: dict[str, Any]) -> tuple[dict[str, Any], str | None]:
|
876
|
-
"""
|
877
|
-
Inner function that returns (updated_schema, reasoning_desc_for_this_node).
|
878
|
-
The parent caller (which handles 'properties') will add the sibling reasoning field if reasoning_desc_for_this_node is not None.
|
879
|
-
"""
|
880
|
-
reasoning_desc = schema.pop("X-ReasoningPrompt", None)
|
881
|
-
|
882
|
-
node_type = schema.get("type")
|
883
|
-
|
884
|
-
# Process children recursively
|
885
|
-
# If object: process properties
|
886
|
-
if node_type == "object" or "$ref" in schema:
|
887
|
-
if "properties" in schema and isinstance(schema["properties"], dict):
|
888
|
-
new_props = {}
|
889
|
-
for property_key, property_value in schema["properties"].items():
|
890
|
-
updated_prop_schema, child_reasoning = _insert_reasoning_fields_inner(property_value)
|
891
|
-
new_props[property_key] = updated_prop_schema
|
892
|
-
if child_reasoning:
|
893
|
-
add_reasoning_sibling_inplace(new_props, property_key, child_reasoning)
|
894
|
-
# Add the reasoning field to required if the property is required
|
895
|
-
if "required" in schema and property_key in schema["required"]:
|
896
|
-
schema["required"].append(f"reasoning___{property_key}")
|
897
|
-
schema["properties"] = new_props
|
898
|
-
|
899
|
-
if "$defs" in schema and isinstance(schema["$defs"], dict):
|
900
|
-
new_defs = {}
|
901
|
-
for dk, dv in schema["$defs"].items():
|
902
|
-
updated_def_schema, _ = _insert_reasoning_fields_inner(dv)
|
903
|
-
new_defs[dk] = updated_def_schema
|
904
|
-
schema["$defs"] = new_defs
|
905
|
-
|
906
|
-
elif node_type == "array" and "items" in schema:
|
907
|
-
# Recurse into items if present
|
908
|
-
updated_items, item_reasoning = _insert_reasoning_fields_inner(schema["items"])
|
909
|
-
schema["items"] = updated_items
|
910
|
-
|
911
|
-
# If the item schema has a reasoning prompt, create a reasoning field inside the item
|
912
|
-
if item_reasoning and updated_items.get("type") == "object":
|
913
|
-
# Create reasoning field for array items
|
914
|
-
if "properties" not in updated_items:
|
915
|
-
updated_items["properties"] = {}
|
916
|
-
|
917
|
-
# Add the reasoning field as first property
|
918
|
-
reasoning_key = "reasoning___item"
|
919
|
-
new_properties = {reasoning_key: {"type": "string", "description": item_reasoning}}
|
920
|
-
|
921
|
-
# Add the rest of the properties
|
922
|
-
for key, value in updated_items["properties"].items():
|
923
|
-
new_properties[key] = value
|
924
|
-
|
925
|
-
updated_items["properties"] = new_properties
|
926
|
-
|
927
|
-
# Add to required if we have required fields
|
928
|
-
if "required" in updated_items:
|
929
|
-
updated_items["required"].insert(0, reasoning_key)
|
930
|
-
else:
|
931
|
-
updated_items["required"] = [reasoning_key]
|
932
|
-
|
933
|
-
return schema, reasoning_desc
|
934
|
-
|
935
|
-
|
936
|
-
def _insert_quote_fields_inner(schema: dict[str, Any]) -> dict[str, Any]:
|
937
|
-
"""
|
938
|
-
Inner function that processes a schema and adds quote___ fields for leaf nodes with X-ReferenceQuote: true.
|
939
|
-
Only applies to leaf fields, never to the root.
|
940
|
-
"""
|
941
|
-
if not isinstance(schema, dict):
|
942
|
-
return schema
|
943
|
-
|
944
|
-
# Create a copy to avoid modifying the original
|
945
|
-
new_schema = copy.deepcopy(schema)
|
946
|
-
|
947
|
-
# Process children recursively
|
948
|
-
if "properties" in new_schema and isinstance(new_schema["properties"], dict):
|
949
|
-
new_props = {}
|
950
|
-
for property_key, property_value in new_schema["properties"].items():
|
951
|
-
updated_prop_schema_value = _insert_quote_fields_inner(property_value)
|
952
|
-
has_quote_field = updated_prop_schema_value.get("X-ReferenceQuote") is True
|
953
|
-
|
954
|
-
# Check if this property is a leaf with X-ReferenceQuote: true
|
955
|
-
if has_quote_field:
|
956
|
-
# Add the quote field
|
957
|
-
quote_key = f"quote___{property_key}"
|
958
|
-
new_props[quote_key] = {"type": "string"}
|
959
|
-
|
960
|
-
# Add the quote field to required if the property is required
|
961
|
-
if "required" in new_schema and property_key in new_schema["required"]:
|
962
|
-
# add the quote field to required just before the property_key
|
963
|
-
new_schema["required"].insert(new_schema["required"].index(property_key), quote_key)
|
964
|
-
|
965
|
-
# Remove the X-ReferenceQuote field
|
966
|
-
updated_prop_schema_value.pop("X-ReferenceQuote", None)
|
967
|
-
|
968
|
-
new_props[property_key] = updated_prop_schema_value
|
969
|
-
new_schema["properties"] = new_props
|
970
|
-
|
971
|
-
elif "items" in new_schema:
|
972
|
-
# Recurse into items if present
|
973
|
-
updated_items = _insert_quote_fields_inner(new_schema["items"])
|
974
|
-
new_schema["items"] = updated_items
|
975
|
-
|
976
|
-
return new_schema
|
977
|
-
|
978
|
-
|
979
|
-
def _rec_replace_description_with_llm_description(schema: dict[str, Any]) -> dict[str, Any]:
|
980
|
-
"""
|
981
|
-
Recursively replace the description field with X-ReasoningPrompt if present.
|
982
|
-
"""
|
983
|
-
if not isinstance(schema, dict):
|
984
|
-
return schema
|
985
|
-
|
986
|
-
new_schema = copy.deepcopy(schema)
|
987
|
-
if "description" in new_schema or "X-FieldPrompt" in new_schema:
|
988
|
-
new_schema["description"] = new_schema.pop("X-FieldPrompt", new_schema.get("description"))
|
989
|
-
if new_schema["description"] is None:
|
990
|
-
new_schema.pop("description")
|
991
|
-
elif "default" in new_schema:
|
992
|
-
new_schema["description"] += f"\nUser Provided a Default Value: {json.dumps(new_schema['default'])}"
|
993
|
-
|
994
|
-
if "properties" in new_schema:
|
995
|
-
new_schema["properties"] = {k: _rec_replace_description_with_llm_description(v) for k, v in new_schema["properties"].items()}
|
996
|
-
|
997
|
-
if "items" in new_schema:
|
998
|
-
new_schema["items"] = _rec_replace_description_with_llm_description(new_schema["items"])
|
999
|
-
|
1000
|
-
if "$defs" in new_schema:
|
1001
|
-
new_schema["$defs"] = {k: _rec_replace_description_with_llm_description(v) for k, v in new_schema["$defs"].items()}
|
1002
|
-
|
1003
|
-
return new_schema
|
1004
|
-
|
1005
|
-
|
1006
|
-
def create_reasoning_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
|
1007
|
-
# Resolve refs first to get expanded schema
|
1008
|
-
definitions = json_schema.get("$defs", {})
|
1009
|
-
resolved = expand_refs(copy.deepcopy(json_schema), definitions)
|
1010
|
-
# resolved.pop("$defs", None)
|
1011
|
-
|
1012
|
-
expanded_schema = copy.deepcopy(resolved)
|
1013
|
-
|
1014
|
-
# Insert reasoning fields.
|
1015
|
-
# We'll handle the root reasoning similarly: if root has reasoning, we add reasoning___root
|
1016
|
-
updated_schema, root_reasoning = _insert_reasoning_fields_inner(copy.deepcopy(expanded_schema))
|
1017
|
-
|
1018
|
-
if root_reasoning:
|
1019
|
-
# Root is an object (assumed). Add reasoning___root at top-level properties
|
1020
|
-
if "properties" not in updated_schema:
|
1021
|
-
updated_schema["properties"] = {}
|
1022
|
-
add_reasoning_sibling_inplace(updated_schema["properties"], "root", root_reasoning)
|
1023
|
-
if "required" in updated_schema:
|
1024
|
-
updated_schema["required"].append("reasoning___root")
|
1025
|
-
|
1026
|
-
# Insert quote fields for leaf nodes with X-ReferenceQuote: true
|
1027
|
-
updated_schema = _insert_quote_fields_inner(updated_schema)
|
1028
|
-
|
1029
|
-
# Clean up $defs from inference_schema if desired (optional)
|
1030
|
-
# if "$defs" in updated_schema:
|
1031
|
-
# updated_schema.pop("$defs", None)
|
1032
|
-
|
1033
|
-
# Replace description with X-FieldPrompt if present
|
1034
|
-
updated_schema = _rec_replace_description_with_llm_description(updated_schema)
|
1035
|
-
|
1036
|
-
# Clean the schema (remove defaults, etc)
|
1037
|
-
updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
|
1038
|
-
return updated_schema
|
1039
|
-
|
1040
|
-
|
1041
|
-
def cleanup_reasoning(output_data: Any, reasoning_preffix: str = "reasoning___") -> Any:
|
1042
|
-
"""
|
1043
|
-
Recursively removes all reasoning key/values from the output data. Reasoning keys starts with 'reasoning___'.
|
1044
|
-
"""
|
1045
|
-
if isinstance(output_data, dict):
|
1046
|
-
new_dict = {}
|
1047
|
-
for k, v in output_data.items():
|
1048
|
-
if not k.startswith(reasoning_preffix):
|
1049
|
-
new_dict[k] = cleanup_reasoning(v)
|
1050
|
-
return new_dict
|
1051
|
-
elif isinstance(output_data, list):
|
1052
|
-
return [cleanup_reasoning(item) for item in output_data]
|
1053
|
-
else:
|
1054
|
-
return output_data
|
1055
|
-
|
1056
|
-
|
1057
|
-
# Other utils
|
1058
|
-
|
1059
|
-
|
1060
|
-
def cast_all_leaves_from_json_schema_to_type(leaf: dict[str, Any], new_type: Literal["string", "boolean"], is_optional: bool = True) -> dict[str, Any]:
|
1061
|
-
new_leaf: dict[str, Any] = {}
|
1062
|
-
# new_leaf["description"] = "Here goes the suggestion, if any, or null."
|
1063
|
-
if leaf.get("type") == "object":
|
1064
|
-
new_leaf["type"] = "object"
|
1065
|
-
new_leaf["properties"] = {}
|
1066
|
-
for key, value in leaf["properties"].items():
|
1067
|
-
new_leaf["properties"][key] = cast_all_leaves_from_json_schema_to_type(value, new_type, is_optional=is_optional)
|
1068
|
-
elif leaf.get("type") == "array":
|
1069
|
-
new_leaf["type"] = "array"
|
1070
|
-
new_leaf["items"] = cast_all_leaves_from_json_schema_to_type(leaf["items"], new_type, is_optional=is_optional)
|
1071
|
-
else:
|
1072
|
-
if is_optional:
|
1073
|
-
new_leaf["anyOf"] = [{"type": new_type}, {"type": "null"}]
|
1074
|
-
else:
|
1075
|
-
new_leaf["type"] = new_type
|
1076
|
-
return new_leaf
|
1077
|
-
|
1078
|
-
|
1079
|
-
SCHEMA_TYPES = Literal["string", "integer", "number", "boolean", "array", "object"]
|
1080
|
-
# SCHEMA_STRING_DATE_FORMATS = Literal["date", "iso-date"]
|
1081
|
-
# SCHEMA_STRING_TIME_FORMATS = Literal["time", "iso-time"]
|
1082
|
-
# SCHEMA_STRING_DATETIME_FORMATS = Literal["datetime", "iso-datetime"]
|
1083
|
-
# SCHEMA_STRING_CUSTOM_FORMATS = Literal["email", "phone-number", "vat-number"]
|
1084
|
-
|
1085
|
-
|
1086
|
-
def get_pydantic_primitive_field_type(
|
1087
|
-
type_: SCHEMA_TYPES | str, format_: str | None, is_nullable: bool = False, validator_func: Callable | None = None, enum_values: list[Any] | None = None
|
1088
|
-
) -> Any:
|
1089
|
-
python_base_type: Any
|
1090
|
-
|
1091
|
-
if enum_values is not None:
|
1092
|
-
python_base_type = Literal[tuple(enum_values)] # type: ignore
|
1093
|
-
elif type_ == "string":
|
1094
|
-
if format_ in ("date", "iso-date"):
|
1095
|
-
python_base_type = datetime.date
|
1096
|
-
if format_ in ("time", "iso-time"):
|
1097
|
-
python_base_type = datetime.time
|
1098
|
-
if format_ in ("datetime", "iso-datetime"):
|
1099
|
-
python_base_type = datetime.datetime
|
1100
|
-
else:
|
1101
|
-
python_base_type = str
|
1102
|
-
elif type_ == "integer":
|
1103
|
-
python_base_type = int
|
1104
|
-
elif type_ == "number":
|
1105
|
-
python_base_type = float
|
1106
|
-
elif type_ == "boolean":
|
1107
|
-
python_base_type = bool
|
1108
|
-
elif type_ == "array":
|
1109
|
-
python_base_type = list
|
1110
|
-
elif type_ == "object":
|
1111
|
-
python_base_type = dict
|
1112
|
-
else:
|
1113
|
-
raise ValueError(f"Unsupported schema type: {type_}")
|
1114
|
-
|
1115
|
-
field_kwargs: Any = {"json_schema_extra": {"format": format_}} if format_ is not None else {}
|
1116
|
-
|
1117
|
-
final_type: Any = Annotated[python_base_type, Field(..., **field_kwargs)]
|
1118
|
-
final_type = Optional[final_type] if is_nullable or validator_func is not None else final_type
|
1119
|
-
if validator_func is not None:
|
1120
|
-
return Annotated[final_type, BeforeValidator(validator_func)]
|
1121
|
-
return final_type
|
1122
|
-
|
1123
|
-
|
1124
|
-
# Defaultdict that returns a no-op lambda for unknown keys, then merges known validators
|
1125
|
-
# Expansive coercion functions (can evolve on time)
|
1126
|
-
KNOWN_COERCIONS: dict[tuple[str | None, str | None], Callable[[Any], Any]] = defaultdict(lambda: lambda x: x) | {
|
1127
|
-
# ("string", "iso-date"): validate_date,
|
1128
|
-
# ("string", "iso-time"): validate_time,
|
1129
|
-
# ("string", "email"): validate_email_address,
|
1130
|
-
# ("string", "phone-number"): validate_phone_number,
|
1131
|
-
# ("string", "vat-number"): validate_vat_number,
|
1132
|
-
("integer", None): validate_integer,
|
1133
|
-
("number", None): validate_float,
|
1134
|
-
("boolean", None): validate_bool,
|
1135
|
-
("string", None): validate_str,
|
1136
|
-
}
|
1137
|
-
|
1138
|
-
|
1139
|
-
def object_format_coercion(instance: dict[str, Any], schema: dict[str, Any]) -> dict[str, Any]:
|
1140
|
-
"""
|
1141
|
-
Coerces an instance to conform to a JSON Schema, applying defaults and handling nullable fields.
|
1142
|
-
Converts empty strings to None only if the field is optional.
|
1143
|
-
"""
|
1144
|
-
|
1145
|
-
def recursive_coercion(_instance: Any, _schema: dict[str, Any]) -> Any:
|
1146
|
-
# 1. Handle object type
|
1147
|
-
if _schema.get("type") == "object":
|
1148
|
-
if not isinstance(_instance, dict):
|
1149
|
-
return _schema.get("default", {})
|
1150
|
-
coerced_instance = {}
|
1151
|
-
for prop_key, prop_schema in _schema.get("properties", {}).items():
|
1152
|
-
coerced_instance[prop_key] = recursive_coercion(_instance.get(prop_key), prop_schema)
|
1153
|
-
return coerced_instance
|
1154
|
-
|
1155
|
-
# 2. Handle array type
|
1156
|
-
if _schema.get("type") == "array":
|
1157
|
-
if not isinstance(_instance, list):
|
1158
|
-
return _schema.get("default", [])
|
1159
|
-
return [recursive_coercion(value, _schema.get("items", {})) for value in _instance]
|
1160
|
-
|
1161
|
-
# 3. Handle anyOf (optional fields)
|
1162
|
-
if "anyOf" in _schema:
|
1163
|
-
is_field_optional = any(sub.get("type") == "null" for sub in _schema["anyOf"])
|
1164
|
-
if is_field_optional and (_instance == "" or _instance is None):
|
1165
|
-
return None
|
1166
|
-
|
1167
|
-
# Try to coerce with the first matching subschema
|
1168
|
-
for subschema in _schema["anyOf"]:
|
1169
|
-
# Skip null subschema for explicit coercion; handled above
|
1170
|
-
if subschema.get("type") == "null":
|
1171
|
-
continue
|
1172
|
-
coerced_value = recursive_coercion(_instance, subschema)
|
1173
|
-
if coerced_value is not None:
|
1174
|
-
return coerced_value
|
1175
|
-
return None # If none match, return None
|
1176
|
-
|
1177
|
-
# 4. Handle primitive types and known coercions
|
1178
|
-
schema_type = _schema.get("type")
|
1179
|
-
## Custom Formats that are not supported by default should be supplied as X-format.
|
1180
|
-
schema_format = _schema.get("X-format") or _schema.get("format")
|
1181
|
-
|
1182
|
-
# Use default if instance is None
|
1183
|
-
if _instance is None:
|
1184
|
-
_instance = _schema.get("default")
|
1185
|
-
|
1186
|
-
# If schema type is null, just return None
|
1187
|
-
if schema_type == "null":
|
1188
|
-
return None
|
1189
|
-
|
1190
|
-
# Apply known coercion
|
1191
|
-
if (schema_type, schema_format) in KNOWN_COERCIONS:
|
1192
|
-
return KNOWN_COERCIONS[(schema_type, schema_format)](_instance)
|
1193
|
-
|
1194
|
-
return _instance # Return as-is if no coercion is required
|
1195
|
-
|
1196
|
-
expanded_schema = expand_refs(schema)
|
1197
|
-
coerced = recursive_coercion(instance, expanded_schema)
|
1198
|
-
return coerced if coerced is not None else {}
|
1199
|
-
|
1200
|
-
|
1201
|
-
def flatten_dict(obj: Any, prefix: str = "", allow_empty_objects: bool = True) -> dict[str, Any]:
|
1202
|
-
items = [] # type: ignore
|
1203
|
-
if isinstance(obj, dict):
|
1204
|
-
if len(obj) == 0 and allow_empty_objects:
|
1205
|
-
# Keep empty dicts as dicts (so we can keep its structure)
|
1206
|
-
items.append((prefix, {}))
|
1207
|
-
else:
|
1208
|
-
for k, v in obj.items():
|
1209
|
-
new_key = f"{prefix}.{k}" if prefix else k
|
1210
|
-
items.extend(flatten_dict(v, new_key, allow_empty_objects=allow_empty_objects).items())
|
1211
|
-
|
1212
|
-
elif isinstance(obj, list):
|
1213
|
-
if len(obj) == 0 and allow_empty_objects:
|
1214
|
-
# Keep empty lists as lists (so we can keep its structure)
|
1215
|
-
items.append((prefix, []))
|
1216
|
-
else:
|
1217
|
-
for i, v in enumerate(obj):
|
1218
|
-
new_key = f"{prefix}.{i}"
|
1219
|
-
items.extend(flatten_dict(v, new_key, allow_empty_objects=allow_empty_objects).items())
|
1220
|
-
else:
|
1221
|
-
items.append((prefix, obj))
|
1222
|
-
return dict(items)
|
1223
|
-
|
1224
|
-
|
1225
|
-
def convert_dict_to_list_recursively(_obj: Any, allow_lists: bool = True) -> Any:
|
1226
|
-
"""
|
1227
|
-
Recursively converts dict[int, Any] to list[Any] if the keys are sequential integers starting from 0.
|
1228
|
-
Creates a copy of the input object rather than modifying it in place.
|
1229
|
-
"""
|
1230
|
-
# Handle non-dict types
|
1231
|
-
if not isinstance(_obj, dict):
|
1232
|
-
return _obj
|
1233
|
-
|
1234
|
-
# Create a copy to avoid modifying the original
|
1235
|
-
result = {}
|
1236
|
-
|
1237
|
-
# Process all nested dictionaries first
|
1238
|
-
for key, value in _obj.items():
|
1239
|
-
result[key] = convert_dict_to_list_recursively(value, allow_lists=allow_lists)
|
1240
|
-
|
1241
|
-
# Check if this dictionary should be converted to a list
|
1242
|
-
if result and all(isinstance(k, int) for k in result.keys()):
|
1243
|
-
# Check if keys are sequential starting from 0
|
1244
|
-
keys = sorted(result.keys())
|
1245
|
-
if allow_lists and keys[0] == 0 and keys[-1] == len(keys) - 1:
|
1246
|
-
# Convert to list
|
1247
|
-
return [result[i] for i in keys]
|
1248
|
-
else:
|
1249
|
-
# Sort the keys and convert to string
|
1250
|
-
return {str(i): result[i] for i in keys}
|
1251
|
-
|
1252
|
-
return result
|
1253
|
-
|
1254
|
-
|
1255
|
-
def unflatten_dict(obj: dict[str, Any], allow_lists: bool = True) -> Any:
|
1256
|
-
"""
|
1257
|
-
Unflattens a dictionary by recursively converting keys with dots into nested dictionaries.
|
1258
|
-
After building the nested structure, converts dict[int, Any] to list[Any] if the keys
|
1259
|
-
are sequential integers starting from 0.
|
1260
|
-
|
1261
|
-
Args:
|
1262
|
-
obj: The dictionary to unflatten.
|
1263
|
-
|
1264
|
-
Returns:
|
1265
|
-
The unflattened dictionary with appropriate dict[int, Any] converted to list[Any].
|
1266
|
-
"""
|
1267
|
-
# Handle empty input
|
1268
|
-
if not obj:
|
1269
|
-
return obj
|
1270
|
-
|
1271
|
-
# Create a copy of the input object to avoid modifying it
|
1272
|
-
input_copy = dict(obj)
|
1273
|
-
|
1274
|
-
# Optionally validate that the dict is indeed flat
|
1275
|
-
# Commented out to avoid potential equality issues with key ordering
|
1276
|
-
# assert flatten_dict(input_copy) == input_copy, "Dictionary is not flat"
|
1277
|
-
|
1278
|
-
# First pass: build everything as nested dictionaries
|
1279
|
-
result = {}
|
1280
|
-
for key, value in input_copy.items():
|
1281
|
-
# Skip invalid keys
|
1282
|
-
if not isinstance(key, str):
|
1283
|
-
continue
|
1284
|
-
|
1285
|
-
parts = key.split(".")
|
1286
|
-
# Filter out empty parts
|
1287
|
-
valid_parts = [p for p in parts if p]
|
1288
|
-
if not valid_parts:
|
1289
|
-
result[key] = value
|
1290
|
-
continue
|
1291
|
-
|
1292
|
-
current = result
|
1293
|
-
|
1294
|
-
for i, part in enumerate(valid_parts):
|
1295
|
-
# Check if the part is an integer (for list indices)
|
1296
|
-
try:
|
1297
|
-
# More robust integer parsing - handles negative numbers too
|
1298
|
-
if part.lstrip("-").isdigit():
|
1299
|
-
part = int(part)
|
1300
|
-
except (ValueError, AttributeError):
|
1301
|
-
# If conversion fails, keep as string
|
1302
|
-
pass
|
1303
|
-
|
1304
|
-
# If at the last part, set the value
|
1305
|
-
if i == len(valid_parts) - 1:
|
1306
|
-
current[part] = value
|
1307
|
-
else:
|
1308
|
-
# Create the container if it doesn't exist
|
1309
|
-
if part not in current:
|
1310
|
-
current[part] = {}
|
1311
|
-
elif not isinstance(current[part], dict):
|
1312
|
-
# Handle case where we're trying to nest under a non-dict
|
1313
|
-
# This is a conflict - the path is both a value and used as a prefix
|
1314
|
-
current[part] = {}
|
1315
|
-
|
1316
|
-
current = current[part]
|
1317
|
-
|
1318
|
-
# Second pass: convert appropriate dict[int, Any] to list[Any]
|
1319
|
-
return convert_dict_to_list_recursively(result, allow_lists=allow_lists)
|
1320
|
-
|
1321
|
-
|
1322
|
-
def extract_property_type_info(prop_schema: dict[str, Any]) -> tuple[str, Optional[str], bool, list[Any] | None]:
|
1323
|
-
"""
|
1324
|
-
Extract the property type, possible 'format'/'X-format', and nullability from a property schema.
|
1325
|
-
- If an 'anyOf' with exactly one 'null' type is used, we unify it into a single schema
|
1326
|
-
(i.e., prop_schema plus is_nullable=True).
|
1327
|
-
- This ensures 'enum', 'format', etc. are preserved from the non-null sub-schema.
|
1328
|
-
|
1329
|
-
Returns:
|
1330
|
-
(prop_type, prop_format, is_nullable)
|
1331
|
-
"""
|
1332
|
-
is_nullable = False
|
1333
|
-
|
1334
|
-
if "anyOf" in prop_schema:
|
1335
|
-
sub_schemas = prop_schema["anyOf"]
|
1336
|
-
sub_types = [s.get("type") for s in sub_schemas if isinstance(s, dict)]
|
1337
|
-
|
1338
|
-
# We only handle the scenario: anyOf: [{type=XYZ,...}, {type=null}]
|
1339
|
-
# If you have more complex unions, you'll need additional logic.
|
1340
|
-
if len(sub_schemas) == 2 and "null" in sub_types:
|
1341
|
-
# Identify the non-null sub-schema
|
1342
|
-
valid_sub = next(s for s in sub_schemas if s.get("type") != "null")
|
1343
|
-
is_nullable = True
|
1344
|
-
|
1345
|
-
# Merge *everything* (enum, format, x-, etc.) from the valid_sub
|
1346
|
-
# into prop_schema. This ensures we don't lose 'enum', 'format', etc.
|
1347
|
-
prop_schema.update(valid_sub)
|
1348
|
-
# Remove the anyOf now that it's merged
|
1349
|
-
prop_schema.pop("anyOf", None)
|
1350
|
-
else:
|
1351
|
-
raise ValueError(f"'anyOf' structure not supported or doesn't match a single null type. Found: {sub_schemas}")
|
1352
|
-
|
1353
|
-
# At this point, we expect a single 'type' in the property
|
1354
|
-
if "type" not in prop_schema:
|
1355
|
-
raise ValueError("Property schema must have a 'type' or a supported 'anyOf' pattern.")
|
1356
|
-
|
1357
|
-
prop_type = prop_schema["type"]
|
1358
|
-
# Pop 'format' or 'X-format' if any
|
1359
|
-
prop_format = prop_schema.pop("format", None) or prop_schema.pop("X-format", None)
|
1360
|
-
enum_values = prop_schema.get("enum", None)
|
1361
|
-
|
1362
|
-
return prop_type, prop_format, is_nullable, enum_values
|
1363
|
-
|
1364
|
-
|
1365
|
-
def _convert_property_schema_to_type(prop_schema: dict[str, Any]) -> Any:
|
1366
|
-
"""
|
1367
|
-
Convert a single JSON Schema property to a Python type annotation:
|
1368
|
-
- If 'enum' => Literal[...]
|
1369
|
-
- If 'type=object' => nested submodel
|
1370
|
-
- If 'type=array' => list[sub_type]
|
1371
|
-
- If 'type=string/integer/number/boolean' => str/int/float/bool
|
1372
|
-
"""
|
1373
|
-
# If there's an enum, return a Literal of the enum values
|
1374
|
-
if "enum" in prop_schema:
|
1375
|
-
# Convert each enum value to the correct Python literal
|
1376
|
-
enum_values = prop_schema["enum"]
|
1377
|
-
return Literal[tuple(enum_values)] # type: ignore
|
1378
|
-
|
1379
|
-
# Otherwise check 'type'
|
1380
|
-
prop_type = prop_schema.get("type")
|
1381
|
-
|
1382
|
-
if prop_type == "object":
|
1383
|
-
# Nested submodel
|
1384
|
-
# If 'properties' is missing, that might be an empty dict
|
1385
|
-
if "properties" in prop_schema:
|
1386
|
-
return convert_json_schema_to_basemodel(prop_schema)
|
1387
|
-
else:
|
1388
|
-
# fallback
|
1389
|
-
return dict
|
1390
|
-
|
1391
|
-
if prop_type == "array":
|
1392
|
-
# Look for 'items' => sub-schema
|
1393
|
-
items_schema = prop_schema.get("items", {})
|
1394
|
-
item_type = _convert_property_schema_to_type(items_schema)
|
1395
|
-
return list[item_type] # type: ignore
|
1396
|
-
|
1397
|
-
if prop_type == "string":
|
1398
|
-
return str
|
1399
|
-
if prop_type == "boolean":
|
1400
|
-
return bool
|
1401
|
-
if prop_type == "integer":
|
1402
|
-
return int
|
1403
|
-
if prop_type == "number":
|
1404
|
-
return float
|
1405
|
-
|
1406
|
-
# If the schema is "null" or unknown, fallback to object
|
1407
|
-
return object
|
1408
|
-
|
1409
|
-
|
1410
|
-
def convert_json_schema_to_basemodel(schema: dict[str, Any]) -> Type[BaseModel]:
|
1411
|
-
"""
|
1412
|
-
Create a Pydantic BaseModel dynamically from a JSON Schema:
|
1413
|
-
- Expand refs
|
1414
|
-
- For each property, figure out if it's required
|
1415
|
-
- Convert 'type': 'object' => nested submodel
|
1416
|
-
- Convert 'enum' => Literal
|
1417
|
-
- 'array' => list[submodel or primitive]
|
1418
|
-
- Primitives => str, int, float, bool
|
1419
|
-
- Preserves anyOf/oneOf structure for nullable fields
|
1420
|
-
"""
|
1421
|
-
# 1) Expand references (inlines $refs)
|
1422
|
-
schema_expanded = expand_refs(copy.deepcopy(schema))
|
1423
|
-
|
1424
|
-
# 2) Figure out model name
|
1425
|
-
model_name = schema_expanded.get("title", "DynamicModel")
|
1426
|
-
|
1427
|
-
# 3) Collect any X-* keys for model config
|
1428
|
-
x_keys = {k: v for k, v in schema_expanded.items() if k.startswith("X-")}
|
1429
|
-
model_config = ConfigDict(extra="forbid", json_schema_extra=x_keys) if x_keys else ConfigDict(extra="forbid")
|
1430
|
-
|
1431
|
-
# 4) Build up the field definitions
|
1432
|
-
properties = schema_expanded.get("properties", {})
|
1433
|
-
required_props = set(schema_expanded.get("required", []))
|
1434
|
-
|
1435
|
-
field_definitions = {}
|
1436
|
-
for prop_name, prop_schema in properties.items():
|
1437
|
-
# If property is required => default=...
|
1438
|
-
# Else => default=None
|
1439
|
-
if prop_name in required_props:
|
1440
|
-
default_val = prop_schema.get("default", ...)
|
1441
|
-
else:
|
1442
|
-
default_val = prop_schema.get("default", None)
|
1443
|
-
|
1444
|
-
# We also keep 'description', 'title', 'X-...' and everything else
|
1445
|
-
# that's needed to preserve schema structure for round-trip conversion
|
1446
|
-
field_kwargs = {
|
1447
|
-
"description": prop_schema.get("description"),
|
1448
|
-
"title": prop_schema.get("title"),
|
1449
|
-
}
|
1450
|
-
|
1451
|
-
# Include all original schema structure for proper round-trip conversion
|
1452
|
-
schema_extra = {}
|
1453
|
-
for k, v in prop_schema.items():
|
1454
|
-
if k not in {"description", "title", "default"} and not k.startswith("$"):
|
1455
|
-
schema_extra[k] = v
|
1456
|
-
|
1457
|
-
if schema_extra:
|
1458
|
-
field_kwargs["json_schema_extra"] = schema_extra
|
1459
|
-
|
1460
|
-
# Handle anyOf for nullable types specially
|
1461
|
-
if "anyOf" in prop_schema:
|
1462
|
-
# Check if it's a standard nullable pattern: [type, null]
|
1463
|
-
sub_schemas = prop_schema["anyOf"]
|
1464
|
-
null_schemas = [s for s in sub_schemas if s.get("type") == "null"]
|
1465
|
-
non_null_schemas = [s for s in sub_schemas if s.get("type") != "null"]
|
1466
|
-
|
1467
|
-
if len(null_schemas) == 1 and len(non_null_schemas) == 1:
|
1468
|
-
# Standard nullable field pattern
|
1469
|
-
non_null_schema = non_null_schemas[0]
|
1470
|
-
inner_type = _convert_property_schema_to_type(non_null_schema)
|
1471
|
-
python_type = Union[inner_type, None]
|
1472
|
-
else:
|
1473
|
-
# More complex anyOf structure - preserve it in schema_extra
|
1474
|
-
python_type = object
|
1475
|
-
|
1476
|
-
field_definitions[prop_name] = (python_type, Field(default_val, **field_kwargs))
|
1477
|
-
continue
|
1478
|
-
|
1479
|
-
# Convert to a Python type annotation
|
1480
|
-
python_type = _convert_property_schema_to_type(prop_schema)
|
1481
|
-
|
1482
|
-
# If a field is not in `required`, we typically wrap it in `Optional[...]`
|
1483
|
-
if prop_name not in required_props and not is_already_optional(python_type):
|
1484
|
-
python_type = Union[python_type, None]
|
1485
|
-
|
1486
|
-
field_definitions[prop_name] = (python_type, Field(default_val, **field_kwargs))
|
1487
|
-
|
1488
|
-
# 5) Build the dynamic model
|
1489
|
-
return create_model(
|
1490
|
-
model_name,
|
1491
|
-
__config__=model_config,
|
1492
|
-
__module__="__main__",
|
1493
|
-
**field_definitions,
|
1494
|
-
) # type: ignore
|
1495
|
-
|
1496
|
-
|
1497
|
-
def convert_json_schema_to_basemodelold(schema: dict[str, Any]) -> Type[BaseModel]:
|
1498
|
-
"""
|
1499
|
-
Create a Pydantic BaseModel dynamically from a JSON Schema.
|
1500
|
-
Steps:
|
1501
|
-
1. Expand all refs.
|
1502
|
-
2. For each property, parse type info and create a suitable Pydantic field.
|
1503
|
-
3. Nested objects -> submodels, arrays -> list[type].
|
1504
|
-
4. Keep 'enum' and 'format' in the final schema so Pydantic sees them in the
|
1505
|
-
generated model's JSON schema.
|
1506
|
-
"""
|
1507
|
-
# 1. Expand references
|
1508
|
-
schema_expanded = expand_refs(copy.deepcopy(schema))
|
1509
|
-
|
1510
|
-
# 2. Gather 'X-*' keys from the root for the config
|
1511
|
-
x_keys = {k: v for k, v in schema_expanded.items() if k.startswith("X-")}
|
1512
|
-
|
1513
|
-
# 3. Prepare dynamic model fields
|
1514
|
-
field_definitions: Any = {}
|
1515
|
-
|
1516
|
-
# 4. Get properties + required
|
1517
|
-
props = schema_expanded.get("properties", {})
|
1518
|
-
required_fields = set(schema_expanded.get("required", []))
|
1519
|
-
|
1520
|
-
for prop_name, prop_schema in props.items():
|
1521
|
-
# a) Determine the python type, format, and nullability
|
1522
|
-
prop_type, prop_format, is_nullable, enum_values = extract_property_type_info(prop_schema)
|
1523
|
-
field_kwargs = {
|
1524
|
-
"description": prop_schema.get("description"),
|
1525
|
-
"title": prop_schema.get("title"),
|
1526
|
-
# Put all schema extras, including 'enum', 'format', 'X-...' etc. into json_schema_extra
|
1527
|
-
"json_schema_extra": {k: v for k, v in prop_schema.items() if k.startswith("X-")},
|
1528
|
-
}
|
1529
|
-
|
1530
|
-
# c) Determine the default or whether it's required
|
1531
|
-
if prop_name in required_fields:
|
1532
|
-
default_val = prop_schema.get("default", ...)
|
1533
|
-
else:
|
1534
|
-
default_val = prop_schema.get("default", None)
|
1535
|
-
|
1536
|
-
# d) Dispatch based on prop_type
|
1537
|
-
if prop_type == "object":
|
1538
|
-
if "properties" not in prop_schema:
|
1539
|
-
raise ValueError(f"Schema for object '{prop_name}' must have 'properties' to build a submodel.")
|
1540
|
-
sub_model = convert_json_schema_to_basemodel(prop_schema)
|
1541
|
-
final_type = sub_model if not is_nullable else Optional[sub_model]
|
1542
|
-
|
1543
|
-
field_definitions[prop_name] = (final_type, Field(default_val, **field_kwargs))
|
1544
|
-
|
1545
|
-
elif prop_type == "array":
|
1546
|
-
# Handle arrays of both objects and primitive types
|
1547
|
-
items_schema = prop_schema.get("items", {})
|
1548
|
-
item_type, item_format, item_nullable, item_enum = extract_property_type_info(items_schema)
|
1549
|
-
|
1550
|
-
if item_type == "object":
|
1551
|
-
# Handle array of objects
|
1552
|
-
sub_model = convert_json_schema_to_basemodel(items_schema)
|
1553
|
-
array_type = list[sub_model] # type: ignore
|
1554
|
-
else:
|
1555
|
-
# Handle array of primitives
|
1556
|
-
item_python_type = get_pydantic_primitive_field_type(
|
1557
|
-
item_type, item_format, is_nullable=item_nullable, validator_func=KNOWN_COERCIONS.get((item_type, item_format), None), enum_values=item_enum
|
1558
|
-
)
|
1559
|
-
array_type = list[item_python_type] # type: ignore
|
1560
|
-
|
1561
|
-
field_definitions[prop_name] = (array_type if not is_nullable else Optional[array_type], Field(default_val, **field_kwargs))
|
1562
|
-
|
1563
|
-
else:
|
1564
|
-
# e) Primitive
|
1565
|
-
python_validator = KNOWN_COERCIONS.get((prop_type, prop_format), None)
|
1566
|
-
python_type = get_pydantic_primitive_field_type(prop_type, prop_format, is_nullable=is_nullable, validator_func=python_validator, enum_values=enum_values)
|
1567
|
-
|
1568
|
-
# If the field can be null, or we have a validator that must accept None:
|
1569
|
-
field_definitions[prop_name] = (python_type, Field(default_val, **field_kwargs))
|
1570
|
-
|
1571
|
-
# 5. Build the model class
|
1572
|
-
model_name: str = schema_expanded.get("title", "DynamicModel")
|
1573
|
-
model_config = ConfigDict(extra="forbid", json_schema_extra=x_keys) if x_keys else ConfigDict(extra="forbid")
|
1574
|
-
|
1575
|
-
return create_model(
|
1576
|
-
model_name,
|
1577
|
-
__config__=model_config,
|
1578
|
-
__module__="__main__",
|
1579
|
-
**field_definitions,
|
1580
|
-
)
|
1581
|
-
|
1582
|
-
|
1583
|
-
def is_basemodel_subclass(t: Any) -> bool:
|
1584
|
-
return isinstance(t, type) and issubclass(t, BaseModel)
|
1585
|
-
|
1586
|
-
|
1587
|
-
def is_already_optional(t: Any) -> bool:
|
1588
|
-
"""Return True if type t is Optional[...] or includes None in a Union."""
|
1589
|
-
return (get_origin(t) in {Union, types.UnionType}) and type(None) in get_args(t)
|
1590
|
-
|
1591
|
-
|
1592
|
-
def convert_basemodel_to_partial_basemodel(base_model: Type[BaseModel]) -> Type[BaseModel]:
|
1593
|
-
"""
|
1594
|
-
Convert a BaseModel class to a new BaseModel class where all fields are Optional.
|
1595
|
-
Handles nested BaseModels, lists, and unions recursively.
|
1596
|
-
"""
|
1597
|
-
field_definitions: Any = {}
|
1598
|
-
maybe_optional_type: Any
|
1599
|
-
for field_name, field_info in base_model.model_fields.items():
|
1600
|
-
field_type = field_info.annotation
|
1601
|
-
|
1602
|
-
# Handle nested BaseModel
|
1603
|
-
if is_basemodel_subclass(field_type):
|
1604
|
-
partial_nested = convert_basemodel_to_partial_basemodel(cast(Type[BaseModel], field_type))
|
1605
|
-
maybe_optional_type = Union[partial_nested, None]
|
1606
|
-
else:
|
1607
|
-
origin = get_origin(field_type)
|
1608
|
-
args = get_args(field_type)
|
1609
|
-
|
1610
|
-
# Handle list[...] or tuple[...]
|
1611
|
-
if origin in (list, tuple) and args:
|
1612
|
-
inner_type = args[0]
|
1613
|
-
if is_basemodel_subclass(inner_type):
|
1614
|
-
# Recursively convert the inner model
|
1615
|
-
partial_inner = convert_basemodel_to_partial_basemodel(inner_type)
|
1616
|
-
container_type = list if origin is list else tuple
|
1617
|
-
new_type = container_type[partial_inner] # type: ignore
|
1618
|
-
else:
|
1619
|
-
new_type = field_type # type: ignore
|
1620
|
-
maybe_optional_type = Union[new_type, None] # type: ignore
|
1621
|
-
|
1622
|
-
# Handle Union types
|
1623
|
-
elif origin in {Union, types.UnionType}:
|
1624
|
-
new_union_args: list[type] = []
|
1625
|
-
for arg in args:
|
1626
|
-
if is_basemodel_subclass(arg):
|
1627
|
-
new_union_args.append(convert_basemodel_to_partial_basemodel(arg))
|
1628
|
-
else:
|
1629
|
-
new_union_args.append(arg)
|
1630
|
-
# Make sure the union has None in it (to enforce optional)
|
1631
|
-
if type(None) not in new_union_args:
|
1632
|
-
new_union_args.append(type(None))
|
1633
|
-
maybe_optional_type = Union[tuple(new_union_args)] # type: ignore
|
1634
|
-
|
1635
|
-
# Any other type - wrap in Optional unless already optional
|
1636
|
-
else:
|
1637
|
-
if is_already_optional(field_type):
|
1638
|
-
maybe_optional_type = field_type
|
1639
|
-
else:
|
1640
|
-
maybe_optional_type = Union[field_type, None] # type: ignore
|
1641
|
-
|
1642
|
-
field_definitions[field_name] = (cast(type, maybe_optional_type), None)
|
1643
|
-
|
1644
|
-
# Dynamically create a new model
|
1645
|
-
return create_model(f"Partial{base_model.__name__}", __config__=base_model.model_config, __module__="__main__", **field_definitions)
|
1646
|
-
|
1647
|
-
|
1648
|
-
def load_json_schema(json_schema: Union[dict[str, Any], Path, str]) -> dict[str, Any]:
|
1649
|
-
"""
|
1650
|
-
Load a JSON schema from either a dictionary or a file path.
|
1651
|
-
|
1652
|
-
Args:
|
1653
|
-
json_schema: Either a dictionary containing the schema or a path to a JSON file
|
1654
|
-
|
1655
|
-
Returns:
|
1656
|
-
dict[str, Any]: The loaded JSON schema
|
1657
|
-
|
1658
|
-
Raises:
|
1659
|
-
JSONDecodeError: If the schema file contains invalid JSON
|
1660
|
-
FileNotFoundError: If the schema file doesn't exist
|
1661
|
-
"""
|
1662
|
-
if isinstance(json_schema, (str, Path)):
|
1663
|
-
with open(json_schema) as f:
|
1664
|
-
return json.load(f)
|
1665
|
-
return json_schema
|
1666
|
-
|
1667
|
-
|
1668
|
-
def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "quote___"]) -> dict[str, Any]:
|
1669
|
-
"""
|
1670
|
-
Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
|
1671
|
-
"""
|
1672
|
-
if not isinstance(data, dict):
|
1673
|
-
return data # Base case: return non-dict values as is
|
1674
|
-
|
1675
|
-
filtered: dict[str, Any] = {}
|
1676
|
-
for key, value in data.items():
|
1677
|
-
if not key.startswith(tuple(prefixes)):
|
1678
|
-
if isinstance(value, dict):
|
1679
|
-
filtered[key] = filter_auxiliary_fields(value, prefixes)
|
1680
|
-
elif isinstance(value, list):
|
1681
|
-
filtered[key] = [filter_auxiliary_fields(item, prefixes) if isinstance(item, dict) else item for item in value]
|
1682
|
-
else:
|
1683
|
-
filtered[key] = value
|
1684
|
-
|
1685
|
-
return filtered
|
1686
|
-
|
1687
|
-
|
1688
|
-
def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "quote___"]) -> dict[str, Any]:
|
1689
|
-
"""
|
1690
|
-
Recursively filters out fields that start with any of the prefixes in `prefixes` from the input JSON data.
|
1691
|
-
"""
|
1692
|
-
data_dict = json.loads(data)
|
1693
|
-
return filter_auxiliary_fields(data_dict, prefixes)
|
1694
|
-
|
1695
|
-
|
1696
|
-
def get_all_paths(schema: dict[str, Any]) -> list[str]:
|
1697
|
-
"""
|
1698
|
-
Extract all possible JSON pointer paths from a JSON Schema.
|
1699
|
-
|
1700
|
-
This function traverses a JSON Schema and generates a list of all possible paths
|
1701
|
-
that could exist in a document conforming to that schema. For arrays, it uses '*'
|
1702
|
-
as a wildcard index.
|
1703
|
-
|
1704
|
-
Args:
|
1705
|
-
schema (dict[str, Any]): The JSON Schema to analyze
|
1706
|
-
|
1707
|
-
Returns:
|
1708
|
-
list[str]: A list of dot-notation paths (e.g. ["person.name", "person.addresses.*.street"])
|
1709
|
-
|
1710
|
-
Example:
|
1711
|
-
>>> schema = {
|
1712
|
-
... "type": "object",
|
1713
|
-
... "properties": {
|
1714
|
-
... "name": {"type": "string"},
|
1715
|
-
... "addresses": {
|
1716
|
-
... "type": "array",
|
1717
|
-
... "items": {
|
1718
|
-
... "type": "object",
|
1719
|
-
... "properties": {
|
1720
|
-
... "street": {"type": "string"}
|
1721
|
-
... }
|
1722
|
-
... }
|
1723
|
-
... }
|
1724
|
-
... }
|
1725
|
-
... }
|
1726
|
-
>>> get_all_paths(schema)
|
1727
|
-
['name', 'addresses', 'addresses.*.street']
|
1728
|
-
"""
|
1729
|
-
paths: list[str] = []
|
1730
|
-
|
1731
|
-
def _traverse(current_schema: dict[str, Any], current_path: str = "") -> None:
|
1732
|
-
if any(key in current_schema for key in ["oneOf", "allOf"]):
|
1733
|
-
raise ValueError("OneOf and AllOf are not supported yet.")
|
1734
|
-
|
1735
|
-
# Handle array type schemas
|
1736
|
-
# if current_schema.get("type") == "array":
|
1737
|
-
if "items" in current_schema:
|
1738
|
-
paths.append(f"{current_path}")
|
1739
|
-
_traverse(current_schema["items"], f"{current_path}.*")
|
1740
|
-
return
|
1741
|
-
|
1742
|
-
# Handle object type schemas
|
1743
|
-
if "properties" in current_schema:
|
1744
|
-
for prop_name, prop_schema in current_schema["properties"].items():
|
1745
|
-
new_path = f"{current_path}.{prop_name}" if current_path else prop_name
|
1746
|
-
|
1747
|
-
# If property is a leaf node (has type but no properties/items)
|
1748
|
-
if not any(key in prop_schema for key in ["properties", "items"]):
|
1749
|
-
paths.append(new_path)
|
1750
|
-
else:
|
1751
|
-
_traverse(prop_schema, new_path)
|
1752
|
-
|
1753
|
-
# Handle $ref schemas
|
1754
|
-
elif "$ref" in current_schema:
|
1755
|
-
# Skip refs for now since we don't have access to the full schema with definitions
|
1756
|
-
pass
|
1757
|
-
|
1758
|
-
# Handle anyOf/oneOf/allOf schemas
|
1759
|
-
|
1760
|
-
elif any(key in current_schema for key in ["anyOf", "oneOf", "allOf"]):
|
1761
|
-
# Take first schema as representative for path generation
|
1762
|
-
for key in ["anyOf", "oneOf", "allOf"]:
|
1763
|
-
if key in current_schema and current_schema[key]:
|
1764
|
-
_traverse(current_schema[key][0], current_path)
|
1765
|
-
break
|
1766
|
-
|
1767
|
-
_traverse(schema)
|
1768
|
-
return paths
|
1769
|
-
|
1770
|
-
|
1771
|
-
def convert_schema_to_layout(schema: dict[str, Any]) -> dict[str, Any]:
|
1772
|
-
"""
|
1773
|
-
Convert a JSON Schema (represented as a Python dict) into a Layout object.
|
1774
|
-
"""
|
1775
|
-
# Get the definitions from the schema (or empty dict if not provided)
|
1776
|
-
defs = schema.get("$defs", {})
|
1777
|
-
converted_defs: dict[str, Column] = {}
|
1778
|
-
|
1779
|
-
def is_object_schema(sch: dict[str, Any]) -> bool:
|
1780
|
-
return "properties" in sch and isinstance(sch.get("properties"), dict)
|
1781
|
-
|
1782
|
-
def extract_ref(sch: dict[str, Any]) -> Optional[str]:
|
1783
|
-
return sch.get("$ref")
|
1784
|
-
|
1785
|
-
def extract_ref_schema(ref: Optional[str], defs: dict[str, dict[str, Any]]) -> Optional[dict[str, Any]]:
|
1786
|
-
if not ref:
|
1787
|
-
return None
|
1788
|
-
ref_name = ref.split("/")[-1]
|
1789
|
-
return defs.get(ref_name)
|
1790
|
-
|
1791
|
-
def is_object_via_any_of(sch: dict[str, Any]) -> bool:
|
1792
|
-
any_of = sch.get("anyOf")
|
1793
|
-
if isinstance(any_of, list):
|
1794
|
-
return any((extract_ref(option) and extract_ref_schema(extract_ref(option), defs)) or is_object_schema(option) for option in any_of)
|
1795
|
-
return False
|
1796
|
-
|
1797
|
-
def property_is_object(prop_schema: dict[str, Any]) -> bool:
|
1798
|
-
ref = extract_ref(prop_schema)
|
1799
|
-
if ref:
|
1800
|
-
ref_schema = extract_ref_schema(ref, defs)
|
1801
|
-
return bool(ref_schema)
|
1802
|
-
return is_object_schema(prop_schema) or is_object_via_any_of(prop_schema)
|
1803
|
-
|
1804
|
-
def property_is_array(prop_schema: dict[str, Any]) -> bool:
|
1805
|
-
return prop_schema.get("type") == "array"
|
1806
|
-
|
1807
|
-
def handle_ref_object(prop_name: str, ref: str) -> RefObject:
|
1808
|
-
ref_name = ref.split("/")[-1]
|
1809
|
-
if ref_name not in converted_defs:
|
1810
|
-
ref_schema = extract_ref_schema(ref, defs)
|
1811
|
-
if ref_schema and is_object_schema(ref_schema):
|
1812
|
-
result = handle_object(ref_name, ref_schema, drop_name=True)
|
1813
|
-
assert isinstance(result, Column)
|
1814
|
-
converted_defs[ref_name] = result
|
1815
|
-
return RefObject(type="object", size=None, **{"$ref": ref})
|
1816
|
-
|
1817
|
-
def handle_object(prop_name: str, object_schema: dict[str, Any], drop_name: bool = False) -> Union[RefObject, Column]:
|
1818
|
-
ref = extract_ref(object_schema)
|
1819
|
-
if ref:
|
1820
|
-
return handle_ref_object(prop_name, ref)
|
1821
|
-
else:
|
1822
|
-
props = object_schema.get("properties")
|
1823
|
-
if not props:
|
1824
|
-
# If no properties, try anyOf (skipping null types)
|
1825
|
-
any_of = object_schema.get("anyOf")
|
1826
|
-
if isinstance(any_of, list):
|
1827
|
-
for option in any_of:
|
1828
|
-
if option.get("type") != "null":
|
1829
|
-
props = option.get("properties")
|
1830
|
-
if props:
|
1831
|
-
break
|
1832
|
-
if not props:
|
1833
|
-
props = {}
|
1834
|
-
items: list[Row | RowList | FieldItem | RefObject] = []
|
1835
|
-
for p_name, p_schema in props.items():
|
1836
|
-
if property_is_object(p_schema):
|
1837
|
-
# Wrap object properties in a row
|
1838
|
-
items.append(Row(type="row", name=p_name, items=[handle_object(p_name, p_schema)]))
|
1839
|
-
elif property_is_array(p_schema):
|
1840
|
-
items.append(handle_array_items(p_name, p_schema))
|
1841
|
-
else:
|
1842
|
-
items.append(FieldItem(type="field", name=p_name, size=1))
|
1843
|
-
if drop_name:
|
1844
|
-
return Column(type="column", size=1, items=items)
|
1845
|
-
else:
|
1846
|
-
return Column(type="column", size=1, items=items, name=prop_name)
|
1847
|
-
|
1848
|
-
def handle_array_items(prop_name: str, array_schema: dict[str, Any]) -> RowList:
|
1849
|
-
items_schema = array_schema.get("items", {})
|
1850
|
-
row_items: list[Column | FieldItem | RefObject] = []
|
1851
|
-
if property_is_object(items_schema):
|
1852
|
-
row_items.append(handle_object(prop_name, items_schema))
|
1853
|
-
else:
|
1854
|
-
row_items.append(FieldItem(type="field", name=prop_name, size=1))
|
1855
|
-
return RowList(type="rowList", name=prop_name, items=row_items)
|
1856
|
-
|
1857
|
-
# Process definitions from $defs
|
1858
|
-
for definition_name, definition_schema in defs.items():
|
1859
|
-
if is_object_schema(definition_schema):
|
1860
|
-
result = handle_object(definition_name, definition_schema, drop_name=True)
|
1861
|
-
assert isinstance(result, Column)
|
1862
|
-
converted_defs[definition_name] = result
|
1863
|
-
|
1864
|
-
# Process top-level properties
|
1865
|
-
top_level_props = schema.get("properties", {})
|
1866
|
-
top_level_items: list[Row | RowList | FieldItem | RefObject] = []
|
1867
|
-
for prop_name, prop_schema in top_level_props.items():
|
1868
|
-
if property_is_object(prop_schema):
|
1869
|
-
top_level_items.append(Row(type="row", name=prop_name, items=[handle_object(prop_name, prop_schema)]))
|
1870
|
-
elif property_is_array(prop_schema):
|
1871
|
-
top_level_items.append(handle_array_items(prop_name, prop_schema))
|
1872
|
-
else:
|
1873
|
-
top_level_items.append(FieldItem(type="field", name=prop_name, size=1))
|
1874
|
-
|
1875
|
-
return Layout(type="column", size=1, items=top_level_items, **{"$defs": converted_defs}).model_dump(by_alias=True)
|
1876
|
-
|
1877
|
-
|
1878
|
-
### Json Schema to NLP Data Structure
|
1879
|
-
|
1880
|
-
|
1881
|
-
def get_type_str(field_schema):
|
1882
|
-
"""
|
1883
|
-
Recursively determine the type string for a given schema field.
|
1884
|
-
Handles 'anyOf' unions, enums, arrays, and simple types.
|
1885
|
-
"""
|
1886
|
-
if "anyOf" in field_schema:
|
1887
|
-
types = []
|
1888
|
-
for sub_schema in field_schema["anyOf"]:
|
1889
|
-
types.append(get_type_str(sub_schema))
|
1890
|
-
# Remove duplicates while preserving order
|
1891
|
-
seen = set()
|
1892
|
-
unique_types = []
|
1893
|
-
for t in types:
|
1894
|
-
if t not in seen:
|
1895
|
-
seen.add(t)
|
1896
|
-
unique_types.append(t)
|
1897
|
-
return " | ".join(unique_types)
|
1898
|
-
elif "enum" in field_schema:
|
1899
|
-
# Create a union of the literal enum values (as JSON strings)
|
1900
|
-
return " | ".join(json.dumps(val) for val in field_schema["enum"])
|
1901
|
-
elif "type" in field_schema:
|
1902
|
-
typ = field_schema["type"]
|
1903
|
-
if typ == "array" and "items" in field_schema:
|
1904
|
-
# For arrays, indicate the type of the items
|
1905
|
-
item_type = get_type_str(field_schema["items"])
|
1906
|
-
return f"array of {item_type}"
|
1907
|
-
return typ
|
1908
|
-
else:
|
1909
|
-
return "unknown"
|
1910
|
-
|
1911
|
-
|
1912
|
-
def process_schema_field(field_name, field_schema, level, new_line_sep: str = "\n", field_name_prefix: str = ""):
|
1913
|
-
"""
|
1914
|
-
Process a single field in the JSON schema.
|
1915
|
-
'level' indicates the header level (e.g., 3 for root, 4 for nested, etc.).
|
1916
|
-
Returns a markdown string representing the field.
|
1917
|
-
"""
|
1918
|
-
md = ""
|
1919
|
-
field_name_complete = field_name_prefix + field_name
|
1920
|
-
|
1921
|
-
# Extract type information
|
1922
|
-
type_str = get_type_str(field_schema)
|
1923
|
-
# md += f"**Type**: {type_str}{new_line_sep}"
|
1924
|
-
|
1925
|
-
header = "#" * level + f" {field_name_complete} ({type_str})"
|
1926
|
-
md += header + new_line_sep
|
1927
|
-
|
1928
|
-
# Extract description (or use a placeholder if not provided)
|
1929
|
-
description = field_schema.get("description", None)
|
1930
|
-
if description is not None:
|
1931
|
-
md += f"<Description>\n{description}\n</Description>"
|
1932
|
-
else:
|
1933
|
-
md += "<Description></Description>"
|
1934
|
-
|
1935
|
-
md += new_line_sep * 2
|
1936
|
-
|
1937
|
-
# If the field is an object with its own properties, process those recursively.
|
1938
|
-
if field_schema.get("type") == "object" and "properties" in field_schema:
|
1939
|
-
for sub_field_name, sub_field_schema in field_schema["properties"].items():
|
1940
|
-
md += process_schema_field(sub_field_name, sub_field_schema, level + 1, field_name_prefix=field_name_complete + ".")
|
1941
|
-
|
1942
|
-
# If the field is an array and its items are objects with properties, process them.
|
1943
|
-
elif field_schema.get("type") == "array" and "items" in field_schema:
|
1944
|
-
items_schema = field_schema["items"]
|
1945
|
-
if items_schema.get("type") == "object" and "properties" in items_schema:
|
1946
|
-
md += process_schema_field("*", items_schema, level + 1, field_name_prefix=field_name_complete + ".")
|
1947
|
-
|
1948
|
-
return md
|
1949
|
-
|
1950
|
-
|
1951
|
-
def json_schema_to_nlp_data_structure(schema: dict) -> str:
|
1952
|
-
"""
|
1953
|
-
Receives a JSON schema (without $defs or $ref) and returns a markdown string
|
1954
|
-
that documents each field with its name, description, type (including unions and enums),
|
1955
|
-
and default value (if defined). Root-level fields use 3 hashtags, and nested fields
|
1956
|
-
add one hashtag per level.
|
1957
|
-
"""
|
1958
|
-
schema_title = schema.get("title", schema.get("name", "Schema"))
|
1959
|
-
md = f"## {schema_title} -- NLP Data Structure\n\n"
|
1960
|
-
# Assume the root schema is an object with properties.
|
1961
|
-
if schema.get("type") == "object" and "properties" in schema:
|
1962
|
-
for field_name, field_schema in schema["properties"].items():
|
1963
|
-
md += process_schema_field(field_name, field_schema, 3)
|
1964
|
-
else:
|
1965
|
-
md += process_schema_field("root", schema, 3)
|
1966
|
-
return md
|
1967
|
-
|
1968
|
-
|
1969
|
-
def nlp_data_structure_to_field_descriptions(nlp_data_structure: str) -> dict:
|
1970
|
-
"""
|
1971
|
-
This function updates the JSON schema with the descriptions from the NLP data structure.
|
1972
|
-
|
1973
|
-
Args:
|
1974
|
-
schema: The original JSON schema dictionary
|
1975
|
-
nlp_data_structure: A markdown string created by json_schema_to_nlp_data_structure, potentially with updated descriptions
|
1976
|
-
|
1977
|
-
Returns:
|
1978
|
-
A new schema with updated descriptions from the NLP data structure
|
1979
|
-
"""
|
1980
|
-
|
1981
|
-
# Pattern to match headers and extract field_name and type
|
1982
|
-
# Example: "### field_name (type)" or "#### parent.child (type)"
|
1983
|
-
header_pattern = re.compile(r"^(#+)\s+([^\s(]+)\s*\(([^)]*)\)")
|
1984
|
-
|
1985
|
-
# Pattern to extract description between tags
|
1986
|
-
description_pattern = re.compile(r"<Description>(.*?)</Description>", re.DOTALL)
|
1987
|
-
|
1988
|
-
# Split the markdown by lines
|
1989
|
-
lines = nlp_data_structure.split("\n")
|
1990
|
-
|
1991
|
-
# Process the markdown to extract field names and descriptions
|
1992
|
-
field_descriptions = {}
|
1993
|
-
|
1994
|
-
i = 0
|
1995
|
-
while i < len(lines):
|
1996
|
-
line = lines[i]
|
1997
|
-
|
1998
|
-
# Check if this line is a header
|
1999
|
-
header_match = header_pattern.match(line)
|
2000
|
-
if header_match:
|
2001
|
-
field_path = header_match.group(2) # Field name or path
|
2002
|
-
|
2003
|
-
# Look for description in subsequent lines until next header
|
2004
|
-
desc_start = i + 1
|
2005
|
-
while desc_start < len(lines) and not header_pattern.match(lines[desc_start]):
|
2006
|
-
desc_start += 1
|
2007
|
-
|
2008
|
-
# Extract description from the block of text
|
2009
|
-
description_block = "\n".join(lines[i + 1 : desc_start])
|
2010
|
-
desc_match = description_pattern.search(description_block)
|
2011
|
-
if desc_match:
|
2012
|
-
description_text = desc_match.group(1).strip()
|
2013
|
-
field_descriptions[field_path] = description_text
|
2014
|
-
|
2015
|
-
i = desc_start - 1 # Will be incremented in the loop
|
2016
|
-
|
2017
|
-
i += 1
|
2018
|
-
return field_descriptions
|
2019
|
-
|
2020
|
-
|
2021
|
-
##### JSON Schema Sanitization #####
|
2022
|
-
|
2023
|
-
SchemaPath = Tuple[Union[str, int], ...] # e.g. ('address', 'city') or ('items', 3)
|
2024
|
-
|
2025
|
-
|
2026
|
-
def _pick_subschema(schemas: list[dict[str, Any]], value: Any) -> dict[str, Any]:
|
2027
|
-
"""
|
2028
|
-
Return the first subschema in *schemas* that
|
2029
|
-
• explicitly allows the Python type of *value*, or
|
2030
|
-
• has no "type" at all (acts as a wildcard).
|
2031
|
-
|
2032
|
-
Fallback: the first subschema (so we *always* return something).
|
2033
|
-
"""
|
2034
|
-
pytypes_to_json = {
|
2035
|
-
str: "string",
|
2036
|
-
int: "integer",
|
2037
|
-
float: "number",
|
2038
|
-
bool: "boolean",
|
2039
|
-
type(None): "null",
|
2040
|
-
dict: "object",
|
2041
|
-
list: "array",
|
2042
|
-
}
|
2043
|
-
jstype = pytypes_to_json.get(type(value))
|
2044
|
-
|
2045
|
-
for sub in schemas:
|
2046
|
-
allowed = sub.get("type")
|
2047
|
-
if allowed is None or allowed == jstype or (isinstance(allowed, list) and jstype in allowed):
|
2048
|
-
return sub
|
2049
|
-
return schemas[0] # last resort
|
2050
|
-
|
2051
|
-
|
2052
|
-
def __sanitize_instance(instance: Any, schema: dict[str, Any], path: SchemaPath = ()) -> Any:
|
2053
|
-
"""
|
2054
|
-
Return a **new** instance where every string that violates ``maxLength``
|
2055
|
-
has been sliced to that length. Mutates nothing in‑place.
|
2056
|
-
"""
|
2057
|
-
|
2058
|
-
# ------------- unwrap anyOf ------------------------------------
|
2059
|
-
if "anyOf" in schema:
|
2060
|
-
schema = _pick_subschema(schema["anyOf"], instance)
|
2061
|
-
# (We recurse *once*; nested anyOfs will be handled the same way)
|
2062
|
-
|
2063
|
-
# ------------- objects -----------------
|
2064
|
-
if schema.get("type") == "object" and isinstance(instance, MutableMapping):
|
2065
|
-
props = schema.get("properties", {})
|
2066
|
-
return {k: __sanitize_instance(v, props.get(k, {}), path + (k,)) for k, v in instance.items()}
|
2067
|
-
|
2068
|
-
# ------------- arrays ------------------
|
2069
|
-
if schema.get("type") == "array" and isinstance(instance, MutableSequence):
|
2070
|
-
item_schema = schema.get("items", {})
|
2071
|
-
return [__sanitize_instance(v, item_schema, path + (i,)) for i, v in enumerate(instance)]
|
2072
|
-
|
2073
|
-
# ------------- primitive strings -------
|
2074
|
-
if schema.get("type") == "string" and isinstance(instance, str):
|
2075
|
-
max_len = schema.get("maxLength")
|
2076
|
-
if max_len is not None and len(instance) > max_len:
|
2077
|
-
print("=" * 100)
|
2078
|
-
_path = ".".join(map(str, path)) or "<root>"
|
2079
|
-
print(
|
2080
|
-
f"Trimmed {_path} from {len(instance)}→{max_len} characters",
|
2081
|
-
)
|
2082
|
-
print("=" * 100)
|
2083
|
-
return instance[:max_len]
|
2084
|
-
|
2085
|
-
# ------------- all other primitives ----
|
2086
|
-
return instance
|
2087
|
-
|
2088
|
-
|
2089
|
-
def sanitize(instance: Any, schema: dict[str, Any]) -> Any:
|
2090
|
-
expanded_schema = expand_refs(schema)
|
2091
|
-
return __sanitize_instance(instance, expanded_schema)
|
2092
|
-
|
2093
|
-
|
2094
|
-
def compute_schema_data_id(json_schema: dict[str, Any]) -> str:
|
2095
|
-
"""Returns the schema_data_id for a given JSON schema.
|
2096
|
-
|
2097
|
-
The schema_data_id is a hash of the schema data, ignoring all prompt/description/default fields
|
2098
|
-
and other non-structural metadata.
|
2099
|
-
|
2100
|
-
Args:
|
2101
|
-
json_schema: The JSON schema to compute the ID for
|
2102
|
-
|
2103
|
-
Returns:
|
2104
|
-
str: A hash string representing the schema data version with "sch_data_id_" prefix
|
2105
|
-
"""
|
2106
|
-
|
2107
|
-
return "sch_data_id_" + generate_blake2b_hash_from_string(
|
2108
|
-
json.dumps(
|
2109
|
-
clean_schema(
|
2110
|
-
copy.deepcopy(json_schema),
|
2111
|
-
remove_custom_fields=True,
|
2112
|
-
fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
|
2113
|
-
),
|
2114
|
-
sort_keys=True,
|
2115
|
-
).strip()
|
2116
|
-
)
|
2117
|
-
|
2118
|
-
|
2119
|
-
def validate_json_against_schema(
|
2120
|
-
data: Any,
|
2121
|
-
schema: dict[str, Any],
|
2122
|
-
return_instance: bool = False,
|
2123
|
-
) -> Union[None, BaseModel]:
|
2124
|
-
"""
|
2125
|
-
Validate *data* against *schema*.
|
2126
|
-
|
2127
|
-
Parameters
|
2128
|
-
----------
|
2129
|
-
data
|
2130
|
-
A JSON‑serialisable Python object (dict / list / primitives).
|
2131
|
-
schema
|
2132
|
-
A JSON‑Schema dict (can contain $defs / $ref – they’ll be expanded
|
2133
|
-
by ``convert_json_schema_to_basemodel``).
|
2134
|
-
return_instance
|
2135
|
-
• ``False`` (default): only validate; raise if invalid; return ``None``.
|
2136
|
-
• ``True``: on success, return the fully‑validated Pydantic instance
|
2137
|
-
(handy for downstream type‑safe access).
|
2138
|
-
|
2139
|
-
Raises
|
2140
|
-
------
|
2141
|
-
pydantic.ValidationError
|
2142
|
-
If *data* does not conform to *schema*.
|
2143
|
-
|
2144
|
-
Examples
|
2145
|
-
--------
|
2146
|
-
>>> validate_json_against_schema({"foo": 1}, my_schema) # just checks
|
2147
|
-
>>> obj = validate_json_against_schema(data, schema, True) # typed access
|
2148
|
-
>>> print(obj.foo + 5)
|
2149
|
-
"""
|
2150
|
-
# 1) Build a Pydantic model on‑the‑fly from the JSON‑Schema
|
2151
|
-
Model: Type[BaseModel] = convert_json_schema_to_basemodel(schema)
|
2152
|
-
|
2153
|
-
# 2) Let Pydantic do the heavy lifting
|
2154
|
-
instance = Model.model_validate(data) # <- raises ValidationError if bad
|
2155
|
-
|
2156
|
-
return instance if return_instance else None
|