retab 0.0.42__py3-none-any.whl → 0.0.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. retab/__init__.py +2 -1
  2. retab/client.py +26 -51
  3. retab/generate_types.py +180 -0
  4. retab/resources/consensus/client.py +1 -1
  5. retab/resources/consensus/responses.py +1 -1
  6. retab/resources/deployments/__init__.py +3 -0
  7. retab/resources/deployments/automations/__init__.py +9 -0
  8. retab/resources/deployments/automations/client.py +244 -0
  9. retab/resources/deployments/automations/endpoints.py +290 -0
  10. retab/resources/deployments/automations/links.py +303 -0
  11. retab/resources/deployments/automations/logs.py +222 -0
  12. retab/resources/deployments/automations/mailboxes.py +423 -0
  13. retab/resources/deployments/automations/outlook.py +377 -0
  14. retab/resources/deployments/automations/tests.py +161 -0
  15. retab/resources/deployments/client.py +148 -0
  16. retab/resources/documents/client.py +94 -68
  17. retab/resources/documents/extractions.py +55 -46
  18. retab/resources/evaluations/__init__.py +2 -2
  19. retab/resources/evaluations/client.py +61 -77
  20. retab/resources/evaluations/documents.py +48 -37
  21. retab/resources/evaluations/iterations.py +58 -40
  22. retab/resources/jsonlUtils.py +3 -4
  23. retab/resources/processors/automations/endpoints.py +49 -39
  24. retab/resources/processors/automations/links.py +52 -43
  25. retab/resources/processors/automations/mailboxes.py +74 -59
  26. retab/resources/processors/automations/outlook.py +104 -82
  27. retab/resources/processors/client.py +35 -30
  28. retab/resources/projects/__init__.py +3 -0
  29. retab/resources/projects/client.py +285 -0
  30. retab/resources/projects/documents.py +244 -0
  31. retab/resources/projects/iterations.py +470 -0
  32. retab/resources/usage.py +2 -0
  33. retab/types/ai_models.py +2 -1
  34. retab/types/deprecated_evals.py +195 -0
  35. retab/types/evaluations/__init__.py +5 -2
  36. retab/types/evaluations/iterations.py +9 -43
  37. retab/types/evaluations/model.py +19 -24
  38. retab/types/extractions.py +1 -0
  39. retab/types/jobs/base.py +1 -1
  40. retab/types/jobs/evaluation.py +1 -1
  41. retab/types/logs.py +5 -6
  42. retab/types/mime.py +1 -10
  43. retab/types/projects/__init__.py +34 -0
  44. retab/types/projects/documents.py +30 -0
  45. retab/types/projects/iterations.py +78 -0
  46. retab/types/projects/model.py +68 -0
  47. retab/types/schemas/enhance.py +22 -5
  48. retab/types/schemas/evaluate.py +2 -2
  49. retab/types/schemas/object.py +27 -25
  50. retab/types/standards.py +2 -2
  51. retab/utils/__init__.py +3 -0
  52. retab/utils/ai_models.py +127 -12
  53. retab/utils/hashing.py +24 -0
  54. retab/utils/json_schema.py +1 -26
  55. retab/utils/mime.py +0 -17
  56. retab/utils/usage/usage.py +0 -1
  57. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/METADATA +4 -6
  58. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/RECORD +60 -55
  59. retab/_utils/__init__.py +0 -0
  60. retab/_utils/_model_cards/anthropic.yaml +0 -59
  61. retab/_utils/_model_cards/auto.yaml +0 -43
  62. retab/_utils/_model_cards/gemini.yaml +0 -117
  63. retab/_utils/_model_cards/openai.yaml +0 -301
  64. retab/_utils/_model_cards/xai.yaml +0 -28
  65. retab/_utils/ai_models.py +0 -138
  66. retab/_utils/benchmarking.py +0 -484
  67. retab/_utils/chat.py +0 -327
  68. retab/_utils/display.py +0 -440
  69. retab/_utils/json_schema.py +0 -2156
  70. retab/_utils/mime.py +0 -165
  71. retab/_utils/responses.py +0 -169
  72. retab/_utils/stream_context_managers.py +0 -52
  73. retab/_utils/usage/__init__.py +0 -0
  74. retab/_utils/usage/usage.py +0 -301
  75. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/WHEEL +0 -0
  76. {retab-0.0.42.dist-info → retab-0.0.44.dist-info}/top_level.txt +0 -0
@@ -1,2156 +0,0 @@
1
- import copy
2
- import datetime
3
- import json
4
- import re
5
- import types
6
- from collections import defaultdict
7
- from pathlib import Path
8
- from typing import Annotated, Any, Callable, Literal, MutableMapping, MutableSequence, Optional, Tuple, Type, Union, cast, get_args, get_origin
9
-
10
- import phonenumbers
11
- import pycountry
12
- import stdnum.eu.vat # type: ignore
13
- from email_validator import validate_email
14
- from pydantic import BaseModel, BeforeValidator, Field, create_model
15
- from pydantic.config import ConfigDict
16
-
17
- from ..types.schemas.layout import Column, FieldItem, Layout, RefObject, Row, RowList
18
- from .mime import generate_blake2b_hash_from_string
19
-
20
- # **** Validation Functions ****
21
-
22
- # 1) Special Objects
23
-
24
-
25
- def generate_schema_data_id(json_schema: dict[str, Any]) -> str:
26
- """Generate a SHA1 hash ID for schema data, ignoring prompt/description/default fields.
27
-
28
- Args:
29
- json_schema: The JSON schema to generate an ID for
30
-
31
- Returns:
32
- str: A SHA1 hash string with "sch_data_id_" prefix
33
- """
34
- return "sch_data_id_" + generate_blake2b_hash_from_string(
35
- json.dumps(
36
- clean_schema(
37
- copy.deepcopy(json_schema),
38
- remove_custom_fields=True,
39
- fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
40
- ),
41
- sort_keys=True,
42
- ).strip()
43
- )
44
-
45
-
46
- def generate_schema_id(json_schema: dict[str, Any]) -> str:
47
- """Generate a SHA1 hash ID for the complete schema.
48
-
49
- Args:
50
- json_schema: The JSON schema to generate an ID for
51
-
52
- Returns:
53
- str: A SHA1 hash string with "sch_id_" prefix
54
- """
55
- return "sch_id_" + generate_blake2b_hash_from_string(json.dumps(json_schema, sort_keys=True).strip())
56
-
57
-
58
- def validate_currency(currency_code: Any) -> Optional[str]:
59
- """
60
- Return the valid currency code (ISO 4217) or None if invalid.
61
- """
62
- if currency_code is None:
63
- return None
64
- currency_code = str(currency_code).strip() # convert to str and trim
65
- if not currency_code:
66
- return None
67
- try:
68
- if pycountry.currencies.lookup(currency_code):
69
- return currency_code
70
- except LookupError:
71
- pass
72
- return None
73
-
74
-
75
- def validate_country_code(v: Any) -> Optional[str]:
76
- """
77
- Return the valid country code (ISO 3166) or None if invalid.
78
- """
79
- if v is None:
80
- return None
81
- v_str = str(v).strip()
82
- if not v_str:
83
- return None
84
- try:
85
- if pycountry.countries.lookup(v_str):
86
- return v_str
87
- except LookupError:
88
- pass
89
- return None
90
-
91
-
92
- def validate_email_regex(v: Any) -> Optional[str]:
93
- """
94
- Return the string if it matches a basic email pattern, else None.
95
- """
96
- if v is None:
97
- return None
98
- v_str = str(v).strip()
99
- if not v_str:
100
- return None
101
- pattern = r"^[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}$"
102
- if re.match(pattern, v_str):
103
- return v_str.lower()
104
- return None
105
-
106
-
107
- def validate_vat_number(v: Any) -> Optional[str]:
108
- """
109
- Return the VAT number if valid (EU format) else None.
110
- """
111
- if v is None:
112
- return None
113
- v_str = str(v).strip()
114
- if not v_str:
115
- return None
116
- try:
117
- if stdnum.eu.vat.is_valid(v_str):
118
- return stdnum.eu.vat.validate(v_str)
119
- except Exception:
120
- pass
121
- return None
122
-
123
-
124
- def validate_phone_number(v: Any) -> Optional[str]:
125
- """
126
- Return E.164 phone number format if valid, else None.
127
- """
128
- if v is None:
129
- return None
130
- v_str = str(v).strip()
131
- if not v_str:
132
- return None
133
- try:
134
- phone_number = phonenumbers.parse(v_str, "FR") # Default region: FR
135
- if phonenumbers.is_valid_number(phone_number):
136
- return phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.E164)
137
- except phonenumbers.NumberParseException:
138
- pass
139
- return None
140
-
141
-
142
- def validate_email_address(v: Any) -> Optional[str]:
143
- """
144
- Return the normalized email address if valid, else None.
145
- """
146
- if v is None:
147
- return None
148
- v_str = str(v).strip()
149
- if not v_str:
150
- return None
151
- try:
152
- return validate_email(v_str).normalized
153
- except Exception:
154
- return None
155
-
156
-
157
- def validate_frenchpostcode(v: Any) -> Optional[str]:
158
- """
159
- Return a 5-digit postcode if valid, else None.
160
- """
161
- if v is None:
162
- return None
163
- v_str = str(v).strip()
164
- if not v_str:
165
- return None
166
- # Zero-pad to 5 digits
167
- try:
168
- v_str = v_str.zfill(5)
169
- # Optionally check numeric
170
- if not v_str.isdigit():
171
- return None
172
- return v_str
173
- except Exception:
174
- return None
175
-
176
-
177
- def validate_packing_type(v: Any) -> Optional[str]:
178
- """
179
- Return the packing type if in the known set, else None.
180
- """
181
- if v is None:
182
- return None
183
- v_str = str(v).strip().lower()
184
- # We'll store the valid set in lower for easy comparison
185
- valid_packing_types = {"box", "pallet", "container", "bag", "drum", "other"}
186
- if v_str in valid_packing_types:
187
- return v_str
188
- return None
189
-
190
-
191
- def validate_un_code(v: Any) -> Optional[int]:
192
- """
193
- Return an integer UN code in range [0..3481], else None.
194
- """
195
- if v is None:
196
- return None
197
- v_str = str(v).strip()
198
- if not v_str:
199
- return None
200
- try:
201
- val = int(float(v_str)) # handle numeric strings
202
- if 0 <= val <= 3481:
203
- return val
204
- except Exception:
205
- pass
206
- return None
207
-
208
-
209
- def validate_adr_tunnel_code(v: Any) -> Optional[str]:
210
- """
211
- Return a valid ADR tunnel code from a known set, else None.
212
- """
213
- if v is None:
214
- return None
215
- v_str = str(v).strip().upper() # unify for set comparison
216
- valid_codes = {"B", "B1000C", "B/D", "B/E", "C", "C5000D", "C/D", "C/E", "D", "D/E", "E", "-"}
217
- return v_str if v_str in valid_codes else None
218
-
219
-
220
- def validate_un_packing_group(v: Any) -> Optional[str]:
221
- """
222
- Return a valid UN packing group (I, II, or III), else None.
223
- """
224
- if v is None:
225
- return None
226
- v_str = str(v).strip().upper()
227
- valid_groups = {"I", "II", "III"}
228
- return v_str if v_str in valid_groups else None
229
-
230
-
231
- # 2) General Objects
232
-
233
-
234
- def validate_integer(v: Any) -> Optional[int]:
235
- """
236
- Return an integer if parseable, else None.
237
- """
238
- if v is None:
239
- return None
240
- v_str = str(v).strip()
241
- if not v_str:
242
- return None
243
- try:
244
- return int(float(v_str))
245
- except Exception:
246
- return None
247
-
248
-
249
- def validate_float(v: Any) -> Optional[float]:
250
- """
251
- Return a float if parseable, else None.
252
- """
253
- if v is None:
254
- return None
255
- v_str = str(v).strip()
256
- if not v_str:
257
- return None
258
- try:
259
- return float(v_str)
260
- except Exception:
261
- return None
262
-
263
-
264
- def validate_date(v: Union[str, datetime.date, None]) -> Optional[str]:
265
- """
266
- Return date in ISO format (YYYY-MM-DD) if valid, else None.
267
- """
268
- if v is None:
269
- return None
270
-
271
- # If it's already a date object
272
- if isinstance(v, datetime.date):
273
- return v.isoformat()
274
-
275
- # If it's a string
276
- v_str = str(v).strip()
277
- if not v_str:
278
- return None
279
-
280
- # Try ISO or a close variant
281
- try:
282
- return datetime.date.fromisoformat(v_str).isoformat()
283
- except ValueError:
284
- # Fallback to strptime
285
- try:
286
- return datetime.datetime.strptime(v_str, "%Y-%m-%d").date().isoformat()
287
- except ValueError:
288
- return None
289
-
290
-
291
- def validate_time(v: Union[str, datetime.time, None]) -> Optional[str]:
292
- """
293
- Return time in ISO format (HH:MM[:SS]) if valid, else None.
294
- """
295
- if v is None:
296
- return None
297
-
298
- # If it's already a time object
299
- if isinstance(v, datetime.time):
300
- return v.isoformat()
301
-
302
- v_str = str(v).strip()
303
- if not v_str:
304
- return None
305
-
306
- # Try multiple formats
307
- time_formats = ["%H:%M:%S", "%H:%M", "%I:%M %p", "%I:%M:%S %p"]
308
- for fmt in time_formats:
309
- try:
310
- parsed = datetime.datetime.strptime(v_str, fmt).time()
311
- return parsed.isoformat()
312
- except ValueError:
313
- continue
314
- return None
315
-
316
-
317
- def validate_bool(v: Any) -> bool:
318
- """
319
- Convert to bool if matches known true/false strings or actual bool.
320
- Otherwise return False.
321
- """
322
- if v is None:
323
- return False
324
-
325
- if isinstance(v, bool):
326
- return v
327
-
328
- try:
329
- v_str = str(v).strip().lower()
330
- true_values = {"true", "t", "yes", "y", "1"}
331
- false_values = {"false", "f", "no", "n", "0"}
332
- if v_str in true_values:
333
- return True
334
- elif v_str in false_values:
335
- return False
336
- except Exception:
337
- pass
338
-
339
- return False
340
-
341
-
342
- def validate_strold(v: Any) -> Optional[str]:
343
- """
344
- Return a stripped string unless it's empty or a known 'null' placeholder, else None.
345
- """
346
- if v is None:
347
- return None
348
- v_str = str(v).strip()
349
- # Treat these placeholders (and empty) as invalid
350
- if v_str.lower() in {"null", "none", "nan", ""}:
351
- return None
352
- return v_str
353
-
354
-
355
- def validate_str(v: Any) -> Optional[str]:
356
- """
357
- Return a stripped string unless it's invalid (e.g., placeholders like 'null'), else None.
358
- Does NOT convert empty strings to None—leaves them as-is.
359
- """
360
- if v is None:
361
- return None
362
- v_str = str(v).strip()
363
- if v_str.lower() in {"null", "none", "nan"}: # Only treat explicit placeholders as None
364
- return None
365
- return v_str # Keep empty strings intact
366
-
367
-
368
- def notnan(x: Any) -> bool:
369
- """
370
- Return False if x is None, 'null', 'nan', or x != x (NaN check).
371
- True otherwise.
372
- """
373
- if x is None:
374
- return False
375
- x_str = str(x).lower().strip()
376
- if x_str in {"null", "nan"}:
377
- return False
378
- # Check for actual float NaN (x != x)
379
- return not (x != x)
380
-
381
-
382
- def merge_descriptions(outer_schema: dict[str, Any], inner_schema: dict[str, Any]) -> dict[str, Any]:
383
- """
384
- Merge descriptions from outer and inner schemas, giving preference to outer.
385
- Also merges X-ReasoningPrompt similarly.
386
- """
387
- merged = copy.deepcopy(inner_schema)
388
-
389
- # Outer description preferred if present
390
- if outer_schema.get("description", "").strip():
391
- merged["description"] = outer_schema["description"]
392
-
393
- # Outer reasoning preferred if present
394
- if outer_schema.get("X-ReasoningPrompt", "").strip():
395
- merged["X-ReasoningPrompt"] = outer_schema["X-ReasoningPrompt"]
396
- elif inner_schema.get("X-ReasoningPrompt", "").strip():
397
- merged["X-ReasoningPrompt"] = inner_schema["X-ReasoningPrompt"]
398
-
399
- if not merged.get("X-ReasoningPrompt", "").strip():
400
- # delete it
401
- merged.pop("X-ReasoningPrompt", None)
402
-
403
- # Outer LLM Description preferred if present
404
- if outer_schema.get("X-FieldPrompt", "").strip():
405
- merged["X-FieldPrompt"] = outer_schema["X-FieldPrompt"]
406
- elif inner_schema.get("X-FieldPrompt", "").strip():
407
- merged["X-FieldPrompt"] = inner_schema["X-FieldPrompt"]
408
-
409
- if not merged.get("X-FieldPrompt", "").strip():
410
- # delete it
411
- merged.pop("X-FieldPrompt", None)
412
-
413
- # System-Prompt
414
- if not merged.get("X-SystemPrompt", "").strip():
415
- # delete it
416
- merged.pop("X-SystemPrompt", None)
417
-
418
- return merged
419
-
420
-
421
- def has_cyclic_refs(schema: dict[str, Any]) -> bool:
422
- """Check if the JSON Schema contains cyclic references.
423
-
424
- The function recursively traverses all nested objects and arrays in the schema.
425
- It follows any "$ref" that points to a definition (i.e. "#/$defs/<name>")
426
- and uses DFS with a current-path stack to detect cycles.
427
- """
428
- definitions = schema.get("$defs", {})
429
- if not definitions:
430
- return False
431
-
432
- # Memoize results for each definition to avoid repeated work.
433
- memo: dict[str, bool] = {}
434
-
435
- def dfs(def_name: str, stack: set[str]) -> bool:
436
- """Perform DFS on a definition (by name) using 'stack' to detect cycles."""
437
- if def_name in stack:
438
- return True
439
- if def_name in memo:
440
- return memo[def_name]
441
-
442
- # Add to current path and traverse the definition.
443
- stack.add(def_name)
444
- node = definitions.get(def_name)
445
- if node is None:
446
- # No such definition, so nothing to do.
447
- stack.remove(def_name)
448
- memo[def_name] = False
449
- return False
450
-
451
- result = traverse(node, stack)
452
- stack.remove(def_name)
453
- memo[def_name] = result
454
- return result
455
-
456
- def traverse(node: Any, stack: set[str]) -> bool:
457
- """Recursively traverse an arbitrary JSON Schema node."""
458
- if isinstance(node, dict):
459
- # If we see a "$ref", try to follow it.
460
- if "$ref" in node:
461
- ref = node["$ref"]
462
- if ref.startswith("#/$defs/"):
463
- target = ref[len("#/$defs/") :]
464
- if dfs(target, stack):
465
- return True
466
- # Recursively check all values in the dictionary.
467
- for key, value in node.items():
468
- # Skip "$ref" as it has already been processed.
469
- if key == "$ref":
470
- continue
471
- if traverse(value, stack):
472
- return True
473
- elif isinstance(node, list):
474
- for item in node:
475
- if traverse(item, stack):
476
- return True
477
- return False
478
-
479
- # Start DFS on each top-level definition.
480
- for def_name in definitions:
481
- if dfs(def_name, set()):
482
- return True
483
-
484
- return False
485
-
486
-
487
- def expand_refs(schema: dict[str, Any], definitions: dict[str, dict[str, Any]] | None = None) -> dict[str, Any]:
488
- """
489
- Recursively resolve $ref in the given schema.
490
- For each $ref, fetch the target schema, merge descriptions, and resolve further.
491
- """
492
- if not isinstance(schema, dict):
493
- return schema
494
-
495
- # First, we will verify if this schema is expandable, we do this by checking if there are cyclic $refs (infinite loop)
496
- # If there are, we will return the schema as is
497
-
498
- if has_cyclic_refs(schema):
499
- print("Cyclic refs found, keeping it as is")
500
- return schema
501
-
502
- if definitions is None:
503
- definitions = schema.pop("$defs", {})
504
-
505
- assert isinstance(definitions, dict)
506
-
507
- if "allOf" in schema:
508
- # Some schemas (notably the one converted from a pydantic model) have allOf. We only accept one element in allOf
509
- if len(schema["allOf"]) != 1:
510
- raise ValueError(f"Property schema must have a single element in 'allOf'. Found: {schema['allOf']}")
511
- schema.update(schema.pop("allOf", [{}])[0])
512
-
513
- if "$ref" in schema:
514
- ref: str = schema["$ref"]
515
- if ref.startswith("#/$defs/"):
516
- def_name = ref.removeprefix("#/$defs/")
517
- if def_name not in definitions:
518
- raise ValueError(f"Reference {ref} not found in definitions.")
519
- target = definitions[def_name]
520
- merged = merge_descriptions(schema, target)
521
- merged.pop("$ref", None)
522
- return expand_refs(merged, definitions)
523
- else:
524
- raise ValueError(f"Unsupported reference format: {ref}")
525
-
526
- result: dict[str, Any] = {}
527
- for annotation, subschema in schema.items():
528
- if annotation in ["properties", "$defs"]:
529
- if isinstance(subschema, dict):
530
- new_dict = {}
531
- for pk, pv in subschema.items():
532
- new_dict[pk] = expand_refs(pv, definitions)
533
- result[annotation] = new_dict
534
- else:
535
- result[annotation] = subschema
536
- elif annotation == "items":
537
- if isinstance(subschema, list):
538
- result[annotation] = [expand_refs(item, definitions) for item in subschema]
539
- else:
540
- result[annotation] = expand_refs(subschema, definitions)
541
- else:
542
- if isinstance(subschema, dict):
543
- result[annotation] = expand_refs(subschema, definitions)
544
- elif isinstance(subschema, list):
545
- new_list = []
546
- for item in subschema:
547
- if isinstance(item, dict):
548
- new_list.append(expand_refs(item, definitions))
549
- else:
550
- new_list.append(item)
551
- result[annotation] = new_list
552
- else:
553
- result[annotation] = subschema
554
-
555
- return result
556
-
557
-
558
- def json_schema_to_typescript_interface(
559
- schema: dict[str, Any],
560
- name: str = "RootInterface",
561
- definitions: Optional[dict[str, dict[str, Any]]] = None,
562
- processed_refs: Optional[dict[str, str]] = None,
563
- indent: int = 2,
564
- add_field_description: bool = False,
565
- ) -> str:
566
- """
567
- Convert a JSON Schema to a TypeScript interface.
568
-
569
- :param schema: The JSON schema as a dict.
570
- :param name: Name of the interface to generate.
571
- :param definitions: A dictionary of named schemas that can be referenced by $ref.
572
- :param processed_refs: A dict to keep track of processed $refs to avoid recursion.
573
- :param indent: Number of spaces for indentation in the output.
574
- :param add_field_description: If True, include field descriptions as comments.
575
- :return: A string containing the TypeScript interface.
576
- """
577
- if definitions is None:
578
- # Extract definitions from $defs if present
579
- definitions = schema.get("$defs", {})
580
-
581
- if processed_refs is None:
582
- processed_refs = {}
583
-
584
- # If we have a top-level object schema
585
- if schema.get("type") == "object" or "properties" in schema:
586
- interface_lines = [f"interface {name} {{"]
587
- indentation = " " * indent
588
- properties = schema.get("properties", {})
589
- required_fields = set(schema.get("required", []))
590
-
591
- for prop_name, prop_schema in properties.items():
592
- is_optional = prop_name not in required_fields
593
- field_ts = schema_to_ts_type(prop_schema, definitions or {}, processed_refs, indent, indent, add_field_description=add_field_description)
594
- optional_flag = "?" if is_optional else ""
595
- line = ""
596
- if add_field_description and "description" in prop_schema:
597
- desc = prop_schema["description"].replace("\n", f"\n{indentation}// ")
598
- line = f"{indentation}// {desc}\n"
599
- line += f"{indentation}{prop_name}{optional_flag}: {field_ts};"
600
- interface_lines.append(line)
601
-
602
- interface_lines.append("}")
603
- return "\n".join(interface_lines)
604
- else:
605
- # Otherwise, produce a type alias if it's not an object
606
- ts_type = schema_to_ts_type(schema, definitions or {}, processed_refs, indent, indent, add_field_description=add_field_description)
607
- return f"type {name} = {ts_type};"
608
-
609
-
610
- def schema_to_ts_type(
611
- schema: dict[str, Any], definitions: dict[str, dict[str, Any]], processed_refs: dict[str, str], indent: int, increment: int, add_field_description: bool = False
612
- ) -> str:
613
- """
614
- Convert a JSON schema snippet to a TypeScript type (string).
615
- Handles objects, arrays, primitives, enums, oneOf/anyOf/allOf, and $ref.
616
- """
617
-
618
- # Handle $ref upfront
619
- if "$ref" in schema:
620
- ref = schema["$ref"]
621
- if ref in processed_refs:
622
- return processed_refs[ref]
623
- resolved = resolve_ref(ref, definitions)
624
- if resolved is None:
625
- return "any"
626
- processed_refs[ref] = "" # to avoid recursion
627
- ts_type = schema_to_ts_type(resolved, definitions, processed_refs, indent, increment, add_field_description=add_field_description)
628
- processed_refs[ref] = ts_type
629
- return ts_type
630
-
631
- # Handle allOf, oneOf, anyOf
632
- if "allOf" in schema:
633
- # allOf means intersection of all subschemas
634
- subtypes = [schema_to_ts_type(s, definitions, processed_refs, indent, increment, add_field_description) for s in schema["allOf"]]
635
- return "(" + " & ".join(subtypes) + ")"
636
-
637
- if "oneOf" in schema:
638
- # oneOf means a union type
639
- subtypes = [schema_to_ts_type(s, definitions, processed_refs, indent, increment, add_field_description) for s in schema["oneOf"]]
640
- return "(" + " | ".join(subtypes) + ")"
641
-
642
- if "anyOf" in schema:
643
- # anyOf means a union type
644
- subtypes = [
645
- schema_to_ts_type(s, definitions, processed_refs, indent, increment, add_field_description)
646
- for s in schema["anyOf"]
647
- # Remove "null" from subtypes if it's present
648
- # if not (isinstance(s, dict) and s.get("type") == "null")
649
- ]
650
- if len(subtypes) == 1:
651
- return subtypes[0]
652
-
653
- return "(" + " | ".join(subtypes) + ")"
654
-
655
- # Handle enums
656
- if "enum" in schema:
657
- # Create a union of literal types
658
- enum_values = schema["enum"]
659
- ts_literals = []
660
- for val in enum_values:
661
- if isinstance(val, str):
662
- ts_literals.append(f'"{val}"')
663
- elif val is None:
664
- ts_literals.append("null")
665
- else:
666
- ts_literals.append(str(val).lower() if isinstance(val, bool) else str(val))
667
- return " | ".join(ts_literals)
668
-
669
- # Handle type
670
- schema_type = schema.get("type")
671
- if schema_type == "object" or "properties" in schema:
672
- # Inline object
673
- return inline_object(schema, definitions, processed_refs, indent, increment, add_field_description)
674
- elif schema_type == "array":
675
- # items define the type of array elements
676
- items_schema = schema.get("items", {})
677
- item_type = schema_to_ts_type(items_schema, definitions, processed_refs, indent + increment, increment, add_field_description)
678
- return f"Array<{item_type}>"
679
- else:
680
- # Primitive types or missing type
681
- if isinstance(schema_type, list):
682
- # union of multiple primitive types
683
- primitive_types = [primitive_type_to_ts(t) for t in schema_type]
684
- return "(" + " | ".join(primitive_types) + ")"
685
- else:
686
- # single primitive
687
- return primitive_type_to_ts(schema_type)
688
-
689
-
690
- def inline_object(schema: dict[str, Any], definitions: dict[str, dict[str, Any]], processed_refs: dict[str, str], indent: int, increment: int, add_field_description: bool) -> str:
691
- """
692
- Inline an object type from a JSON schema into a TypeScript type.
693
- """
694
- properties = schema.get("properties", {})
695
- required_fields = set(schema.get("required", []))
696
- lines = ["{"]
697
- field_indentation = " " * (indent + increment)
698
- for prop_name, prop_schema in properties.items():
699
- is_optional = prop_name not in required_fields
700
- ts_type = schema_to_ts_type(prop_schema, definitions, processed_refs, indent + increment, increment, add_field_description)
701
- optional_flag = "?" if is_optional else ""
702
- line = ""
703
- if add_field_description and "description" in prop_schema:
704
- desc = prop_schema["description"].replace("\n", f"\n{field_indentation}// ")
705
- line = f"{field_indentation}// {desc}\n"
706
- line += f"{field_indentation}{prop_name}{optional_flag}: {ts_type};"
707
- lines.append(line)
708
- lines.append(" " * indent + "}")
709
- return "\n".join(lines)
710
-
711
-
712
- def primitive_type_to_ts(t: Union[str, None]) -> str:
713
- """
714
- Convert a primitive JSON schema type to a TypeScript type.
715
- """
716
- if t == "string":
717
- return "string"
718
- elif t in ("integer", "number"):
719
- return "number"
720
- elif t == "boolean":
721
- return "boolean"
722
- elif t == "null":
723
- return "null"
724
- elif t is None:
725
- # no specific type given
726
- return "any"
727
- else:
728
- # fallback
729
- return "any"
730
-
731
-
732
- def resolve_ref(ref: str, definitions: dict[str, dict[str, Any]]) -> Optional[dict[str, Any]]:
733
- """
734
- Resolve a $ref against the given definitions.
735
- The schema uses $defs. Ref format: "#/$defs/SomeDefinition"
736
- """
737
- if ref.startswith("#/$defs/"):
738
- key = ref[len("#/$defs/") :]
739
- return definitions.get(key)
740
- # No known resolution strategy
741
- return None
742
-
743
-
744
- def json_schema_to_strict_openai_schema(obj: Union[dict[str, Any], list[Any]]) -> Union[dict[str, Any], list[Any]]:
745
- # Gets a json supported by GPT Structured Output from a pydantic Basemodel
746
-
747
- if isinstance(obj, dict):
748
- new_obj: dict[str, Any] = copy.deepcopy(obj)
749
-
750
- # Remove some not-supported fields
751
- for key in ["default", "format", "X-FieldTranslation", "X-EnumTranslation"]:
752
- new_obj.pop(key, None)
753
-
754
- # Handle integer type
755
- if "type" in new_obj:
756
- if new_obj["type"] == "integer":
757
- new_obj["type"] = "number"
758
- elif isinstance(new_obj["type"], list):
759
- new_obj["type"] = ["number" if t == "integer" else t for t in new_obj["type"]]
760
-
761
- # Handle allOf
762
- if "allOf" in new_obj:
763
- subschemas = new_obj.pop("allOf")
764
- merged: dict[str, Any] = {}
765
- for subschema in subschemas:
766
- if "$ref" in subschema:
767
- merged.update({"$ref": subschema["$ref"]})
768
- else:
769
- merged.update(json_schema_to_strict_openai_schema(subschema))
770
- new_obj.update(merged)
771
-
772
- # Handle anyOf
773
- if "anyOf" in new_obj:
774
- new_obj["anyOf"] = [json_schema_to_strict_openai_schema(subschema) for subschema in new_obj["anyOf"]]
775
-
776
- # Handle enum (force type to string)
777
- if "enum" in new_obj:
778
- new_obj["enum"] = [str(e) for e in new_obj["enum"]]
779
- new_obj["type"] = "string"
780
-
781
- # Handle object type
782
- if new_obj.get("type") == "object" and "properties" in new_obj and isinstance(new_obj["properties"], dict):
783
- new_obj["required"] = list(new_obj["properties"].keys())
784
- new_obj["additionalProperties"] = False
785
- new_obj["properties"] = {k: json_schema_to_strict_openai_schema(v) for k, v in new_obj["properties"].items()}
786
-
787
- # Handle array type
788
- if new_obj.get("type") == "array" and "items" in new_obj:
789
- new_obj["items"] = json_schema_to_strict_openai_schema(new_obj["items"])
790
-
791
- # Handle defs
792
- if "$defs" in new_obj:
793
- new_obj["$defs"] = {k: json_schema_to_strict_openai_schema(v) for k, v in new_obj["$defs"].items()}
794
-
795
- return new_obj
796
- elif isinstance(obj, list):
797
- return [json_schema_to_strict_openai_schema(item) for item in obj]
798
- else:
799
- return obj
800
-
801
-
802
- def clean_schema(schema: dict[str, Any], remove_custom_fields: bool = False, fields_to_remove: list[str] = ["default", "minlength", "maxlength"]) -> dict[str, Any]:
803
- """
804
- Recursively remove specified fields from a JSON schema.
805
-
806
- Args:
807
- schema: The JSON schema to be cleaned.
808
- remove_custom_fields: If True, also remove fields starting with 'x-'.
809
- fields_to_remove: List of keys to remove (case-insensitive check).
810
-
811
- Returns:
812
- The resulting cleaned JSON schema.
813
- """
814
- schema = schema.copy()
815
- lower_fields_to_remove = [f.lower() for f in fields_to_remove]
816
- for key in list(schema.keys()):
817
- if not isinstance(key, str):
818
- continue
819
-
820
- lower_key = key.lower()
821
-
822
- conditions_to_remove = [
823
- # Empty keys
824
- not key,
825
- # Empty subschemas
826
- isinstance(schema[key], dict) and len(schema[key]) == 0,
827
- # Fields to remove
828
- lower_key in lower_fields_to_remove,
829
- # Custom fields
830
- remove_custom_fields and lower_key.startswith("x-"),
831
- ]
832
-
833
- if any(conditions_to_remove):
834
- schema.pop(key)
835
- continue
836
-
837
- if "properties" in schema:
838
- schema["properties"] = {
839
- prop_key: clean_schema(prop_schema, fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields)
840
- for prop_key, prop_schema in schema["properties"].items()
841
- }
842
- if "items" in schema:
843
- schema["items"] = clean_schema(schema["items"], fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields)
844
- if "$defs" in schema:
845
- schema["$defs"] = {k: clean_schema(v, fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields) for k, v in schema["$defs"].items()}
846
- if "allOf" in schema:
847
- schema["allOf"] = [clean_schema(subschema, fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields) for subschema in schema["allOf"]]
848
- if "anyOf" in schema:
849
- schema["anyOf"] = [clean_schema(subschema, fields_to_remove=fields_to_remove, remove_custom_fields=remove_custom_fields) for subschema in schema["anyOf"]]
850
-
851
- return schema
852
-
853
-
854
- def add_reasoning_sibling_inplace(properties: dict[str, Any], field_name: str, reasoning_desc: str) -> None:
855
- """
856
- Add a reasoning sibling for a given property field_name into properties dict.
857
- We'll use the naming convention reasoning___<field_name>.
858
- If the field_name is 'root', we add 'reasoning___root'.
859
- """
860
- reasoning_key = f"reasoning___{field_name}"
861
- new_properties: dict[str, Any]
862
- if field_name == "root":
863
- new_properties = {reasoning_key: {"type": "string", "description": reasoning_desc}, **properties}
864
- else:
865
- # Insert reasoning_key just above the field_name
866
- new_properties = {}
867
- for key, value in properties.items():
868
- if key == field_name:
869
- new_properties[reasoning_key] = {"type": "string", "description": reasoning_desc}
870
- new_properties[key] = value
871
- properties.clear()
872
- properties.update(new_properties)
873
-
874
-
875
- def _insert_reasoning_fields_inner(schema: dict[str, Any]) -> tuple[dict[str, Any], str | None]:
876
- """
877
- Inner function that returns (updated_schema, reasoning_desc_for_this_node).
878
- The parent caller (which handles 'properties') will add the sibling reasoning field if reasoning_desc_for_this_node is not None.
879
- """
880
- reasoning_desc = schema.pop("X-ReasoningPrompt", None)
881
-
882
- node_type = schema.get("type")
883
-
884
- # Process children recursively
885
- # If object: process properties
886
- if node_type == "object" or "$ref" in schema:
887
- if "properties" in schema and isinstance(schema["properties"], dict):
888
- new_props = {}
889
- for property_key, property_value in schema["properties"].items():
890
- updated_prop_schema, child_reasoning = _insert_reasoning_fields_inner(property_value)
891
- new_props[property_key] = updated_prop_schema
892
- if child_reasoning:
893
- add_reasoning_sibling_inplace(new_props, property_key, child_reasoning)
894
- # Add the reasoning field to required if the property is required
895
- if "required" in schema and property_key in schema["required"]:
896
- schema["required"].append(f"reasoning___{property_key}")
897
- schema["properties"] = new_props
898
-
899
- if "$defs" in schema and isinstance(schema["$defs"], dict):
900
- new_defs = {}
901
- for dk, dv in schema["$defs"].items():
902
- updated_def_schema, _ = _insert_reasoning_fields_inner(dv)
903
- new_defs[dk] = updated_def_schema
904
- schema["$defs"] = new_defs
905
-
906
- elif node_type == "array" and "items" in schema:
907
- # Recurse into items if present
908
- updated_items, item_reasoning = _insert_reasoning_fields_inner(schema["items"])
909
- schema["items"] = updated_items
910
-
911
- # If the item schema has a reasoning prompt, create a reasoning field inside the item
912
- if item_reasoning and updated_items.get("type") == "object":
913
- # Create reasoning field for array items
914
- if "properties" not in updated_items:
915
- updated_items["properties"] = {}
916
-
917
- # Add the reasoning field as first property
918
- reasoning_key = "reasoning___item"
919
- new_properties = {reasoning_key: {"type": "string", "description": item_reasoning}}
920
-
921
- # Add the rest of the properties
922
- for key, value in updated_items["properties"].items():
923
- new_properties[key] = value
924
-
925
- updated_items["properties"] = new_properties
926
-
927
- # Add to required if we have required fields
928
- if "required" in updated_items:
929
- updated_items["required"].insert(0, reasoning_key)
930
- else:
931
- updated_items["required"] = [reasoning_key]
932
-
933
- return schema, reasoning_desc
934
-
935
-
936
- def _insert_quote_fields_inner(schema: dict[str, Any]) -> dict[str, Any]:
937
- """
938
- Inner function that processes a schema and adds quote___ fields for leaf nodes with X-ReferenceQuote: true.
939
- Only applies to leaf fields, never to the root.
940
- """
941
- if not isinstance(schema, dict):
942
- return schema
943
-
944
- # Create a copy to avoid modifying the original
945
- new_schema = copy.deepcopy(schema)
946
-
947
- # Process children recursively
948
- if "properties" in new_schema and isinstance(new_schema["properties"], dict):
949
- new_props = {}
950
- for property_key, property_value in new_schema["properties"].items():
951
- updated_prop_schema_value = _insert_quote_fields_inner(property_value)
952
- has_quote_field = updated_prop_schema_value.get("X-ReferenceQuote") is True
953
-
954
- # Check if this property is a leaf with X-ReferenceQuote: true
955
- if has_quote_field:
956
- # Add the quote field
957
- quote_key = f"quote___{property_key}"
958
- new_props[quote_key] = {"type": "string"}
959
-
960
- # Add the quote field to required if the property is required
961
- if "required" in new_schema and property_key in new_schema["required"]:
962
- # add the quote field to required just before the property_key
963
- new_schema["required"].insert(new_schema["required"].index(property_key), quote_key)
964
-
965
- # Remove the X-ReferenceQuote field
966
- updated_prop_schema_value.pop("X-ReferenceQuote", None)
967
-
968
- new_props[property_key] = updated_prop_schema_value
969
- new_schema["properties"] = new_props
970
-
971
- elif "items" in new_schema:
972
- # Recurse into items if present
973
- updated_items = _insert_quote_fields_inner(new_schema["items"])
974
- new_schema["items"] = updated_items
975
-
976
- return new_schema
977
-
978
-
979
- def _rec_replace_description_with_llm_description(schema: dict[str, Any]) -> dict[str, Any]:
980
- """
981
- Recursively replace the description field with X-ReasoningPrompt if present.
982
- """
983
- if not isinstance(schema, dict):
984
- return schema
985
-
986
- new_schema = copy.deepcopy(schema)
987
- if "description" in new_schema or "X-FieldPrompt" in new_schema:
988
- new_schema["description"] = new_schema.pop("X-FieldPrompt", new_schema.get("description"))
989
- if new_schema["description"] is None:
990
- new_schema.pop("description")
991
- elif "default" in new_schema:
992
- new_schema["description"] += f"\nUser Provided a Default Value: {json.dumps(new_schema['default'])}"
993
-
994
- if "properties" in new_schema:
995
- new_schema["properties"] = {k: _rec_replace_description_with_llm_description(v) for k, v in new_schema["properties"].items()}
996
-
997
- if "items" in new_schema:
998
- new_schema["items"] = _rec_replace_description_with_llm_description(new_schema["items"])
999
-
1000
- if "$defs" in new_schema:
1001
- new_schema["$defs"] = {k: _rec_replace_description_with_llm_description(v) for k, v in new_schema["$defs"].items()}
1002
-
1003
- return new_schema
1004
-
1005
-
1006
- def create_reasoning_schema(json_schema: dict[str, Any]) -> dict[str, Any]:
1007
- # Resolve refs first to get expanded schema
1008
- definitions = json_schema.get("$defs", {})
1009
- resolved = expand_refs(copy.deepcopy(json_schema), definitions)
1010
- # resolved.pop("$defs", None)
1011
-
1012
- expanded_schema = copy.deepcopy(resolved)
1013
-
1014
- # Insert reasoning fields.
1015
- # We'll handle the root reasoning similarly: if root has reasoning, we add reasoning___root
1016
- updated_schema, root_reasoning = _insert_reasoning_fields_inner(copy.deepcopy(expanded_schema))
1017
-
1018
- if root_reasoning:
1019
- # Root is an object (assumed). Add reasoning___root at top-level properties
1020
- if "properties" not in updated_schema:
1021
- updated_schema["properties"] = {}
1022
- add_reasoning_sibling_inplace(updated_schema["properties"], "root", root_reasoning)
1023
- if "required" in updated_schema:
1024
- updated_schema["required"].append("reasoning___root")
1025
-
1026
- # Insert quote fields for leaf nodes with X-ReferenceQuote: true
1027
- updated_schema = _insert_quote_fields_inner(updated_schema)
1028
-
1029
- # Clean up $defs from inference_schema if desired (optional)
1030
- # if "$defs" in updated_schema:
1031
- # updated_schema.pop("$defs", None)
1032
-
1033
- # Replace description with X-FieldPrompt if present
1034
- updated_schema = _rec_replace_description_with_llm_description(updated_schema)
1035
-
1036
- # Clean the schema (remove defaults, etc)
1037
- updated_schema = clean_schema(updated_schema, remove_custom_fields=True)
1038
- return updated_schema
1039
-
1040
-
1041
- def cleanup_reasoning(output_data: Any, reasoning_preffix: str = "reasoning___") -> Any:
1042
- """
1043
- Recursively removes all reasoning key/values from the output data. Reasoning keys starts with 'reasoning___'.
1044
- """
1045
- if isinstance(output_data, dict):
1046
- new_dict = {}
1047
- for k, v in output_data.items():
1048
- if not k.startswith(reasoning_preffix):
1049
- new_dict[k] = cleanup_reasoning(v)
1050
- return new_dict
1051
- elif isinstance(output_data, list):
1052
- return [cleanup_reasoning(item) for item in output_data]
1053
- else:
1054
- return output_data
1055
-
1056
-
1057
- # Other utils
1058
-
1059
-
1060
- def cast_all_leaves_from_json_schema_to_type(leaf: dict[str, Any], new_type: Literal["string", "boolean"], is_optional: bool = True) -> dict[str, Any]:
1061
- new_leaf: dict[str, Any] = {}
1062
- # new_leaf["description"] = "Here goes the suggestion, if any, or null."
1063
- if leaf.get("type") == "object":
1064
- new_leaf["type"] = "object"
1065
- new_leaf["properties"] = {}
1066
- for key, value in leaf["properties"].items():
1067
- new_leaf["properties"][key] = cast_all_leaves_from_json_schema_to_type(value, new_type, is_optional=is_optional)
1068
- elif leaf.get("type") == "array":
1069
- new_leaf["type"] = "array"
1070
- new_leaf["items"] = cast_all_leaves_from_json_schema_to_type(leaf["items"], new_type, is_optional=is_optional)
1071
- else:
1072
- if is_optional:
1073
- new_leaf["anyOf"] = [{"type": new_type}, {"type": "null"}]
1074
- else:
1075
- new_leaf["type"] = new_type
1076
- return new_leaf
1077
-
1078
-
1079
- SCHEMA_TYPES = Literal["string", "integer", "number", "boolean", "array", "object"]
1080
- # SCHEMA_STRING_DATE_FORMATS = Literal["date", "iso-date"]
1081
- # SCHEMA_STRING_TIME_FORMATS = Literal["time", "iso-time"]
1082
- # SCHEMA_STRING_DATETIME_FORMATS = Literal["datetime", "iso-datetime"]
1083
- # SCHEMA_STRING_CUSTOM_FORMATS = Literal["email", "phone-number", "vat-number"]
1084
-
1085
-
1086
- def get_pydantic_primitive_field_type(
1087
- type_: SCHEMA_TYPES | str, format_: str | None, is_nullable: bool = False, validator_func: Callable | None = None, enum_values: list[Any] | None = None
1088
- ) -> Any:
1089
- python_base_type: Any
1090
-
1091
- if enum_values is not None:
1092
- python_base_type = Literal[tuple(enum_values)] # type: ignore
1093
- elif type_ == "string":
1094
- if format_ in ("date", "iso-date"):
1095
- python_base_type = datetime.date
1096
- if format_ in ("time", "iso-time"):
1097
- python_base_type = datetime.time
1098
- if format_ in ("datetime", "iso-datetime"):
1099
- python_base_type = datetime.datetime
1100
- else:
1101
- python_base_type = str
1102
- elif type_ == "integer":
1103
- python_base_type = int
1104
- elif type_ == "number":
1105
- python_base_type = float
1106
- elif type_ == "boolean":
1107
- python_base_type = bool
1108
- elif type_ == "array":
1109
- python_base_type = list
1110
- elif type_ == "object":
1111
- python_base_type = dict
1112
- else:
1113
- raise ValueError(f"Unsupported schema type: {type_}")
1114
-
1115
- field_kwargs: Any = {"json_schema_extra": {"format": format_}} if format_ is not None else {}
1116
-
1117
- final_type: Any = Annotated[python_base_type, Field(..., **field_kwargs)]
1118
- final_type = Optional[final_type] if is_nullable or validator_func is not None else final_type
1119
- if validator_func is not None:
1120
- return Annotated[final_type, BeforeValidator(validator_func)]
1121
- return final_type
1122
-
1123
-
1124
- # Defaultdict that returns a no-op lambda for unknown keys, then merges known validators
1125
- # Expansive coercion functions (can evolve on time)
1126
- KNOWN_COERCIONS: dict[tuple[str | None, str | None], Callable[[Any], Any]] = defaultdict(lambda: lambda x: x) | {
1127
- # ("string", "iso-date"): validate_date,
1128
- # ("string", "iso-time"): validate_time,
1129
- # ("string", "email"): validate_email_address,
1130
- # ("string", "phone-number"): validate_phone_number,
1131
- # ("string", "vat-number"): validate_vat_number,
1132
- ("integer", None): validate_integer,
1133
- ("number", None): validate_float,
1134
- ("boolean", None): validate_bool,
1135
- ("string", None): validate_str,
1136
- }
1137
-
1138
-
1139
- def object_format_coercion(instance: dict[str, Any], schema: dict[str, Any]) -> dict[str, Any]:
1140
- """
1141
- Coerces an instance to conform to a JSON Schema, applying defaults and handling nullable fields.
1142
- Converts empty strings to None only if the field is optional.
1143
- """
1144
-
1145
- def recursive_coercion(_instance: Any, _schema: dict[str, Any]) -> Any:
1146
- # 1. Handle object type
1147
- if _schema.get("type") == "object":
1148
- if not isinstance(_instance, dict):
1149
- return _schema.get("default", {})
1150
- coerced_instance = {}
1151
- for prop_key, prop_schema in _schema.get("properties", {}).items():
1152
- coerced_instance[prop_key] = recursive_coercion(_instance.get(prop_key), prop_schema)
1153
- return coerced_instance
1154
-
1155
- # 2. Handle array type
1156
- if _schema.get("type") == "array":
1157
- if not isinstance(_instance, list):
1158
- return _schema.get("default", [])
1159
- return [recursive_coercion(value, _schema.get("items", {})) for value in _instance]
1160
-
1161
- # 3. Handle anyOf (optional fields)
1162
- if "anyOf" in _schema:
1163
- is_field_optional = any(sub.get("type") == "null" for sub in _schema["anyOf"])
1164
- if is_field_optional and (_instance == "" or _instance is None):
1165
- return None
1166
-
1167
- # Try to coerce with the first matching subschema
1168
- for subschema in _schema["anyOf"]:
1169
- # Skip null subschema for explicit coercion; handled above
1170
- if subschema.get("type") == "null":
1171
- continue
1172
- coerced_value = recursive_coercion(_instance, subschema)
1173
- if coerced_value is not None:
1174
- return coerced_value
1175
- return None # If none match, return None
1176
-
1177
- # 4. Handle primitive types and known coercions
1178
- schema_type = _schema.get("type")
1179
- ## Custom Formats that are not supported by default should be supplied as X-format.
1180
- schema_format = _schema.get("X-format") or _schema.get("format")
1181
-
1182
- # Use default if instance is None
1183
- if _instance is None:
1184
- _instance = _schema.get("default")
1185
-
1186
- # If schema type is null, just return None
1187
- if schema_type == "null":
1188
- return None
1189
-
1190
- # Apply known coercion
1191
- if (schema_type, schema_format) in KNOWN_COERCIONS:
1192
- return KNOWN_COERCIONS[(schema_type, schema_format)](_instance)
1193
-
1194
- return _instance # Return as-is if no coercion is required
1195
-
1196
- expanded_schema = expand_refs(schema)
1197
- coerced = recursive_coercion(instance, expanded_schema)
1198
- return coerced if coerced is not None else {}
1199
-
1200
-
1201
- def flatten_dict(obj: Any, prefix: str = "", allow_empty_objects: bool = True) -> dict[str, Any]:
1202
- items = [] # type: ignore
1203
- if isinstance(obj, dict):
1204
- if len(obj) == 0 and allow_empty_objects:
1205
- # Keep empty dicts as dicts (so we can keep its structure)
1206
- items.append((prefix, {}))
1207
- else:
1208
- for k, v in obj.items():
1209
- new_key = f"{prefix}.{k}" if prefix else k
1210
- items.extend(flatten_dict(v, new_key, allow_empty_objects=allow_empty_objects).items())
1211
-
1212
- elif isinstance(obj, list):
1213
- if len(obj) == 0 and allow_empty_objects:
1214
- # Keep empty lists as lists (so we can keep its structure)
1215
- items.append((prefix, []))
1216
- else:
1217
- for i, v in enumerate(obj):
1218
- new_key = f"{prefix}.{i}"
1219
- items.extend(flatten_dict(v, new_key, allow_empty_objects=allow_empty_objects).items())
1220
- else:
1221
- items.append((prefix, obj))
1222
- return dict(items)
1223
-
1224
-
1225
- def convert_dict_to_list_recursively(_obj: Any, allow_lists: bool = True) -> Any:
1226
- """
1227
- Recursively converts dict[int, Any] to list[Any] if the keys are sequential integers starting from 0.
1228
- Creates a copy of the input object rather than modifying it in place.
1229
- """
1230
- # Handle non-dict types
1231
- if not isinstance(_obj, dict):
1232
- return _obj
1233
-
1234
- # Create a copy to avoid modifying the original
1235
- result = {}
1236
-
1237
- # Process all nested dictionaries first
1238
- for key, value in _obj.items():
1239
- result[key] = convert_dict_to_list_recursively(value, allow_lists=allow_lists)
1240
-
1241
- # Check if this dictionary should be converted to a list
1242
- if result and all(isinstance(k, int) for k in result.keys()):
1243
- # Check if keys are sequential starting from 0
1244
- keys = sorted(result.keys())
1245
- if allow_lists and keys[0] == 0 and keys[-1] == len(keys) - 1:
1246
- # Convert to list
1247
- return [result[i] for i in keys]
1248
- else:
1249
- # Sort the keys and convert to string
1250
- return {str(i): result[i] for i in keys}
1251
-
1252
- return result
1253
-
1254
-
1255
- def unflatten_dict(obj: dict[str, Any], allow_lists: bool = True) -> Any:
1256
- """
1257
- Unflattens a dictionary by recursively converting keys with dots into nested dictionaries.
1258
- After building the nested structure, converts dict[int, Any] to list[Any] if the keys
1259
- are sequential integers starting from 0.
1260
-
1261
- Args:
1262
- obj: The dictionary to unflatten.
1263
-
1264
- Returns:
1265
- The unflattened dictionary with appropriate dict[int, Any] converted to list[Any].
1266
- """
1267
- # Handle empty input
1268
- if not obj:
1269
- return obj
1270
-
1271
- # Create a copy of the input object to avoid modifying it
1272
- input_copy = dict(obj)
1273
-
1274
- # Optionally validate that the dict is indeed flat
1275
- # Commented out to avoid potential equality issues with key ordering
1276
- # assert flatten_dict(input_copy) == input_copy, "Dictionary is not flat"
1277
-
1278
- # First pass: build everything as nested dictionaries
1279
- result = {}
1280
- for key, value in input_copy.items():
1281
- # Skip invalid keys
1282
- if not isinstance(key, str):
1283
- continue
1284
-
1285
- parts = key.split(".")
1286
- # Filter out empty parts
1287
- valid_parts = [p for p in parts if p]
1288
- if not valid_parts:
1289
- result[key] = value
1290
- continue
1291
-
1292
- current = result
1293
-
1294
- for i, part in enumerate(valid_parts):
1295
- # Check if the part is an integer (for list indices)
1296
- try:
1297
- # More robust integer parsing - handles negative numbers too
1298
- if part.lstrip("-").isdigit():
1299
- part = int(part)
1300
- except (ValueError, AttributeError):
1301
- # If conversion fails, keep as string
1302
- pass
1303
-
1304
- # If at the last part, set the value
1305
- if i == len(valid_parts) - 1:
1306
- current[part] = value
1307
- else:
1308
- # Create the container if it doesn't exist
1309
- if part not in current:
1310
- current[part] = {}
1311
- elif not isinstance(current[part], dict):
1312
- # Handle case where we're trying to nest under a non-dict
1313
- # This is a conflict - the path is both a value and used as a prefix
1314
- current[part] = {}
1315
-
1316
- current = current[part]
1317
-
1318
- # Second pass: convert appropriate dict[int, Any] to list[Any]
1319
- return convert_dict_to_list_recursively(result, allow_lists=allow_lists)
1320
-
1321
-
1322
- def extract_property_type_info(prop_schema: dict[str, Any]) -> tuple[str, Optional[str], bool, list[Any] | None]:
1323
- """
1324
- Extract the property type, possible 'format'/'X-format', and nullability from a property schema.
1325
- - If an 'anyOf' with exactly one 'null' type is used, we unify it into a single schema
1326
- (i.e., prop_schema plus is_nullable=True).
1327
- - This ensures 'enum', 'format', etc. are preserved from the non-null sub-schema.
1328
-
1329
- Returns:
1330
- (prop_type, prop_format, is_nullable)
1331
- """
1332
- is_nullable = False
1333
-
1334
- if "anyOf" in prop_schema:
1335
- sub_schemas = prop_schema["anyOf"]
1336
- sub_types = [s.get("type") for s in sub_schemas if isinstance(s, dict)]
1337
-
1338
- # We only handle the scenario: anyOf: [{type=XYZ,...}, {type=null}]
1339
- # If you have more complex unions, you'll need additional logic.
1340
- if len(sub_schemas) == 2 and "null" in sub_types:
1341
- # Identify the non-null sub-schema
1342
- valid_sub = next(s for s in sub_schemas if s.get("type") != "null")
1343
- is_nullable = True
1344
-
1345
- # Merge *everything* (enum, format, x-, etc.) from the valid_sub
1346
- # into prop_schema. This ensures we don't lose 'enum', 'format', etc.
1347
- prop_schema.update(valid_sub)
1348
- # Remove the anyOf now that it's merged
1349
- prop_schema.pop("anyOf", None)
1350
- else:
1351
- raise ValueError(f"'anyOf' structure not supported or doesn't match a single null type. Found: {sub_schemas}")
1352
-
1353
- # At this point, we expect a single 'type' in the property
1354
- if "type" not in prop_schema:
1355
- raise ValueError("Property schema must have a 'type' or a supported 'anyOf' pattern.")
1356
-
1357
- prop_type = prop_schema["type"]
1358
- # Pop 'format' or 'X-format' if any
1359
- prop_format = prop_schema.pop("format", None) or prop_schema.pop("X-format", None)
1360
- enum_values = prop_schema.get("enum", None)
1361
-
1362
- return prop_type, prop_format, is_nullable, enum_values
1363
-
1364
-
1365
- def _convert_property_schema_to_type(prop_schema: dict[str, Any]) -> Any:
1366
- """
1367
- Convert a single JSON Schema property to a Python type annotation:
1368
- - If 'enum' => Literal[...]
1369
- - If 'type=object' => nested submodel
1370
- - If 'type=array' => list[sub_type]
1371
- - If 'type=string/integer/number/boolean' => str/int/float/bool
1372
- """
1373
- # If there's an enum, return a Literal of the enum values
1374
- if "enum" in prop_schema:
1375
- # Convert each enum value to the correct Python literal
1376
- enum_values = prop_schema["enum"]
1377
- return Literal[tuple(enum_values)] # type: ignore
1378
-
1379
- # Otherwise check 'type'
1380
- prop_type = prop_schema.get("type")
1381
-
1382
- if prop_type == "object":
1383
- # Nested submodel
1384
- # If 'properties' is missing, that might be an empty dict
1385
- if "properties" in prop_schema:
1386
- return convert_json_schema_to_basemodel(prop_schema)
1387
- else:
1388
- # fallback
1389
- return dict
1390
-
1391
- if prop_type == "array":
1392
- # Look for 'items' => sub-schema
1393
- items_schema = prop_schema.get("items", {})
1394
- item_type = _convert_property_schema_to_type(items_schema)
1395
- return list[item_type] # type: ignore
1396
-
1397
- if prop_type == "string":
1398
- return str
1399
- if prop_type == "boolean":
1400
- return bool
1401
- if prop_type == "integer":
1402
- return int
1403
- if prop_type == "number":
1404
- return float
1405
-
1406
- # If the schema is "null" or unknown, fallback to object
1407
- return object
1408
-
1409
-
1410
- def convert_json_schema_to_basemodel(schema: dict[str, Any]) -> Type[BaseModel]:
1411
- """
1412
- Create a Pydantic BaseModel dynamically from a JSON Schema:
1413
- - Expand refs
1414
- - For each property, figure out if it's required
1415
- - Convert 'type': 'object' => nested submodel
1416
- - Convert 'enum' => Literal
1417
- - 'array' => list[submodel or primitive]
1418
- - Primitives => str, int, float, bool
1419
- - Preserves anyOf/oneOf structure for nullable fields
1420
- """
1421
- # 1) Expand references (inlines $refs)
1422
- schema_expanded = expand_refs(copy.deepcopy(schema))
1423
-
1424
- # 2) Figure out model name
1425
- model_name = schema_expanded.get("title", "DynamicModel")
1426
-
1427
- # 3) Collect any X-* keys for model config
1428
- x_keys = {k: v for k, v in schema_expanded.items() if k.startswith("X-")}
1429
- model_config = ConfigDict(extra="forbid", json_schema_extra=x_keys) if x_keys else ConfigDict(extra="forbid")
1430
-
1431
- # 4) Build up the field definitions
1432
- properties = schema_expanded.get("properties", {})
1433
- required_props = set(schema_expanded.get("required", []))
1434
-
1435
- field_definitions = {}
1436
- for prop_name, prop_schema in properties.items():
1437
- # If property is required => default=...
1438
- # Else => default=None
1439
- if prop_name in required_props:
1440
- default_val = prop_schema.get("default", ...)
1441
- else:
1442
- default_val = prop_schema.get("default", None)
1443
-
1444
- # We also keep 'description', 'title', 'X-...' and everything else
1445
- # that's needed to preserve schema structure for round-trip conversion
1446
- field_kwargs = {
1447
- "description": prop_schema.get("description"),
1448
- "title": prop_schema.get("title"),
1449
- }
1450
-
1451
- # Include all original schema structure for proper round-trip conversion
1452
- schema_extra = {}
1453
- for k, v in prop_schema.items():
1454
- if k not in {"description", "title", "default"} and not k.startswith("$"):
1455
- schema_extra[k] = v
1456
-
1457
- if schema_extra:
1458
- field_kwargs["json_schema_extra"] = schema_extra
1459
-
1460
- # Handle anyOf for nullable types specially
1461
- if "anyOf" in prop_schema:
1462
- # Check if it's a standard nullable pattern: [type, null]
1463
- sub_schemas = prop_schema["anyOf"]
1464
- null_schemas = [s for s in sub_schemas if s.get("type") == "null"]
1465
- non_null_schemas = [s for s in sub_schemas if s.get("type") != "null"]
1466
-
1467
- if len(null_schemas) == 1 and len(non_null_schemas) == 1:
1468
- # Standard nullable field pattern
1469
- non_null_schema = non_null_schemas[0]
1470
- inner_type = _convert_property_schema_to_type(non_null_schema)
1471
- python_type = Union[inner_type, None]
1472
- else:
1473
- # More complex anyOf structure - preserve it in schema_extra
1474
- python_type = object
1475
-
1476
- field_definitions[prop_name] = (python_type, Field(default_val, **field_kwargs))
1477
- continue
1478
-
1479
- # Convert to a Python type annotation
1480
- python_type = _convert_property_schema_to_type(prop_schema)
1481
-
1482
- # If a field is not in `required`, we typically wrap it in `Optional[...]`
1483
- if prop_name not in required_props and not is_already_optional(python_type):
1484
- python_type = Union[python_type, None]
1485
-
1486
- field_definitions[prop_name] = (python_type, Field(default_val, **field_kwargs))
1487
-
1488
- # 5) Build the dynamic model
1489
- return create_model(
1490
- model_name,
1491
- __config__=model_config,
1492
- __module__="__main__",
1493
- **field_definitions,
1494
- ) # type: ignore
1495
-
1496
-
1497
- def convert_json_schema_to_basemodelold(schema: dict[str, Any]) -> Type[BaseModel]:
1498
- """
1499
- Create a Pydantic BaseModel dynamically from a JSON Schema.
1500
- Steps:
1501
- 1. Expand all refs.
1502
- 2. For each property, parse type info and create a suitable Pydantic field.
1503
- 3. Nested objects -> submodels, arrays -> list[type].
1504
- 4. Keep 'enum' and 'format' in the final schema so Pydantic sees them in the
1505
- generated model's JSON schema.
1506
- """
1507
- # 1. Expand references
1508
- schema_expanded = expand_refs(copy.deepcopy(schema))
1509
-
1510
- # 2. Gather 'X-*' keys from the root for the config
1511
- x_keys = {k: v for k, v in schema_expanded.items() if k.startswith("X-")}
1512
-
1513
- # 3. Prepare dynamic model fields
1514
- field_definitions: Any = {}
1515
-
1516
- # 4. Get properties + required
1517
- props = schema_expanded.get("properties", {})
1518
- required_fields = set(schema_expanded.get("required", []))
1519
-
1520
- for prop_name, prop_schema in props.items():
1521
- # a) Determine the python type, format, and nullability
1522
- prop_type, prop_format, is_nullable, enum_values = extract_property_type_info(prop_schema)
1523
- field_kwargs = {
1524
- "description": prop_schema.get("description"),
1525
- "title": prop_schema.get("title"),
1526
- # Put all schema extras, including 'enum', 'format', 'X-...' etc. into json_schema_extra
1527
- "json_schema_extra": {k: v for k, v in prop_schema.items() if k.startswith("X-")},
1528
- }
1529
-
1530
- # c) Determine the default or whether it's required
1531
- if prop_name in required_fields:
1532
- default_val = prop_schema.get("default", ...)
1533
- else:
1534
- default_val = prop_schema.get("default", None)
1535
-
1536
- # d) Dispatch based on prop_type
1537
- if prop_type == "object":
1538
- if "properties" not in prop_schema:
1539
- raise ValueError(f"Schema for object '{prop_name}' must have 'properties' to build a submodel.")
1540
- sub_model = convert_json_schema_to_basemodel(prop_schema)
1541
- final_type = sub_model if not is_nullable else Optional[sub_model]
1542
-
1543
- field_definitions[prop_name] = (final_type, Field(default_val, **field_kwargs))
1544
-
1545
- elif prop_type == "array":
1546
- # Handle arrays of both objects and primitive types
1547
- items_schema = prop_schema.get("items", {})
1548
- item_type, item_format, item_nullable, item_enum = extract_property_type_info(items_schema)
1549
-
1550
- if item_type == "object":
1551
- # Handle array of objects
1552
- sub_model = convert_json_schema_to_basemodel(items_schema)
1553
- array_type = list[sub_model] # type: ignore
1554
- else:
1555
- # Handle array of primitives
1556
- item_python_type = get_pydantic_primitive_field_type(
1557
- item_type, item_format, is_nullable=item_nullable, validator_func=KNOWN_COERCIONS.get((item_type, item_format), None), enum_values=item_enum
1558
- )
1559
- array_type = list[item_python_type] # type: ignore
1560
-
1561
- field_definitions[prop_name] = (array_type if not is_nullable else Optional[array_type], Field(default_val, **field_kwargs))
1562
-
1563
- else:
1564
- # e) Primitive
1565
- python_validator = KNOWN_COERCIONS.get((prop_type, prop_format), None)
1566
- python_type = get_pydantic_primitive_field_type(prop_type, prop_format, is_nullable=is_nullable, validator_func=python_validator, enum_values=enum_values)
1567
-
1568
- # If the field can be null, or we have a validator that must accept None:
1569
- field_definitions[prop_name] = (python_type, Field(default_val, **field_kwargs))
1570
-
1571
- # 5. Build the model class
1572
- model_name: str = schema_expanded.get("title", "DynamicModel")
1573
- model_config = ConfigDict(extra="forbid", json_schema_extra=x_keys) if x_keys else ConfigDict(extra="forbid")
1574
-
1575
- return create_model(
1576
- model_name,
1577
- __config__=model_config,
1578
- __module__="__main__",
1579
- **field_definitions,
1580
- )
1581
-
1582
-
1583
- def is_basemodel_subclass(t: Any) -> bool:
1584
- return isinstance(t, type) and issubclass(t, BaseModel)
1585
-
1586
-
1587
- def is_already_optional(t: Any) -> bool:
1588
- """Return True if type t is Optional[...] or includes None in a Union."""
1589
- return (get_origin(t) in {Union, types.UnionType}) and type(None) in get_args(t)
1590
-
1591
-
1592
- def convert_basemodel_to_partial_basemodel(base_model: Type[BaseModel]) -> Type[BaseModel]:
1593
- """
1594
- Convert a BaseModel class to a new BaseModel class where all fields are Optional.
1595
- Handles nested BaseModels, lists, and unions recursively.
1596
- """
1597
- field_definitions: Any = {}
1598
- maybe_optional_type: Any
1599
- for field_name, field_info in base_model.model_fields.items():
1600
- field_type = field_info.annotation
1601
-
1602
- # Handle nested BaseModel
1603
- if is_basemodel_subclass(field_type):
1604
- partial_nested = convert_basemodel_to_partial_basemodel(cast(Type[BaseModel], field_type))
1605
- maybe_optional_type = Union[partial_nested, None]
1606
- else:
1607
- origin = get_origin(field_type)
1608
- args = get_args(field_type)
1609
-
1610
- # Handle list[...] or tuple[...]
1611
- if origin in (list, tuple) and args:
1612
- inner_type = args[0]
1613
- if is_basemodel_subclass(inner_type):
1614
- # Recursively convert the inner model
1615
- partial_inner = convert_basemodel_to_partial_basemodel(inner_type)
1616
- container_type = list if origin is list else tuple
1617
- new_type = container_type[partial_inner] # type: ignore
1618
- else:
1619
- new_type = field_type # type: ignore
1620
- maybe_optional_type = Union[new_type, None] # type: ignore
1621
-
1622
- # Handle Union types
1623
- elif origin in {Union, types.UnionType}:
1624
- new_union_args: list[type] = []
1625
- for arg in args:
1626
- if is_basemodel_subclass(arg):
1627
- new_union_args.append(convert_basemodel_to_partial_basemodel(arg))
1628
- else:
1629
- new_union_args.append(arg)
1630
- # Make sure the union has None in it (to enforce optional)
1631
- if type(None) not in new_union_args:
1632
- new_union_args.append(type(None))
1633
- maybe_optional_type = Union[tuple(new_union_args)] # type: ignore
1634
-
1635
- # Any other type - wrap in Optional unless already optional
1636
- else:
1637
- if is_already_optional(field_type):
1638
- maybe_optional_type = field_type
1639
- else:
1640
- maybe_optional_type = Union[field_type, None] # type: ignore
1641
-
1642
- field_definitions[field_name] = (cast(type, maybe_optional_type), None)
1643
-
1644
- # Dynamically create a new model
1645
- return create_model(f"Partial{base_model.__name__}", __config__=base_model.model_config, __module__="__main__", **field_definitions)
1646
-
1647
-
1648
- def load_json_schema(json_schema: Union[dict[str, Any], Path, str]) -> dict[str, Any]:
1649
- """
1650
- Load a JSON schema from either a dictionary or a file path.
1651
-
1652
- Args:
1653
- json_schema: Either a dictionary containing the schema or a path to a JSON file
1654
-
1655
- Returns:
1656
- dict[str, Any]: The loaded JSON schema
1657
-
1658
- Raises:
1659
- JSONDecodeError: If the schema file contains invalid JSON
1660
- FileNotFoundError: If the schema file doesn't exist
1661
- """
1662
- if isinstance(json_schema, (str, Path)):
1663
- with open(json_schema) as f:
1664
- return json.load(f)
1665
- return json_schema
1666
-
1667
-
1668
- def filter_auxiliary_fields(data: dict[str, Any], prefixes: list[str] = ["reasoning___", "quote___"]) -> dict[str, Any]:
1669
- """
1670
- Recursively filters out fields that start with any of the prefixes in `prefixes` from the input data.
1671
- """
1672
- if not isinstance(data, dict):
1673
- return data # Base case: return non-dict values as is
1674
-
1675
- filtered: dict[str, Any] = {}
1676
- for key, value in data.items():
1677
- if not key.startswith(tuple(prefixes)):
1678
- if isinstance(value, dict):
1679
- filtered[key] = filter_auxiliary_fields(value, prefixes)
1680
- elif isinstance(value, list):
1681
- filtered[key] = [filter_auxiliary_fields(item, prefixes) if isinstance(item, dict) else item for item in value]
1682
- else:
1683
- filtered[key] = value
1684
-
1685
- return filtered
1686
-
1687
-
1688
- def filter_auxiliary_fields_json(data: str, prefixes: list[str] = ["reasoning___", "quote___"]) -> dict[str, Any]:
1689
- """
1690
- Recursively filters out fields that start with any of the prefixes in `prefixes` from the input JSON data.
1691
- """
1692
- data_dict = json.loads(data)
1693
- return filter_auxiliary_fields(data_dict, prefixes)
1694
-
1695
-
1696
- def get_all_paths(schema: dict[str, Any]) -> list[str]:
1697
- """
1698
- Extract all possible JSON pointer paths from a JSON Schema.
1699
-
1700
- This function traverses a JSON Schema and generates a list of all possible paths
1701
- that could exist in a document conforming to that schema. For arrays, it uses '*'
1702
- as a wildcard index.
1703
-
1704
- Args:
1705
- schema (dict[str, Any]): The JSON Schema to analyze
1706
-
1707
- Returns:
1708
- list[str]: A list of dot-notation paths (e.g. ["person.name", "person.addresses.*.street"])
1709
-
1710
- Example:
1711
- >>> schema = {
1712
- ... "type": "object",
1713
- ... "properties": {
1714
- ... "name": {"type": "string"},
1715
- ... "addresses": {
1716
- ... "type": "array",
1717
- ... "items": {
1718
- ... "type": "object",
1719
- ... "properties": {
1720
- ... "street": {"type": "string"}
1721
- ... }
1722
- ... }
1723
- ... }
1724
- ... }
1725
- ... }
1726
- >>> get_all_paths(schema)
1727
- ['name', 'addresses', 'addresses.*.street']
1728
- """
1729
- paths: list[str] = []
1730
-
1731
- def _traverse(current_schema: dict[str, Any], current_path: str = "") -> None:
1732
- if any(key in current_schema for key in ["oneOf", "allOf"]):
1733
- raise ValueError("OneOf and AllOf are not supported yet.")
1734
-
1735
- # Handle array type schemas
1736
- # if current_schema.get("type") == "array":
1737
- if "items" in current_schema:
1738
- paths.append(f"{current_path}")
1739
- _traverse(current_schema["items"], f"{current_path}.*")
1740
- return
1741
-
1742
- # Handle object type schemas
1743
- if "properties" in current_schema:
1744
- for prop_name, prop_schema in current_schema["properties"].items():
1745
- new_path = f"{current_path}.{prop_name}" if current_path else prop_name
1746
-
1747
- # If property is a leaf node (has type but no properties/items)
1748
- if not any(key in prop_schema for key in ["properties", "items"]):
1749
- paths.append(new_path)
1750
- else:
1751
- _traverse(prop_schema, new_path)
1752
-
1753
- # Handle $ref schemas
1754
- elif "$ref" in current_schema:
1755
- # Skip refs for now since we don't have access to the full schema with definitions
1756
- pass
1757
-
1758
- # Handle anyOf/oneOf/allOf schemas
1759
-
1760
- elif any(key in current_schema for key in ["anyOf", "oneOf", "allOf"]):
1761
- # Take first schema as representative for path generation
1762
- for key in ["anyOf", "oneOf", "allOf"]:
1763
- if key in current_schema and current_schema[key]:
1764
- _traverse(current_schema[key][0], current_path)
1765
- break
1766
-
1767
- _traverse(schema)
1768
- return paths
1769
-
1770
-
1771
- def convert_schema_to_layout(schema: dict[str, Any]) -> dict[str, Any]:
1772
- """
1773
- Convert a JSON Schema (represented as a Python dict) into a Layout object.
1774
- """
1775
- # Get the definitions from the schema (or empty dict if not provided)
1776
- defs = schema.get("$defs", {})
1777
- converted_defs: dict[str, Column] = {}
1778
-
1779
- def is_object_schema(sch: dict[str, Any]) -> bool:
1780
- return "properties" in sch and isinstance(sch.get("properties"), dict)
1781
-
1782
- def extract_ref(sch: dict[str, Any]) -> Optional[str]:
1783
- return sch.get("$ref")
1784
-
1785
- def extract_ref_schema(ref: Optional[str], defs: dict[str, dict[str, Any]]) -> Optional[dict[str, Any]]:
1786
- if not ref:
1787
- return None
1788
- ref_name = ref.split("/")[-1]
1789
- return defs.get(ref_name)
1790
-
1791
- def is_object_via_any_of(sch: dict[str, Any]) -> bool:
1792
- any_of = sch.get("anyOf")
1793
- if isinstance(any_of, list):
1794
- return any((extract_ref(option) and extract_ref_schema(extract_ref(option), defs)) or is_object_schema(option) for option in any_of)
1795
- return False
1796
-
1797
- def property_is_object(prop_schema: dict[str, Any]) -> bool:
1798
- ref = extract_ref(prop_schema)
1799
- if ref:
1800
- ref_schema = extract_ref_schema(ref, defs)
1801
- return bool(ref_schema)
1802
- return is_object_schema(prop_schema) or is_object_via_any_of(prop_schema)
1803
-
1804
- def property_is_array(prop_schema: dict[str, Any]) -> bool:
1805
- return prop_schema.get("type") == "array"
1806
-
1807
- def handle_ref_object(prop_name: str, ref: str) -> RefObject:
1808
- ref_name = ref.split("/")[-1]
1809
- if ref_name not in converted_defs:
1810
- ref_schema = extract_ref_schema(ref, defs)
1811
- if ref_schema and is_object_schema(ref_schema):
1812
- result = handle_object(ref_name, ref_schema, drop_name=True)
1813
- assert isinstance(result, Column)
1814
- converted_defs[ref_name] = result
1815
- return RefObject(type="object", size=None, **{"$ref": ref})
1816
-
1817
- def handle_object(prop_name: str, object_schema: dict[str, Any], drop_name: bool = False) -> Union[RefObject, Column]:
1818
- ref = extract_ref(object_schema)
1819
- if ref:
1820
- return handle_ref_object(prop_name, ref)
1821
- else:
1822
- props = object_schema.get("properties")
1823
- if not props:
1824
- # If no properties, try anyOf (skipping null types)
1825
- any_of = object_schema.get("anyOf")
1826
- if isinstance(any_of, list):
1827
- for option in any_of:
1828
- if option.get("type") != "null":
1829
- props = option.get("properties")
1830
- if props:
1831
- break
1832
- if not props:
1833
- props = {}
1834
- items: list[Row | RowList | FieldItem | RefObject] = []
1835
- for p_name, p_schema in props.items():
1836
- if property_is_object(p_schema):
1837
- # Wrap object properties in a row
1838
- items.append(Row(type="row", name=p_name, items=[handle_object(p_name, p_schema)]))
1839
- elif property_is_array(p_schema):
1840
- items.append(handle_array_items(p_name, p_schema))
1841
- else:
1842
- items.append(FieldItem(type="field", name=p_name, size=1))
1843
- if drop_name:
1844
- return Column(type="column", size=1, items=items)
1845
- else:
1846
- return Column(type="column", size=1, items=items, name=prop_name)
1847
-
1848
- def handle_array_items(prop_name: str, array_schema: dict[str, Any]) -> RowList:
1849
- items_schema = array_schema.get("items", {})
1850
- row_items: list[Column | FieldItem | RefObject] = []
1851
- if property_is_object(items_schema):
1852
- row_items.append(handle_object(prop_name, items_schema))
1853
- else:
1854
- row_items.append(FieldItem(type="field", name=prop_name, size=1))
1855
- return RowList(type="rowList", name=prop_name, items=row_items)
1856
-
1857
- # Process definitions from $defs
1858
- for definition_name, definition_schema in defs.items():
1859
- if is_object_schema(definition_schema):
1860
- result = handle_object(definition_name, definition_schema, drop_name=True)
1861
- assert isinstance(result, Column)
1862
- converted_defs[definition_name] = result
1863
-
1864
- # Process top-level properties
1865
- top_level_props = schema.get("properties", {})
1866
- top_level_items: list[Row | RowList | FieldItem | RefObject] = []
1867
- for prop_name, prop_schema in top_level_props.items():
1868
- if property_is_object(prop_schema):
1869
- top_level_items.append(Row(type="row", name=prop_name, items=[handle_object(prop_name, prop_schema)]))
1870
- elif property_is_array(prop_schema):
1871
- top_level_items.append(handle_array_items(prop_name, prop_schema))
1872
- else:
1873
- top_level_items.append(FieldItem(type="field", name=prop_name, size=1))
1874
-
1875
- return Layout(type="column", size=1, items=top_level_items, **{"$defs": converted_defs}).model_dump(by_alias=True)
1876
-
1877
-
1878
- ### Json Schema to NLP Data Structure
1879
-
1880
-
1881
- def get_type_str(field_schema):
1882
- """
1883
- Recursively determine the type string for a given schema field.
1884
- Handles 'anyOf' unions, enums, arrays, and simple types.
1885
- """
1886
- if "anyOf" in field_schema:
1887
- types = []
1888
- for sub_schema in field_schema["anyOf"]:
1889
- types.append(get_type_str(sub_schema))
1890
- # Remove duplicates while preserving order
1891
- seen = set()
1892
- unique_types = []
1893
- for t in types:
1894
- if t not in seen:
1895
- seen.add(t)
1896
- unique_types.append(t)
1897
- return " | ".join(unique_types)
1898
- elif "enum" in field_schema:
1899
- # Create a union of the literal enum values (as JSON strings)
1900
- return " | ".join(json.dumps(val) for val in field_schema["enum"])
1901
- elif "type" in field_schema:
1902
- typ = field_schema["type"]
1903
- if typ == "array" and "items" in field_schema:
1904
- # For arrays, indicate the type of the items
1905
- item_type = get_type_str(field_schema["items"])
1906
- return f"array of {item_type}"
1907
- return typ
1908
- else:
1909
- return "unknown"
1910
-
1911
-
1912
- def process_schema_field(field_name, field_schema, level, new_line_sep: str = "\n", field_name_prefix: str = ""):
1913
- """
1914
- Process a single field in the JSON schema.
1915
- 'level' indicates the header level (e.g., 3 for root, 4 for nested, etc.).
1916
- Returns a markdown string representing the field.
1917
- """
1918
- md = ""
1919
- field_name_complete = field_name_prefix + field_name
1920
-
1921
- # Extract type information
1922
- type_str = get_type_str(field_schema)
1923
- # md += f"**Type**: {type_str}{new_line_sep}"
1924
-
1925
- header = "#" * level + f" {field_name_complete} ({type_str})"
1926
- md += header + new_line_sep
1927
-
1928
- # Extract description (or use a placeholder if not provided)
1929
- description = field_schema.get("description", None)
1930
- if description is not None:
1931
- md += f"<Description>\n{description}\n</Description>"
1932
- else:
1933
- md += "<Description></Description>"
1934
-
1935
- md += new_line_sep * 2
1936
-
1937
- # If the field is an object with its own properties, process those recursively.
1938
- if field_schema.get("type") == "object" and "properties" in field_schema:
1939
- for sub_field_name, sub_field_schema in field_schema["properties"].items():
1940
- md += process_schema_field(sub_field_name, sub_field_schema, level + 1, field_name_prefix=field_name_complete + ".")
1941
-
1942
- # If the field is an array and its items are objects with properties, process them.
1943
- elif field_schema.get("type") == "array" and "items" in field_schema:
1944
- items_schema = field_schema["items"]
1945
- if items_schema.get("type") == "object" and "properties" in items_schema:
1946
- md += process_schema_field("*", items_schema, level + 1, field_name_prefix=field_name_complete + ".")
1947
-
1948
- return md
1949
-
1950
-
1951
- def json_schema_to_nlp_data_structure(schema: dict) -> str:
1952
- """
1953
- Receives a JSON schema (without $defs or $ref) and returns a markdown string
1954
- that documents each field with its name, description, type (including unions and enums),
1955
- and default value (if defined). Root-level fields use 3 hashtags, and nested fields
1956
- add one hashtag per level.
1957
- """
1958
- schema_title = schema.get("title", schema.get("name", "Schema"))
1959
- md = f"## {schema_title} -- NLP Data Structure\n\n"
1960
- # Assume the root schema is an object with properties.
1961
- if schema.get("type") == "object" and "properties" in schema:
1962
- for field_name, field_schema in schema["properties"].items():
1963
- md += process_schema_field(field_name, field_schema, 3)
1964
- else:
1965
- md += process_schema_field("root", schema, 3)
1966
- return md
1967
-
1968
-
1969
- def nlp_data_structure_to_field_descriptions(nlp_data_structure: str) -> dict:
1970
- """
1971
- This function updates the JSON schema with the descriptions from the NLP data structure.
1972
-
1973
- Args:
1974
- schema: The original JSON schema dictionary
1975
- nlp_data_structure: A markdown string created by json_schema_to_nlp_data_structure, potentially with updated descriptions
1976
-
1977
- Returns:
1978
- A new schema with updated descriptions from the NLP data structure
1979
- """
1980
-
1981
- # Pattern to match headers and extract field_name and type
1982
- # Example: "### field_name (type)" or "#### parent.child (type)"
1983
- header_pattern = re.compile(r"^(#+)\s+([^\s(]+)\s*\(([^)]*)\)")
1984
-
1985
- # Pattern to extract description between tags
1986
- description_pattern = re.compile(r"<Description>(.*?)</Description>", re.DOTALL)
1987
-
1988
- # Split the markdown by lines
1989
- lines = nlp_data_structure.split("\n")
1990
-
1991
- # Process the markdown to extract field names and descriptions
1992
- field_descriptions = {}
1993
-
1994
- i = 0
1995
- while i < len(lines):
1996
- line = lines[i]
1997
-
1998
- # Check if this line is a header
1999
- header_match = header_pattern.match(line)
2000
- if header_match:
2001
- field_path = header_match.group(2) # Field name or path
2002
-
2003
- # Look for description in subsequent lines until next header
2004
- desc_start = i + 1
2005
- while desc_start < len(lines) and not header_pattern.match(lines[desc_start]):
2006
- desc_start += 1
2007
-
2008
- # Extract description from the block of text
2009
- description_block = "\n".join(lines[i + 1 : desc_start])
2010
- desc_match = description_pattern.search(description_block)
2011
- if desc_match:
2012
- description_text = desc_match.group(1).strip()
2013
- field_descriptions[field_path] = description_text
2014
-
2015
- i = desc_start - 1 # Will be incremented in the loop
2016
-
2017
- i += 1
2018
- return field_descriptions
2019
-
2020
-
2021
- ##### JSON Schema Sanitization #####
2022
-
2023
- SchemaPath = Tuple[Union[str, int], ...] # e.g. ('address', 'city') or ('items', 3)
2024
-
2025
-
2026
- def _pick_subschema(schemas: list[dict[str, Any]], value: Any) -> dict[str, Any]:
2027
- """
2028
- Return the first subschema in *schemas* that
2029
- • explicitly allows the Python type of *value*, or
2030
- • has no "type" at all (acts as a wildcard).
2031
-
2032
- Fallback: the first subschema (so we *always* return something).
2033
- """
2034
- pytypes_to_json = {
2035
- str: "string",
2036
- int: "integer",
2037
- float: "number",
2038
- bool: "boolean",
2039
- type(None): "null",
2040
- dict: "object",
2041
- list: "array",
2042
- }
2043
- jstype = pytypes_to_json.get(type(value))
2044
-
2045
- for sub in schemas:
2046
- allowed = sub.get("type")
2047
- if allowed is None or allowed == jstype or (isinstance(allowed, list) and jstype in allowed):
2048
- return sub
2049
- return schemas[0] # last resort
2050
-
2051
-
2052
- def __sanitize_instance(instance: Any, schema: dict[str, Any], path: SchemaPath = ()) -> Any:
2053
- """
2054
- Return a **new** instance where every string that violates ``maxLength``
2055
- has been sliced to that length. Mutates nothing in‑place.
2056
- """
2057
-
2058
- # ------------- unwrap anyOf ------------------------------------
2059
- if "anyOf" in schema:
2060
- schema = _pick_subschema(schema["anyOf"], instance)
2061
- # (We recurse *once*; nested anyOfs will be handled the same way)
2062
-
2063
- # ------------- objects -----------------
2064
- if schema.get("type") == "object" and isinstance(instance, MutableMapping):
2065
- props = schema.get("properties", {})
2066
- return {k: __sanitize_instance(v, props.get(k, {}), path + (k,)) for k, v in instance.items()}
2067
-
2068
- # ------------- arrays ------------------
2069
- if schema.get("type") == "array" and isinstance(instance, MutableSequence):
2070
- item_schema = schema.get("items", {})
2071
- return [__sanitize_instance(v, item_schema, path + (i,)) for i, v in enumerate(instance)]
2072
-
2073
- # ------------- primitive strings -------
2074
- if schema.get("type") == "string" and isinstance(instance, str):
2075
- max_len = schema.get("maxLength")
2076
- if max_len is not None and len(instance) > max_len:
2077
- print("=" * 100)
2078
- _path = ".".join(map(str, path)) or "<root>"
2079
- print(
2080
- f"Trimmed {_path} from {len(instance)}→{max_len} characters",
2081
- )
2082
- print("=" * 100)
2083
- return instance[:max_len]
2084
-
2085
- # ------------- all other primitives ----
2086
- return instance
2087
-
2088
-
2089
- def sanitize(instance: Any, schema: dict[str, Any]) -> Any:
2090
- expanded_schema = expand_refs(schema)
2091
- return __sanitize_instance(instance, expanded_schema)
2092
-
2093
-
2094
- def compute_schema_data_id(json_schema: dict[str, Any]) -> str:
2095
- """Returns the schema_data_id for a given JSON schema.
2096
-
2097
- The schema_data_id is a hash of the schema data, ignoring all prompt/description/default fields
2098
- and other non-structural metadata.
2099
-
2100
- Args:
2101
- json_schema: The JSON schema to compute the ID for
2102
-
2103
- Returns:
2104
- str: A hash string representing the schema data version with "sch_data_id_" prefix
2105
- """
2106
-
2107
- return "sch_data_id_" + generate_blake2b_hash_from_string(
2108
- json.dumps(
2109
- clean_schema(
2110
- copy.deepcopy(json_schema),
2111
- remove_custom_fields=True,
2112
- fields_to_remove=["description", "default", "title", "required", "examples", "deprecated", "readOnly", "writeOnly"],
2113
- ),
2114
- sort_keys=True,
2115
- ).strip()
2116
- )
2117
-
2118
-
2119
- def validate_json_against_schema(
2120
- data: Any,
2121
- schema: dict[str, Any],
2122
- return_instance: bool = False,
2123
- ) -> Union[None, BaseModel]:
2124
- """
2125
- Validate *data* against *schema*.
2126
-
2127
- Parameters
2128
- ----------
2129
- data
2130
- A JSON‑serialisable Python object (dict / list / primitives).
2131
- schema
2132
- A JSON‑Schema dict (can contain $defs / $ref – they’ll be expanded
2133
- by ``convert_json_schema_to_basemodel``).
2134
- return_instance
2135
- • ``False`` (default): only validate; raise if invalid; return ``None``.
2136
- • ``True``: on success, return the fully‑validated Pydantic instance
2137
- (handy for downstream type‑safe access).
2138
-
2139
- Raises
2140
- ------
2141
- pydantic.ValidationError
2142
- If *data* does not conform to *schema*.
2143
-
2144
- Examples
2145
- --------
2146
- >>> validate_json_against_schema({"foo": 1}, my_schema) # just checks
2147
- >>> obj = validate_json_against_schema(data, schema, True) # typed access
2148
- >>> print(obj.foo + 5)
2149
- """
2150
- # 1) Build a Pydantic model on‑the‑fly from the JSON‑Schema
2151
- Model: Type[BaseModel] = convert_json_schema_to_basemodel(schema)
2152
-
2153
- # 2) Let Pydantic do the heavy lifting
2154
- instance = Model.model_validate(data) # <- raises ValidationError if bad
2155
-
2156
- return instance if return_instance else None