structurize 3.0.1__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,825 @@
1
+ """Shared schema inference logic for JSON and XML data.
2
+
3
+ This module provides the core inference logic used by:
4
+ - json2a/json2s: Infer schema from JSON files
5
+ - xml2a/xml2s: Infer schema from XML files
6
+ - sql2a: Infer schema for JSON/XML columns in databases
7
+ """
8
+
9
+ import copy
10
+ import json
11
+ import xml.etree.ElementTree as ET
12
+ from typing import Any, Dict, List, Tuple, Callable
13
+
14
+ from avrotize.common import avro_name, get_tree_hash
15
+
16
+ JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | float | None
17
+
18
+
19
+ class SchemaInferrer:
20
+ """Base class for schema inference from JSON and XML data."""
21
+
22
+ def __init__(self, namespace: str = '', type_name_prefix: str = '', altnames_key: str = 'json'):
23
+ """Initialize the schema inferrer.
24
+
25
+ Args:
26
+ namespace: Namespace for generated types (Avro) or $id base (JSON Structure)
27
+ type_name_prefix: Prefix for generated type names
28
+ altnames_key: Key to use for altnames mapping (e.g., 'json', 'sql', 'xml')
29
+ """
30
+ self.namespace = namespace
31
+ self.type_name_prefix = type_name_prefix
32
+ self.altnames_key = altnames_key
33
+ self.generated_types: List[str] = []
34
+
35
+ def fold_record_types(self, base_record: dict, new_record: dict) -> Tuple[bool, dict]:
36
+ """Merges two record types by combining their fields.
37
+
38
+ When two records have overlapping fields with compatible types, they
39
+ are folded into a single record with all fields. Fields that don't
40
+ appear in all records become optional (nullable with null default).
41
+
42
+ Args:
43
+ base_record: The base record to merge into
44
+ new_record: The new record to merge
45
+
46
+ Returns:
47
+ Tuple of (success, merged_record). If folding fails due to
48
+ incompatible types, returns (False, new_record).
49
+ """
50
+ base_fields = copy.deepcopy(base_record).get("fields", [])
51
+ new_fields = new_record.get("fields", [])
52
+
53
+ # Track field names present in each record
54
+ base_field_names = {f["name"] for f in base_fields}
55
+ new_field_names = {f["name"] for f in new_fields}
56
+
57
+ # Process fields from the new record
58
+ for field in new_fields:
59
+ base_field = next(
60
+ (f for f in base_fields if f["name"] == field["name"]), None)
61
+ if not base_field:
62
+ # Field only in new record - add it as nullable
63
+ new_field = copy.deepcopy(field)
64
+ new_field["type"] = self._make_nullable(new_field["type"])
65
+ new_field["default"] = None
66
+ base_fields.append(new_field)
67
+ else:
68
+ # Field in both records - merge types
69
+ merged_type = self._merge_field_types(base_field["type"], field["type"])
70
+ if merged_type is None:
71
+ return False, new_record
72
+ base_field["type"] = merged_type
73
+
74
+ # Make fields that are only in base record nullable
75
+ for base_field in base_fields:
76
+ if base_field["name"] not in new_field_names and base_field["name"] in base_field_names:
77
+ if not self._is_nullable(base_field["type"]):
78
+ base_field["type"] = self._make_nullable(base_field["type"])
79
+ base_field["default"] = None
80
+
81
+ base_record["fields"] = base_fields
82
+ return True, base_record
83
+
84
+ def _is_nullable(self, avro_type: JsonNode) -> bool:
85
+ """Check if an Avro type is nullable (contains null in union)."""
86
+ if avro_type == "null":
87
+ return True
88
+ if isinstance(avro_type, list):
89
+ return "null" in avro_type
90
+ return False
91
+
92
+ def _make_nullable(self, avro_type: JsonNode) -> JsonNode:
93
+ """Make an Avro type nullable by wrapping in union with null."""
94
+ if self._is_nullable(avro_type):
95
+ return avro_type
96
+ if avro_type == "null":
97
+ return "null"
98
+ if isinstance(avro_type, list):
99
+ # Already a union, add null if not present
100
+ if "null" not in avro_type:
101
+ return ["null"] + list(avro_type)
102
+ return avro_type
103
+ # Wrap in union with null first (for Avro default null)
104
+ return ["null", avro_type]
105
+
106
+ def _merge_field_types(self, type1: JsonNode, type2: JsonNode) -> JsonNode | None:
107
+ """Merge two Avro types into a compatible type.
108
+
109
+ Returns the merged type, or None if types are incompatible.
110
+ """
111
+ # If types are identical, return as-is
112
+ if type1 == type2:
113
+ return type1
114
+
115
+ # Handle null combinations - create nullable type
116
+ if type1 == "null":
117
+ return self._make_nullable(type2)
118
+ if type2 == "null":
119
+ return self._make_nullable(type1)
120
+
121
+ # If one is already nullable and other is compatible base type
122
+ if isinstance(type1, list) and "null" in type1:
123
+ non_null_types = [t for t in type1 if t != "null"]
124
+ if len(non_null_types) == 1 and non_null_types[0] == type2:
125
+ return type1
126
+ # Check if type2 is compatible with any non-null type
127
+ for t in non_null_types:
128
+ if t == type2:
129
+ return type1
130
+ if isinstance(t, dict) and isinstance(type2, dict):
131
+ if t.get("type") == type2.get("type") == "record":
132
+ success, merged = self.fold_record_types(t, type2)
133
+ if success:
134
+ return ["null", merged]
135
+ # Add type2 to the union
136
+ return type1 + [type2] if type2 not in type1 else type1
137
+
138
+ if isinstance(type2, list) and "null" in type2:
139
+ non_null_types = [t for t in type2 if t != "null"]
140
+ if len(non_null_types) == 1 and non_null_types[0] == type1:
141
+ return type2
142
+ # Add type1 to the union
143
+ return type2 + [type1] if type1 not in type2 else type2
144
+
145
+ # Both are primitives but different - try to create union
146
+ if isinstance(type1, str) and isinstance(type2, str):
147
+ # Create a nullable union with both types
148
+ return ["null", type1, type2]
149
+
150
+ # Both are records - try to fold
151
+ if isinstance(type1, dict) and isinstance(type2, dict):
152
+ if type1.get("type") == type2.get("type") == "record":
153
+ success, merged = self.fold_record_types(type1, type2)
154
+ if success:
155
+ return merged
156
+ elif type1.get("type") == type2.get("type") == "array":
157
+ # Merge array item types
158
+ items1 = type1.get("items", "string")
159
+ items2 = type2.get("items", "string")
160
+ merged_items = self._merge_field_types(items1, items2)
161
+ if merged_items is not None:
162
+ return {"type": "array", "items": merged_items}
163
+
164
+ return None
165
+
166
+ def consolidated_type_list(self, type_name: str, python_values: list,
167
+ type_converter: Callable[[str, Any], JsonNode]) -> List[JsonNode]:
168
+ """Consolidates a list of values into unique types.
169
+
170
+ Eliminates duplicate types using tree hashing and attempts to fold
171
+ compatible record types together.
172
+
173
+ Args:
174
+ type_name: Base name for generated types
175
+ python_values: List of Python values to analyze
176
+ type_converter: Function to convert Python values to schema types
177
+
178
+ Returns:
179
+ List of unique schema types
180
+ """
181
+ list_types = [type_converter(type_name, item) for item in python_values]
182
+
183
+ # Eliminate duplicates using tree hashing
184
+ tree_hashes = {}
185
+ for item in list_types:
186
+ tree_hash = get_tree_hash(item)
187
+ if tree_hash.hash_value not in tree_hashes:
188
+ tree_hashes[tree_hash.hash_value] = item
189
+ list_types = list(tree_hashes.values())
190
+
191
+ # Try to fold record types together
192
+ unique_types = []
193
+ prior_record = None
194
+ for item in list_types:
195
+ if isinstance(item, dict) and item.get("type") == "record":
196
+ if prior_record is None:
197
+ prior_record = item
198
+ else:
199
+ folded, record = self.fold_record_types(prior_record, item)
200
+ if not folded:
201
+ unique_types.append(item)
202
+ else:
203
+ prior_record = record
204
+ else:
205
+ unique_types.append(item)
206
+ if prior_record is not None:
207
+ unique_types.append(prior_record)
208
+
209
+ # Consolidate array and map types
210
+ array_types = [item["items"] for item in unique_types
211
+ if isinstance(item, dict) and item.get("type") == "array"]
212
+ map_types = [item["values"] for item in unique_types
213
+ if isinstance(item, dict) and item.get("type") == "map"]
214
+ list_types = [item for item in unique_types
215
+ if not isinstance(item, dict) or item.get("type") not in ["array", "map"]]
216
+
217
+ item_types: List[JsonNode] = []
218
+ for item2 in array_types:
219
+ if isinstance(item2, list):
220
+ item_types.extend(item2)
221
+ else:
222
+ item_types.append(item2)
223
+ if len(item_types) > 0:
224
+ list_types.append({"type": "array", "items": item_types})
225
+
226
+ value_types: List[JsonNode] = []
227
+ for item3 in map_types:
228
+ if isinstance(item3, list):
229
+ value_types.extend(item3)
230
+ else:
231
+ value_types.append(item3)
232
+ if len(value_types) > 0:
233
+ list_types.append({"type": "map", "values": value_types})
234
+
235
+ return list_types
236
+
237
+
238
+ class AvroSchemaInferrer(SchemaInferrer):
239
+ """Infers Avro schemas from JSON and XML data."""
240
+
241
+ def python_type_to_avro_type(self, type_name: str, python_value: Any) -> JsonNode:
242
+ """Maps Python types to Avro types.
243
+
244
+ Args:
245
+ type_name: Name for the type being generated
246
+ python_value: Python value to convert
247
+
248
+ Returns:
249
+ Avro schema type
250
+ """
251
+ simple_types = {
252
+ int: "long", # Use long for safety with large integers
253
+ float: "double",
254
+ str: "string",
255
+ bool: "boolean",
256
+ bytes: "bytes"
257
+ }
258
+
259
+ if python_value is None:
260
+ return "null"
261
+
262
+ if isinstance(python_value, dict):
263
+ type_name_name = avro_name(type_name.rsplit('.', 1)[-1])
264
+ type_name_namespace = (type_name.rsplit('.', 1)[0]) + "Types" if '.' in type_name else ''
265
+ if self.namespace:
266
+ type_namespace = self.namespace + ('.' if type_name_namespace else '') + type_name_namespace
267
+ else:
268
+ type_namespace = type_name_namespace
269
+ record: Dict[str, JsonNode] = {
270
+ "type": "record",
271
+ "name": type_name_name,
272
+ }
273
+ if type_namespace:
274
+ record["namespace"] = type_namespace
275
+ fields: List[JsonNode] = []
276
+ for key, value in python_value.items():
277
+ original_key = key
278
+ key = avro_name(key)
279
+ field: Dict[str, JsonNode] = {
280
+ "name": key,
281
+ "type": self.python_type_to_avro_type(f"{type_name}.{key}", value)
282
+ }
283
+ if original_key != key:
284
+ field["altnames"] = {self.altnames_key: original_key}
285
+ fields.append(field)
286
+ record["fields"] = fields
287
+ return record
288
+
289
+ if isinstance(python_value, list):
290
+ if len(python_value) > 0:
291
+ item_types = self.consolidated_type_list(
292
+ type_name, python_value, self.python_type_to_avro_type)
293
+ else:
294
+ item_types = ["string"]
295
+ if len(item_types) == 1:
296
+ return {"type": "array", "items": item_types[0]}
297
+ else:
298
+ return {"type": "array", "items": item_types}
299
+
300
+ return simple_types.get(type(python_value), "string")
301
+
302
+ def infer_from_json_values(self, type_name: str, values: List[Any]) -> JsonNode:
303
+ """Infers Avro schema from a list of JSON values.
304
+
305
+ Args:
306
+ type_name: Name for the root type
307
+ values: List of parsed JSON values
308
+
309
+ Returns:
310
+ Inferred Avro schema
311
+ """
312
+ if not values:
313
+ return "string"
314
+
315
+ unique_types = self.consolidated_type_list(
316
+ type_name, values, self.python_type_to_avro_type)
317
+
318
+ if len(unique_types) > 1:
319
+ # Try to merge all types into a single compatible type
320
+ merged = unique_types[0]
321
+ for t in unique_types[1:]:
322
+ merged = self._merge_field_types(merged, t)
323
+ if merged is None:
324
+ # Can't merge - return as union
325
+ return unique_types
326
+ return merged
327
+ elif len(unique_types) == 1:
328
+ return unique_types[0]
329
+ else:
330
+ return "string"
331
+
332
+ def infer_from_xml_values(self, type_name: str, xml_strings: List[str]) -> JsonNode:
333
+ """Infers Avro schema from a list of XML strings.
334
+
335
+ Args:
336
+ type_name: Name for the root type
337
+ xml_strings: List of XML strings to analyze
338
+
339
+ Returns:
340
+ Inferred Avro schema
341
+ """
342
+ xml_structures: List[Dict[str, Any]] = []
343
+ for xml_str in xml_strings:
344
+ try:
345
+ structure = self._parse_xml_to_dict(xml_str)
346
+ if structure:
347
+ xml_structures.append(structure)
348
+ except ET.ParseError:
349
+ pass
350
+
351
+ if not xml_structures:
352
+ return "string"
353
+
354
+ unique_types = self.consolidated_type_list(
355
+ type_name, xml_structures, self.python_type_to_avro_type)
356
+
357
+ if len(unique_types) > 1:
358
+ # Try to merge all types into a single compatible type
359
+ merged = unique_types[0]
360
+ for t in unique_types[1:]:
361
+ merged = self._merge_field_types(merged, t)
362
+ if merged is None:
363
+ # Can't merge - return as union
364
+ return unique_types
365
+ return merged
366
+ elif len(unique_types) == 1:
367
+ return unique_types[0]
368
+ else:
369
+ return "string"
370
+
371
+ def _parse_xml_to_dict(self, xml_string: str) -> Dict[str, Any] | None:
372
+ """Parses XML string to a dictionary structure for schema inference."""
373
+ try:
374
+ root = ET.fromstring(xml_string)
375
+ return self._element_to_dict(root)
376
+ except ET.ParseError:
377
+ return None
378
+
379
+ def _element_to_dict(self, element: ET.Element) -> Dict[str, Any]:
380
+ """Converts an XML element to a dictionary."""
381
+ result: Dict[str, Any] = {}
382
+
383
+ # Handle attributes
384
+ for attr_name, attr_value in element.attrib.items():
385
+ # Strip namespace from attribute name
386
+ attr_name = attr_name.split('}')[-1] if '}' in attr_name else attr_name
387
+ result[f"@{attr_name}"] = attr_value
388
+
389
+ # Handle text content
390
+ if element.text and element.text.strip():
391
+ if len(element) == 0 and not element.attrib:
392
+ return element.text.strip() # type: ignore
393
+ result["#text"] = element.text.strip()
394
+
395
+ # Handle child elements
396
+ for child in element:
397
+ child_tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
398
+ child_dict = self._element_to_dict(child)
399
+
400
+ if child_tag in result:
401
+ # Convert to list if multiple children with same tag
402
+ if not isinstance(result[child_tag], list):
403
+ result[child_tag] = [result[child_tag]]
404
+ result[child_tag].append(child_dict)
405
+ else:
406
+ result[child_tag] = child_dict
407
+
408
+ return result
409
+
410
+
411
+ class JsonStructureSchemaInferrer(SchemaInferrer):
412
+ """Infers JSON Structure schemas from JSON and XML data."""
413
+
414
+ # JSON Structure primitive type mapping
415
+ # Use 'integer' for general integers (accepts native JSON numbers)
416
+ # int64/uint64 etc. are string-encoded for JSON safety with large numbers
417
+ PYTHON_TO_JSTRUCT_TYPES = {
418
+ int: "integer",
419
+ float: "double",
420
+ str: "string",
421
+ bool: "boolean",
422
+ bytes: "binary"
423
+ }
424
+
425
+ def __init__(self, namespace: str = '', type_name_prefix: str = '', base_id: str = ''):
426
+ """Initialize the JSON Structure schema inferrer.
427
+
428
+ Args:
429
+ namespace: Namespace for generated types
430
+ type_name_prefix: Prefix for generated type names
431
+ base_id: Base URI for $id generation
432
+ """
433
+ super().__init__(namespace, type_name_prefix)
434
+ self.base_id = base_id or 'https://example.com/'
435
+ self.definitions: Dict[str, Any] = {}
436
+
437
+ def python_type_to_jstruct_type(self, type_name: str, python_value: Any) -> Dict[str, Any] | str:
438
+ """Maps Python types to JSON Structure types.
439
+
440
+ Args:
441
+ type_name: Name for the type being generated
442
+ python_value: Python value to convert
443
+
444
+ Returns:
445
+ JSON Structure schema type
446
+ """
447
+ if python_value is None:
448
+ return "null"
449
+
450
+ if isinstance(python_value, dict):
451
+ # Generate an object type
452
+ safe_name = avro_name(type_name.rsplit('.', 1)[-1])
453
+ properties: Dict[str, Any] = {}
454
+ required: List[str] = []
455
+
456
+ for key, value in python_value.items():
457
+ original_key = key
458
+ safe_key = avro_name(key)
459
+ prop_type = self.python_type_to_jstruct_type(f"{type_name}.{safe_key}", value)
460
+
461
+ if isinstance(prop_type, str):
462
+ properties[safe_key] = {"type": prop_type}
463
+ else:
464
+ properties[safe_key] = prop_type
465
+
466
+ # Add altnames if key was transformed
467
+ if original_key != safe_key:
468
+ properties[safe_key]["altnames"] = {self.altnames_key: original_key}
469
+
470
+ # All inferred properties are required unless null
471
+ if prop_type != "null":
472
+ required.append(safe_key)
473
+
474
+ result: Dict[str, Any] = {
475
+ "type": "object",
476
+ "name": safe_name,
477
+ "properties": properties
478
+ }
479
+ if required:
480
+ result["required"] = required
481
+
482
+ return result
483
+
484
+ if isinstance(python_value, list):
485
+ if len(python_value) > 0:
486
+ item_types = self.consolidated_jstruct_type_list(
487
+ type_name, python_value)
488
+ # Simplify single-type arrays
489
+ if len(item_types) == 1:
490
+ items = item_types[0]
491
+ else:
492
+ # Use choice for multiple item types
493
+ items = {"type": "choice", "choices": item_types}
494
+ else:
495
+ items = {"type": "string"}
496
+
497
+ if isinstance(items, str):
498
+ return {"type": "array", "items": {"type": items}}
499
+ elif isinstance(items, dict) and "type" not in items:
500
+ return {"type": "array", "items": items}
501
+ else:
502
+ return {"type": "array", "items": items}
503
+
504
+ return self.PYTHON_TO_JSTRUCT_TYPES.get(type(python_value), "string")
505
+
506
+ def fold_jstruct_record_types(self, base_record: dict, new_record: dict) -> Tuple[bool, dict]:
507
+ """Merges two JSON Structure object types by combining their properties.
508
+
509
+ Args:
510
+ base_record: The base object to merge into
511
+ new_record: The new object to merge
512
+
513
+ Returns:
514
+ Tuple of (success, merged_object)
515
+ """
516
+ base_props = copy.deepcopy(base_record).get("properties", {})
517
+ new_props = new_record.get("properties", {})
518
+ base_required = set(base_record.get("required", []))
519
+ new_required = set(new_record.get("required", []))
520
+
521
+ for prop_name, prop_schema in new_props.items():
522
+ if prop_name not in base_props:
523
+ base_props[prop_name] = prop_schema
524
+ # Property only in some records is not required
525
+ else:
526
+ # Property exists in both - check compatibility
527
+ base_type = base_props[prop_name].get("type") if isinstance(base_props[prop_name], dict) else base_props[prop_name]
528
+ new_type = prop_schema.get("type") if isinstance(prop_schema, dict) else prop_schema
529
+
530
+ if base_type != new_type:
531
+ # Types differ - can't fold simply
532
+ if base_type == "object" and new_type == "object":
533
+ # Try to fold nested objects
534
+ success, merged = self.fold_jstruct_record_types(
535
+ base_props[prop_name], prop_schema)
536
+ if success:
537
+ base_props[prop_name] = merged
538
+ else:
539
+ return False, new_record
540
+ else:
541
+ return False, new_record
542
+
543
+ # Update required - only properties in ALL records are required
544
+ merged_required = base_required & new_required
545
+
546
+ base_record["properties"] = base_props
547
+ if merged_required:
548
+ base_record["required"] = list(merged_required)
549
+ elif "required" in base_record:
550
+ del base_record["required"]
551
+
552
+ return True, base_record
553
+
554
+ def consolidated_jstruct_type_list(self, type_name: str, python_values: list) -> List[Any]:
555
+ """Consolidates a list of values into unique JSON Structure types.
556
+
557
+ Args:
558
+ type_name: Base name for generated types
559
+ python_values: List of Python values to analyze
560
+
561
+ Returns:
562
+ List of unique JSON Structure types
563
+ """
564
+ list_types = [self.python_type_to_jstruct_type(type_name, item) for item in python_values]
565
+
566
+ # Eliminate duplicates using tree hashing
567
+ tree_hashes = {}
568
+ for item in list_types:
569
+ tree_hash = get_tree_hash(item)
570
+ if tree_hash.hash_value not in tree_hashes:
571
+ tree_hashes[tree_hash.hash_value] = item
572
+ list_types = list(tree_hashes.values())
573
+
574
+ # Try to fold object types together
575
+ unique_types = []
576
+ prior_object = None
577
+ for item in list_types:
578
+ if isinstance(item, dict) and item.get("type") == "object":
579
+ if prior_object is None:
580
+ prior_object = item
581
+ else:
582
+ folded, obj = self.fold_jstruct_record_types(prior_object, item)
583
+ if not folded:
584
+ unique_types.append(item)
585
+ else:
586
+ prior_object = obj
587
+ else:
588
+ unique_types.append(item)
589
+ if prior_object is not None:
590
+ unique_types.append(prior_object)
591
+
592
+ # Consolidate array and map types
593
+ array_types = [item.get("items") for item in unique_types
594
+ if isinstance(item, dict) and item.get("type") == "array"]
595
+ map_types = [item.get("values") for item in unique_types
596
+ if isinstance(item, dict) and item.get("type") == "map"]
597
+ list_types = [item for item in unique_types
598
+ if not isinstance(item, dict) or item.get("type") not in ["array", "map"]]
599
+
600
+ item_types: List[Any] = []
601
+ for item2 in array_types:
602
+ if isinstance(item2, list):
603
+ item_types.extend(item2)
604
+ elif item2:
605
+ item_types.append(item2)
606
+ if item_types:
607
+ if len(item_types) == 1:
608
+ list_types.append({"type": "array", "items": item_types[0]})
609
+ else:
610
+ list_types.append({"type": "array", "items": {"type": "choice", "choices": item_types}})
611
+
612
+ value_types: List[Any] = []
613
+ for item3 in map_types:
614
+ if isinstance(item3, list):
615
+ value_types.extend(item3)
616
+ elif item3:
617
+ value_types.append(item3)
618
+ if value_types:
619
+ if len(value_types) == 1:
620
+ list_types.append({"type": "map", "values": value_types[0]})
621
+ else:
622
+ list_types.append({"type": "map", "values": {"type": "choice", "choices": value_types}})
623
+
624
+ return list_types
625
+
626
+ def infer_from_json_values(self, type_name: str, values: List[Any]) -> Dict[str, Any]:
627
+ """Infers JSON Structure schema from a list of JSON values.
628
+
629
+ Args:
630
+ type_name: Name for the root type
631
+ values: List of parsed JSON values
632
+
633
+ Returns:
634
+ Complete JSON Structure schema with $schema and $id
635
+ """
636
+ if not values:
637
+ return self._wrap_schema({"type": "string"}, type_name)
638
+
639
+ unique_types = self.consolidated_jstruct_type_list(type_name, values)
640
+
641
+ if len(unique_types) > 1:
642
+ # Multiple types -> use choice
643
+ schema = {"type": "choice", "choices": unique_types, "name": avro_name(type_name)}
644
+ elif len(unique_types) == 1:
645
+ schema = unique_types[0]
646
+ if isinstance(schema, str):
647
+ schema = {"type": schema}
648
+ if "name" not in schema:
649
+ schema["name"] = avro_name(type_name)
650
+ else:
651
+ schema = {"type": "string", "name": avro_name(type_name)}
652
+
653
+ return self._wrap_schema(schema, type_name)
654
+
655
+ def infer_from_xml_values(self, type_name: str, xml_strings: List[str]) -> Dict[str, Any]:
656
+ """Infers JSON Structure schema from a list of XML strings.
657
+
658
+ Args:
659
+ type_name: Name for the root type
660
+ xml_strings: List of XML strings to analyze
661
+
662
+ Returns:
663
+ Complete JSON Structure schema with $schema and $id
664
+ """
665
+ xml_structures: List[Dict[str, Any]] = []
666
+ for xml_str in xml_strings:
667
+ try:
668
+ structure = self._parse_xml_to_dict(xml_str)
669
+ if structure:
670
+ xml_structures.append(structure)
671
+ except ET.ParseError:
672
+ pass
673
+
674
+ if not xml_structures:
675
+ return self._wrap_schema({"type": "string"}, type_name)
676
+
677
+ unique_types = self.consolidated_jstruct_type_list(type_name, xml_structures)
678
+
679
+ if len(unique_types) > 1:
680
+ schema = {"type": "choice", "choices": unique_types, "name": avro_name(type_name)}
681
+ elif len(unique_types) == 1:
682
+ schema = unique_types[0]
683
+ if isinstance(schema, str):
684
+ schema = {"type": schema}
685
+ if "name" not in schema:
686
+ schema["name"] = avro_name(type_name)
687
+ else:
688
+ schema = {"type": "string", "name": avro_name(type_name)}
689
+
690
+ return self._wrap_schema(schema, type_name)
691
+
692
+ def _wrap_schema(self, schema: Dict[str, Any], type_name: str) -> Dict[str, Any]:
693
+ """Wraps a schema with JSON Structure metadata.
694
+
695
+ Args:
696
+ schema: The schema body
697
+ type_name: Name for generating $id
698
+
699
+ Returns:
700
+ Complete JSON Structure schema
701
+ """
702
+ safe_name = avro_name(type_name)
703
+ schema_id = f"{self.base_id.rstrip('/')}/{safe_name}"
704
+
705
+ result = {
706
+ "$schema": "https://json-structure.org/meta/core/v0/#",
707
+ "$id": schema_id,
708
+ }
709
+ result.update(schema)
710
+ return result
711
+
712
+ def _parse_xml_to_dict(self, xml_string: str) -> Dict[str, Any] | None:
713
+ """Parses XML string to a dictionary structure for schema inference."""
714
+ try:
715
+ root = ET.fromstring(xml_string)
716
+ return self._element_to_dict(root)
717
+ except ET.ParseError:
718
+ return None
719
+
720
+ def _element_to_dict(self, element: ET.Element) -> Dict[str, Any]:
721
+ """Converts an XML element to a dictionary."""
722
+ result: Dict[str, Any] = {}
723
+
724
+ # Handle attributes (prefix with @ for XML attributes)
725
+ for attr_name, attr_value in element.attrib.items():
726
+ attr_name = attr_name.split('}')[-1] if '}' in attr_name else attr_name
727
+ result[f"@{attr_name}"] = attr_value
728
+
729
+ # Handle text content
730
+ if element.text and element.text.strip():
731
+ if len(element) == 0 and not element.attrib:
732
+ return element.text.strip() # type: ignore
733
+ result["#text"] = element.text.strip()
734
+
735
+ # Handle child elements
736
+ for child in element:
737
+ child_tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
738
+ child_dict = self._element_to_dict(child)
739
+
740
+ if child_tag in result:
741
+ if not isinstance(result[child_tag], list):
742
+ result[child_tag] = [result[child_tag]]
743
+ result[child_tag].append(child_dict)
744
+ else:
745
+ result[child_tag] = child_dict
746
+
747
+ return result
748
+
749
+
750
+ # Convenience functions for direct use
751
+
752
+ def infer_avro_schema_from_json(
753
+ json_values: List[Any],
754
+ type_name: str = 'Document',
755
+ namespace: str = ''
756
+ ) -> JsonNode:
757
+ """Infers Avro schema from JSON values.
758
+
759
+ Args:
760
+ json_values: List of parsed JSON values
761
+ type_name: Name for the root type
762
+ namespace: Avro namespace
763
+
764
+ Returns:
765
+ Inferred Avro schema
766
+ """
767
+ inferrer = AvroSchemaInferrer(namespace=namespace)
768
+ return inferrer.infer_from_json_values(type_name, json_values)
769
+
770
+
771
+ def infer_avro_schema_from_xml(
772
+ xml_strings: List[str],
773
+ type_name: str = 'Document',
774
+ namespace: str = ''
775
+ ) -> JsonNode:
776
+ """Infers Avro schema from XML strings.
777
+
778
+ Args:
779
+ xml_strings: List of XML strings
780
+ type_name: Name for the root type
781
+ namespace: Avro namespace
782
+
783
+ Returns:
784
+ Inferred Avro schema
785
+ """
786
+ inferrer = AvroSchemaInferrer(namespace=namespace)
787
+ return inferrer.infer_from_xml_values(type_name, xml_strings)
788
+
789
+
790
+ def infer_jstruct_schema_from_json(
791
+ json_values: List[Any],
792
+ type_name: str = 'Document',
793
+ base_id: str = 'https://example.com/'
794
+ ) -> Dict[str, Any]:
795
+ """Infers JSON Structure schema from JSON values.
796
+
797
+ Args:
798
+ json_values: List of parsed JSON values
799
+ type_name: Name for the root type
800
+ base_id: Base URI for $id generation
801
+
802
+ Returns:
803
+ Complete JSON Structure schema
804
+ """
805
+ inferrer = JsonStructureSchemaInferrer(base_id=base_id)
806
+ return inferrer.infer_from_json_values(type_name, json_values)
807
+
808
+
809
+ def infer_jstruct_schema_from_xml(
810
+ xml_strings: List[str],
811
+ type_name: str = 'Document',
812
+ base_id: str = 'https://example.com/'
813
+ ) -> Dict[str, Any]:
814
+ """Infers JSON Structure schema from XML strings.
815
+
816
+ Args:
817
+ xml_strings: List of XML strings
818
+ type_name: Name for the root type
819
+ base_id: Base URI for $id generation
820
+
821
+ Returns:
822
+ Complete JSON Structure schema
823
+ """
824
+ inferrer = JsonStructureSchemaInferrer(base_id=base_id)
825
+ return inferrer.infer_from_xml_values(type_name, xml_strings)