structurize 3.0.2__py3-none-any.whl → 3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/_version.py +3 -3
- avrotize/avrotize.py +4 -0
- avrotize/avrotots.py +62 -7
- avrotize/avrovalidator.py +518 -0
- avrotize/commands.json +466 -0
- avrotize/dependencies/typescript/node22/package.json +1 -1
- avrotize/jsontoschema.py +151 -0
- avrotize/schema_inference.py +825 -0
- avrotize/sqltoavro.py +1159 -0
- avrotize/validate.py +242 -0
- avrotize/xmltoschema.py +122 -0
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/METADATA +1 -1
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/RECORD +17 -11
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/WHEEL +1 -1
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/entry_points.txt +0 -0
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/licenses/LICENSE +0 -0
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,825 @@
|
|
|
1
|
+
"""Shared schema inference logic for JSON and XML data.
|
|
2
|
+
|
|
3
|
+
This module provides the core inference logic used by:
|
|
4
|
+
- json2a/json2s: Infer schema from JSON files
|
|
5
|
+
- xml2a/xml2s: Infer schema from XML files
|
|
6
|
+
- sql2a: Infer schema for JSON/XML columns in databases
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import copy
|
|
10
|
+
import json
|
|
11
|
+
import xml.etree.ElementTree as ET
|
|
12
|
+
from typing import Any, Dict, List, Tuple, Callable
|
|
13
|
+
|
|
14
|
+
from avrotize.common import avro_name, get_tree_hash
|
|
15
|
+
|
|
16
|
+
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | float | None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SchemaInferrer:
|
|
20
|
+
"""Base class for schema inference from JSON and XML data."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, namespace: str = '', type_name_prefix: str = '', altnames_key: str = 'json'):
|
|
23
|
+
"""Initialize the schema inferrer.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
namespace: Namespace for generated types (Avro) or $id base (JSON Structure)
|
|
27
|
+
type_name_prefix: Prefix for generated type names
|
|
28
|
+
altnames_key: Key to use for altnames mapping (e.g., 'json', 'sql', 'xml')
|
|
29
|
+
"""
|
|
30
|
+
self.namespace = namespace
|
|
31
|
+
self.type_name_prefix = type_name_prefix
|
|
32
|
+
self.altnames_key = altnames_key
|
|
33
|
+
self.generated_types: List[str] = []
|
|
34
|
+
|
|
35
|
+
def fold_record_types(self, base_record: dict, new_record: dict) -> Tuple[bool, dict]:
|
|
36
|
+
"""Merges two record types by combining their fields.
|
|
37
|
+
|
|
38
|
+
When two records have overlapping fields with compatible types, they
|
|
39
|
+
are folded into a single record with all fields. Fields that don't
|
|
40
|
+
appear in all records become optional (nullable with null default).
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
base_record: The base record to merge into
|
|
44
|
+
new_record: The new record to merge
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Tuple of (success, merged_record). If folding fails due to
|
|
48
|
+
incompatible types, returns (False, new_record).
|
|
49
|
+
"""
|
|
50
|
+
base_fields = copy.deepcopy(base_record).get("fields", [])
|
|
51
|
+
new_fields = new_record.get("fields", [])
|
|
52
|
+
|
|
53
|
+
# Track field names present in each record
|
|
54
|
+
base_field_names = {f["name"] for f in base_fields}
|
|
55
|
+
new_field_names = {f["name"] for f in new_fields}
|
|
56
|
+
|
|
57
|
+
# Process fields from the new record
|
|
58
|
+
for field in new_fields:
|
|
59
|
+
base_field = next(
|
|
60
|
+
(f for f in base_fields if f["name"] == field["name"]), None)
|
|
61
|
+
if not base_field:
|
|
62
|
+
# Field only in new record - add it as nullable
|
|
63
|
+
new_field = copy.deepcopy(field)
|
|
64
|
+
new_field["type"] = self._make_nullable(new_field["type"])
|
|
65
|
+
new_field["default"] = None
|
|
66
|
+
base_fields.append(new_field)
|
|
67
|
+
else:
|
|
68
|
+
# Field in both records - merge types
|
|
69
|
+
merged_type = self._merge_field_types(base_field["type"], field["type"])
|
|
70
|
+
if merged_type is None:
|
|
71
|
+
return False, new_record
|
|
72
|
+
base_field["type"] = merged_type
|
|
73
|
+
|
|
74
|
+
# Make fields that are only in base record nullable
|
|
75
|
+
for base_field in base_fields:
|
|
76
|
+
if base_field["name"] not in new_field_names and base_field["name"] in base_field_names:
|
|
77
|
+
if not self._is_nullable(base_field["type"]):
|
|
78
|
+
base_field["type"] = self._make_nullable(base_field["type"])
|
|
79
|
+
base_field["default"] = None
|
|
80
|
+
|
|
81
|
+
base_record["fields"] = base_fields
|
|
82
|
+
return True, base_record
|
|
83
|
+
|
|
84
|
+
def _is_nullable(self, avro_type: JsonNode) -> bool:
|
|
85
|
+
"""Check if an Avro type is nullable (contains null in union)."""
|
|
86
|
+
if avro_type == "null":
|
|
87
|
+
return True
|
|
88
|
+
if isinstance(avro_type, list):
|
|
89
|
+
return "null" in avro_type
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
def _make_nullable(self, avro_type: JsonNode) -> JsonNode:
|
|
93
|
+
"""Make an Avro type nullable by wrapping in union with null."""
|
|
94
|
+
if self._is_nullable(avro_type):
|
|
95
|
+
return avro_type
|
|
96
|
+
if avro_type == "null":
|
|
97
|
+
return "null"
|
|
98
|
+
if isinstance(avro_type, list):
|
|
99
|
+
# Already a union, add null if not present
|
|
100
|
+
if "null" not in avro_type:
|
|
101
|
+
return ["null"] + list(avro_type)
|
|
102
|
+
return avro_type
|
|
103
|
+
# Wrap in union with null first (for Avro default null)
|
|
104
|
+
return ["null", avro_type]
|
|
105
|
+
|
|
106
|
+
def _merge_field_types(self, type1: JsonNode, type2: JsonNode) -> JsonNode | None:
|
|
107
|
+
"""Merge two Avro types into a compatible type.
|
|
108
|
+
|
|
109
|
+
Returns the merged type, or None if types are incompatible.
|
|
110
|
+
"""
|
|
111
|
+
# If types are identical, return as-is
|
|
112
|
+
if type1 == type2:
|
|
113
|
+
return type1
|
|
114
|
+
|
|
115
|
+
# Handle null combinations - create nullable type
|
|
116
|
+
if type1 == "null":
|
|
117
|
+
return self._make_nullable(type2)
|
|
118
|
+
if type2 == "null":
|
|
119
|
+
return self._make_nullable(type1)
|
|
120
|
+
|
|
121
|
+
# If one is already nullable and other is compatible base type
|
|
122
|
+
if isinstance(type1, list) and "null" in type1:
|
|
123
|
+
non_null_types = [t for t in type1 if t != "null"]
|
|
124
|
+
if len(non_null_types) == 1 and non_null_types[0] == type2:
|
|
125
|
+
return type1
|
|
126
|
+
# Check if type2 is compatible with any non-null type
|
|
127
|
+
for t in non_null_types:
|
|
128
|
+
if t == type2:
|
|
129
|
+
return type1
|
|
130
|
+
if isinstance(t, dict) and isinstance(type2, dict):
|
|
131
|
+
if t.get("type") == type2.get("type") == "record":
|
|
132
|
+
success, merged = self.fold_record_types(t, type2)
|
|
133
|
+
if success:
|
|
134
|
+
return ["null", merged]
|
|
135
|
+
# Add type2 to the union
|
|
136
|
+
return type1 + [type2] if type2 not in type1 else type1
|
|
137
|
+
|
|
138
|
+
if isinstance(type2, list) and "null" in type2:
|
|
139
|
+
non_null_types = [t for t in type2 if t != "null"]
|
|
140
|
+
if len(non_null_types) == 1 and non_null_types[0] == type1:
|
|
141
|
+
return type2
|
|
142
|
+
# Add type1 to the union
|
|
143
|
+
return type2 + [type1] if type1 not in type2 else type2
|
|
144
|
+
|
|
145
|
+
# Both are primitives but different - try to create union
|
|
146
|
+
if isinstance(type1, str) and isinstance(type2, str):
|
|
147
|
+
# Create a nullable union with both types
|
|
148
|
+
return ["null", type1, type2]
|
|
149
|
+
|
|
150
|
+
# Both are records - try to fold
|
|
151
|
+
if isinstance(type1, dict) and isinstance(type2, dict):
|
|
152
|
+
if type1.get("type") == type2.get("type") == "record":
|
|
153
|
+
success, merged = self.fold_record_types(type1, type2)
|
|
154
|
+
if success:
|
|
155
|
+
return merged
|
|
156
|
+
elif type1.get("type") == type2.get("type") == "array":
|
|
157
|
+
# Merge array item types
|
|
158
|
+
items1 = type1.get("items", "string")
|
|
159
|
+
items2 = type2.get("items", "string")
|
|
160
|
+
merged_items = self._merge_field_types(items1, items2)
|
|
161
|
+
if merged_items is not None:
|
|
162
|
+
return {"type": "array", "items": merged_items}
|
|
163
|
+
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
def consolidated_type_list(self, type_name: str, python_values: list,
|
|
167
|
+
type_converter: Callable[[str, Any], JsonNode]) -> List[JsonNode]:
|
|
168
|
+
"""Consolidates a list of values into unique types.
|
|
169
|
+
|
|
170
|
+
Eliminates duplicate types using tree hashing and attempts to fold
|
|
171
|
+
compatible record types together.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
type_name: Base name for generated types
|
|
175
|
+
python_values: List of Python values to analyze
|
|
176
|
+
type_converter: Function to convert Python values to schema types
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
List of unique schema types
|
|
180
|
+
"""
|
|
181
|
+
list_types = [type_converter(type_name, item) for item in python_values]
|
|
182
|
+
|
|
183
|
+
# Eliminate duplicates using tree hashing
|
|
184
|
+
tree_hashes = {}
|
|
185
|
+
for item in list_types:
|
|
186
|
+
tree_hash = get_tree_hash(item)
|
|
187
|
+
if tree_hash.hash_value not in tree_hashes:
|
|
188
|
+
tree_hashes[tree_hash.hash_value] = item
|
|
189
|
+
list_types = list(tree_hashes.values())
|
|
190
|
+
|
|
191
|
+
# Try to fold record types together
|
|
192
|
+
unique_types = []
|
|
193
|
+
prior_record = None
|
|
194
|
+
for item in list_types:
|
|
195
|
+
if isinstance(item, dict) and item.get("type") == "record":
|
|
196
|
+
if prior_record is None:
|
|
197
|
+
prior_record = item
|
|
198
|
+
else:
|
|
199
|
+
folded, record = self.fold_record_types(prior_record, item)
|
|
200
|
+
if not folded:
|
|
201
|
+
unique_types.append(item)
|
|
202
|
+
else:
|
|
203
|
+
prior_record = record
|
|
204
|
+
else:
|
|
205
|
+
unique_types.append(item)
|
|
206
|
+
if prior_record is not None:
|
|
207
|
+
unique_types.append(prior_record)
|
|
208
|
+
|
|
209
|
+
# Consolidate array and map types
|
|
210
|
+
array_types = [item["items"] for item in unique_types
|
|
211
|
+
if isinstance(item, dict) and item.get("type") == "array"]
|
|
212
|
+
map_types = [item["values"] for item in unique_types
|
|
213
|
+
if isinstance(item, dict) and item.get("type") == "map"]
|
|
214
|
+
list_types = [item for item in unique_types
|
|
215
|
+
if not isinstance(item, dict) or item.get("type") not in ["array", "map"]]
|
|
216
|
+
|
|
217
|
+
item_types: List[JsonNode] = []
|
|
218
|
+
for item2 in array_types:
|
|
219
|
+
if isinstance(item2, list):
|
|
220
|
+
item_types.extend(item2)
|
|
221
|
+
else:
|
|
222
|
+
item_types.append(item2)
|
|
223
|
+
if len(item_types) > 0:
|
|
224
|
+
list_types.append({"type": "array", "items": item_types})
|
|
225
|
+
|
|
226
|
+
value_types: List[JsonNode] = []
|
|
227
|
+
for item3 in map_types:
|
|
228
|
+
if isinstance(item3, list):
|
|
229
|
+
value_types.extend(item3)
|
|
230
|
+
else:
|
|
231
|
+
value_types.append(item3)
|
|
232
|
+
if len(value_types) > 0:
|
|
233
|
+
list_types.append({"type": "map", "values": value_types})
|
|
234
|
+
|
|
235
|
+
return list_types
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class AvroSchemaInferrer(SchemaInferrer):
|
|
239
|
+
"""Infers Avro schemas from JSON and XML data."""
|
|
240
|
+
|
|
241
|
+
def python_type_to_avro_type(self, type_name: str, python_value: Any) -> JsonNode:
|
|
242
|
+
"""Maps Python types to Avro types.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
type_name: Name for the type being generated
|
|
246
|
+
python_value: Python value to convert
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Avro schema type
|
|
250
|
+
"""
|
|
251
|
+
simple_types = {
|
|
252
|
+
int: "long", # Use long for safety with large integers
|
|
253
|
+
float: "double",
|
|
254
|
+
str: "string",
|
|
255
|
+
bool: "boolean",
|
|
256
|
+
bytes: "bytes"
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if python_value is None:
|
|
260
|
+
return "null"
|
|
261
|
+
|
|
262
|
+
if isinstance(python_value, dict):
|
|
263
|
+
type_name_name = avro_name(type_name.rsplit('.', 1)[-1])
|
|
264
|
+
type_name_namespace = (type_name.rsplit('.', 1)[0]) + "Types" if '.' in type_name else ''
|
|
265
|
+
if self.namespace:
|
|
266
|
+
type_namespace = self.namespace + ('.' if type_name_namespace else '') + type_name_namespace
|
|
267
|
+
else:
|
|
268
|
+
type_namespace = type_name_namespace
|
|
269
|
+
record: Dict[str, JsonNode] = {
|
|
270
|
+
"type": "record",
|
|
271
|
+
"name": type_name_name,
|
|
272
|
+
}
|
|
273
|
+
if type_namespace:
|
|
274
|
+
record["namespace"] = type_namespace
|
|
275
|
+
fields: List[JsonNode] = []
|
|
276
|
+
for key, value in python_value.items():
|
|
277
|
+
original_key = key
|
|
278
|
+
key = avro_name(key)
|
|
279
|
+
field: Dict[str, JsonNode] = {
|
|
280
|
+
"name": key,
|
|
281
|
+
"type": self.python_type_to_avro_type(f"{type_name}.{key}", value)
|
|
282
|
+
}
|
|
283
|
+
if original_key != key:
|
|
284
|
+
field["altnames"] = {self.altnames_key: original_key}
|
|
285
|
+
fields.append(field)
|
|
286
|
+
record["fields"] = fields
|
|
287
|
+
return record
|
|
288
|
+
|
|
289
|
+
if isinstance(python_value, list):
|
|
290
|
+
if len(python_value) > 0:
|
|
291
|
+
item_types = self.consolidated_type_list(
|
|
292
|
+
type_name, python_value, self.python_type_to_avro_type)
|
|
293
|
+
else:
|
|
294
|
+
item_types = ["string"]
|
|
295
|
+
if len(item_types) == 1:
|
|
296
|
+
return {"type": "array", "items": item_types[0]}
|
|
297
|
+
else:
|
|
298
|
+
return {"type": "array", "items": item_types}
|
|
299
|
+
|
|
300
|
+
return simple_types.get(type(python_value), "string")
|
|
301
|
+
|
|
302
|
+
def infer_from_json_values(self, type_name: str, values: List[Any]) -> JsonNode:
|
|
303
|
+
"""Infers Avro schema from a list of JSON values.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
type_name: Name for the root type
|
|
307
|
+
values: List of parsed JSON values
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Inferred Avro schema
|
|
311
|
+
"""
|
|
312
|
+
if not values:
|
|
313
|
+
return "string"
|
|
314
|
+
|
|
315
|
+
unique_types = self.consolidated_type_list(
|
|
316
|
+
type_name, values, self.python_type_to_avro_type)
|
|
317
|
+
|
|
318
|
+
if len(unique_types) > 1:
|
|
319
|
+
# Try to merge all types into a single compatible type
|
|
320
|
+
merged = unique_types[0]
|
|
321
|
+
for t in unique_types[1:]:
|
|
322
|
+
merged = self._merge_field_types(merged, t)
|
|
323
|
+
if merged is None:
|
|
324
|
+
# Can't merge - return as union
|
|
325
|
+
return unique_types
|
|
326
|
+
return merged
|
|
327
|
+
elif len(unique_types) == 1:
|
|
328
|
+
return unique_types[0]
|
|
329
|
+
else:
|
|
330
|
+
return "string"
|
|
331
|
+
|
|
332
|
+
def infer_from_xml_values(self, type_name: str, xml_strings: List[str]) -> JsonNode:
|
|
333
|
+
"""Infers Avro schema from a list of XML strings.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
type_name: Name for the root type
|
|
337
|
+
xml_strings: List of XML strings to analyze
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
Inferred Avro schema
|
|
341
|
+
"""
|
|
342
|
+
xml_structures: List[Dict[str, Any]] = []
|
|
343
|
+
for xml_str in xml_strings:
|
|
344
|
+
try:
|
|
345
|
+
structure = self._parse_xml_to_dict(xml_str)
|
|
346
|
+
if structure:
|
|
347
|
+
xml_structures.append(structure)
|
|
348
|
+
except ET.ParseError:
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
if not xml_structures:
|
|
352
|
+
return "string"
|
|
353
|
+
|
|
354
|
+
unique_types = self.consolidated_type_list(
|
|
355
|
+
type_name, xml_structures, self.python_type_to_avro_type)
|
|
356
|
+
|
|
357
|
+
if len(unique_types) > 1:
|
|
358
|
+
# Try to merge all types into a single compatible type
|
|
359
|
+
merged = unique_types[0]
|
|
360
|
+
for t in unique_types[1:]:
|
|
361
|
+
merged = self._merge_field_types(merged, t)
|
|
362
|
+
if merged is None:
|
|
363
|
+
# Can't merge - return as union
|
|
364
|
+
return unique_types
|
|
365
|
+
return merged
|
|
366
|
+
elif len(unique_types) == 1:
|
|
367
|
+
return unique_types[0]
|
|
368
|
+
else:
|
|
369
|
+
return "string"
|
|
370
|
+
|
|
371
|
+
def _parse_xml_to_dict(self, xml_string: str) -> Dict[str, Any] | None:
|
|
372
|
+
"""Parses XML string to a dictionary structure for schema inference."""
|
|
373
|
+
try:
|
|
374
|
+
root = ET.fromstring(xml_string)
|
|
375
|
+
return self._element_to_dict(root)
|
|
376
|
+
except ET.ParseError:
|
|
377
|
+
return None
|
|
378
|
+
|
|
379
|
+
def _element_to_dict(self, element: ET.Element) -> Dict[str, Any]:
|
|
380
|
+
"""Converts an XML element to a dictionary."""
|
|
381
|
+
result: Dict[str, Any] = {}
|
|
382
|
+
|
|
383
|
+
# Handle attributes
|
|
384
|
+
for attr_name, attr_value in element.attrib.items():
|
|
385
|
+
# Strip namespace from attribute name
|
|
386
|
+
attr_name = attr_name.split('}')[-1] if '}' in attr_name else attr_name
|
|
387
|
+
result[f"@{attr_name}"] = attr_value
|
|
388
|
+
|
|
389
|
+
# Handle text content
|
|
390
|
+
if element.text and element.text.strip():
|
|
391
|
+
if len(element) == 0 and not element.attrib:
|
|
392
|
+
return element.text.strip() # type: ignore
|
|
393
|
+
result["#text"] = element.text.strip()
|
|
394
|
+
|
|
395
|
+
# Handle child elements
|
|
396
|
+
for child in element:
|
|
397
|
+
child_tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
|
|
398
|
+
child_dict = self._element_to_dict(child)
|
|
399
|
+
|
|
400
|
+
if child_tag in result:
|
|
401
|
+
# Convert to list if multiple children with same tag
|
|
402
|
+
if not isinstance(result[child_tag], list):
|
|
403
|
+
result[child_tag] = [result[child_tag]]
|
|
404
|
+
result[child_tag].append(child_dict)
|
|
405
|
+
else:
|
|
406
|
+
result[child_tag] = child_dict
|
|
407
|
+
|
|
408
|
+
return result
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
class JsonStructureSchemaInferrer(SchemaInferrer):
|
|
412
|
+
"""Infers JSON Structure schemas from JSON and XML data."""
|
|
413
|
+
|
|
414
|
+
# JSON Structure primitive type mapping
|
|
415
|
+
# Use 'integer' for general integers (accepts native JSON numbers)
|
|
416
|
+
# int64/uint64 etc. are string-encoded for JSON safety with large numbers
|
|
417
|
+
PYTHON_TO_JSTRUCT_TYPES = {
|
|
418
|
+
int: "integer",
|
|
419
|
+
float: "double",
|
|
420
|
+
str: "string",
|
|
421
|
+
bool: "boolean",
|
|
422
|
+
bytes: "binary"
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
def __init__(self, namespace: str = '', type_name_prefix: str = '', base_id: str = ''):
|
|
426
|
+
"""Initialize the JSON Structure schema inferrer.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
namespace: Namespace for generated types
|
|
430
|
+
type_name_prefix: Prefix for generated type names
|
|
431
|
+
base_id: Base URI for $id generation
|
|
432
|
+
"""
|
|
433
|
+
super().__init__(namespace, type_name_prefix)
|
|
434
|
+
self.base_id = base_id or 'https://example.com/'
|
|
435
|
+
self.definitions: Dict[str, Any] = {}
|
|
436
|
+
|
|
437
|
+
def python_type_to_jstruct_type(self, type_name: str, python_value: Any) -> Dict[str, Any] | str:
|
|
438
|
+
"""Maps Python types to JSON Structure types.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
type_name: Name for the type being generated
|
|
442
|
+
python_value: Python value to convert
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
JSON Structure schema type
|
|
446
|
+
"""
|
|
447
|
+
if python_value is None:
|
|
448
|
+
return "null"
|
|
449
|
+
|
|
450
|
+
if isinstance(python_value, dict):
|
|
451
|
+
# Generate an object type
|
|
452
|
+
safe_name = avro_name(type_name.rsplit('.', 1)[-1])
|
|
453
|
+
properties: Dict[str, Any] = {}
|
|
454
|
+
required: List[str] = []
|
|
455
|
+
|
|
456
|
+
for key, value in python_value.items():
|
|
457
|
+
original_key = key
|
|
458
|
+
safe_key = avro_name(key)
|
|
459
|
+
prop_type = self.python_type_to_jstruct_type(f"{type_name}.{safe_key}", value)
|
|
460
|
+
|
|
461
|
+
if isinstance(prop_type, str):
|
|
462
|
+
properties[safe_key] = {"type": prop_type}
|
|
463
|
+
else:
|
|
464
|
+
properties[safe_key] = prop_type
|
|
465
|
+
|
|
466
|
+
# Add altnames if key was transformed
|
|
467
|
+
if original_key != safe_key:
|
|
468
|
+
properties[safe_key]["altnames"] = {self.altnames_key: original_key}
|
|
469
|
+
|
|
470
|
+
# All inferred properties are required unless null
|
|
471
|
+
if prop_type != "null":
|
|
472
|
+
required.append(safe_key)
|
|
473
|
+
|
|
474
|
+
result: Dict[str, Any] = {
|
|
475
|
+
"type": "object",
|
|
476
|
+
"name": safe_name,
|
|
477
|
+
"properties": properties
|
|
478
|
+
}
|
|
479
|
+
if required:
|
|
480
|
+
result["required"] = required
|
|
481
|
+
|
|
482
|
+
return result
|
|
483
|
+
|
|
484
|
+
if isinstance(python_value, list):
|
|
485
|
+
if len(python_value) > 0:
|
|
486
|
+
item_types = self.consolidated_jstruct_type_list(
|
|
487
|
+
type_name, python_value)
|
|
488
|
+
# Simplify single-type arrays
|
|
489
|
+
if len(item_types) == 1:
|
|
490
|
+
items = item_types[0]
|
|
491
|
+
else:
|
|
492
|
+
# Use choice for multiple item types
|
|
493
|
+
items = {"type": "choice", "choices": item_types}
|
|
494
|
+
else:
|
|
495
|
+
items = {"type": "string"}
|
|
496
|
+
|
|
497
|
+
if isinstance(items, str):
|
|
498
|
+
return {"type": "array", "items": {"type": items}}
|
|
499
|
+
elif isinstance(items, dict) and "type" not in items:
|
|
500
|
+
return {"type": "array", "items": items}
|
|
501
|
+
else:
|
|
502
|
+
return {"type": "array", "items": items}
|
|
503
|
+
|
|
504
|
+
return self.PYTHON_TO_JSTRUCT_TYPES.get(type(python_value), "string")
|
|
505
|
+
|
|
506
|
+
def fold_jstruct_record_types(self, base_record: dict, new_record: dict) -> Tuple[bool, dict]:
|
|
507
|
+
"""Merges two JSON Structure object types by combining their properties.
|
|
508
|
+
|
|
509
|
+
Args:
|
|
510
|
+
base_record: The base object to merge into
|
|
511
|
+
new_record: The new object to merge
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
Tuple of (success, merged_object)
|
|
515
|
+
"""
|
|
516
|
+
base_props = copy.deepcopy(base_record).get("properties", {})
|
|
517
|
+
new_props = new_record.get("properties", {})
|
|
518
|
+
base_required = set(base_record.get("required", []))
|
|
519
|
+
new_required = set(new_record.get("required", []))
|
|
520
|
+
|
|
521
|
+
for prop_name, prop_schema in new_props.items():
|
|
522
|
+
if prop_name not in base_props:
|
|
523
|
+
base_props[prop_name] = prop_schema
|
|
524
|
+
# Property only in some records is not required
|
|
525
|
+
else:
|
|
526
|
+
# Property exists in both - check compatibility
|
|
527
|
+
base_type = base_props[prop_name].get("type") if isinstance(base_props[prop_name], dict) else base_props[prop_name]
|
|
528
|
+
new_type = prop_schema.get("type") if isinstance(prop_schema, dict) else prop_schema
|
|
529
|
+
|
|
530
|
+
if base_type != new_type:
|
|
531
|
+
# Types differ - can't fold simply
|
|
532
|
+
if base_type == "object" and new_type == "object":
|
|
533
|
+
# Try to fold nested objects
|
|
534
|
+
success, merged = self.fold_jstruct_record_types(
|
|
535
|
+
base_props[prop_name], prop_schema)
|
|
536
|
+
if success:
|
|
537
|
+
base_props[prop_name] = merged
|
|
538
|
+
else:
|
|
539
|
+
return False, new_record
|
|
540
|
+
else:
|
|
541
|
+
return False, new_record
|
|
542
|
+
|
|
543
|
+
# Update required - only properties in ALL records are required
|
|
544
|
+
merged_required = base_required & new_required
|
|
545
|
+
|
|
546
|
+
base_record["properties"] = base_props
|
|
547
|
+
if merged_required:
|
|
548
|
+
base_record["required"] = list(merged_required)
|
|
549
|
+
elif "required" in base_record:
|
|
550
|
+
del base_record["required"]
|
|
551
|
+
|
|
552
|
+
return True, base_record
|
|
553
|
+
|
|
554
|
+
def consolidated_jstruct_type_list(self, type_name: str, python_values: list) -> List[Any]:
|
|
555
|
+
"""Consolidates a list of values into unique JSON Structure types.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
type_name: Base name for generated types
|
|
559
|
+
python_values: List of Python values to analyze
|
|
560
|
+
|
|
561
|
+
Returns:
|
|
562
|
+
List of unique JSON Structure types
|
|
563
|
+
"""
|
|
564
|
+
list_types = [self.python_type_to_jstruct_type(type_name, item) for item in python_values]
|
|
565
|
+
|
|
566
|
+
# Eliminate duplicates using tree hashing
|
|
567
|
+
tree_hashes = {}
|
|
568
|
+
for item in list_types:
|
|
569
|
+
tree_hash = get_tree_hash(item)
|
|
570
|
+
if tree_hash.hash_value not in tree_hashes:
|
|
571
|
+
tree_hashes[tree_hash.hash_value] = item
|
|
572
|
+
list_types = list(tree_hashes.values())
|
|
573
|
+
|
|
574
|
+
# Try to fold object types together
|
|
575
|
+
unique_types = []
|
|
576
|
+
prior_object = None
|
|
577
|
+
for item in list_types:
|
|
578
|
+
if isinstance(item, dict) and item.get("type") == "object":
|
|
579
|
+
if prior_object is None:
|
|
580
|
+
prior_object = item
|
|
581
|
+
else:
|
|
582
|
+
folded, obj = self.fold_jstruct_record_types(prior_object, item)
|
|
583
|
+
if not folded:
|
|
584
|
+
unique_types.append(item)
|
|
585
|
+
else:
|
|
586
|
+
prior_object = obj
|
|
587
|
+
else:
|
|
588
|
+
unique_types.append(item)
|
|
589
|
+
if prior_object is not None:
|
|
590
|
+
unique_types.append(prior_object)
|
|
591
|
+
|
|
592
|
+
# Consolidate array and map types
|
|
593
|
+
array_types = [item.get("items") for item in unique_types
|
|
594
|
+
if isinstance(item, dict) and item.get("type") == "array"]
|
|
595
|
+
map_types = [item.get("values") for item in unique_types
|
|
596
|
+
if isinstance(item, dict) and item.get("type") == "map"]
|
|
597
|
+
list_types = [item for item in unique_types
|
|
598
|
+
if not isinstance(item, dict) or item.get("type") not in ["array", "map"]]
|
|
599
|
+
|
|
600
|
+
item_types: List[Any] = []
|
|
601
|
+
for item2 in array_types:
|
|
602
|
+
if isinstance(item2, list):
|
|
603
|
+
item_types.extend(item2)
|
|
604
|
+
elif item2:
|
|
605
|
+
item_types.append(item2)
|
|
606
|
+
if item_types:
|
|
607
|
+
if len(item_types) == 1:
|
|
608
|
+
list_types.append({"type": "array", "items": item_types[0]})
|
|
609
|
+
else:
|
|
610
|
+
list_types.append({"type": "array", "items": {"type": "choice", "choices": item_types}})
|
|
611
|
+
|
|
612
|
+
value_types: List[Any] = []
|
|
613
|
+
for item3 in map_types:
|
|
614
|
+
if isinstance(item3, list):
|
|
615
|
+
value_types.extend(item3)
|
|
616
|
+
elif item3:
|
|
617
|
+
value_types.append(item3)
|
|
618
|
+
if value_types:
|
|
619
|
+
if len(value_types) == 1:
|
|
620
|
+
list_types.append({"type": "map", "values": value_types[0]})
|
|
621
|
+
else:
|
|
622
|
+
list_types.append({"type": "map", "values": {"type": "choice", "choices": value_types}})
|
|
623
|
+
|
|
624
|
+
return list_types
|
|
625
|
+
|
|
626
|
+
def infer_from_json_values(self, type_name: str, values: List[Any]) -> Dict[str, Any]:
|
|
627
|
+
"""Infers JSON Structure schema from a list of JSON values.
|
|
628
|
+
|
|
629
|
+
Args:
|
|
630
|
+
type_name: Name for the root type
|
|
631
|
+
values: List of parsed JSON values
|
|
632
|
+
|
|
633
|
+
Returns:
|
|
634
|
+
Complete JSON Structure schema with $schema and $id
|
|
635
|
+
"""
|
|
636
|
+
if not values:
|
|
637
|
+
return self._wrap_schema({"type": "string"}, type_name)
|
|
638
|
+
|
|
639
|
+
unique_types = self.consolidated_jstruct_type_list(type_name, values)
|
|
640
|
+
|
|
641
|
+
if len(unique_types) > 1:
|
|
642
|
+
# Multiple types -> use choice
|
|
643
|
+
schema = {"type": "choice", "choices": unique_types, "name": avro_name(type_name)}
|
|
644
|
+
elif len(unique_types) == 1:
|
|
645
|
+
schema = unique_types[0]
|
|
646
|
+
if isinstance(schema, str):
|
|
647
|
+
schema = {"type": schema}
|
|
648
|
+
if "name" not in schema:
|
|
649
|
+
schema["name"] = avro_name(type_name)
|
|
650
|
+
else:
|
|
651
|
+
schema = {"type": "string", "name": avro_name(type_name)}
|
|
652
|
+
|
|
653
|
+
return self._wrap_schema(schema, type_name)
|
|
654
|
+
|
|
655
|
+
def infer_from_xml_values(self, type_name: str, xml_strings: List[str]) -> Dict[str, Any]:
|
|
656
|
+
"""Infers JSON Structure schema from a list of XML strings.
|
|
657
|
+
|
|
658
|
+
Args:
|
|
659
|
+
type_name: Name for the root type
|
|
660
|
+
xml_strings: List of XML strings to analyze
|
|
661
|
+
|
|
662
|
+
Returns:
|
|
663
|
+
Complete JSON Structure schema with $schema and $id
|
|
664
|
+
"""
|
|
665
|
+
xml_structures: List[Dict[str, Any]] = []
|
|
666
|
+
for xml_str in xml_strings:
|
|
667
|
+
try:
|
|
668
|
+
structure = self._parse_xml_to_dict(xml_str)
|
|
669
|
+
if structure:
|
|
670
|
+
xml_structures.append(structure)
|
|
671
|
+
except ET.ParseError:
|
|
672
|
+
pass
|
|
673
|
+
|
|
674
|
+
if not xml_structures:
|
|
675
|
+
return self._wrap_schema({"type": "string"}, type_name)
|
|
676
|
+
|
|
677
|
+
unique_types = self.consolidated_jstruct_type_list(type_name, xml_structures)
|
|
678
|
+
|
|
679
|
+
if len(unique_types) > 1:
|
|
680
|
+
schema = {"type": "choice", "choices": unique_types, "name": avro_name(type_name)}
|
|
681
|
+
elif len(unique_types) == 1:
|
|
682
|
+
schema = unique_types[0]
|
|
683
|
+
if isinstance(schema, str):
|
|
684
|
+
schema = {"type": schema}
|
|
685
|
+
if "name" not in schema:
|
|
686
|
+
schema["name"] = avro_name(type_name)
|
|
687
|
+
else:
|
|
688
|
+
schema = {"type": "string", "name": avro_name(type_name)}
|
|
689
|
+
|
|
690
|
+
return self._wrap_schema(schema, type_name)
|
|
691
|
+
|
|
692
|
+
def _wrap_schema(self, schema: Dict[str, Any], type_name: str) -> Dict[str, Any]:
|
|
693
|
+
"""Wraps a schema with JSON Structure metadata.
|
|
694
|
+
|
|
695
|
+
Args:
|
|
696
|
+
schema: The schema body
|
|
697
|
+
type_name: Name for generating $id
|
|
698
|
+
|
|
699
|
+
Returns:
|
|
700
|
+
Complete JSON Structure schema
|
|
701
|
+
"""
|
|
702
|
+
safe_name = avro_name(type_name)
|
|
703
|
+
schema_id = f"{self.base_id.rstrip('/')}/{safe_name}"
|
|
704
|
+
|
|
705
|
+
result = {
|
|
706
|
+
"$schema": "https://json-structure.org/meta/core/v0/#",
|
|
707
|
+
"$id": schema_id,
|
|
708
|
+
}
|
|
709
|
+
result.update(schema)
|
|
710
|
+
return result
|
|
711
|
+
|
|
712
|
+
def _parse_xml_to_dict(self, xml_string: str) -> Dict[str, Any] | None:
|
|
713
|
+
"""Parses XML string to a dictionary structure for schema inference."""
|
|
714
|
+
try:
|
|
715
|
+
root = ET.fromstring(xml_string)
|
|
716
|
+
return self._element_to_dict(root)
|
|
717
|
+
except ET.ParseError:
|
|
718
|
+
return None
|
|
719
|
+
|
|
720
|
+
def _element_to_dict(self, element: ET.Element) -> Dict[str, Any]:
|
|
721
|
+
"""Converts an XML element to a dictionary."""
|
|
722
|
+
result: Dict[str, Any] = {}
|
|
723
|
+
|
|
724
|
+
# Handle attributes (prefix with @ for XML attributes)
|
|
725
|
+
for attr_name, attr_value in element.attrib.items():
|
|
726
|
+
attr_name = attr_name.split('}')[-1] if '}' in attr_name else attr_name
|
|
727
|
+
result[f"@{attr_name}"] = attr_value
|
|
728
|
+
|
|
729
|
+
# Handle text content
|
|
730
|
+
if element.text and element.text.strip():
|
|
731
|
+
if len(element) == 0 and not element.attrib:
|
|
732
|
+
return element.text.strip() # type: ignore
|
|
733
|
+
result["#text"] = element.text.strip()
|
|
734
|
+
|
|
735
|
+
# Handle child elements
|
|
736
|
+
for child in element:
|
|
737
|
+
child_tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
|
|
738
|
+
child_dict = self._element_to_dict(child)
|
|
739
|
+
|
|
740
|
+
if child_tag in result:
|
|
741
|
+
if not isinstance(result[child_tag], list):
|
|
742
|
+
result[child_tag] = [result[child_tag]]
|
|
743
|
+
result[child_tag].append(child_dict)
|
|
744
|
+
else:
|
|
745
|
+
result[child_tag] = child_dict
|
|
746
|
+
|
|
747
|
+
return result
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
# Convenience functions for direct use
|
|
751
|
+
|
|
752
|
+
def infer_avro_schema_from_json(
|
|
753
|
+
json_values: List[Any],
|
|
754
|
+
type_name: str = 'Document',
|
|
755
|
+
namespace: str = ''
|
|
756
|
+
) -> JsonNode:
|
|
757
|
+
"""Infers Avro schema from JSON values.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
json_values: List of parsed JSON values
|
|
761
|
+
type_name: Name for the root type
|
|
762
|
+
namespace: Avro namespace
|
|
763
|
+
|
|
764
|
+
Returns:
|
|
765
|
+
Inferred Avro schema
|
|
766
|
+
"""
|
|
767
|
+
inferrer = AvroSchemaInferrer(namespace=namespace)
|
|
768
|
+
return inferrer.infer_from_json_values(type_name, json_values)
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
def infer_avro_schema_from_xml(
|
|
772
|
+
xml_strings: List[str],
|
|
773
|
+
type_name: str = 'Document',
|
|
774
|
+
namespace: str = ''
|
|
775
|
+
) -> JsonNode:
|
|
776
|
+
"""Infers Avro schema from XML strings.
|
|
777
|
+
|
|
778
|
+
Args:
|
|
779
|
+
xml_strings: List of XML strings
|
|
780
|
+
type_name: Name for the root type
|
|
781
|
+
namespace: Avro namespace
|
|
782
|
+
|
|
783
|
+
Returns:
|
|
784
|
+
Inferred Avro schema
|
|
785
|
+
"""
|
|
786
|
+
inferrer = AvroSchemaInferrer(namespace=namespace)
|
|
787
|
+
return inferrer.infer_from_xml_values(type_name, xml_strings)
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def infer_jstruct_schema_from_json(
|
|
791
|
+
json_values: List[Any],
|
|
792
|
+
type_name: str = 'Document',
|
|
793
|
+
base_id: str = 'https://example.com/'
|
|
794
|
+
) -> Dict[str, Any]:
|
|
795
|
+
"""Infers JSON Structure schema from JSON values.
|
|
796
|
+
|
|
797
|
+
Args:
|
|
798
|
+
json_values: List of parsed JSON values
|
|
799
|
+
type_name: Name for the root type
|
|
800
|
+
base_id: Base URI for $id generation
|
|
801
|
+
|
|
802
|
+
Returns:
|
|
803
|
+
Complete JSON Structure schema
|
|
804
|
+
"""
|
|
805
|
+
inferrer = JsonStructureSchemaInferrer(base_id=base_id)
|
|
806
|
+
return inferrer.infer_from_json_values(type_name, json_values)
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def infer_jstruct_schema_from_xml(
|
|
810
|
+
xml_strings: List[str],
|
|
811
|
+
type_name: str = 'Document',
|
|
812
|
+
base_id: str = 'https://example.com/'
|
|
813
|
+
) -> Dict[str, Any]:
|
|
814
|
+
"""Infers JSON Structure schema from XML strings.
|
|
815
|
+
|
|
816
|
+
Args:
|
|
817
|
+
xml_strings: List of XML strings
|
|
818
|
+
type_name: Name for the root type
|
|
819
|
+
base_id: Base URI for $id generation
|
|
820
|
+
|
|
821
|
+
Returns:
|
|
822
|
+
Complete JSON Structure schema
|
|
823
|
+
"""
|
|
824
|
+
inferrer = JsonStructureSchemaInferrer(base_id=base_id)
|
|
825
|
+
return inferrer.infer_from_xml_values(type_name, xml_strings)
|