avrotize 3.0.1__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/_version.py +2 -2
- avrotize/avrotize.py +4 -0
- avrotize/avrotoiceberg.py +111 -13
- avrotize/avrotots/class_core.ts.jinja +2 -2
- avrotize/avrotots.py +62 -7
- avrotize/avrovalidator.py +518 -0
- avrotize/commands.json +485 -2
- avrotize/dependencies/cs/net90/dependencies.csproj +4 -4
- avrotize/dependencies/java/jdk21/pom.xml +6 -6
- avrotize/dependencies/typescript/node22/package.json +1 -1
- avrotize/jsontoschema.py +151 -0
- avrotize/schema_inference.py +825 -0
- avrotize/sqltoavro.py +1159 -0
- avrotize/structuretodb.py +1 -1
- avrotize/structuretoiceberg.py +113 -13
- avrotize/validate.py +242 -0
- avrotize/xmltoschema.py +122 -0
- {avrotize-3.0.1.dist-info → avrotize-3.1.0.dist-info}/METADATA +224 -4
- {avrotize-3.0.1.dist-info → avrotize-3.1.0.dist-info}/RECORD +22 -16
- {avrotize-3.0.1.dist-info → avrotize-3.1.0.dist-info}/WHEEL +0 -0
- {avrotize-3.0.1.dist-info → avrotize-3.1.0.dist-info}/entry_points.txt +0 -0
- {avrotize-3.0.1.dist-info → avrotize-3.1.0.dist-info}/licenses/LICENSE +0 -0
avrotize/structuretodb.py
CHANGED
|
@@ -426,7 +426,7 @@ def structure_type_to_sql_type(structure_type: Any, dialect: str) -> str:
|
|
|
426
426
|
type_map["sqlanywhere"] = type_map["sqlserver"].copy()
|
|
427
427
|
type_map["bigquery"] = {k: v.replace("VARCHAR", "STRING").replace("JSONB", "STRING").replace("BYTEA", "BYTES") for k, v in type_map["postgres"].items()}
|
|
428
428
|
type_map["snowflake"] = {k: v.replace("JSONB", "VARIANT").replace("BYTEA", "BINARY") for k, v in type_map["postgres"].items()}
|
|
429
|
-
type_map["redshift"] = type_map["postgres"].
|
|
429
|
+
type_map["redshift"] = {k: v.replace("JSONB", "SUPER").replace("BYTEA", "VARBYTE") for k, v in type_map["postgres"].items()}
|
|
430
430
|
|
|
431
431
|
# Handle type resolution
|
|
432
432
|
if isinstance(structure_type, str):
|
avrotize/structuretoiceberg.py
CHANGED
|
@@ -5,6 +5,7 @@ import sys
|
|
|
5
5
|
from typing import Dict, List, Any, Optional
|
|
6
6
|
import pyarrow as pa
|
|
7
7
|
from pyiceberg.schema import Schema, NestedField
|
|
8
|
+
from pyiceberg.io.pyarrow import PyArrowFileIO, schema_to_pyarrow
|
|
8
9
|
from pyiceberg.types import (
|
|
9
10
|
BooleanType,
|
|
10
11
|
IntegerType,
|
|
@@ -22,11 +23,76 @@ from pyiceberg.types import (
|
|
|
22
23
|
StructType,
|
|
23
24
|
TimeType
|
|
24
25
|
)
|
|
25
|
-
from pyiceberg.io.pyarrow import PyArrowFileIO, schema_to_pyarrow
|
|
26
26
|
|
|
27
27
|
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
28
28
|
|
|
29
29
|
|
|
30
|
+
def iceberg_type_to_json(iceberg_type) -> str | Dict:
|
|
31
|
+
"""
|
|
32
|
+
Serialize an Iceberg type to JSON per Iceberg Table Spec Appendix C.
|
|
33
|
+
|
|
34
|
+
Primitive types are serialized as strings. Complex types (struct, list, map)
|
|
35
|
+
are serialized as JSON objects with their nested structure.
|
|
36
|
+
"""
|
|
37
|
+
# Primitive types map to simple strings
|
|
38
|
+
if isinstance(iceberg_type, BooleanType):
|
|
39
|
+
return "boolean"
|
|
40
|
+
elif isinstance(iceberg_type, IntegerType):
|
|
41
|
+
return "int"
|
|
42
|
+
elif isinstance(iceberg_type, LongType):
|
|
43
|
+
return "long"
|
|
44
|
+
elif isinstance(iceberg_type, FloatType):
|
|
45
|
+
return "float"
|
|
46
|
+
elif isinstance(iceberg_type, DoubleType):
|
|
47
|
+
return "double"
|
|
48
|
+
elif isinstance(iceberg_type, StringType):
|
|
49
|
+
return "string"
|
|
50
|
+
elif isinstance(iceberg_type, BinaryType):
|
|
51
|
+
return "binary"
|
|
52
|
+
elif isinstance(iceberg_type, DateType):
|
|
53
|
+
return "date"
|
|
54
|
+
elif isinstance(iceberg_type, TimeType):
|
|
55
|
+
return "time"
|
|
56
|
+
elif isinstance(iceberg_type, TimestampType):
|
|
57
|
+
return "timestamp"
|
|
58
|
+
elif isinstance(iceberg_type, DecimalType):
|
|
59
|
+
return f"decimal({iceberg_type.precision},{iceberg_type.scale})"
|
|
60
|
+
elif isinstance(iceberg_type, FixedType):
|
|
61
|
+
return f"fixed[{iceberg_type.length}]"
|
|
62
|
+
elif isinstance(iceberg_type, ListType):
|
|
63
|
+
return {
|
|
64
|
+
"type": "list",
|
|
65
|
+
"element-id": iceberg_type.element_id,
|
|
66
|
+
"element-required": iceberg_type.element_required,
|
|
67
|
+
"element": iceberg_type_to_json(iceberg_type.element_type)
|
|
68
|
+
}
|
|
69
|
+
elif isinstance(iceberg_type, MapType):
|
|
70
|
+
return {
|
|
71
|
+
"type": "map",
|
|
72
|
+
"key-id": iceberg_type.key_id,
|
|
73
|
+
"key": iceberg_type_to_json(iceberg_type.key_type),
|
|
74
|
+
"value-id": iceberg_type.value_id,
|
|
75
|
+
"value-required": iceberg_type.value_required,
|
|
76
|
+
"value": iceberg_type_to_json(iceberg_type.value_type)
|
|
77
|
+
}
|
|
78
|
+
elif isinstance(iceberg_type, StructType):
|
|
79
|
+
return {
|
|
80
|
+
"type": "struct",
|
|
81
|
+
"fields": [
|
|
82
|
+
{
|
|
83
|
+
"id": field.field_id,
|
|
84
|
+
"name": field.name,
|
|
85
|
+
"required": field.required,
|
|
86
|
+
"type": iceberg_type_to_json(field.field_type)
|
|
87
|
+
}
|
|
88
|
+
for field in iceberg_type.fields
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
else:
|
|
92
|
+
# Fallback for unknown types
|
|
93
|
+
return str(iceberg_type)
|
|
94
|
+
|
|
95
|
+
|
|
30
96
|
class StructureToIcebergConverter:
|
|
31
97
|
"""Class to convert JSON Structure schema to Iceberg schema."""
|
|
32
98
|
|
|
@@ -45,8 +111,16 @@ class StructureToIcebergConverter:
|
|
|
45
111
|
"""Get the full name of a record type."""
|
|
46
112
|
return f"{namespace}.{name}" if namespace else name
|
|
47
113
|
|
|
48
|
-
def convert_structure_to_iceberg(self, structure_schema_path: str, structure_record_type: Optional[str], output_path: str, emit_cloudevents_columns: bool=False):
|
|
49
|
-
"""Convert a JSON Structure schema to an Iceberg schema.
|
|
114
|
+
def convert_structure_to_iceberg(self, structure_schema_path: str, structure_record_type: Optional[str], output_path: str, emit_cloudevents_columns: bool=False, output_format: str="arrow"):
|
|
115
|
+
"""Convert a JSON Structure schema to an Iceberg schema.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
structure_schema_path: Path to the JSON Structure schema file
|
|
119
|
+
structure_record_type: Record type to convert (or None for the root)
|
|
120
|
+
output_path: Path to write the Iceberg schema
|
|
121
|
+
emit_cloudevents_columns: Whether to add CloudEvents columns
|
|
122
|
+
output_format: Output format - 'arrow' for binary Arrow IPC (default), 'schema' for JSON
|
|
123
|
+
"""
|
|
50
124
|
schema_file = structure_schema_path
|
|
51
125
|
if not schema_file:
|
|
52
126
|
print("Please specify the JSON Structure schema file")
|
|
@@ -114,14 +188,32 @@ class StructureToIcebergConverter:
|
|
|
114
188
|
])
|
|
115
189
|
|
|
116
190
|
iceberg_schema = Schema(*iceberg_fields)
|
|
117
|
-
|
|
118
|
-
print(f"Iceberg schema created: {arrow_schema}")
|
|
191
|
+
print(f"Iceberg schema created: {iceberg_schema}")
|
|
119
192
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
193
|
+
if output_format == "arrow":
|
|
194
|
+
# Write as binary PyArrow schema
|
|
195
|
+
arrow_schema = schema_to_pyarrow(iceberg_schema)
|
|
196
|
+
file_io = PyArrowFileIO()
|
|
197
|
+
output_file = file_io.new_output("file://" + output_path)
|
|
198
|
+
with output_file.create(overwrite=True) as f:
|
|
199
|
+
pa.output_stream(f).write(arrow_schema.serialize().to_pybytes())
|
|
200
|
+
else:
|
|
201
|
+
# Write Iceberg schema as spec-compliant JSON (per Iceberg Table Spec Appendix C)
|
|
202
|
+
schema_json = {
|
|
203
|
+
"type": "struct",
|
|
204
|
+
"schema-id": 0,
|
|
205
|
+
"fields": [
|
|
206
|
+
{
|
|
207
|
+
"id": field.field_id,
|
|
208
|
+
"name": field.name,
|
|
209
|
+
"required": field.required,
|
|
210
|
+
"type": iceberg_type_to_json(field.field_type)
|
|
211
|
+
}
|
|
212
|
+
for field in iceberg_schema.fields
|
|
213
|
+
]
|
|
214
|
+
}
|
|
215
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
216
|
+
json.dump(schema_json, f, indent=2)
|
|
125
217
|
|
|
126
218
|
def resolve_ref(self, ref: str) -> Dict[str, Any]:
|
|
127
219
|
"""Resolve a $ref reference."""
|
|
@@ -348,8 +440,16 @@ class StructureToIcebergConverter:
|
|
|
348
440
|
return type_mapping.get(type_name, StringType())
|
|
349
441
|
|
|
350
442
|
|
|
351
|
-
def convert_structure_to_iceberg(structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns=False):
|
|
352
|
-
"""Convert a JSON Structure schema to an Iceberg schema.
|
|
443
|
+
def convert_structure_to_iceberg(structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns=False, output_format="arrow"):
|
|
444
|
+
"""Convert a JSON Structure schema to an Iceberg schema.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
structure_schema_path: Path to the JSON Structure schema file
|
|
448
|
+
structure_record_type: Record type to convert (or None for the root)
|
|
449
|
+
output_path: Path to write the Iceberg schema
|
|
450
|
+
emit_cloudevents_columns: Whether to add CloudEvents columns
|
|
451
|
+
output_format: Output format - 'arrow' for binary Arrow IPC (default), 'schema' for JSON
|
|
452
|
+
"""
|
|
353
453
|
converter = StructureToIcebergConverter()
|
|
354
454
|
converter.convert_structure_to_iceberg(
|
|
355
|
-
structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns)
|
|
455
|
+
structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns, output_format)
|
avrotize/validate.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Validates JSON instances against Avro or JSON Structure schemas.
|
|
2
|
+
|
|
3
|
+
This module provides a unified interface for validating JSON data against
|
|
4
|
+
both Avro schemas and JSON Structure schemas.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
from typing import Any, Dict, List, Tuple
|
|
10
|
+
|
|
11
|
+
from avrotize.avrovalidator import AvroValidator, AvroValidationError, validate_json_against_avro
|
|
12
|
+
|
|
13
|
+
# JSON Structure SDK for validation
|
|
14
|
+
try:
|
|
15
|
+
from json_structure import SchemaValidator as JStructSchemaValidator
|
|
16
|
+
from json_structure import InstanceValidator as JStructInstanceValidator
|
|
17
|
+
HAS_JSTRUCT_SDK = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
HAS_JSTRUCT_SDK = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ValidationResult:
|
|
23
|
+
"""Result of validating a JSON instance against a schema."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, is_valid: bool, errors: List[str] = None, instance_path: str = None):
|
|
26
|
+
self.is_valid = is_valid
|
|
27
|
+
self.errors = errors or []
|
|
28
|
+
self.instance_path = instance_path
|
|
29
|
+
|
|
30
|
+
def __str__(self) -> str:
|
|
31
|
+
if self.is_valid:
|
|
32
|
+
return f"✓ Valid" + (f": {self.instance_path}" if self.instance_path else "")
|
|
33
|
+
else:
|
|
34
|
+
prefix = f"{self.instance_path}: " if self.instance_path else ""
|
|
35
|
+
return f"✗ Invalid: {prefix}" + "; ".join(self.errors)
|
|
36
|
+
|
|
37
|
+
def __repr__(self) -> str:
|
|
38
|
+
return f"ValidationResult(is_valid={self.is_valid}, errors={self.errors})"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def detect_schema_type(schema: Dict[str, Any]) -> str:
|
|
42
|
+
"""Detects whether a schema is Avro or JSON Structure.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
schema: The parsed schema object
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
'avro' or 'jstruct' or 'unknown'
|
|
49
|
+
"""
|
|
50
|
+
# JSON Structure schemas have $schema and $id
|
|
51
|
+
if '$schema' in schema and 'json-structure' in schema.get('$schema', ''):
|
|
52
|
+
return 'jstruct'
|
|
53
|
+
|
|
54
|
+
# Avro schemas have 'type' at root and may have 'namespace', 'fields', etc.
|
|
55
|
+
if 'type' in schema:
|
|
56
|
+
schema_type = schema.get('type')
|
|
57
|
+
# Check for Avro record, enum, array, map, or primitive
|
|
58
|
+
if schema_type in ('record', 'enum', 'fixed', 'array', 'map'):
|
|
59
|
+
return 'avro'
|
|
60
|
+
if schema_type in ('null', 'boolean', 'int', 'long', 'float', 'double', 'bytes', 'string'):
|
|
61
|
+
return 'avro'
|
|
62
|
+
# JSON Structure object type
|
|
63
|
+
if schema_type == 'object' and 'properties' in schema:
|
|
64
|
+
return 'jstruct'
|
|
65
|
+
|
|
66
|
+
# Check if it's a union (list)
|
|
67
|
+
if isinstance(schema, list):
|
|
68
|
+
return 'avro'
|
|
69
|
+
|
|
70
|
+
return 'unknown'
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def validate_instance(
|
|
74
|
+
instance: Any,
|
|
75
|
+
schema: Dict[str, Any],
|
|
76
|
+
schema_type: str = None
|
|
77
|
+
) -> ValidationResult:
|
|
78
|
+
"""Validates a JSON instance against a schema.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
instance: The JSON value to validate
|
|
82
|
+
schema: The schema (Avro or JSON Structure)
|
|
83
|
+
schema_type: 'avro' or 'jstruct', auto-detected if not provided
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
ValidationResult with validation status and any errors
|
|
87
|
+
"""
|
|
88
|
+
if schema_type is None:
|
|
89
|
+
schema_type = detect_schema_type(schema)
|
|
90
|
+
|
|
91
|
+
if schema_type == 'avro':
|
|
92
|
+
errors = validate_json_against_avro(instance, schema)
|
|
93
|
+
return ValidationResult(is_valid=len(errors) == 0, errors=errors)
|
|
94
|
+
|
|
95
|
+
elif schema_type == 'jstruct':
|
|
96
|
+
if not HAS_JSTRUCT_SDK:
|
|
97
|
+
return ValidationResult(
|
|
98
|
+
is_valid=False,
|
|
99
|
+
errors=["JSON Structure SDK not installed. Install with: pip install json-structure"]
|
|
100
|
+
)
|
|
101
|
+
try:
|
|
102
|
+
validator = JStructInstanceValidator(schema)
|
|
103
|
+
errors = validator.validate(instance)
|
|
104
|
+
return ValidationResult(is_valid=len(errors) == 0, errors=errors if errors else [])
|
|
105
|
+
except Exception as e:
|
|
106
|
+
return ValidationResult(is_valid=False, errors=[str(e)])
|
|
107
|
+
|
|
108
|
+
else:
|
|
109
|
+
return ValidationResult(
|
|
110
|
+
is_valid=False,
|
|
111
|
+
errors=[f"Unknown schema type. Cannot auto-detect schema format."]
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def validate_file(
|
|
116
|
+
instance_file: str,
|
|
117
|
+
schema_file: str,
|
|
118
|
+
schema_type: str = None
|
|
119
|
+
) -> List[ValidationResult]:
|
|
120
|
+
"""Validates JSON instance file(s) against a schema file.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
instance_file: Path to JSON file (single object, array, or JSONL)
|
|
124
|
+
schema_file: Path to schema file (.avsc or .jstruct.json)
|
|
125
|
+
schema_type: 'avro' or 'jstruct', auto-detected if not provided
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
List of ValidationResult for each instance in the file
|
|
129
|
+
"""
|
|
130
|
+
# Load schema
|
|
131
|
+
with open(schema_file, 'r', encoding='utf-8') as f:
|
|
132
|
+
schema = json.load(f)
|
|
133
|
+
|
|
134
|
+
# Auto-detect schema type from file extension if not provided
|
|
135
|
+
if schema_type is None:
|
|
136
|
+
if schema_file.endswith('.avsc'):
|
|
137
|
+
schema_type = 'avro'
|
|
138
|
+
elif schema_file.endswith('.jstruct.json') or schema_file.endswith('.jstruct'):
|
|
139
|
+
schema_type = 'jstruct'
|
|
140
|
+
else:
|
|
141
|
+
schema_type = detect_schema_type(schema)
|
|
142
|
+
|
|
143
|
+
# Load instances
|
|
144
|
+
with open(instance_file, 'r', encoding='utf-8') as f:
|
|
145
|
+
content = f.read().strip()
|
|
146
|
+
|
|
147
|
+
instances = []
|
|
148
|
+
instance_paths = []
|
|
149
|
+
|
|
150
|
+
# Try as JSON array or object
|
|
151
|
+
try:
|
|
152
|
+
data = json.loads(content)
|
|
153
|
+
if isinstance(data, list):
|
|
154
|
+
instances = data
|
|
155
|
+
instance_paths = [f"{instance_file}[{i}]" for i in range(len(data))]
|
|
156
|
+
else:
|
|
157
|
+
instances = [data]
|
|
158
|
+
instance_paths = [instance_file]
|
|
159
|
+
except json.JSONDecodeError:
|
|
160
|
+
# Try as JSONL
|
|
161
|
+
for i, line in enumerate(content.split('\n')):
|
|
162
|
+
line = line.strip()
|
|
163
|
+
if line:
|
|
164
|
+
try:
|
|
165
|
+
instances.append(json.loads(line))
|
|
166
|
+
instance_paths.append(f"{instance_file}:{i+1}")
|
|
167
|
+
except json.JSONDecodeError:
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
# Validate each instance
|
|
171
|
+
results = []
|
|
172
|
+
for instance, path in zip(instances, instance_paths):
|
|
173
|
+
result = validate_instance(instance, schema, schema_type)
|
|
174
|
+
result.instance_path = path
|
|
175
|
+
results.append(result)
|
|
176
|
+
|
|
177
|
+
return results
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def validate_json_instances(
|
|
181
|
+
input_files: List[str],
|
|
182
|
+
schema_file: str,
|
|
183
|
+
schema_type: str = None,
|
|
184
|
+
verbose: bool = False
|
|
185
|
+
) -> Tuple[int, int]:
|
|
186
|
+
"""Validates multiple JSON instance files against a schema.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
input_files: List of JSON file paths to validate
|
|
190
|
+
schema_file: Path to schema file
|
|
191
|
+
schema_type: 'avro' or 'jstruct', auto-detected if not provided
|
|
192
|
+
verbose: Whether to print validation results
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Tuple of (valid_count, invalid_count)
|
|
196
|
+
"""
|
|
197
|
+
valid_count = 0
|
|
198
|
+
invalid_count = 0
|
|
199
|
+
|
|
200
|
+
for input_file in input_files:
|
|
201
|
+
results = validate_file(input_file, schema_file, schema_type)
|
|
202
|
+
for result in results:
|
|
203
|
+
if result.is_valid:
|
|
204
|
+
valid_count += 1
|
|
205
|
+
if verbose:
|
|
206
|
+
print(result)
|
|
207
|
+
else:
|
|
208
|
+
invalid_count += 1
|
|
209
|
+
if verbose:
|
|
210
|
+
print(result)
|
|
211
|
+
|
|
212
|
+
return valid_count, invalid_count
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# Command entry point for avrotize CLI
|
|
216
|
+
def validate(
|
|
217
|
+
input: List[str],
|
|
218
|
+
schema: str,
|
|
219
|
+
schema_type: str = None,
|
|
220
|
+
quiet: bool = False
|
|
221
|
+
) -> None:
|
|
222
|
+
"""Validates JSON instances against an Avro or JSON Structure schema.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
input: List of JSON files to validate
|
|
226
|
+
schema: Path to schema file (.avsc or .jstruct.json)
|
|
227
|
+
schema_type: Schema type ('avro' or 'jstruct'), auto-detected if not provided
|
|
228
|
+
quiet: Suppress output, exit with code 0 if valid, 1 if invalid
|
|
229
|
+
"""
|
|
230
|
+
valid_count, invalid_count = validate_json_instances(
|
|
231
|
+
input_files=input,
|
|
232
|
+
schema_file=schema,
|
|
233
|
+
schema_type=schema_type,
|
|
234
|
+
verbose=not quiet
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
if not quiet:
|
|
238
|
+
total = valid_count + invalid_count
|
|
239
|
+
print(f"\nValidation summary: {valid_count}/{total} instances valid")
|
|
240
|
+
|
|
241
|
+
if invalid_count > 0:
|
|
242
|
+
exit(1)
|
avrotize/xmltoschema.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""Infers schema from XML files and converts to Avro or JSON Structure format.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- xml2a: Infer Avro schema from XML files
|
|
5
|
+
- xml2s: Infer JSON Structure schema from XML files
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
from typing import List
|
|
11
|
+
|
|
12
|
+
from avrotize.schema_inference import (
|
|
13
|
+
AvroSchemaInferrer,
|
|
14
|
+
JsonStructureSchemaInferrer,
|
|
15
|
+
JsonNode
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def convert_xml_to_avro(
|
|
20
|
+
input_files: List[str],
|
|
21
|
+
avro_schema_file: str,
|
|
22
|
+
type_name: str = 'Document',
|
|
23
|
+
avro_namespace: str = '',
|
|
24
|
+
sample_size: int = 0
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Infers Avro schema from XML files.
|
|
27
|
+
|
|
28
|
+
Reads XML files, analyzes their structure, and generates an Avro schema
|
|
29
|
+
that can represent all the data. Multiple files are analyzed together to
|
|
30
|
+
produce a unified schema.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
input_files: List of XML file paths to analyze
|
|
34
|
+
avro_schema_file: Output path for the Avro schema
|
|
35
|
+
type_name: Name for the root type
|
|
36
|
+
avro_namespace: Namespace for generated Avro types
|
|
37
|
+
sample_size: Maximum number of documents to sample (0 = all)
|
|
38
|
+
"""
|
|
39
|
+
if not input_files:
|
|
40
|
+
raise ValueError("At least one input file is required")
|
|
41
|
+
|
|
42
|
+
xml_strings = _load_xml_strings(input_files, sample_size)
|
|
43
|
+
|
|
44
|
+
if not xml_strings:
|
|
45
|
+
raise ValueError("No valid XML data found in input files")
|
|
46
|
+
|
|
47
|
+
inferrer = AvroSchemaInferrer(namespace=avro_namespace)
|
|
48
|
+
schema = inferrer.infer_from_xml_values(type_name, xml_strings)
|
|
49
|
+
|
|
50
|
+
# Ensure output directory exists
|
|
51
|
+
output_dir = os.path.dirname(avro_schema_file)
|
|
52
|
+
if output_dir and not os.path.exists(output_dir):
|
|
53
|
+
os.makedirs(output_dir)
|
|
54
|
+
|
|
55
|
+
with open(avro_schema_file, 'w', encoding='utf-8') as f:
|
|
56
|
+
json.dump(schema, f, indent=2)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def convert_xml_to_jstruct(
|
|
60
|
+
input_files: List[str],
|
|
61
|
+
jstruct_schema_file: str,
|
|
62
|
+
type_name: str = 'Document',
|
|
63
|
+
base_id: str = 'https://example.com/',
|
|
64
|
+
sample_size: int = 0
|
|
65
|
+
) -> None:
|
|
66
|
+
"""Infers JSON Structure schema from XML files.
|
|
67
|
+
|
|
68
|
+
Reads XML files, analyzes their structure, and generates a JSON Structure
|
|
69
|
+
schema that validates with the official JSON Structure SDK.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
input_files: List of XML file paths to analyze
|
|
73
|
+
jstruct_schema_file: Output path for the JSON Structure schema
|
|
74
|
+
type_name: Name for the root type
|
|
75
|
+
base_id: Base URI for $id generation
|
|
76
|
+
sample_size: Maximum number of documents to sample (0 = all)
|
|
77
|
+
"""
|
|
78
|
+
if not input_files:
|
|
79
|
+
raise ValueError("At least one input file is required")
|
|
80
|
+
|
|
81
|
+
xml_strings = _load_xml_strings(input_files, sample_size)
|
|
82
|
+
|
|
83
|
+
if not xml_strings:
|
|
84
|
+
raise ValueError("No valid XML data found in input files")
|
|
85
|
+
|
|
86
|
+
inferrer = JsonStructureSchemaInferrer(base_id=base_id)
|
|
87
|
+
schema = inferrer.infer_from_xml_values(type_name, xml_strings)
|
|
88
|
+
|
|
89
|
+
# Ensure output directory exists
|
|
90
|
+
output_dir = os.path.dirname(jstruct_schema_file)
|
|
91
|
+
if output_dir and not os.path.exists(output_dir):
|
|
92
|
+
os.makedirs(output_dir)
|
|
93
|
+
|
|
94
|
+
with open(jstruct_schema_file, 'w', encoding='utf-8') as f:
|
|
95
|
+
json.dump(schema, f, indent=2)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _load_xml_strings(input_files: List[str], sample_size: int) -> List[str]:
|
|
99
|
+
"""Loads XML content from files.
|
|
100
|
+
|
|
101
|
+
Each file is treated as a single XML document.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
input_files: List of file paths
|
|
105
|
+
sample_size: Maximum documents to load (0 = all)
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List of XML strings
|
|
109
|
+
"""
|
|
110
|
+
xml_strings: List[str] = []
|
|
111
|
+
|
|
112
|
+
for file_path in input_files:
|
|
113
|
+
if sample_size > 0 and len(xml_strings) >= sample_size:
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
117
|
+
content = f.read().strip()
|
|
118
|
+
|
|
119
|
+
if content:
|
|
120
|
+
xml_strings.append(content)
|
|
121
|
+
|
|
122
|
+
return xml_strings
|