structurize 3.0.2__py3-none-any.whl → 3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avrotize/validate.py ADDED
@@ -0,0 +1,242 @@
1
+ """Validates JSON instances against Avro or JSON Structure schemas.
2
+
3
+ This module provides a unified interface for validating JSON data against
4
+ both Avro schemas and JSON Structure schemas.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ from typing import Any, Dict, List, Tuple
10
+
11
+ from avrotize.avrovalidator import AvroValidator, AvroValidationError, validate_json_against_avro
12
+
13
+ # JSON Structure SDK for validation
14
+ try:
15
+ from json_structure import SchemaValidator as JStructSchemaValidator
16
+ from json_structure import InstanceValidator as JStructInstanceValidator
17
+ HAS_JSTRUCT_SDK = True
18
+ except ImportError:
19
+ HAS_JSTRUCT_SDK = False
20
+
21
+
22
+ class ValidationResult:
23
+ """Result of validating a JSON instance against a schema."""
24
+
25
+ def __init__(self, is_valid: bool, errors: List[str] = None, instance_path: str = None):
26
+ self.is_valid = is_valid
27
+ self.errors = errors or []
28
+ self.instance_path = instance_path
29
+
30
+ def __str__(self) -> str:
31
+ if self.is_valid:
32
+ return f"✓ Valid" + (f": {self.instance_path}" if self.instance_path else "")
33
+ else:
34
+ prefix = f"{self.instance_path}: " if self.instance_path else ""
35
+ return f"✗ Invalid: {prefix}" + "; ".join(self.errors)
36
+
37
+ def __repr__(self) -> str:
38
+ return f"ValidationResult(is_valid={self.is_valid}, errors={self.errors})"
39
+
40
+
41
+ def detect_schema_type(schema: Dict[str, Any]) -> str:
42
+ """Detects whether a schema is Avro or JSON Structure.
43
+
44
+ Args:
45
+ schema: The parsed schema object
46
+
47
+ Returns:
48
+ 'avro' or 'jstruct' or 'unknown'
49
+ """
50
+ # JSON Structure schemas have $schema and $id
51
+ if '$schema' in schema and 'json-structure' in schema.get('$schema', ''):
52
+ return 'jstruct'
53
+
54
+ # Avro schemas have 'type' at root and may have 'namespace', 'fields', etc.
55
+ if 'type' in schema:
56
+ schema_type = schema.get('type')
57
+ # Check for Avro record, enum, array, map, or primitive
58
+ if schema_type in ('record', 'enum', 'fixed', 'array', 'map'):
59
+ return 'avro'
60
+ if schema_type in ('null', 'boolean', 'int', 'long', 'float', 'double', 'bytes', 'string'):
61
+ return 'avro'
62
+ # JSON Structure object type
63
+ if schema_type == 'object' and 'properties' in schema:
64
+ return 'jstruct'
65
+
66
+ # Check if it's a union (list)
67
+ if isinstance(schema, list):
68
+ return 'avro'
69
+
70
+ return 'unknown'
71
+
72
+
73
+ def validate_instance(
74
+ instance: Any,
75
+ schema: Dict[str, Any],
76
+ schema_type: str = None
77
+ ) -> ValidationResult:
78
+ """Validates a JSON instance against a schema.
79
+
80
+ Args:
81
+ instance: The JSON value to validate
82
+ schema: The schema (Avro or JSON Structure)
83
+ schema_type: 'avro' or 'jstruct', auto-detected if not provided
84
+
85
+ Returns:
86
+ ValidationResult with validation status and any errors
87
+ """
88
+ if schema_type is None:
89
+ schema_type = detect_schema_type(schema)
90
+
91
+ if schema_type == 'avro':
92
+ errors = validate_json_against_avro(instance, schema)
93
+ return ValidationResult(is_valid=len(errors) == 0, errors=errors)
94
+
95
+ elif schema_type == 'jstruct':
96
+ if not HAS_JSTRUCT_SDK:
97
+ return ValidationResult(
98
+ is_valid=False,
99
+ errors=["JSON Structure SDK not installed. Install with: pip install json-structure"]
100
+ )
101
+ try:
102
+ validator = JStructInstanceValidator(schema)
103
+ errors = validator.validate(instance)
104
+ return ValidationResult(is_valid=len(errors) == 0, errors=errors if errors else [])
105
+ except Exception as e:
106
+ return ValidationResult(is_valid=False, errors=[str(e)])
107
+
108
+ else:
109
+ return ValidationResult(
110
+ is_valid=False,
111
+ errors=[f"Unknown schema type. Cannot auto-detect schema format."]
112
+ )
113
+
114
+
115
+ def validate_file(
116
+ instance_file: str,
117
+ schema_file: str,
118
+ schema_type: str = None
119
+ ) -> List[ValidationResult]:
120
+ """Validates JSON instance file(s) against a schema file.
121
+
122
+ Args:
123
+ instance_file: Path to JSON file (single object, array, or JSONL)
124
+ schema_file: Path to schema file (.avsc or .jstruct.json)
125
+ schema_type: 'avro' or 'jstruct', auto-detected if not provided
126
+
127
+ Returns:
128
+ List of ValidationResult for each instance in the file
129
+ """
130
+ # Load schema
131
+ with open(schema_file, 'r', encoding='utf-8') as f:
132
+ schema = json.load(f)
133
+
134
+ # Auto-detect schema type from file extension if not provided
135
+ if schema_type is None:
136
+ if schema_file.endswith('.avsc'):
137
+ schema_type = 'avro'
138
+ elif schema_file.endswith('.jstruct.json') or schema_file.endswith('.jstruct'):
139
+ schema_type = 'jstruct'
140
+ else:
141
+ schema_type = detect_schema_type(schema)
142
+
143
+ # Load instances
144
+ with open(instance_file, 'r', encoding='utf-8') as f:
145
+ content = f.read().strip()
146
+
147
+ instances = []
148
+ instance_paths = []
149
+
150
+ # Try as JSON array or object
151
+ try:
152
+ data = json.loads(content)
153
+ if isinstance(data, list):
154
+ instances = data
155
+ instance_paths = [f"{instance_file}[{i}]" for i in range(len(data))]
156
+ else:
157
+ instances = [data]
158
+ instance_paths = [instance_file]
159
+ except json.JSONDecodeError:
160
+ # Try as JSONL
161
+ for i, line in enumerate(content.split('\n')):
162
+ line = line.strip()
163
+ if line:
164
+ try:
165
+ instances.append(json.loads(line))
166
+ instance_paths.append(f"{instance_file}:{i+1}")
167
+ except json.JSONDecodeError:
168
+ pass
169
+
170
+ # Validate each instance
171
+ results = []
172
+ for instance, path in zip(instances, instance_paths):
173
+ result = validate_instance(instance, schema, schema_type)
174
+ result.instance_path = path
175
+ results.append(result)
176
+
177
+ return results
178
+
179
+
180
+ def validate_json_instances(
181
+ input_files: List[str],
182
+ schema_file: str,
183
+ schema_type: str = None,
184
+ verbose: bool = False
185
+ ) -> Tuple[int, int]:
186
+ """Validates multiple JSON instance files against a schema.
187
+
188
+ Args:
189
+ input_files: List of JSON file paths to validate
190
+ schema_file: Path to schema file
191
+ schema_type: 'avro' or 'jstruct', auto-detected if not provided
192
+ verbose: Whether to print validation results
193
+
194
+ Returns:
195
+ Tuple of (valid_count, invalid_count)
196
+ """
197
+ valid_count = 0
198
+ invalid_count = 0
199
+
200
+ for input_file in input_files:
201
+ results = validate_file(input_file, schema_file, schema_type)
202
+ for result in results:
203
+ if result.is_valid:
204
+ valid_count += 1
205
+ if verbose:
206
+ print(result)
207
+ else:
208
+ invalid_count += 1
209
+ if verbose:
210
+ print(result)
211
+
212
+ return valid_count, invalid_count
213
+
214
+
215
+ # Command entry point for avrotize CLI
216
+ def validate(
217
+ input: List[str],
218
+ schema: str,
219
+ schema_type: str = None,
220
+ quiet: bool = False
221
+ ) -> None:
222
+ """Validates JSON instances against an Avro or JSON Structure schema.
223
+
224
+ Args:
225
+ input: List of JSON files to validate
226
+ schema: Path to schema file (.avsc or .jstruct.json)
227
+ schema_type: Schema type ('avro' or 'jstruct'), auto-detected if not provided
228
+ quiet: Suppress output, exit with code 0 if valid, 1 if invalid
229
+ """
230
+ valid_count, invalid_count = validate_json_instances(
231
+ input_files=input,
232
+ schema_file=schema,
233
+ schema_type=schema_type,
234
+ verbose=not quiet
235
+ )
236
+
237
+ if not quiet:
238
+ total = valid_count + invalid_count
239
+ print(f"\nValidation summary: {valid_count}/{total} instances valid")
240
+
241
+ if invalid_count > 0:
242
+ exit(1)
@@ -0,0 +1,122 @@
1
+ """Infers schema from XML files and converts to Avro or JSON Structure format.
2
+
3
+ This module provides:
4
+ - xml2a: Infer Avro schema from XML files
5
+ - xml2s: Infer JSON Structure schema from XML files
6
+ """
7
+
8
+ import json
9
+ import os
10
+ from typing import List
11
+
12
+ from avrotize.schema_inference import (
13
+ AvroSchemaInferrer,
14
+ JsonStructureSchemaInferrer,
15
+ JsonNode
16
+ )
17
+
18
+
19
+ def convert_xml_to_avro(
20
+ input_files: List[str],
21
+ avro_schema_file: str,
22
+ type_name: str = 'Document',
23
+ avro_namespace: str = '',
24
+ sample_size: int = 0
25
+ ) -> None:
26
+ """Infers Avro schema from XML files.
27
+
28
+ Reads XML files, analyzes their structure, and generates an Avro schema
29
+ that can represent all the data. Multiple files are analyzed together to
30
+ produce a unified schema.
31
+
32
+ Args:
33
+ input_files: List of XML file paths to analyze
34
+ avro_schema_file: Output path for the Avro schema
35
+ type_name: Name for the root type
36
+ avro_namespace: Namespace for generated Avro types
37
+ sample_size: Maximum number of documents to sample (0 = all)
38
+ """
39
+ if not input_files:
40
+ raise ValueError("At least one input file is required")
41
+
42
+ xml_strings = _load_xml_strings(input_files, sample_size)
43
+
44
+ if not xml_strings:
45
+ raise ValueError("No valid XML data found in input files")
46
+
47
+ inferrer = AvroSchemaInferrer(namespace=avro_namespace)
48
+ schema = inferrer.infer_from_xml_values(type_name, xml_strings)
49
+
50
+ # Ensure output directory exists
51
+ output_dir = os.path.dirname(avro_schema_file)
52
+ if output_dir and not os.path.exists(output_dir):
53
+ os.makedirs(output_dir)
54
+
55
+ with open(avro_schema_file, 'w', encoding='utf-8') as f:
56
+ json.dump(schema, f, indent=2)
57
+
58
+
59
+ def convert_xml_to_jstruct(
60
+ input_files: List[str],
61
+ jstruct_schema_file: str,
62
+ type_name: str = 'Document',
63
+ base_id: str = 'https://example.com/',
64
+ sample_size: int = 0
65
+ ) -> None:
66
+ """Infers JSON Structure schema from XML files.
67
+
68
+ Reads XML files, analyzes their structure, and generates a JSON Structure
69
+ schema that validates with the official JSON Structure SDK.
70
+
71
+ Args:
72
+ input_files: List of XML file paths to analyze
73
+ jstruct_schema_file: Output path for the JSON Structure schema
74
+ type_name: Name for the root type
75
+ base_id: Base URI for $id generation
76
+ sample_size: Maximum number of documents to sample (0 = all)
77
+ """
78
+ if not input_files:
79
+ raise ValueError("At least one input file is required")
80
+
81
+ xml_strings = _load_xml_strings(input_files, sample_size)
82
+
83
+ if not xml_strings:
84
+ raise ValueError("No valid XML data found in input files")
85
+
86
+ inferrer = JsonStructureSchemaInferrer(base_id=base_id)
87
+ schema = inferrer.infer_from_xml_values(type_name, xml_strings)
88
+
89
+ # Ensure output directory exists
90
+ output_dir = os.path.dirname(jstruct_schema_file)
91
+ if output_dir and not os.path.exists(output_dir):
92
+ os.makedirs(output_dir)
93
+
94
+ with open(jstruct_schema_file, 'w', encoding='utf-8') as f:
95
+ json.dump(schema, f, indent=2)
96
+
97
+
98
+ def _load_xml_strings(input_files: List[str], sample_size: int) -> List[str]:
99
+ """Loads XML content from files.
100
+
101
+ Each file is treated as a single XML document.
102
+
103
+ Args:
104
+ input_files: List of file paths
105
+ sample_size: Maximum documents to load (0 = all)
106
+
107
+ Returns:
108
+ List of XML strings
109
+ """
110
+ xml_strings: List[str] = []
111
+
112
+ for file_path in input_files:
113
+ if sample_size > 0 and len(xml_strings) >= sample_size:
114
+ break
115
+
116
+ with open(file_path, 'r', encoding='utf-8') as f:
117
+ content = f.read().strip()
118
+
119
+ if content:
120
+ xml_strings.append(content)
121
+
122
+ return xml_strings
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: structurize
3
- Version: 3.0.2
3
+ Version: 3.1.1
4
4
  Summary: Tools to convert from and to JSON Structure from various other schema languages.
5
5
  Author-email: Clemens Vasters <clemensv@microsoft.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,8 +1,8 @@
1
1
  avrotize/__init__.py,sha256=t5h5wkHXr6M0mmHAB5rhjZ3Gxy9xutGTGIfojfao9rI,3820
2
2
  avrotize/__main__.py,sha256=5pY8dYAURcOnFRvgb6fgaOIa_SOzPLIWbU8-ZTQ0jG4,88
3
- avrotize/_version.py,sha256=oe-LtbcDFd-ELWKNVft1PeQ3Tp3QrrPnPsINIJYW4ZE,712
3
+ avrotize/_version.py,sha256=jKplN91giDOWFov4V2npiJtBYgOtJkkFOjorEJUKChc,712
4
4
  avrotize/asn1toavro.py,sha256=QDNwfBfXMxSH-k487CA3CaGCGDzOLs4PpVbbENm5uF0,8386
5
- avrotize/avrotize.py,sha256=VHFpBltMVBpyt0ju3ZWW725BKjQ4Fk-nrAy8udW-X44,5713
5
+ avrotize/avrotize.py,sha256=z5snX7ZxlXwL6XC2vmhymnHvhbqIW7Tz7ZW2jRWrMgY,5966
6
6
  avrotize/avrotocpp.py,sha256=hRZV247_TDD7Sm6_8sFx-UH5SueLLx2Wg6TvAVUX0iE,25693
7
7
  avrotize/avrotocsharp.py,sha256=jOx9ctuUSsdpNDEXnNNfFHOb8zWWHclUvYTF5rInOaM,72339
8
8
  avrotize/avrotocsv.py,sha256=yqVbP4Ar8bZyEgOAmHuHAsQKfCVjIO5_pa5dtaKLZKE,4575
@@ -22,10 +22,11 @@ avrotize/avrotoparquet.py,sha256=qm5hfia5elW1Yn4KACG8bbudLAqQSwGk3fIkTvdT5Rg,908
22
22
  avrotize/avrotoproto.py,sha256=STqbdGjVrgKrlKXt-6dZlekW_Oq0W0StRx80St1XqIc,22486
23
23
  avrotize/avrotopython.py,sha256=7s73rKyaQDu5-uLnF4mK7Jat9F4jipQ-lLapPXg6aPI,34168
24
24
  avrotize/avrotorust.py,sha256=HEcDirRBCbXQNNs_FmkT-sp1dWQgZ8A23qkQYUxVuXE,24255
25
- avrotize/avrotots.py,sha256=u_XLjlHN0Gof5QYlpqK4X9WoX9rL30TjQMPg4TiyYnI,33241
25
+ avrotize/avrotots.py,sha256=QBHcrJ90gCELFYL2msOrpsJA49l84JT-qPXw6dDnTIE,34404
26
26
  avrotize/avrotoxsd.py,sha256=iGQq_8kC0kfKsqvqS6s_mO-kJ8N5G8vXOwqRI_DZUxc,17744
27
+ avrotize/avrovalidator.py,sha256=qwrjt7e97t53GK06iiY2KKv7DPnRf_2Uwv72ciom9s8,21369
27
28
  avrotize/cddltostructure.py,sha256=MA2c-P3CIEAxEaBX-FF299gR55xcLEV3FrfTr2QfayM,74491
28
- avrotize/commands.json,sha256=c6yV5vSey4GuoLVllPvL0Zb5rfmMQ8dgXUzKAtgKMZw,98904
29
+ avrotize/commands.json,sha256=tmOaoxKnxUetJmnwtWHz1t0vgoF8zKZB68f9NWYoD_Y,111718
29
30
  avrotize/common.py,sha256=enqNR1I9-SbW7fNJE3w7N2R87kiN6_9Oa7VB4b2AUBc,31913
30
31
  avrotize/constants.py,sha256=LlgHrvT6RsRPrhFGRNHmFuIj3b1bSd45yC4rBCIGGVA,2753
31
32
  avrotize/csvtoavro.py,sha256=TuIYm_Xv8gioEHl1YgWQKOYkFGGHfuwmK5RuEAEXbt8,4293
@@ -34,6 +35,7 @@ avrotize/dependency_resolver.py,sha256=LGOTutpobJ4kMjAwvs-l0Mt2tEoZFaXCazs-u38qn
34
35
  avrotize/dependency_version.py,sha256=tvbpO2VstTSTmNA5jbzQl48u6jnIM7BHyASQrrgsRYU,16844
35
36
  avrotize/jsonstoavro.py,sha256=ZzigsCjAxw_TflXCjTLKHTrPmkiZRZMpuaZICfT_r_I,120069
36
37
  avrotize/jsonstostructure.py,sha256=AXqrh3gJsQveWmC9Pfj1rTcUY_4KaDP7L9ObBSQehF4,147687
38
+ avrotize/jsontoschema.py,sha256=bRLhBqN1tkKVFvmNWBtzMeWkNVj__dMspTxtzMwZoc4,4714
37
39
  avrotize/jstructtoavro.py,sha256=sOq7Ru1b8_ZLCEsnBqx3gsMWd7dPAaYxoraAD0bz6rk,33891
38
40
  avrotize/kstructtoavro.py,sha256=t97JY22n0uILK3WcvQu_Yp9ONvouJRLAC2bZ3rvZ1L0,2856
39
41
  avrotize/kustotoavro.py,sha256=1oEk9mbqmP3N5-V7mBHSXpbSlYFzjJ7ajIDNJZxA1r8,21756
@@ -42,6 +44,8 @@ avrotize/parquettoavro.py,sha256=iAPrSYNkiH3fBKNVDfIgeXkQbAiopa252ULJrGgmBDI,553
42
44
  avrotize/proto2parser.py,sha256=__9R3cqiUJXc_efPCZZcF7rt18kA7mfhmX0qm2v0eSw,19742
43
45
  avrotize/proto3parser.py,sha256=MfE84c-oAWWuzYmKlEZ5g5LUF7YzZaASFh2trX3UCaw,15604
44
46
  avrotize/prototoavro.py,sha256=hqXBGRxYojaEbEgoHZxXwMG4R1nWC7UMl_XNLWfqH38,17346
47
+ avrotize/schema_inference.py,sha256=puhn81PxhZiWldD7rZ2IfUQY7-hCw5pcMkVcEsP1FGY,31413
48
+ avrotize/sqltoavro.py,sha256=BpvsN8KJTp_0MLWl7JsbcQHvnA2hQgbwsJymtDKYXjo,48264
45
49
  avrotize/structuretocddl.py,sha256=RK_dTJf0oAo6BIBM48NHRcWC96OtUjlgUC6HzXs5Lkk,21234
46
50
  avrotize/structuretocpp.py,sha256=tBWOvyZPYQ1CHN6RgDnWlmzJ1giOyQ9SlHBHWvhPyiw,35898
47
51
  avrotize/structuretocsharp.py,sha256=NnHzeJcOWDbZ_LnGV6AW9hXgP7rQW6Lld9utzxO8l-g,128208
@@ -61,12 +65,14 @@ avrotize/structuretopython.py,sha256=d9EZVDHq7r-x0ZYZIRYfCP6kub7MkEROuvzjTJfNVv0
61
65
  avrotize/structuretorust.py,sha256=ChRmO7uzU-pMdDdS0Vtg-MVUaOaNhNUPwH-ZKKOHglU,35134
62
66
  avrotize/structuretots.py,sha256=zsN4ssX8rU4izOqXIiQcE0LQJ5wouydvCPoTWwSKXj4,35289
63
67
  avrotize/structuretoxsd.py,sha256=01VpasyWSMOx04sILHLP7H-WkhGdXAEGKohUUfgrNf0,32797
68
+ avrotize/validate.py,sha256=isaDJ4E3O8SKehOROKCMnwE1-vPhkrVU5E-DDeThwlQ,7747
69
+ avrotize/xmltoschema.py,sha256=I125hZtbzxzbE3BSYemYJBB8d-X9Zle0ooKVaiNu_bk,3726
64
70
  avrotize/xsdtoavro.py,sha256=nQtNH_3pEZBp67oUCPqzhvItEExHTe-8obsIfNRXt8Y,19064
65
71
  avrotize/dependencies/cpp/vcpkg/vcpkg.json,sha256=se5qnUVQ1Q6wN_DqgIioqKg_y7ouh9oly2iBAJJXkgw,414
66
- avrotize/dependencies/typescript/node22/package.json,sha256=oAW_2X-b715kV7aajwuONZEMkyQUJGviG3GwnoUa7hU,387
67
- structurize-3.0.2.dist-info/licenses/LICENSE,sha256=xGtQGygTETTtDQJafZCUbpsed3GxO6grmqig-jGEuSk,11348
68
- structurize-3.0.2.dist-info/METADATA,sha256=3c5_rlTtVt9hq44svI_ve5VVxLzIMEZH98HwWIm8BWw,3669
69
- structurize-3.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
70
- structurize-3.0.2.dist-info/entry_points.txt,sha256=biIH7jA5auhVqfbwHVk2gmD_gvrPYKgjpCAn0JWZ-Rs,55
71
- structurize-3.0.2.dist-info/top_level.txt,sha256=yn-yQ0Cm1O9fbF8KJgv4IIvX4YRGelKgPqZF1wS5P50,9
72
- structurize-3.0.2.dist-info/RECORD,,
72
+ avrotize/dependencies/typescript/node22/package.json,sha256=XEiq9nTES4aSHdLMOTp3lt9urzg3pOtQHOZ6gTOCW9s,386
73
+ structurize-3.1.1.dist-info/licenses/LICENSE,sha256=xGtQGygTETTtDQJafZCUbpsed3GxO6grmqig-jGEuSk,11348
74
+ structurize-3.1.1.dist-info/METADATA,sha256=vQ6IokgDsD6K3Pb8uS4fbRyraW7IBaX8T4DGg8eJOG8,3669
75
+ structurize-3.1.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
76
+ structurize-3.1.1.dist-info/entry_points.txt,sha256=biIH7jA5auhVqfbwHVk2gmD_gvrPYKgjpCAn0JWZ-Rs,55
77
+ structurize-3.1.1.dist-info/top_level.txt,sha256=yn-yQ0Cm1O9fbF8KJgv4IIvX4YRGelKgPqZF1wS5P50,9
78
+ structurize-3.1.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5