avrotize 3.0.1__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avrotize/structuretodb.py CHANGED
@@ -426,7 +426,7 @@ def structure_type_to_sql_type(structure_type: Any, dialect: str) -> str:
426
426
  type_map["sqlanywhere"] = type_map["sqlserver"].copy()
427
427
  type_map["bigquery"] = {k: v.replace("VARCHAR", "STRING").replace("JSONB", "STRING").replace("BYTEA", "BYTES") for k, v in type_map["postgres"].items()}
428
428
  type_map["snowflake"] = {k: v.replace("JSONB", "VARIANT").replace("BYTEA", "BINARY") for k, v in type_map["postgres"].items()}
429
- type_map["redshift"] = type_map["postgres"].copy()
429
+ type_map["redshift"] = {k: v.replace("JSONB", "SUPER").replace("BYTEA", "VARBYTE") for k, v in type_map["postgres"].items()}
430
430
 
431
431
  # Handle type resolution
432
432
  if isinstance(structure_type, str):
@@ -5,6 +5,7 @@ import sys
5
5
  from typing import Dict, List, Any, Optional
6
6
  import pyarrow as pa
7
7
  from pyiceberg.schema import Schema, NestedField
8
+ from pyiceberg.io.pyarrow import PyArrowFileIO, schema_to_pyarrow
8
9
  from pyiceberg.types import (
9
10
  BooleanType,
10
11
  IntegerType,
@@ -22,11 +23,76 @@ from pyiceberg.types import (
22
23
  StructType,
23
24
  TimeType
24
25
  )
25
- from pyiceberg.io.pyarrow import PyArrowFileIO, schema_to_pyarrow
26
26
 
27
27
  JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
28
28
 
29
29
 
30
+ def iceberg_type_to_json(iceberg_type) -> str | Dict:
31
+ """
32
+ Serialize an Iceberg type to JSON per Iceberg Table Spec Appendix C.
33
+
34
+ Primitive types are serialized as strings. Complex types (struct, list, map)
35
+ are serialized as JSON objects with their nested structure.
36
+ """
37
+ # Primitive types map to simple strings
38
+ if isinstance(iceberg_type, BooleanType):
39
+ return "boolean"
40
+ elif isinstance(iceberg_type, IntegerType):
41
+ return "int"
42
+ elif isinstance(iceberg_type, LongType):
43
+ return "long"
44
+ elif isinstance(iceberg_type, FloatType):
45
+ return "float"
46
+ elif isinstance(iceberg_type, DoubleType):
47
+ return "double"
48
+ elif isinstance(iceberg_type, StringType):
49
+ return "string"
50
+ elif isinstance(iceberg_type, BinaryType):
51
+ return "binary"
52
+ elif isinstance(iceberg_type, DateType):
53
+ return "date"
54
+ elif isinstance(iceberg_type, TimeType):
55
+ return "time"
56
+ elif isinstance(iceberg_type, TimestampType):
57
+ return "timestamp"
58
+ elif isinstance(iceberg_type, DecimalType):
59
+ return f"decimal({iceberg_type.precision},{iceberg_type.scale})"
60
+ elif isinstance(iceberg_type, FixedType):
61
+ return f"fixed[{iceberg_type.length}]"
62
+ elif isinstance(iceberg_type, ListType):
63
+ return {
64
+ "type": "list",
65
+ "element-id": iceberg_type.element_id,
66
+ "element-required": iceberg_type.element_required,
67
+ "element": iceberg_type_to_json(iceberg_type.element_type)
68
+ }
69
+ elif isinstance(iceberg_type, MapType):
70
+ return {
71
+ "type": "map",
72
+ "key-id": iceberg_type.key_id,
73
+ "key": iceberg_type_to_json(iceberg_type.key_type),
74
+ "value-id": iceberg_type.value_id,
75
+ "value-required": iceberg_type.value_required,
76
+ "value": iceberg_type_to_json(iceberg_type.value_type)
77
+ }
78
+ elif isinstance(iceberg_type, StructType):
79
+ return {
80
+ "type": "struct",
81
+ "fields": [
82
+ {
83
+ "id": field.field_id,
84
+ "name": field.name,
85
+ "required": field.required,
86
+ "type": iceberg_type_to_json(field.field_type)
87
+ }
88
+ for field in iceberg_type.fields
89
+ ]
90
+ }
91
+ else:
92
+ # Fallback for unknown types
93
+ return str(iceberg_type)
94
+
95
+
30
96
  class StructureToIcebergConverter:
31
97
  """Class to convert JSON Structure schema to Iceberg schema."""
32
98
 
@@ -45,8 +111,16 @@ class StructureToIcebergConverter:
45
111
  """Get the full name of a record type."""
46
112
  return f"{namespace}.{name}" if namespace else name
47
113
 
48
- def convert_structure_to_iceberg(self, structure_schema_path: str, structure_record_type: Optional[str], output_path: str, emit_cloudevents_columns: bool=False):
49
- """Convert a JSON Structure schema to an Iceberg schema."""
114
+ def convert_structure_to_iceberg(self, structure_schema_path: str, structure_record_type: Optional[str], output_path: str, emit_cloudevents_columns: bool=False, output_format: str="arrow"):
115
+ """Convert a JSON Structure schema to an Iceberg schema.
116
+
117
+ Args:
118
+ structure_schema_path: Path to the JSON Structure schema file
119
+ structure_record_type: Record type to convert (or None for the root)
120
+ output_path: Path to write the Iceberg schema
121
+ emit_cloudevents_columns: Whether to add CloudEvents columns
122
+ output_format: Output format - 'arrow' for binary Arrow IPC (default), 'schema' for JSON
123
+ """
50
124
  schema_file = structure_schema_path
51
125
  if not schema_file:
52
126
  print("Please specify the JSON Structure schema file")
@@ -114,14 +188,32 @@ class StructureToIcebergConverter:
114
188
  ])
115
189
 
116
190
  iceberg_schema = Schema(*iceberg_fields)
117
- arrow_schema = schema_to_pyarrow(iceberg_schema)
118
- print(f"Iceberg schema created: {arrow_schema}")
191
+ print(f"Iceberg schema created: {iceberg_schema}")
119
192
 
120
- # Write to Iceberg table (for demonstration, using local file system)
121
- file_io = PyArrowFileIO()
122
- output_file = file_io.new_output("file://"+output_path)
123
- with output_file.create(overwrite=True) as f:
124
- pa.output_stream(f).write(arrow_schema.serialize().to_pybytes())
193
+ if output_format == "arrow":
194
+ # Write as binary PyArrow schema
195
+ arrow_schema = schema_to_pyarrow(iceberg_schema)
196
+ file_io = PyArrowFileIO()
197
+ output_file = file_io.new_output("file://" + output_path)
198
+ with output_file.create(overwrite=True) as f:
199
+ pa.output_stream(f).write(arrow_schema.serialize().to_pybytes())
200
+ else:
201
+ # Write Iceberg schema as spec-compliant JSON (per Iceberg Table Spec Appendix C)
202
+ schema_json = {
203
+ "type": "struct",
204
+ "schema-id": 0,
205
+ "fields": [
206
+ {
207
+ "id": field.field_id,
208
+ "name": field.name,
209
+ "required": field.required,
210
+ "type": iceberg_type_to_json(field.field_type)
211
+ }
212
+ for field in iceberg_schema.fields
213
+ ]
214
+ }
215
+ with open(output_path, "w", encoding="utf-8") as f:
216
+ json.dump(schema_json, f, indent=2)
125
217
 
126
218
  def resolve_ref(self, ref: str) -> Dict[str, Any]:
127
219
  """Resolve a $ref reference."""
@@ -348,8 +440,16 @@ class StructureToIcebergConverter:
348
440
  return type_mapping.get(type_name, StringType())
349
441
 
350
442
 
351
- def convert_structure_to_iceberg(structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns=False):
352
- """Convert a JSON Structure schema to an Iceberg schema."""
443
+ def convert_structure_to_iceberg(structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns=False, output_format="arrow"):
444
+ """Convert a JSON Structure schema to an Iceberg schema.
445
+
446
+ Args:
447
+ structure_schema_path: Path to the JSON Structure schema file
448
+ structure_record_type: Record type to convert (or None for the root)
449
+ output_path: Path to write the Iceberg schema
450
+ emit_cloudevents_columns: Whether to add CloudEvents columns
451
+ output_format: Output format - 'arrow' for binary Arrow IPC (default), 'schema' for JSON
452
+ """
353
453
  converter = StructureToIcebergConverter()
354
454
  converter.convert_structure_to_iceberg(
355
- structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns)
455
+ structure_schema_path, structure_record_type, output_path, emit_cloudevents_columns, output_format)
avrotize/validate.py ADDED
@@ -0,0 +1,242 @@
1
+ """Validates JSON instances against Avro or JSON Structure schemas.
2
+
3
+ This module provides a unified interface for validating JSON data against
4
+ both Avro schemas and JSON Structure schemas.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ from typing import Any, Dict, List, Tuple
10
+
11
+ from avrotize.avrovalidator import AvroValidator, AvroValidationError, validate_json_against_avro
12
+
13
+ # JSON Structure SDK for validation
14
+ try:
15
+ from json_structure import SchemaValidator as JStructSchemaValidator
16
+ from json_structure import InstanceValidator as JStructInstanceValidator
17
+ HAS_JSTRUCT_SDK = True
18
+ except ImportError:
19
+ HAS_JSTRUCT_SDK = False
20
+
21
+
22
+ class ValidationResult:
23
+ """Result of validating a JSON instance against a schema."""
24
+
25
+ def __init__(self, is_valid: bool, errors: List[str] = None, instance_path: str = None):
26
+ self.is_valid = is_valid
27
+ self.errors = errors or []
28
+ self.instance_path = instance_path
29
+
30
+ def __str__(self) -> str:
31
+ if self.is_valid:
32
+ return f"✓ Valid" + (f": {self.instance_path}" if self.instance_path else "")
33
+ else:
34
+ prefix = f"{self.instance_path}: " if self.instance_path else ""
35
+ return f"✗ Invalid: {prefix}" + "; ".join(self.errors)
36
+
37
+ def __repr__(self) -> str:
38
+ return f"ValidationResult(is_valid={self.is_valid}, errors={self.errors})"
39
+
40
+
41
+ def detect_schema_type(schema: Dict[str, Any]) -> str:
42
+ """Detects whether a schema is Avro or JSON Structure.
43
+
44
+ Args:
45
+ schema: The parsed schema object
46
+
47
+ Returns:
48
+ 'avro' or 'jstruct' or 'unknown'
49
+ """
50
+ # JSON Structure schemas have $schema and $id
51
+ if '$schema' in schema and 'json-structure' in schema.get('$schema', ''):
52
+ return 'jstruct'
53
+
54
+ # Avro schemas have 'type' at root and may have 'namespace', 'fields', etc.
55
+ if 'type' in schema:
56
+ schema_type = schema.get('type')
57
+ # Check for Avro record, enum, array, map, or primitive
58
+ if schema_type in ('record', 'enum', 'fixed', 'array', 'map'):
59
+ return 'avro'
60
+ if schema_type in ('null', 'boolean', 'int', 'long', 'float', 'double', 'bytes', 'string'):
61
+ return 'avro'
62
+ # JSON Structure object type
63
+ if schema_type == 'object' and 'properties' in schema:
64
+ return 'jstruct'
65
+
66
+ # Check if it's a union (list)
67
+ if isinstance(schema, list):
68
+ return 'avro'
69
+
70
+ return 'unknown'
71
+
72
+
73
+ def validate_instance(
74
+ instance: Any,
75
+ schema: Dict[str, Any],
76
+ schema_type: str = None
77
+ ) -> ValidationResult:
78
+ """Validates a JSON instance against a schema.
79
+
80
+ Args:
81
+ instance: The JSON value to validate
82
+ schema: The schema (Avro or JSON Structure)
83
+ schema_type: 'avro' or 'jstruct', auto-detected if not provided
84
+
85
+ Returns:
86
+ ValidationResult with validation status and any errors
87
+ """
88
+ if schema_type is None:
89
+ schema_type = detect_schema_type(schema)
90
+
91
+ if schema_type == 'avro':
92
+ errors = validate_json_against_avro(instance, schema)
93
+ return ValidationResult(is_valid=len(errors) == 0, errors=errors)
94
+
95
+ elif schema_type == 'jstruct':
96
+ if not HAS_JSTRUCT_SDK:
97
+ return ValidationResult(
98
+ is_valid=False,
99
+ errors=["JSON Structure SDK not installed. Install with: pip install json-structure"]
100
+ )
101
+ try:
102
+ validator = JStructInstanceValidator(schema)
103
+ errors = validator.validate(instance)
104
+ return ValidationResult(is_valid=len(errors) == 0, errors=errors if errors else [])
105
+ except Exception as e:
106
+ return ValidationResult(is_valid=False, errors=[str(e)])
107
+
108
+ else:
109
+ return ValidationResult(
110
+ is_valid=False,
111
+ errors=[f"Unknown schema type. Cannot auto-detect schema format."]
112
+ )
113
+
114
+
115
+ def validate_file(
116
+ instance_file: str,
117
+ schema_file: str,
118
+ schema_type: str = None
119
+ ) -> List[ValidationResult]:
120
+ """Validates JSON instance file(s) against a schema file.
121
+
122
+ Args:
123
+ instance_file: Path to JSON file (single object, array, or JSONL)
124
+ schema_file: Path to schema file (.avsc or .jstruct.json)
125
+ schema_type: 'avro' or 'jstruct', auto-detected if not provided
126
+
127
+ Returns:
128
+ List of ValidationResult for each instance in the file
129
+ """
130
+ # Load schema
131
+ with open(schema_file, 'r', encoding='utf-8') as f:
132
+ schema = json.load(f)
133
+
134
+ # Auto-detect schema type from file extension if not provided
135
+ if schema_type is None:
136
+ if schema_file.endswith('.avsc'):
137
+ schema_type = 'avro'
138
+ elif schema_file.endswith('.jstruct.json') or schema_file.endswith('.jstruct'):
139
+ schema_type = 'jstruct'
140
+ else:
141
+ schema_type = detect_schema_type(schema)
142
+
143
+ # Load instances
144
+ with open(instance_file, 'r', encoding='utf-8') as f:
145
+ content = f.read().strip()
146
+
147
+ instances = []
148
+ instance_paths = []
149
+
150
+ # Try as JSON array or object
151
+ try:
152
+ data = json.loads(content)
153
+ if isinstance(data, list):
154
+ instances = data
155
+ instance_paths = [f"{instance_file}[{i}]" for i in range(len(data))]
156
+ else:
157
+ instances = [data]
158
+ instance_paths = [instance_file]
159
+ except json.JSONDecodeError:
160
+ # Try as JSONL
161
+ for i, line in enumerate(content.split('\n')):
162
+ line = line.strip()
163
+ if line:
164
+ try:
165
+ instances.append(json.loads(line))
166
+ instance_paths.append(f"{instance_file}:{i+1}")
167
+ except json.JSONDecodeError:
168
+ pass
169
+
170
+ # Validate each instance
171
+ results = []
172
+ for instance, path in zip(instances, instance_paths):
173
+ result = validate_instance(instance, schema, schema_type)
174
+ result.instance_path = path
175
+ results.append(result)
176
+
177
+ return results
178
+
179
+
180
+ def validate_json_instances(
181
+ input_files: List[str],
182
+ schema_file: str,
183
+ schema_type: str = None,
184
+ verbose: bool = False
185
+ ) -> Tuple[int, int]:
186
+ """Validates multiple JSON instance files against a schema.
187
+
188
+ Args:
189
+ input_files: List of JSON file paths to validate
190
+ schema_file: Path to schema file
191
+ schema_type: 'avro' or 'jstruct', auto-detected if not provided
192
+ verbose: Whether to print validation results
193
+
194
+ Returns:
195
+ Tuple of (valid_count, invalid_count)
196
+ """
197
+ valid_count = 0
198
+ invalid_count = 0
199
+
200
+ for input_file in input_files:
201
+ results = validate_file(input_file, schema_file, schema_type)
202
+ for result in results:
203
+ if result.is_valid:
204
+ valid_count += 1
205
+ if verbose:
206
+ print(result)
207
+ else:
208
+ invalid_count += 1
209
+ if verbose:
210
+ print(result)
211
+
212
+ return valid_count, invalid_count
213
+
214
+
215
+ # Command entry point for avrotize CLI
216
+ def validate(
217
+ input: List[str],
218
+ schema: str,
219
+ schema_type: str = None,
220
+ quiet: bool = False
221
+ ) -> None:
222
+ """Validates JSON instances against an Avro or JSON Structure schema.
223
+
224
+ Args:
225
+ input: List of JSON files to validate
226
+ schema: Path to schema file (.avsc or .jstruct.json)
227
+ schema_type: Schema type ('avro' or 'jstruct'), auto-detected if not provided
228
+ quiet: Suppress output, exit with code 0 if valid, 1 if invalid
229
+ """
230
+ valid_count, invalid_count = validate_json_instances(
231
+ input_files=input,
232
+ schema_file=schema,
233
+ schema_type=schema_type,
234
+ verbose=not quiet
235
+ )
236
+
237
+ if not quiet:
238
+ total = valid_count + invalid_count
239
+ print(f"\nValidation summary: {valid_count}/{total} instances valid")
240
+
241
+ if invalid_count > 0:
242
+ exit(1)
@@ -0,0 +1,122 @@
1
+ """Infers schema from XML files and converts to Avro or JSON Structure format.
2
+
3
+ This module provides:
4
+ - xml2a: Infer Avro schema from XML files
5
+ - xml2s: Infer JSON Structure schema from XML files
6
+ """
7
+
8
+ import json
9
+ import os
10
+ from typing import List
11
+
12
+ from avrotize.schema_inference import (
13
+ AvroSchemaInferrer,
14
+ JsonStructureSchemaInferrer,
15
+ JsonNode
16
+ )
17
+
18
+
19
+ def convert_xml_to_avro(
20
+ input_files: List[str],
21
+ avro_schema_file: str,
22
+ type_name: str = 'Document',
23
+ avro_namespace: str = '',
24
+ sample_size: int = 0
25
+ ) -> None:
26
+ """Infers Avro schema from XML files.
27
+
28
+ Reads XML files, analyzes their structure, and generates an Avro schema
29
+ that can represent all the data. Multiple files are analyzed together to
30
+ produce a unified schema.
31
+
32
+ Args:
33
+ input_files: List of XML file paths to analyze
34
+ avro_schema_file: Output path for the Avro schema
35
+ type_name: Name for the root type
36
+ avro_namespace: Namespace for generated Avro types
37
+ sample_size: Maximum number of documents to sample (0 = all)
38
+ """
39
+ if not input_files:
40
+ raise ValueError("At least one input file is required")
41
+
42
+ xml_strings = _load_xml_strings(input_files, sample_size)
43
+
44
+ if not xml_strings:
45
+ raise ValueError("No valid XML data found in input files")
46
+
47
+ inferrer = AvroSchemaInferrer(namespace=avro_namespace)
48
+ schema = inferrer.infer_from_xml_values(type_name, xml_strings)
49
+
50
+ # Ensure output directory exists
51
+ output_dir = os.path.dirname(avro_schema_file)
52
+ if output_dir and not os.path.exists(output_dir):
53
+ os.makedirs(output_dir)
54
+
55
+ with open(avro_schema_file, 'w', encoding='utf-8') as f:
56
+ json.dump(schema, f, indent=2)
57
+
58
+
59
+ def convert_xml_to_jstruct(
60
+ input_files: List[str],
61
+ jstruct_schema_file: str,
62
+ type_name: str = 'Document',
63
+ base_id: str = 'https://example.com/',
64
+ sample_size: int = 0
65
+ ) -> None:
66
+ """Infers JSON Structure schema from XML files.
67
+
68
+ Reads XML files, analyzes their structure, and generates a JSON Structure
69
+ schema that validates with the official JSON Structure SDK.
70
+
71
+ Args:
72
+ input_files: List of XML file paths to analyze
73
+ jstruct_schema_file: Output path for the JSON Structure schema
74
+ type_name: Name for the root type
75
+ base_id: Base URI for $id generation
76
+ sample_size: Maximum number of documents to sample (0 = all)
77
+ """
78
+ if not input_files:
79
+ raise ValueError("At least one input file is required")
80
+
81
+ xml_strings = _load_xml_strings(input_files, sample_size)
82
+
83
+ if not xml_strings:
84
+ raise ValueError("No valid XML data found in input files")
85
+
86
+ inferrer = JsonStructureSchemaInferrer(base_id=base_id)
87
+ schema = inferrer.infer_from_xml_values(type_name, xml_strings)
88
+
89
+ # Ensure output directory exists
90
+ output_dir = os.path.dirname(jstruct_schema_file)
91
+ if output_dir and not os.path.exists(output_dir):
92
+ os.makedirs(output_dir)
93
+
94
+ with open(jstruct_schema_file, 'w', encoding='utf-8') as f:
95
+ json.dump(schema, f, indent=2)
96
+
97
+
98
+ def _load_xml_strings(input_files: List[str], sample_size: int) -> List[str]:
99
+ """Loads XML content from files.
100
+
101
+ Each file is treated as a single XML document.
102
+
103
+ Args:
104
+ input_files: List of file paths
105
+ sample_size: Maximum documents to load (0 = all)
106
+
107
+ Returns:
108
+ List of XML strings
109
+ """
110
+ xml_strings: List[str] = []
111
+
112
+ for file_path in input_files:
113
+ if sample_size > 0 and len(xml_strings) >= sample_size:
114
+ break
115
+
116
+ with open(file_path, 'r', encoding='utf-8') as f:
117
+ content = f.read().strip()
118
+
119
+ if content:
120
+ xml_strings.append(content)
121
+
122
+ return xml_strings