avrotize 3.0.2__py3-none-any.whl → 3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avrotize/validate.py ADDED
@@ -0,0 +1,242 @@
1
+ """Validates JSON instances against Avro or JSON Structure schemas.
2
+
3
+ This module provides a unified interface for validating JSON data against
4
+ both Avro schemas and JSON Structure schemas.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ from typing import Any, Dict, List, Tuple
10
+
11
+ from avrotize.avrovalidator import AvroValidator, AvroValidationError, validate_json_against_avro
12
+
13
+ # JSON Structure SDK for validation
14
+ try:
15
+ from json_structure import SchemaValidator as JStructSchemaValidator
16
+ from json_structure import InstanceValidator as JStructInstanceValidator
17
+ HAS_JSTRUCT_SDK = True
18
+ except ImportError:
19
+ HAS_JSTRUCT_SDK = False
20
+
21
+
22
+ class ValidationResult:
23
+ """Result of validating a JSON instance against a schema."""
24
+
25
+ def __init__(self, is_valid: bool, errors: List[str] = None, instance_path: str = None):
26
+ self.is_valid = is_valid
27
+ self.errors = errors or []
28
+ self.instance_path = instance_path
29
+
30
+ def __str__(self) -> str:
31
+ if self.is_valid:
32
+ return f"✓ Valid" + (f": {self.instance_path}" if self.instance_path else "")
33
+ else:
34
+ prefix = f"{self.instance_path}: " if self.instance_path else ""
35
+ return f"✗ Invalid: {prefix}" + "; ".join(self.errors)
36
+
37
+ def __repr__(self) -> str:
38
+ return f"ValidationResult(is_valid={self.is_valid}, errors={self.errors})"
39
+
40
+
41
+ def detect_schema_type(schema: Dict[str, Any]) -> str:
42
+ """Detects whether a schema is Avro or JSON Structure.
43
+
44
+ Args:
45
+ schema: The parsed schema object
46
+
47
+ Returns:
48
+ 'avro' or 'jstruct' or 'unknown'
49
+ """
50
+ # JSON Structure schemas have $schema and $id
51
+ if '$schema' in schema and 'json-structure' in schema.get('$schema', ''):
52
+ return 'jstruct'
53
+
54
+ # Avro schemas have 'type' at root and may have 'namespace', 'fields', etc.
55
+ if 'type' in schema:
56
+ schema_type = schema.get('type')
57
+ # Check for Avro record, enum, array, map, or primitive
58
+ if schema_type in ('record', 'enum', 'fixed', 'array', 'map'):
59
+ return 'avro'
60
+ if schema_type in ('null', 'boolean', 'int', 'long', 'float', 'double', 'bytes', 'string'):
61
+ return 'avro'
62
+ # JSON Structure object type
63
+ if schema_type == 'object' and 'properties' in schema:
64
+ return 'jstruct'
65
+
66
+ # Check if it's a union (list)
67
+ if isinstance(schema, list):
68
+ return 'avro'
69
+
70
+ return 'unknown'
71
+
72
+
73
+ def validate_instance(
74
+ instance: Any,
75
+ schema: Dict[str, Any],
76
+ schema_type: str = None
77
+ ) -> ValidationResult:
78
+ """Validates a JSON instance against a schema.
79
+
80
+ Args:
81
+ instance: The JSON value to validate
82
+ schema: The schema (Avro or JSON Structure)
83
+ schema_type: 'avro' or 'jstruct', auto-detected if not provided
84
+
85
+ Returns:
86
+ ValidationResult with validation status and any errors
87
+ """
88
+ if schema_type is None:
89
+ schema_type = detect_schema_type(schema)
90
+
91
+ if schema_type == 'avro':
92
+ errors = validate_json_against_avro(instance, schema)
93
+ return ValidationResult(is_valid=len(errors) == 0, errors=errors)
94
+
95
+ elif schema_type == 'jstruct':
96
+ if not HAS_JSTRUCT_SDK:
97
+ return ValidationResult(
98
+ is_valid=False,
99
+ errors=["JSON Structure SDK not installed. Install with: pip install json-structure"]
100
+ )
101
+ try:
102
+ validator = JStructInstanceValidator(schema)
103
+ errors = validator.validate(instance)
104
+ return ValidationResult(is_valid=len(errors) == 0, errors=errors if errors else [])
105
+ except Exception as e:
106
+ return ValidationResult(is_valid=False, errors=[str(e)])
107
+
108
+ else:
109
+ return ValidationResult(
110
+ is_valid=False,
111
+ errors=[f"Unknown schema type. Cannot auto-detect schema format."]
112
+ )
113
+
114
+
115
+ def validate_file(
116
+ instance_file: str,
117
+ schema_file: str,
118
+ schema_type: str = None
119
+ ) -> List[ValidationResult]:
120
+ """Validates JSON instance file(s) against a schema file.
121
+
122
+ Args:
123
+ instance_file: Path to JSON file (single object, array, or JSONL)
124
+ schema_file: Path to schema file (.avsc or .jstruct.json)
125
+ schema_type: 'avro' or 'jstruct', auto-detected if not provided
126
+
127
+ Returns:
128
+ List of ValidationResult for each instance in the file
129
+ """
130
+ # Load schema
131
+ with open(schema_file, 'r', encoding='utf-8') as f:
132
+ schema = json.load(f)
133
+
134
+ # Auto-detect schema type from file extension if not provided
135
+ if schema_type is None:
136
+ if schema_file.endswith('.avsc'):
137
+ schema_type = 'avro'
138
+ elif schema_file.endswith('.jstruct.json') or schema_file.endswith('.jstruct'):
139
+ schema_type = 'jstruct'
140
+ else:
141
+ schema_type = detect_schema_type(schema)
142
+
143
+ # Load instances
144
+ with open(instance_file, 'r', encoding='utf-8') as f:
145
+ content = f.read().strip()
146
+
147
+ instances = []
148
+ instance_paths = []
149
+
150
+ # Try as JSON array or object
151
+ try:
152
+ data = json.loads(content)
153
+ if isinstance(data, list):
154
+ instances = data
155
+ instance_paths = [f"{instance_file}[{i}]" for i in range(len(data))]
156
+ else:
157
+ instances = [data]
158
+ instance_paths = [instance_file]
159
+ except json.JSONDecodeError:
160
+ # Try as JSONL
161
+ for i, line in enumerate(content.split('\n')):
162
+ line = line.strip()
163
+ if line:
164
+ try:
165
+ instances.append(json.loads(line))
166
+ instance_paths.append(f"{instance_file}:{i+1}")
167
+ except json.JSONDecodeError:
168
+ pass
169
+
170
+ # Validate each instance
171
+ results = []
172
+ for instance, path in zip(instances, instance_paths):
173
+ result = validate_instance(instance, schema, schema_type)
174
+ result.instance_path = path
175
+ results.append(result)
176
+
177
+ return results
178
+
179
+
180
+ def validate_json_instances(
181
+ input_files: List[str],
182
+ schema_file: str,
183
+ schema_type: str = None,
184
+ verbose: bool = False
185
+ ) -> Tuple[int, int]:
186
+ """Validates multiple JSON instance files against a schema.
187
+
188
+ Args:
189
+ input_files: List of JSON file paths to validate
190
+ schema_file: Path to schema file
191
+ schema_type: 'avro' or 'jstruct', auto-detected if not provided
192
+ verbose: Whether to print validation results
193
+
194
+ Returns:
195
+ Tuple of (valid_count, invalid_count)
196
+ """
197
+ valid_count = 0
198
+ invalid_count = 0
199
+
200
+ for input_file in input_files:
201
+ results = validate_file(input_file, schema_file, schema_type)
202
+ for result in results:
203
+ if result.is_valid:
204
+ valid_count += 1
205
+ if verbose:
206
+ print(result)
207
+ else:
208
+ invalid_count += 1
209
+ if verbose:
210
+ print(result)
211
+
212
+ return valid_count, invalid_count
213
+
214
+
215
+ # Command entry point for avrotize CLI
216
+ def validate(
217
+ input: List[str],
218
+ schema: str,
219
+ schema_type: str = None,
220
+ quiet: bool = False
221
+ ) -> None:
222
+ """Validates JSON instances against an Avro or JSON Structure schema.
223
+
224
+ Args:
225
+ input: List of JSON files to validate
226
+ schema: Path to schema file (.avsc or .jstruct.json)
227
+ schema_type: Schema type ('avro' or 'jstruct'), auto-detected if not provided
228
+ quiet: Suppress output, exit with code 0 if valid, 1 if invalid
229
+ """
230
+ valid_count, invalid_count = validate_json_instances(
231
+ input_files=input,
232
+ schema_file=schema,
233
+ schema_type=schema_type,
234
+ verbose=not quiet
235
+ )
236
+
237
+ if not quiet:
238
+ total = valid_count + invalid_count
239
+ print(f"\nValidation summary: {valid_count}/{total} instances valid")
240
+
241
+ if invalid_count > 0:
242
+ exit(1)
@@ -0,0 +1,122 @@
1
+ """Infers schema from XML files and converts to Avro or JSON Structure format.
2
+
3
+ This module provides:
4
+ - xml2a: Infer Avro schema from XML files
5
+ - xml2s: Infer JSON Structure schema from XML files
6
+ """
7
+
8
+ import json
9
+ import os
10
+ from typing import List
11
+
12
+ from avrotize.schema_inference import (
13
+ AvroSchemaInferrer,
14
+ JsonStructureSchemaInferrer,
15
+ JsonNode
16
+ )
17
+
18
+
19
+ def convert_xml_to_avro(
20
+ input_files: List[str],
21
+ avro_schema_file: str,
22
+ type_name: str = 'Document',
23
+ avro_namespace: str = '',
24
+ sample_size: int = 0
25
+ ) -> None:
26
+ """Infers Avro schema from XML files.
27
+
28
+ Reads XML files, analyzes their structure, and generates an Avro schema
29
+ that can represent all the data. Multiple files are analyzed together to
30
+ produce a unified schema.
31
+
32
+ Args:
33
+ input_files: List of XML file paths to analyze
34
+ avro_schema_file: Output path for the Avro schema
35
+ type_name: Name for the root type
36
+ avro_namespace: Namespace for generated Avro types
37
+ sample_size: Maximum number of documents to sample (0 = all)
38
+ """
39
+ if not input_files:
40
+ raise ValueError("At least one input file is required")
41
+
42
+ xml_strings = _load_xml_strings(input_files, sample_size)
43
+
44
+ if not xml_strings:
45
+ raise ValueError("No valid XML data found in input files")
46
+
47
+ inferrer = AvroSchemaInferrer(namespace=avro_namespace)
48
+ schema = inferrer.infer_from_xml_values(type_name, xml_strings)
49
+
50
+ # Ensure output directory exists
51
+ output_dir = os.path.dirname(avro_schema_file)
52
+ if output_dir and not os.path.exists(output_dir):
53
+ os.makedirs(output_dir)
54
+
55
+ with open(avro_schema_file, 'w', encoding='utf-8') as f:
56
+ json.dump(schema, f, indent=2)
57
+
58
+
59
+ def convert_xml_to_jstruct(
60
+ input_files: List[str],
61
+ jstruct_schema_file: str,
62
+ type_name: str = 'Document',
63
+ base_id: str = 'https://example.com/',
64
+ sample_size: int = 0
65
+ ) -> None:
66
+ """Infers JSON Structure schema from XML files.
67
+
68
+ Reads XML files, analyzes their structure, and generates a JSON Structure
69
+ schema that validates with the official JSON Structure SDK.
70
+
71
+ Args:
72
+ input_files: List of XML file paths to analyze
73
+ jstruct_schema_file: Output path for the JSON Structure schema
74
+ type_name: Name for the root type
75
+ base_id: Base URI for $id generation
76
+ sample_size: Maximum number of documents to sample (0 = all)
77
+ """
78
+ if not input_files:
79
+ raise ValueError("At least one input file is required")
80
+
81
+ xml_strings = _load_xml_strings(input_files, sample_size)
82
+
83
+ if not xml_strings:
84
+ raise ValueError("No valid XML data found in input files")
85
+
86
+ inferrer = JsonStructureSchemaInferrer(base_id=base_id)
87
+ schema = inferrer.infer_from_xml_values(type_name, xml_strings)
88
+
89
+ # Ensure output directory exists
90
+ output_dir = os.path.dirname(jstruct_schema_file)
91
+ if output_dir and not os.path.exists(output_dir):
92
+ os.makedirs(output_dir)
93
+
94
+ with open(jstruct_schema_file, 'w', encoding='utf-8') as f:
95
+ json.dump(schema, f, indent=2)
96
+
97
+
98
+ def _load_xml_strings(input_files: List[str], sample_size: int) -> List[str]:
99
+ """Loads XML content from files.
100
+
101
+ Each file is treated as a single XML document.
102
+
103
+ Args:
104
+ input_files: List of file paths
105
+ sample_size: Maximum documents to load (0 = all)
106
+
107
+ Returns:
108
+ List of XML strings
109
+ """
110
+ xml_strings: List[str] = []
111
+
112
+ for file_path in input_files:
113
+ if sample_size > 0 and len(xml_strings) >= sample_size:
114
+ break
115
+
116
+ with open(file_path, 'r', encoding='utf-8') as f:
117
+ content = f.read().strip()
118
+
119
+ if content:
120
+ xml_strings.append(content)
121
+
122
+ return xml_strings
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: avrotize
3
- Version: 3.0.2
3
+ Version: 3.1.1
4
4
  Summary: Tools to convert from and to Avro Schema from various other schema languages.
5
5
  Author-email: Clemens Vasters <clemensv@microsoft.com>
6
6
  Requires-Python: >=3.10
@@ -26,6 +26,10 @@ Requires-Dist: pandas>=2.2.2
26
26
  Requires-Dist: docker>=7.1.0
27
27
  Requires-Dist: cddlparser>=0.5.0
28
28
  Requires-Dist: json-structure>=0.1.8
29
+ Requires-Dist: psycopg2-binary>=2.9.9 ; extra == "all-sql"
30
+ Requires-Dist: pymysql>=1.1.1 ; extra == "all-sql"
31
+ Requires-Dist: pyodbc>=5.1.0 ; extra == "all-sql"
32
+ Requires-Dist: oracledb>=2.3.0 ; extra == "all-sql"
29
33
  Requires-Dist: pytest>=8.3.2 ; extra == "dev"
30
34
  Requires-Dist: fastavro>=1.9.5 ; extra == "dev"
31
35
  Requires-Dist: xmlschema>=3.3.2 ; extra == "dev"
@@ -37,14 +41,23 @@ Requires-Dist: pydantic>=2.8.2 ; extra == "dev"
37
41
  Requires-Dist: avro>=1.12.0 ; extra == "dev"
38
42
  Requires-Dist: testcontainers>=4.7.2 ; extra == "dev"
39
43
  Requires-Dist: pymysql>=1.1.1 ; extra == "dev"
40
- Requires-Dist: psycopg2>=2.9.9 ; extra == "dev"
44
+ Requires-Dist: psycopg2-binary>=2.9.9 ; extra == "dev"
41
45
  Requires-Dist: pyodbc>=5.1.0 ; extra == "dev"
42
46
  Requires-Dist: pymongo>=4.8.0 ; extra == "dev"
43
47
  Requires-Dist: oracledb>=2.3.0 ; extra == "dev"
44
48
  Requires-Dist: cassandra-driver>=3.29.1 ; extra == "dev"
45
49
  Requires-Dist: sqlalchemy>=2.0.32 ; extra == "dev"
46
50
  Requires-Dist: graphql-core>=3.2.0 ; extra == "dev"
51
+ Requires-Dist: pymysql>=1.1.1 ; extra == "mysql"
52
+ Requires-Dist: oracledb>=2.3.0 ; extra == "oracle"
53
+ Requires-Dist: psycopg2-binary>=2.9.9 ; extra == "postgres"
54
+ Requires-Dist: pyodbc>=5.1.0 ; extra == "sqlserver"
55
+ Provides-Extra: all-sql
47
56
  Provides-Extra: dev
57
+ Provides-Extra: mysql
58
+ Provides-Extra: oracle
59
+ Provides-Extra: postgres
60
+ Provides-Extra: sqlserver
48
61
 
49
62
  # Avrotize & Structurize
50
63
 
@@ -75,6 +88,22 @@ You can install Avrotize from PyPI, [having installed Python 3.10 or later](http
75
88
  pip install avrotize
76
89
  ```
77
90
 
91
+ For SQL database support (`sql2a` command), install the optional database drivers:
92
+
93
+ ```bash
94
+ # PostgreSQL
95
+ pip install avrotize[postgres]
96
+
97
+ # MySQL
98
+ pip install avrotize[mysql]
99
+
100
+ # SQL Server
101
+ pip install avrotize[sqlserver]
102
+
103
+ # All SQL databases
104
+ pip install avrotize[all-sql]
105
+ ```
106
+
78
107
  ## Usage
79
108
 
80
109
  Avrotize provides several commands for converting schema formats via Avrotize Schema.
@@ -86,6 +115,11 @@ Converting to Avrotize Schema:
86
115
  - [`avrotize x2a`](#convert-xml-schema-xsd-to-avrotize-schema) - Convert XML schema to Avrotize Schema.
87
116
  - [`avrotize asn2a`](#convert-asn1-schema-to-avrotize-schema) - Convert ASN.1 to Avrotize Schema.
88
117
  - [`avrotize k2a`](#convert-kusto-table-definition-to-avrotize-schema) - Convert Kusto table definitions to Avrotize Schema.
118
+ - [`avrotize sql2a`](#convert-sql-database-schema-to-avrotize-schema) - Convert SQL database schema to Avrotize Schema.
119
+ - [`avrotize json2a`](#infer-avro-schema-from-json-files) - Infer Avro schema from JSON files.
120
+ - [`avrotize json2s`](#infer-json-structure-schema-from-json-files) - Infer JSON Structure schema from JSON files.
121
+ - [`avrotize xml2a`](#infer-avro-schema-from-xml-files) - Infer Avro schema from XML files.
122
+ - [`avrotize xml2s`](#infer-json-structure-schema-from-xml-files) - Infer JSON Structure schema from XML files.
89
123
  - [`avrotize pq2a`](#convert-parquet-schema-to-avrotize-schema) - Convert Parquet schema to Avrotize Schema.
90
124
  - [`avrotize csv2a`](#convert-csv-file-to-avrotize-schema) - Convert CSV file to Avrotize Schema.
91
125
  - [`avrotize kstruct2a`](#convert-kafka-connect-schema-to-avrotize-schema) - Convert Kafka Connect Schema to Avrotize Schema.
@@ -153,6 +187,7 @@ Direct JSON Structure conversions:
153
187
  Other commands:
154
188
 
155
189
  - [`avrotize pcf`](#create-the-parsing-canonical-form-pcf-of-an-avrotize-schema) - Create the Parsing Canonical Form (PCF) of an Avrotize Schema.
190
+ - [`avrotize validate`](#validate-json-instances-against-schemas) - Validate JSON instances against Avro or JSON Structure schemas.
156
191
 
157
192
  JSON Structure conversions:
158
193
 
@@ -426,6 +461,150 @@ Conversion notes:
426
461
  - For `dynamic` columns, the tool will sample the data in the table to determine the structure of the dynamic column. The tool will map the dynamic column to an Avro record type with fields that correspond to the fields found in the dynamic column. If the dynamic column contains nested dynamic columns, the tool will recursively map those to Avro record types. If records with conflicting structures are found in the dynamic column, the tool will emit a union of record types for the dynamic column.
427
462
  - If the `--emit-cloudevents-xregistry` option is set, the tool will emit an [xRegistry](http://xregistry.io) registry manifest file with a CloudEvent message definition for each table in the Kusto database and a separate Avro Schema for each table in the embedded schema registry. If one or more tables are found to contain CloudEvent data (as indicated by the presence of the CloudEvents attribute columns), the tool will inspect the content of the `type` (or `__type` or `__type`) columns to determine which CloudEvent types have been stored in the table and will emit a CloudEvent definition and schema for each unique type.
428
463
 
464
+ ### Convert SQL database schema to Avrotize Schema
465
+
466
+ ```bash
467
+ avrotize sql2a --connection-string <connection_string> [--username <user>] [--password <pass>] [--dialect <dialect>] [--database <database>] [--table-name <table>] [--out <path_to_avro_schema_file>] [--namespace <namespace>] [--infer-json] [--infer-xml] [--sample-size <n>] [--emit-cloudevents] [--emit-xregistry]
468
+ ```
469
+
470
+ Parameters:
471
+
472
+ - `--connection-string`: The database connection string. Supports SSL/TLS and integrated authentication options (see examples below).
473
+ - `--username`: (optional) Database username. Overrides any username in the connection string. Use this to avoid credentials in command history.
474
+ - `--password`: (optional) Database password. Overrides any password in the connection string. Use this to avoid credentials in command history.
475
+ - `--dialect`: (optional) The SQL dialect: `postgres` (default), `mysql`, `sqlserver`, `oracle`, or `sqlite`.
476
+ - `--database`: (optional) The database name if not specified in the connection string.
477
+ - `--table-name`: (optional) A specific table to convert. If omitted, all tables are converted.
478
+ - `--out`: The path to the Avrotize Schema file. If omitted, output goes to stdout.
479
+ - `--namespace`: (optional) The Avro namespace for the generated schema.
480
+ - `--infer-json`: (optional, default: true) Infer schema for JSON/JSONB columns by sampling data.
481
+ - `--infer-xml`: (optional, default: true) Infer schema for XML columns by sampling data.
482
+ - `--sample-size`: (optional, default: 100) Number of rows to sample for JSON/XML schema inference.
483
+ - `--emit-cloudevents`: (optional) Detect CloudEvents tables and emit CloudEvents declarations.
484
+ - `--emit-xregistry`: (optional) Emit an xRegistry manifest instead of a single schema file.
485
+
486
+ Connection string examples:
487
+
488
+ ```bash
489
+ # PostgreSQL with separate credentials (preferred for security)
490
+ avrotize sql2a --connection-string "postgresql://host:5432/mydb?sslmode=require" --username myuser --password mypass --out schema.avsc
491
+
492
+ # PostgreSQL with SSL (credentials in URL)
493
+ avrotize sql2a --connection-string "postgresql://user:pass@host:5432/mydb?sslmode=require" --out schema.avsc
494
+
495
+ # MySQL with SSL
496
+ avrotize sql2a --connection-string "mysql://user:pass@host:3306/mydb?ssl=true" --dialect mysql --out schema.avsc
497
+
498
+ # SQL Server with Windows Authentication (omit user/password)
499
+ avrotize sql2a --connection-string "mssql://@host:1433/mydb" --dialect sqlserver --out schema.avsc
500
+
501
+ # SQL Server with TLS encryption
502
+ avrotize sql2a --connection-string "mssql://user:pass@host:1433/mydb?encrypt=true" --dialect sqlserver --out schema.avsc
503
+
504
+ # SQLite file
505
+ avrotize sql2a --connection-string "/path/to/database.db" --dialect sqlite --out schema.avsc
506
+ ```
507
+
508
+ Conversion notes:
509
+
510
+ - The tool connects to a live database and reads the schema from the information schema or system catalogs.
511
+ - Type mappings for each dialect:
512
+ - **PostgreSQL**: All standard types including `uuid`, `jsonb`, `xml`, arrays, and custom types.
513
+ - **MySQL**: Standard types including `json`, `enum`, `set`, and spatial types.
514
+ - **SQL Server**: Standard types including `uniqueidentifier`, `xml`, `money`, and `hierarchyid`.
515
+ - **Oracle**: Standard types including `number`, `clob`, `blob`, and Oracle-specific types.
516
+ - **SQLite**: Dynamic typing mapped based on declared type affinity.
517
+ - For JSON/JSONB columns (PostgreSQL, MySQL) and XML columns, the tool samples data to infer the structure. Fields that appear in some but not all records are folded together. If field types conflict across records, the tool emits a union of record types.
518
+ - For columns with keys that cannot be valid Avro identifiers (UUIDs, URLs, special characters), the tool generates `map<string, T>` types instead of record types.
519
+ - Table and column comments are preserved as Avro `doc` attributes where available.
520
+ - Primary key columns are noted in the schema's `unique` attribute.
521
+
522
+ ### Infer Avro schema from JSON files
523
+
524
+ ```bash
525
+ avrotize json2a <json_files...> [--out <path_to_avro_schema_file>] [--type-name <name>] [--namespace <namespace>] [--sample-size <n>]
526
+ ```
527
+
528
+ Parameters:
529
+
530
+ - `<json_files...>`: One or more JSON files to analyze. Supports JSON arrays, single objects, and JSONL (JSON Lines) format.
531
+ - `--out`: The path to the Avro schema file. If omitted, output goes to stdout.
532
+ - `--type-name`: (optional) Name for the root type (default: "Document").
533
+ - `--namespace`: (optional) Avro namespace for generated types.
534
+ - `--sample-size`: (optional) Maximum number of records to sample (0 = all, default: 0).
535
+
536
+ Example:
537
+
538
+ ```bash
539
+ # Infer schema from multiple JSON files
540
+ avrotize json2a data1.json data2.json --out schema.avsc --type-name Event --namespace com.example
541
+
542
+ # Infer schema from JSONL file
543
+ avrotize json2a events.jsonl --out events.avsc --type-name LogEntry
544
+ ```
545
+
546
+ ### Infer JSON Structure schema from JSON files
547
+
548
+ ```bash
549
+ avrotize json2s <json_files...> [--out <path_to_jstruct_schema_file>] [--type-name <name>] [--base-id <uri>] [--sample-size <n>]
550
+ ```
551
+
552
+ Parameters:
553
+
554
+ - `<json_files...>`: One or more JSON files to analyze.
555
+ - `--out`: The path to the JSON Structure schema file. If omitted, output goes to stdout.
556
+ - `--type-name`: (optional) Name for the root type (default: "Document").
557
+ - `--base-id`: (optional) Base URI for $id generation (default: "https://example.com/").
558
+ - `--sample-size`: (optional) Maximum number of records to sample (0 = all, default: 0).
559
+
560
+ Example:
561
+
562
+ ```bash
563
+ avrotize json2s data.json --out schema.jstruct.json --type-name Person --base-id https://myapi.example.com/schemas/
564
+ ```
565
+
566
+ ### Infer Avro schema from XML files
567
+
568
+ ```bash
569
+ avrotize xml2a <xml_files...> [--out <path_to_avro_schema_file>] [--type-name <name>] [--namespace <namespace>] [--sample-size <n>]
570
+ ```
571
+
572
+ Parameters:
573
+
574
+ - `<xml_files...>`: One or more XML files to analyze.
575
+ - `--out`: The path to the Avro schema file. If omitted, output goes to stdout.
576
+ - `--type-name`: (optional) Name for the root type (default: "Document").
577
+ - `--namespace`: (optional) Avro namespace for generated types.
578
+ - `--sample-size`: (optional) Maximum number of documents to sample (0 = all, default: 0).
579
+
580
+ Example:
581
+
582
+ ```bash
583
+ avrotize xml2a config.xml --out config.avsc --type-name Configuration --namespace com.example.config
584
+ ```
585
+
586
+ ### Infer JSON Structure schema from XML files
587
+
588
+ ```bash
589
+ avrotize xml2s <xml_files...> [--out <path_to_jstruct_schema_file>] [--type-name <name>] [--base-id <uri>] [--sample-size <n>]
590
+ ```
591
+
592
+ Parameters:
593
+
594
+ - `<xml_files...>`: One or more XML files to analyze.
595
+ - `--out`: The path to the JSON Structure schema file. If omitted, output goes to stdout.
596
+ - `--type-name`: (optional) Name for the root type (default: "Document").
597
+ - `--base-id`: (optional) Base URI for $id generation (default: "https://example.com/").
598
+ - `--sample-size`: (optional) Maximum number of documents to sample (0 = all, default: 0).
599
+
600
+ Conversion notes (applies to all inference commands):
601
+
602
+ - XML attributes are converted to fields prefixed with `@` (normalized to valid identifiers).
603
+ - Text content in mixed-content elements becomes a `#text` field.
604
+ - Repeated elements are inferred as arrays.
605
+ - Multiple files with different structures are merged into a unified schema.
606
+ - Sparse data (fields that appear in some but not all records) is folded into a single type.
607
+
429
608
  ### Convert Avrotize Schema to Kusto table declaration
430
609
 
431
610
  ```bash
@@ -1308,6 +1487,45 @@ Conversion notes:
1308
1487
  - The tool generates the Parsing Canonical Form (PCF) of the Avrotize Schema. The PCF is a normalized form of the schema that is used for schema comparison and compatibility checking.
1309
1488
  - The PCF is a JSON object that is written to stdout.
1310
1489
 
1490
+ ### Validate JSON instances against schemas
1491
+
1492
+ ```bash
1493
+ avrotize validate <json_files...> --schema <schema_file> [--schema-type <type>] [--quiet]
1494
+ ```
1495
+
1496
+ Parameters:
1497
+
1498
+ - `<json_files...>`: One or more JSON files to validate. Supports single JSON objects, JSON arrays, and JSONL (newline-delimited JSON) formats.
1499
+ - `--schema <schema_file>`: Path to the schema file (`.avsc` for Avro, `.jstruct.json` for JSON Structure).
1500
+ - `--schema-type`: (optional) Schema type: `avro` or `jstruct`. Auto-detected from file extension if omitted.
1501
+ - `--quiet`: (optional) Suppress output. Exit code 0 if all instances are valid, 1 if any are invalid.
1502
+
1503
+ Validation notes:
1504
+
1505
+ - Validates JSON instances against Avro schemas per the [Avrotize Schema specification](specs/avrotize-schema.md).
1506
+ - Supports all Avro primitive types: null, boolean, int, long, float, double, bytes, string.
1507
+ - Supports all Avro complex types: record, enum, array, map, fixed.
1508
+ - Supports logical types with both native and string encodings: decimal, uuid, date, time-millis, time-micros, timestamp-millis, timestamp-micros, duration.
1509
+ - Supports field `altnames` for JSON field name mapping.
1510
+ - Supports enum `altsymbols` for JSON symbol mapping.
1511
+ - For JSON Structure validation, requires the `json-structure` package.
1512
+
1513
+ Example:
1514
+
1515
+ ```bash
1516
+ # Validate JSON file against Avro schema
1517
+ avrotize validate data.json --schema schema.avsc
1518
+
1519
+ # Validate multiple files
1520
+ avrotize validate file1.json file2.json --schema schema.avsc
1521
+
1522
+ # Validate JSONL file against JSON Structure schema
1523
+ avrotize validate events.jsonl --schema events.jstruct.json
1524
+
1525
+ # Quiet mode for CI/CD pipelines (exit code only)
1526
+ avrotize validate data.json --schema schema.avsc --quiet
1527
+ ```
1528
+
1311
1529
  ### Convert JSON Structure schema to GraphQL schema
1312
1530
 
1313
1531
  ```bash