structurize 3.0.2__py3-none-any.whl → 3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avrotize/commands.json CHANGED
@@ -997,6 +997,472 @@
997
997
  "suggested_output_file_path": "{input_file_name}.sql",
998
998
  "prompts": []
999
999
  },
1000
+ {
1001
+ "command": "sql2a",
1002
+ "description": "Convert SQL schema to Avrotize schema (connects to live database)",
1003
+ "group": "5_SQL",
1004
+ "skip_input_file_handling": true,
1005
+ "function": {
1006
+ "name": "avrotize.sqltoavro.convert_sql_to_avro",
1007
+ "args": {
1008
+ "connection_string": "args.connection_string",
1009
+ "database": "args.database",
1010
+ "table_name": "args.table_name",
1011
+ "avro_namespace": "args.namespace",
1012
+ "avro_schema_file": "output_file_path",
1013
+ "dialect": "args.dialect",
1014
+ "emit_cloudevents": "args.emit_cloudevents",
1015
+ "emit_cloudevents_xregistry": "args.emit_xregistry",
1016
+ "sample_size": "args.sample_size",
1017
+ "infer_json_schema": "args.infer_json",
1018
+ "infer_xml_schema": "args.infer_xml",
1019
+ "username": "args.username",
1020
+ "password": "args.password"
1021
+ }
1022
+ },
1023
+ "extensions": [],
1024
+ "args": [
1025
+ {
1026
+ "name": "input",
1027
+ "type": "str",
1028
+ "nargs": "?",
1029
+ "help": "Not used (database connection is live)",
1030
+ "required": false
1031
+ },
1032
+ {
1033
+ "name": "--out",
1034
+ "type": "str",
1035
+ "help": "Path to the Avrotize schema file",
1036
+ "required": false
1037
+ },
1038
+ {
1039
+ "name": "--connection-string",
1040
+ "type": "str",
1041
+ "help": "Database connection string. Examples: postgresql://host:port/db (credentials via --username/--password), postgresql://user:pass@host:port/db?sslmode=require (PostgreSQL with SSL), mysql://user:pass@host/db?ssl=true (MySQL with SSL), mssql://@host/db (SQL Server with Windows Auth)",
1042
+ "required": true
1043
+ },
1044
+ {
1045
+ "name": "--username",
1046
+ "type": "str",
1047
+ "help": "Database username (overrides credentials in connection string, avoids credentials in command history)",
1048
+ "required": false
1049
+ },
1050
+ {
1051
+ "name": "--password",
1052
+ "type": "str",
1053
+ "help": "Database password (overrides credentials in connection string, avoids credentials in command history)",
1054
+ "required": false
1055
+ },
1056
+ {
1057
+ "name": "--database",
1058
+ "type": "str",
1059
+ "help": "Database name (if not in connection string)",
1060
+ "required": false
1061
+ },
1062
+ {
1063
+ "name": "--table-name",
1064
+ "type": "str",
1065
+ "help": "Specific table name (omit for all tables)",
1066
+ "required": false
1067
+ },
1068
+ {
1069
+ "name": "--namespace",
1070
+ "type": "str",
1071
+ "help": "Namespace for the Avrotize schema",
1072
+ "required": false
1073
+ },
1074
+ {
1075
+ "name": "--dialect",
1076
+ "type": "str",
1077
+ "help": "SQL dialect",
1078
+ "choices": [
1079
+ "postgres",
1080
+ "mysql",
1081
+ "sqlserver",
1082
+ "oracle",
1083
+ "sqlite"
1084
+ ],
1085
+ "default": "postgres",
1086
+ "required": false
1087
+ },
1088
+ {
1089
+ "name": "--emit-cloudevents",
1090
+ "type": "bool",
1091
+ "help": "Emit CloudEvents declarations for tables with type/source/data/id columns",
1092
+ "required": false
1093
+ },
1094
+ {
1095
+ "name": "--emit-xregistry",
1096
+ "type": "bool",
1097
+ "help": "Emit an xRegistry manifest instead of a single Avrotize schema",
1098
+ "required": false
1099
+ },
1100
+ {
1101
+ "name": "--sample-size",
1102
+ "type": "int",
1103
+ "help": "Number of rows to sample for JSON/XML inference",
1104
+ "default": 100,
1105
+ "required": false
1106
+ },
1107
+ {
1108
+ "name": "--infer-json",
1109
+ "type": "bool",
1110
+ "help": "Infer schema for JSON/JSONB columns",
1111
+ "default": true,
1112
+ "required": false
1113
+ },
1114
+ {
1115
+ "name": "--infer-xml",
1116
+ "type": "bool",
1117
+ "help": "Infer schema for XML columns",
1118
+ "default": true,
1119
+ "required": false
1120
+ }
1121
+ ],
1122
+ "suggested_output_file_path": "{database}.avsc",
1123
+ "prompts": [
1124
+ {
1125
+ "name": "--namespace",
1126
+ "message": "Enter the namespace for the Avro schema",
1127
+ "type": "str",
1128
+ "required": false
1129
+ },
1130
+ {
1131
+ "name": "--dialect",
1132
+ "message": "Select the SQL dialect",
1133
+ "choices": [
1134
+ "postgres",
1135
+ "mysql",
1136
+ "sqlserver",
1137
+ "oracle",
1138
+ "sqlite"
1139
+ ],
1140
+ "default": "postgres",
1141
+ "required": true
1142
+ }
1143
+ ]
1144
+ },
1145
+ {
1146
+ "command": "json2a",
1147
+ "description": "Infer Avro schema from JSON files",
1148
+ "group": "6_Inference",
1149
+ "skip_input_file_handling": true,
1150
+ "function": {
1151
+ "name": "avrotize.jsontoschema.convert_json_to_avro",
1152
+ "args": {
1153
+ "input_files": "args.input",
1154
+ "avro_schema_file": "output_file_path",
1155
+ "type_name": "args.type_name",
1156
+ "avro_namespace": "args.namespace",
1157
+ "sample_size": "args.sample_size"
1158
+ }
1159
+ },
1160
+ "extensions": [".json", ".jsonl", ".ndjson"],
1161
+ "args": [
1162
+ {
1163
+ "name": "input",
1164
+ "type": "str",
1165
+ "nargs": "+",
1166
+ "help": "JSON file(s) to analyze for schema inference",
1167
+ "required": true
1168
+ },
1169
+ {
1170
+ "name": "--out",
1171
+ "type": "str",
1172
+ "help": "Path to the output Avro schema file",
1173
+ "required": false
1174
+ },
1175
+ {
1176
+ "name": "--type-name",
1177
+ "type": "str",
1178
+ "help": "Name for the root type",
1179
+ "default": "Document",
1180
+ "required": false
1181
+ },
1182
+ {
1183
+ "name": "--namespace",
1184
+ "type": "str",
1185
+ "help": "Namespace for the Avro schema",
1186
+ "required": false
1187
+ },
1188
+ {
1189
+ "name": "--sample-size",
1190
+ "type": "int",
1191
+ "help": "Maximum number of records to sample (0 = all)",
1192
+ "default": 0,
1193
+ "required": false
1194
+ }
1195
+ ],
1196
+ "suggested_output_file_path": "{input_file_name}.avsc",
1197
+ "prompts": [
1198
+ {
1199
+ "name": "--type-name",
1200
+ "message": "Enter the name for the root type",
1201
+ "type": "str",
1202
+ "default": "Document",
1203
+ "required": false
1204
+ },
1205
+ {
1206
+ "name": "--namespace",
1207
+ "message": "Enter the namespace for the Avro schema",
1208
+ "type": "str",
1209
+ "required": false
1210
+ }
1211
+ ]
1212
+ },
1213
+ {
1214
+ "command": "json2s",
1215
+ "description": "Infer JSON Structure schema from JSON files",
1216
+ "group": "6_Inference",
1217
+ "skip_input_file_handling": true,
1218
+ "function": {
1219
+ "name": "avrotize.jsontoschema.convert_json_to_jstruct",
1220
+ "args": {
1221
+ "input_files": "args.input",
1222
+ "jstruct_schema_file": "output_file_path",
1223
+ "type_name": "args.type_name",
1224
+ "base_id": "args.base_id",
1225
+ "sample_size": "args.sample_size"
1226
+ }
1227
+ },
1228
+ "extensions": [".json", ".jsonl", ".ndjson"],
1229
+ "args": [
1230
+ {
1231
+ "name": "input",
1232
+ "type": "str",
1233
+ "nargs": "+",
1234
+ "help": "JSON file(s) to analyze for schema inference",
1235
+ "required": true
1236
+ },
1237
+ {
1238
+ "name": "--out",
1239
+ "type": "str",
1240
+ "help": "Path to the output JSON Structure schema file",
1241
+ "required": false
1242
+ },
1243
+ {
1244
+ "name": "--type-name",
1245
+ "type": "str",
1246
+ "help": "Name for the root type",
1247
+ "default": "Document",
1248
+ "required": false
1249
+ },
1250
+ {
1251
+ "name": "--base-id",
1252
+ "type": "str",
1253
+ "help": "Base URI for $id generation",
1254
+ "default": "https://example.com/",
1255
+ "required": false
1256
+ },
1257
+ {
1258
+ "name": "--sample-size",
1259
+ "type": "int",
1260
+ "help": "Maximum number of records to sample (0 = all)",
1261
+ "default": 0,
1262
+ "required": false
1263
+ }
1264
+ ],
1265
+ "suggested_output_file_path": "{input_file_name}.jstruct.json",
1266
+ "prompts": [
1267
+ {
1268
+ "name": "--type-name",
1269
+ "message": "Enter the name for the root type",
1270
+ "type": "str",
1271
+ "default": "Document",
1272
+ "required": false
1273
+ },
1274
+ {
1275
+ "name": "--base-id",
1276
+ "message": "Enter the base URI for $id generation",
1277
+ "type": "str",
1278
+ "default": "https://example.com/",
1279
+ "required": false
1280
+ }
1281
+ ]
1282
+ },
1283
+ {
1284
+ "command": "xml2a",
1285
+ "description": "Infer Avro schema from XML files",
1286
+ "group": "6_Inference",
1287
+ "skip_input_file_handling": true,
1288
+ "function": {
1289
+ "name": "avrotize.xmltoschema.convert_xml_to_avro",
1290
+ "args": {
1291
+ "input_files": "args.input",
1292
+ "avro_schema_file": "output_file_path",
1293
+ "type_name": "args.type_name",
1294
+ "avro_namespace": "args.namespace",
1295
+ "sample_size": "args.sample_size"
1296
+ }
1297
+ },
1298
+ "extensions": [".xml"],
1299
+ "args": [
1300
+ {
1301
+ "name": "input",
1302
+ "type": "str",
1303
+ "nargs": "+",
1304
+ "help": "XML file(s) to analyze for schema inference",
1305
+ "required": true
1306
+ },
1307
+ {
1308
+ "name": "--out",
1309
+ "type": "str",
1310
+ "help": "Path to the output Avro schema file",
1311
+ "required": false
1312
+ },
1313
+ {
1314
+ "name": "--type-name",
1315
+ "type": "str",
1316
+ "help": "Name for the root type",
1317
+ "default": "Document",
1318
+ "required": false
1319
+ },
1320
+ {
1321
+ "name": "--namespace",
1322
+ "type": "str",
1323
+ "help": "Namespace for the Avro schema",
1324
+ "required": false
1325
+ },
1326
+ {
1327
+ "name": "--sample-size",
1328
+ "type": "int",
1329
+ "help": "Maximum number of documents to sample (0 = all)",
1330
+ "default": 0,
1331
+ "required": false
1332
+ }
1333
+ ],
1334
+ "suggested_output_file_path": "{input_file_name}.avsc",
1335
+ "prompts": [
1336
+ {
1337
+ "name": "--type-name",
1338
+ "message": "Enter the name for the root type",
1339
+ "type": "str",
1340
+ "default": "Document",
1341
+ "required": false
1342
+ },
1343
+ {
1344
+ "name": "--namespace",
1345
+ "message": "Enter the namespace for the Avro schema",
1346
+ "type": "str",
1347
+ "required": false
1348
+ }
1349
+ ]
1350
+ },
1351
+ {
1352
+ "command": "xml2s",
1353
+ "description": "Infer JSON Structure schema from XML files",
1354
+ "group": "6_Inference",
1355
+ "skip_input_file_handling": true,
1356
+ "function": {
1357
+ "name": "avrotize.xmltoschema.convert_xml_to_jstruct",
1358
+ "args": {
1359
+ "input_files": "args.input",
1360
+ "jstruct_schema_file": "output_file_path",
1361
+ "type_name": "args.type_name",
1362
+ "base_id": "args.base_id",
1363
+ "sample_size": "args.sample_size"
1364
+ }
1365
+ },
1366
+ "extensions": [".xml"],
1367
+ "args": [
1368
+ {
1369
+ "name": "input",
1370
+ "type": "str",
1371
+ "nargs": "+",
1372
+ "help": "XML file(s) to analyze for schema inference",
1373
+ "required": true
1374
+ },
1375
+ {
1376
+ "name": "--out",
1377
+ "type": "str",
1378
+ "help": "Path to the output JSON Structure schema file",
1379
+ "required": false
1380
+ },
1381
+ {
1382
+ "name": "--type-name",
1383
+ "type": "str",
1384
+ "help": "Name for the root type",
1385
+ "default": "Document",
1386
+ "required": false
1387
+ },
1388
+ {
1389
+ "name": "--base-id",
1390
+ "type": "str",
1391
+ "help": "Base URI for $id generation",
1392
+ "default": "https://example.com/",
1393
+ "required": false
1394
+ },
1395
+ {
1396
+ "name": "--sample-size",
1397
+ "type": "int",
1398
+ "help": "Maximum number of documents to sample (0 = all)",
1399
+ "default": 0,
1400
+ "required": false
1401
+ }
1402
+ ],
1403
+ "suggested_output_file_path": "{input_file_name}.jstruct.json",
1404
+ "prompts": [
1405
+ {
1406
+ "name": "--type-name",
1407
+ "message": "Enter the name for the root type",
1408
+ "type": "str",
1409
+ "default": "Document",
1410
+ "required": false
1411
+ },
1412
+ {
1413
+ "name": "--base-id",
1414
+ "message": "Enter the base URI for $id generation",
1415
+ "type": "str",
1416
+ "default": "https://example.com/",
1417
+ "required": false
1418
+ }
1419
+ ]
1420
+ },
1421
+ {
1422
+ "command": "validate",
1423
+ "description": "Validate JSON instances against Avro or JSON Structure schemas",
1424
+ "group": "6_Inference",
1425
+ "skip_input_file_handling": true,
1426
+ "function": {
1427
+ "name": "avrotize.validate.validate",
1428
+ "args": {
1429
+ "input": "args.input",
1430
+ "schema": "args.schema",
1431
+ "schema_type": "args.schema_type",
1432
+ "quiet": "args.quiet"
1433
+ }
1434
+ },
1435
+ "extensions": [".json", ".jsonl"],
1436
+ "args": [
1437
+ {
1438
+ "name": "input",
1439
+ "type": "str",
1440
+ "nargs": "+",
1441
+ "help": "JSON file(s) to validate",
1442
+ "required": true
1443
+ },
1444
+ {
1445
+ "name": "--schema",
1446
+ "type": "str",
1447
+ "help": "Path to schema file (.avsc for Avro, .jstruct.json for JSON Structure)",
1448
+ "required": true
1449
+ },
1450
+ {
1451
+ "name": "--schema-type",
1452
+ "type": "str",
1453
+ "help": "Schema type: 'avro' or 'jstruct'. Auto-detected from file extension if omitted.",
1454
+ "required": false
1455
+ },
1456
+ {
1457
+ "name": "--quiet",
1458
+ "type": "bool",
1459
+ "help": "Suppress output. Exit code 0 if valid, 1 if invalid.",
1460
+ "default": false,
1461
+ "required": false
1462
+ }
1463
+ ],
1464
+ "prompts": []
1465
+ },
1000
1466
  {
1001
1467
  "command": "a2mongo",
1002
1468
  "description": "Convert Avrotize schema to MongoDB schema",
@@ -8,7 +8,7 @@
8
8
  "kafkajs": "^2.2.4"
9
9
  },
10
10
  "devDependencies": {
11
- "@types/node": "^24.10.1",
11
+ "@types/node": "^25.0.3",
12
12
  "typescript": "^5.7.2",
13
13
  "jest": "^30.2.0",
14
14
  "@types/jest": "^30.0.0"
@@ -0,0 +1,151 @@
1
+ """Infers schema from JSON files and converts to Avro or JSON Structure format.
2
+
3
+ This module provides:
4
+ - json2a: Infer Avro schema from JSON files
5
+ - json2s: Infer JSON Structure schema from JSON files
6
+ """
7
+
8
+ import json
9
+ import os
10
+ from typing import Any, Dict, List
11
+
12
+ from avrotize.schema_inference import (
13
+ AvroSchemaInferrer,
14
+ JsonStructureSchemaInferrer,
15
+ JsonNode
16
+ )
17
+
18
+
19
+ def convert_json_to_avro(
20
+ input_files: List[str],
21
+ avro_schema_file: str,
22
+ type_name: str = 'Document',
23
+ avro_namespace: str = '',
24
+ sample_size: int = 0
25
+ ) -> None:
26
+ """Infers Avro schema from JSON files.
27
+
28
+ Reads JSON files, analyzes their structure, and generates an Avro schema
29
+ that can represent all the data. Multiple files are analyzed together to
30
+ produce a unified schema.
31
+
32
+ Args:
33
+ input_files: List of JSON file paths to analyze
34
+ avro_schema_file: Output path for the Avro schema
35
+ type_name: Name for the root type
36
+ avro_namespace: Namespace for generated Avro types
37
+ sample_size: Maximum number of records to sample (0 = all)
38
+ """
39
+ if not input_files:
40
+ raise ValueError("At least one input file is required")
41
+
42
+ values = _load_json_values(input_files, sample_size)
43
+
44
+ if not values:
45
+ raise ValueError("No valid JSON data found in input files")
46
+
47
+ inferrer = AvroSchemaInferrer(namespace=avro_namespace)
48
+ schema = inferrer.infer_from_json_values(type_name, values)
49
+
50
+ # Ensure output directory exists
51
+ output_dir = os.path.dirname(avro_schema_file)
52
+ if output_dir and not os.path.exists(output_dir):
53
+ os.makedirs(output_dir)
54
+
55
+ with open(avro_schema_file, 'w', encoding='utf-8') as f:
56
+ json.dump(schema, f, indent=2)
57
+
58
+
59
+ def convert_json_to_jstruct(
60
+ input_files: List[str],
61
+ jstruct_schema_file: str,
62
+ type_name: str = 'Document',
63
+ base_id: str = 'https://example.com/',
64
+ sample_size: int = 0
65
+ ) -> None:
66
+ """Infers JSON Structure schema from JSON files.
67
+
68
+ Reads JSON files, analyzes their structure, and generates a JSON Structure
69
+ schema that validates with the official JSON Structure SDK.
70
+
71
+ Args:
72
+ input_files: List of JSON file paths to analyze
73
+ jstruct_schema_file: Output path for the JSON Structure schema
74
+ type_name: Name for the root type
75
+ base_id: Base URI for $id generation
76
+ sample_size: Maximum number of records to sample (0 = all)
77
+ """
78
+ if not input_files:
79
+ raise ValueError("At least one input file is required")
80
+
81
+ values = _load_json_values(input_files, sample_size)
82
+
83
+ if not values:
84
+ raise ValueError("No valid JSON data found in input files")
85
+
86
+ inferrer = JsonStructureSchemaInferrer(base_id=base_id)
87
+ schema = inferrer.infer_from_json_values(type_name, values)
88
+
89
+ # Ensure output directory exists
90
+ output_dir = os.path.dirname(jstruct_schema_file)
91
+ if output_dir and not os.path.exists(output_dir):
92
+ os.makedirs(output_dir)
93
+
94
+ with open(jstruct_schema_file, 'w', encoding='utf-8') as f:
95
+ json.dump(schema, f, indent=2)
96
+
97
+
98
+ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
99
+ """Loads JSON values from files.
100
+
101
+ Handles both single JSON documents and JSON Lines (JSONL) files.
102
+ Arrays at the root level are flattened into individual values.
103
+
104
+ Args:
105
+ input_files: List of file paths
106
+ sample_size: Maximum values to load (0 = all)
107
+
108
+ Returns:
109
+ List of parsed JSON values
110
+ """
111
+ values: List[Any] = []
112
+
113
+ for file_path in input_files:
114
+ if sample_size > 0 and len(values) >= sample_size:
115
+ break
116
+
117
+ with open(file_path, 'r', encoding='utf-8') as f:
118
+ content = f.read().strip()
119
+
120
+ if not content:
121
+ continue
122
+
123
+ # Try parsing as a single JSON document first
124
+ try:
125
+ data = json.loads(content)
126
+ if isinstance(data, list):
127
+ # Root-level array: each element is a separate value
128
+ for item in data:
129
+ values.append(item)
130
+ if sample_size > 0 and len(values) >= sample_size:
131
+ break
132
+ else:
133
+ values.append(data)
134
+ continue
135
+ except json.JSONDecodeError:
136
+ pass
137
+
138
+ # Try parsing as JSON Lines (JSONL)
139
+ for line in content.split('\n'):
140
+ line = line.strip()
141
+ if not line:
142
+ continue
143
+ try:
144
+ data = json.loads(line)
145
+ values.append(data)
146
+ if sample_size > 0 and len(values) >= sample_size:
147
+ break
148
+ except json.JSONDecodeError:
149
+ pass
150
+
151
+ return values