structurize 3.0.2__py3-none-any.whl → 3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/_version.py +3 -3
- avrotize/avrotize.py +4 -0
- avrotize/avrotots.py +62 -7
- avrotize/avrovalidator.py +518 -0
- avrotize/commands.json +466 -0
- avrotize/dependencies/typescript/node22/package.json +1 -1
- avrotize/jsontoschema.py +151 -0
- avrotize/schema_inference.py +825 -0
- avrotize/sqltoavro.py +1159 -0
- avrotize/validate.py +242 -0
- avrotize/xmltoschema.py +122 -0
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/METADATA +1 -1
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/RECORD +17 -11
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/WHEEL +1 -1
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/entry_points.txt +0 -0
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/licenses/LICENSE +0 -0
- {structurize-3.0.2.dist-info → structurize-3.1.1.dist-info}/top_level.txt +0 -0
avrotize/commands.json
CHANGED
|
@@ -997,6 +997,472 @@
|
|
|
997
997
|
"suggested_output_file_path": "{input_file_name}.sql",
|
|
998
998
|
"prompts": []
|
|
999
999
|
},
|
|
1000
|
+
{
|
|
1001
|
+
"command": "sql2a",
|
|
1002
|
+
"description": "Convert SQL schema to Avrotize schema (connects to live database)",
|
|
1003
|
+
"group": "5_SQL",
|
|
1004
|
+
"skip_input_file_handling": true,
|
|
1005
|
+
"function": {
|
|
1006
|
+
"name": "avrotize.sqltoavro.convert_sql_to_avro",
|
|
1007
|
+
"args": {
|
|
1008
|
+
"connection_string": "args.connection_string",
|
|
1009
|
+
"database": "args.database",
|
|
1010
|
+
"table_name": "args.table_name",
|
|
1011
|
+
"avro_namespace": "args.namespace",
|
|
1012
|
+
"avro_schema_file": "output_file_path",
|
|
1013
|
+
"dialect": "args.dialect",
|
|
1014
|
+
"emit_cloudevents": "args.emit_cloudevents",
|
|
1015
|
+
"emit_cloudevents_xregistry": "args.emit_xregistry",
|
|
1016
|
+
"sample_size": "args.sample_size",
|
|
1017
|
+
"infer_json_schema": "args.infer_json",
|
|
1018
|
+
"infer_xml_schema": "args.infer_xml",
|
|
1019
|
+
"username": "args.username",
|
|
1020
|
+
"password": "args.password"
|
|
1021
|
+
}
|
|
1022
|
+
},
|
|
1023
|
+
"extensions": [],
|
|
1024
|
+
"args": [
|
|
1025
|
+
{
|
|
1026
|
+
"name": "input",
|
|
1027
|
+
"type": "str",
|
|
1028
|
+
"nargs": "?",
|
|
1029
|
+
"help": "Not used (database connection is live)",
|
|
1030
|
+
"required": false
|
|
1031
|
+
},
|
|
1032
|
+
{
|
|
1033
|
+
"name": "--out",
|
|
1034
|
+
"type": "str",
|
|
1035
|
+
"help": "Path to the Avrotize schema file",
|
|
1036
|
+
"required": false
|
|
1037
|
+
},
|
|
1038
|
+
{
|
|
1039
|
+
"name": "--connection-string",
|
|
1040
|
+
"type": "str",
|
|
1041
|
+
"help": "Database connection string. Examples: postgresql://host:port/db (credentials via --username/--password), postgresql://user:pass@host:port/db?sslmode=require (PostgreSQL with SSL), mysql://user:pass@host/db?ssl=true (MySQL with SSL), mssql://@host/db (SQL Server with Windows Auth)",
|
|
1042
|
+
"required": true
|
|
1043
|
+
},
|
|
1044
|
+
{
|
|
1045
|
+
"name": "--username",
|
|
1046
|
+
"type": "str",
|
|
1047
|
+
"help": "Database username (overrides credentials in connection string, avoids credentials in command history)",
|
|
1048
|
+
"required": false
|
|
1049
|
+
},
|
|
1050
|
+
{
|
|
1051
|
+
"name": "--password",
|
|
1052
|
+
"type": "str",
|
|
1053
|
+
"help": "Database password (overrides credentials in connection string, avoids credentials in command history)",
|
|
1054
|
+
"required": false
|
|
1055
|
+
},
|
|
1056
|
+
{
|
|
1057
|
+
"name": "--database",
|
|
1058
|
+
"type": "str",
|
|
1059
|
+
"help": "Database name (if not in connection string)",
|
|
1060
|
+
"required": false
|
|
1061
|
+
},
|
|
1062
|
+
{
|
|
1063
|
+
"name": "--table-name",
|
|
1064
|
+
"type": "str",
|
|
1065
|
+
"help": "Specific table name (omit for all tables)",
|
|
1066
|
+
"required": false
|
|
1067
|
+
},
|
|
1068
|
+
{
|
|
1069
|
+
"name": "--namespace",
|
|
1070
|
+
"type": "str",
|
|
1071
|
+
"help": "Namespace for the Avrotize schema",
|
|
1072
|
+
"required": false
|
|
1073
|
+
},
|
|
1074
|
+
{
|
|
1075
|
+
"name": "--dialect",
|
|
1076
|
+
"type": "str",
|
|
1077
|
+
"help": "SQL dialect",
|
|
1078
|
+
"choices": [
|
|
1079
|
+
"postgres",
|
|
1080
|
+
"mysql",
|
|
1081
|
+
"sqlserver",
|
|
1082
|
+
"oracle",
|
|
1083
|
+
"sqlite"
|
|
1084
|
+
],
|
|
1085
|
+
"default": "postgres",
|
|
1086
|
+
"required": false
|
|
1087
|
+
},
|
|
1088
|
+
{
|
|
1089
|
+
"name": "--emit-cloudevents",
|
|
1090
|
+
"type": "bool",
|
|
1091
|
+
"help": "Emit CloudEvents declarations for tables with type/source/data/id columns",
|
|
1092
|
+
"required": false
|
|
1093
|
+
},
|
|
1094
|
+
{
|
|
1095
|
+
"name": "--emit-xregistry",
|
|
1096
|
+
"type": "bool",
|
|
1097
|
+
"help": "Emit an xRegistry manifest instead of a single Avrotize schema",
|
|
1098
|
+
"required": false
|
|
1099
|
+
},
|
|
1100
|
+
{
|
|
1101
|
+
"name": "--sample-size",
|
|
1102
|
+
"type": "int",
|
|
1103
|
+
"help": "Number of rows to sample for JSON/XML inference",
|
|
1104
|
+
"default": 100,
|
|
1105
|
+
"required": false
|
|
1106
|
+
},
|
|
1107
|
+
{
|
|
1108
|
+
"name": "--infer-json",
|
|
1109
|
+
"type": "bool",
|
|
1110
|
+
"help": "Infer schema for JSON/JSONB columns",
|
|
1111
|
+
"default": true,
|
|
1112
|
+
"required": false
|
|
1113
|
+
},
|
|
1114
|
+
{
|
|
1115
|
+
"name": "--infer-xml",
|
|
1116
|
+
"type": "bool",
|
|
1117
|
+
"help": "Infer schema for XML columns",
|
|
1118
|
+
"default": true,
|
|
1119
|
+
"required": false
|
|
1120
|
+
}
|
|
1121
|
+
],
|
|
1122
|
+
"suggested_output_file_path": "{database}.avsc",
|
|
1123
|
+
"prompts": [
|
|
1124
|
+
{
|
|
1125
|
+
"name": "--namespace",
|
|
1126
|
+
"message": "Enter the namespace for the Avro schema",
|
|
1127
|
+
"type": "str",
|
|
1128
|
+
"required": false
|
|
1129
|
+
},
|
|
1130
|
+
{
|
|
1131
|
+
"name": "--dialect",
|
|
1132
|
+
"message": "Select the SQL dialect",
|
|
1133
|
+
"choices": [
|
|
1134
|
+
"postgres",
|
|
1135
|
+
"mysql",
|
|
1136
|
+
"sqlserver",
|
|
1137
|
+
"oracle",
|
|
1138
|
+
"sqlite"
|
|
1139
|
+
],
|
|
1140
|
+
"default": "postgres",
|
|
1141
|
+
"required": true
|
|
1142
|
+
}
|
|
1143
|
+
]
|
|
1144
|
+
},
|
|
1145
|
+
{
|
|
1146
|
+
"command": "json2a",
|
|
1147
|
+
"description": "Infer Avro schema from JSON files",
|
|
1148
|
+
"group": "6_Inference",
|
|
1149
|
+
"skip_input_file_handling": true,
|
|
1150
|
+
"function": {
|
|
1151
|
+
"name": "avrotize.jsontoschema.convert_json_to_avro",
|
|
1152
|
+
"args": {
|
|
1153
|
+
"input_files": "args.input",
|
|
1154
|
+
"avro_schema_file": "output_file_path",
|
|
1155
|
+
"type_name": "args.type_name",
|
|
1156
|
+
"avro_namespace": "args.namespace",
|
|
1157
|
+
"sample_size": "args.sample_size"
|
|
1158
|
+
}
|
|
1159
|
+
},
|
|
1160
|
+
"extensions": [".json", ".jsonl", ".ndjson"],
|
|
1161
|
+
"args": [
|
|
1162
|
+
{
|
|
1163
|
+
"name": "input",
|
|
1164
|
+
"type": "str",
|
|
1165
|
+
"nargs": "+",
|
|
1166
|
+
"help": "JSON file(s) to analyze for schema inference",
|
|
1167
|
+
"required": true
|
|
1168
|
+
},
|
|
1169
|
+
{
|
|
1170
|
+
"name": "--out",
|
|
1171
|
+
"type": "str",
|
|
1172
|
+
"help": "Path to the output Avro schema file",
|
|
1173
|
+
"required": false
|
|
1174
|
+
},
|
|
1175
|
+
{
|
|
1176
|
+
"name": "--type-name",
|
|
1177
|
+
"type": "str",
|
|
1178
|
+
"help": "Name for the root type",
|
|
1179
|
+
"default": "Document",
|
|
1180
|
+
"required": false
|
|
1181
|
+
},
|
|
1182
|
+
{
|
|
1183
|
+
"name": "--namespace",
|
|
1184
|
+
"type": "str",
|
|
1185
|
+
"help": "Namespace for the Avro schema",
|
|
1186
|
+
"required": false
|
|
1187
|
+
},
|
|
1188
|
+
{
|
|
1189
|
+
"name": "--sample-size",
|
|
1190
|
+
"type": "int",
|
|
1191
|
+
"help": "Maximum number of records to sample (0 = all)",
|
|
1192
|
+
"default": 0,
|
|
1193
|
+
"required": false
|
|
1194
|
+
}
|
|
1195
|
+
],
|
|
1196
|
+
"suggested_output_file_path": "{input_file_name}.avsc",
|
|
1197
|
+
"prompts": [
|
|
1198
|
+
{
|
|
1199
|
+
"name": "--type-name",
|
|
1200
|
+
"message": "Enter the name for the root type",
|
|
1201
|
+
"type": "str",
|
|
1202
|
+
"default": "Document",
|
|
1203
|
+
"required": false
|
|
1204
|
+
},
|
|
1205
|
+
{
|
|
1206
|
+
"name": "--namespace",
|
|
1207
|
+
"message": "Enter the namespace for the Avro schema",
|
|
1208
|
+
"type": "str",
|
|
1209
|
+
"required": false
|
|
1210
|
+
}
|
|
1211
|
+
]
|
|
1212
|
+
},
|
|
1213
|
+
{
|
|
1214
|
+
"command": "json2s",
|
|
1215
|
+
"description": "Infer JSON Structure schema from JSON files",
|
|
1216
|
+
"group": "6_Inference",
|
|
1217
|
+
"skip_input_file_handling": true,
|
|
1218
|
+
"function": {
|
|
1219
|
+
"name": "avrotize.jsontoschema.convert_json_to_jstruct",
|
|
1220
|
+
"args": {
|
|
1221
|
+
"input_files": "args.input",
|
|
1222
|
+
"jstruct_schema_file": "output_file_path",
|
|
1223
|
+
"type_name": "args.type_name",
|
|
1224
|
+
"base_id": "args.base_id",
|
|
1225
|
+
"sample_size": "args.sample_size"
|
|
1226
|
+
}
|
|
1227
|
+
},
|
|
1228
|
+
"extensions": [".json", ".jsonl", ".ndjson"],
|
|
1229
|
+
"args": [
|
|
1230
|
+
{
|
|
1231
|
+
"name": "input",
|
|
1232
|
+
"type": "str",
|
|
1233
|
+
"nargs": "+",
|
|
1234
|
+
"help": "JSON file(s) to analyze for schema inference",
|
|
1235
|
+
"required": true
|
|
1236
|
+
},
|
|
1237
|
+
{
|
|
1238
|
+
"name": "--out",
|
|
1239
|
+
"type": "str",
|
|
1240
|
+
"help": "Path to the output JSON Structure schema file",
|
|
1241
|
+
"required": false
|
|
1242
|
+
},
|
|
1243
|
+
{
|
|
1244
|
+
"name": "--type-name",
|
|
1245
|
+
"type": "str",
|
|
1246
|
+
"help": "Name for the root type",
|
|
1247
|
+
"default": "Document",
|
|
1248
|
+
"required": false
|
|
1249
|
+
},
|
|
1250
|
+
{
|
|
1251
|
+
"name": "--base-id",
|
|
1252
|
+
"type": "str",
|
|
1253
|
+
"help": "Base URI for $id generation",
|
|
1254
|
+
"default": "https://example.com/",
|
|
1255
|
+
"required": false
|
|
1256
|
+
},
|
|
1257
|
+
{
|
|
1258
|
+
"name": "--sample-size",
|
|
1259
|
+
"type": "int",
|
|
1260
|
+
"help": "Maximum number of records to sample (0 = all)",
|
|
1261
|
+
"default": 0,
|
|
1262
|
+
"required": false
|
|
1263
|
+
}
|
|
1264
|
+
],
|
|
1265
|
+
"suggested_output_file_path": "{input_file_name}.jstruct.json",
|
|
1266
|
+
"prompts": [
|
|
1267
|
+
{
|
|
1268
|
+
"name": "--type-name",
|
|
1269
|
+
"message": "Enter the name for the root type",
|
|
1270
|
+
"type": "str",
|
|
1271
|
+
"default": "Document",
|
|
1272
|
+
"required": false
|
|
1273
|
+
},
|
|
1274
|
+
{
|
|
1275
|
+
"name": "--base-id",
|
|
1276
|
+
"message": "Enter the base URI for $id generation",
|
|
1277
|
+
"type": "str",
|
|
1278
|
+
"default": "https://example.com/",
|
|
1279
|
+
"required": false
|
|
1280
|
+
}
|
|
1281
|
+
]
|
|
1282
|
+
},
|
|
1283
|
+
{
|
|
1284
|
+
"command": "xml2a",
|
|
1285
|
+
"description": "Infer Avro schema from XML files",
|
|
1286
|
+
"group": "6_Inference",
|
|
1287
|
+
"skip_input_file_handling": true,
|
|
1288
|
+
"function": {
|
|
1289
|
+
"name": "avrotize.xmltoschema.convert_xml_to_avro",
|
|
1290
|
+
"args": {
|
|
1291
|
+
"input_files": "args.input",
|
|
1292
|
+
"avro_schema_file": "output_file_path",
|
|
1293
|
+
"type_name": "args.type_name",
|
|
1294
|
+
"avro_namespace": "args.namespace",
|
|
1295
|
+
"sample_size": "args.sample_size"
|
|
1296
|
+
}
|
|
1297
|
+
},
|
|
1298
|
+
"extensions": [".xml"],
|
|
1299
|
+
"args": [
|
|
1300
|
+
{
|
|
1301
|
+
"name": "input",
|
|
1302
|
+
"type": "str",
|
|
1303
|
+
"nargs": "+",
|
|
1304
|
+
"help": "XML file(s) to analyze for schema inference",
|
|
1305
|
+
"required": true
|
|
1306
|
+
},
|
|
1307
|
+
{
|
|
1308
|
+
"name": "--out",
|
|
1309
|
+
"type": "str",
|
|
1310
|
+
"help": "Path to the output Avro schema file",
|
|
1311
|
+
"required": false
|
|
1312
|
+
},
|
|
1313
|
+
{
|
|
1314
|
+
"name": "--type-name",
|
|
1315
|
+
"type": "str",
|
|
1316
|
+
"help": "Name for the root type",
|
|
1317
|
+
"default": "Document",
|
|
1318
|
+
"required": false
|
|
1319
|
+
},
|
|
1320
|
+
{
|
|
1321
|
+
"name": "--namespace",
|
|
1322
|
+
"type": "str",
|
|
1323
|
+
"help": "Namespace for the Avro schema",
|
|
1324
|
+
"required": false
|
|
1325
|
+
},
|
|
1326
|
+
{
|
|
1327
|
+
"name": "--sample-size",
|
|
1328
|
+
"type": "int",
|
|
1329
|
+
"help": "Maximum number of documents to sample (0 = all)",
|
|
1330
|
+
"default": 0,
|
|
1331
|
+
"required": false
|
|
1332
|
+
}
|
|
1333
|
+
],
|
|
1334
|
+
"suggested_output_file_path": "{input_file_name}.avsc",
|
|
1335
|
+
"prompts": [
|
|
1336
|
+
{
|
|
1337
|
+
"name": "--type-name",
|
|
1338
|
+
"message": "Enter the name for the root type",
|
|
1339
|
+
"type": "str",
|
|
1340
|
+
"default": "Document",
|
|
1341
|
+
"required": false
|
|
1342
|
+
},
|
|
1343
|
+
{
|
|
1344
|
+
"name": "--namespace",
|
|
1345
|
+
"message": "Enter the namespace for the Avro schema",
|
|
1346
|
+
"type": "str",
|
|
1347
|
+
"required": false
|
|
1348
|
+
}
|
|
1349
|
+
]
|
|
1350
|
+
},
|
|
1351
|
+
{
|
|
1352
|
+
"command": "xml2s",
|
|
1353
|
+
"description": "Infer JSON Structure schema from XML files",
|
|
1354
|
+
"group": "6_Inference",
|
|
1355
|
+
"skip_input_file_handling": true,
|
|
1356
|
+
"function": {
|
|
1357
|
+
"name": "avrotize.xmltoschema.convert_xml_to_jstruct",
|
|
1358
|
+
"args": {
|
|
1359
|
+
"input_files": "args.input",
|
|
1360
|
+
"jstruct_schema_file": "output_file_path",
|
|
1361
|
+
"type_name": "args.type_name",
|
|
1362
|
+
"base_id": "args.base_id",
|
|
1363
|
+
"sample_size": "args.sample_size"
|
|
1364
|
+
}
|
|
1365
|
+
},
|
|
1366
|
+
"extensions": [".xml"],
|
|
1367
|
+
"args": [
|
|
1368
|
+
{
|
|
1369
|
+
"name": "input",
|
|
1370
|
+
"type": "str",
|
|
1371
|
+
"nargs": "+",
|
|
1372
|
+
"help": "XML file(s) to analyze for schema inference",
|
|
1373
|
+
"required": true
|
|
1374
|
+
},
|
|
1375
|
+
{
|
|
1376
|
+
"name": "--out",
|
|
1377
|
+
"type": "str",
|
|
1378
|
+
"help": "Path to the output JSON Structure schema file",
|
|
1379
|
+
"required": false
|
|
1380
|
+
},
|
|
1381
|
+
{
|
|
1382
|
+
"name": "--type-name",
|
|
1383
|
+
"type": "str",
|
|
1384
|
+
"help": "Name for the root type",
|
|
1385
|
+
"default": "Document",
|
|
1386
|
+
"required": false
|
|
1387
|
+
},
|
|
1388
|
+
{
|
|
1389
|
+
"name": "--base-id",
|
|
1390
|
+
"type": "str",
|
|
1391
|
+
"help": "Base URI for $id generation",
|
|
1392
|
+
"default": "https://example.com/",
|
|
1393
|
+
"required": false
|
|
1394
|
+
},
|
|
1395
|
+
{
|
|
1396
|
+
"name": "--sample-size",
|
|
1397
|
+
"type": "int",
|
|
1398
|
+
"help": "Maximum number of documents to sample (0 = all)",
|
|
1399
|
+
"default": 0,
|
|
1400
|
+
"required": false
|
|
1401
|
+
}
|
|
1402
|
+
],
|
|
1403
|
+
"suggested_output_file_path": "{input_file_name}.jstruct.json",
|
|
1404
|
+
"prompts": [
|
|
1405
|
+
{
|
|
1406
|
+
"name": "--type-name",
|
|
1407
|
+
"message": "Enter the name for the root type",
|
|
1408
|
+
"type": "str",
|
|
1409
|
+
"default": "Document",
|
|
1410
|
+
"required": false
|
|
1411
|
+
},
|
|
1412
|
+
{
|
|
1413
|
+
"name": "--base-id",
|
|
1414
|
+
"message": "Enter the base URI for $id generation",
|
|
1415
|
+
"type": "str",
|
|
1416
|
+
"default": "https://example.com/",
|
|
1417
|
+
"required": false
|
|
1418
|
+
}
|
|
1419
|
+
]
|
|
1420
|
+
},
|
|
1421
|
+
{
|
|
1422
|
+
"command": "validate",
|
|
1423
|
+
"description": "Validate JSON instances against Avro or JSON Structure schemas",
|
|
1424
|
+
"group": "6_Inference",
|
|
1425
|
+
"skip_input_file_handling": true,
|
|
1426
|
+
"function": {
|
|
1427
|
+
"name": "avrotize.validate.validate",
|
|
1428
|
+
"args": {
|
|
1429
|
+
"input": "args.input",
|
|
1430
|
+
"schema": "args.schema",
|
|
1431
|
+
"schema_type": "args.schema_type",
|
|
1432
|
+
"quiet": "args.quiet"
|
|
1433
|
+
}
|
|
1434
|
+
},
|
|
1435
|
+
"extensions": [".json", ".jsonl"],
|
|
1436
|
+
"args": [
|
|
1437
|
+
{
|
|
1438
|
+
"name": "input",
|
|
1439
|
+
"type": "str",
|
|
1440
|
+
"nargs": "+",
|
|
1441
|
+
"help": "JSON file(s) to validate",
|
|
1442
|
+
"required": true
|
|
1443
|
+
},
|
|
1444
|
+
{
|
|
1445
|
+
"name": "--schema",
|
|
1446
|
+
"type": "str",
|
|
1447
|
+
"help": "Path to schema file (.avsc for Avro, .jstruct.json for JSON Structure)",
|
|
1448
|
+
"required": true
|
|
1449
|
+
},
|
|
1450
|
+
{
|
|
1451
|
+
"name": "--schema-type",
|
|
1452
|
+
"type": "str",
|
|
1453
|
+
"help": "Schema type: 'avro' or 'jstruct'. Auto-detected from file extension if omitted.",
|
|
1454
|
+
"required": false
|
|
1455
|
+
},
|
|
1456
|
+
{
|
|
1457
|
+
"name": "--quiet",
|
|
1458
|
+
"type": "bool",
|
|
1459
|
+
"help": "Suppress output. Exit code 0 if valid, 1 if invalid.",
|
|
1460
|
+
"default": false,
|
|
1461
|
+
"required": false
|
|
1462
|
+
}
|
|
1463
|
+
],
|
|
1464
|
+
"prompts": []
|
|
1465
|
+
},
|
|
1000
1466
|
{
|
|
1001
1467
|
"command": "a2mongo",
|
|
1002
1468
|
"description": "Convert Avrotize schema to MongoDB schema",
|
avrotize/jsontoschema.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Infers schema from JSON files and converts to Avro or JSON Structure format.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- json2a: Infer Avro schema from JSON files
|
|
5
|
+
- json2s: Infer JSON Structure schema from JSON files
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
from typing import Any, Dict, List
|
|
11
|
+
|
|
12
|
+
from avrotize.schema_inference import (
|
|
13
|
+
AvroSchemaInferrer,
|
|
14
|
+
JsonStructureSchemaInferrer,
|
|
15
|
+
JsonNode
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def convert_json_to_avro(
|
|
20
|
+
input_files: List[str],
|
|
21
|
+
avro_schema_file: str,
|
|
22
|
+
type_name: str = 'Document',
|
|
23
|
+
avro_namespace: str = '',
|
|
24
|
+
sample_size: int = 0
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Infers Avro schema from JSON files.
|
|
27
|
+
|
|
28
|
+
Reads JSON files, analyzes their structure, and generates an Avro schema
|
|
29
|
+
that can represent all the data. Multiple files are analyzed together to
|
|
30
|
+
produce a unified schema.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
input_files: List of JSON file paths to analyze
|
|
34
|
+
avro_schema_file: Output path for the Avro schema
|
|
35
|
+
type_name: Name for the root type
|
|
36
|
+
avro_namespace: Namespace for generated Avro types
|
|
37
|
+
sample_size: Maximum number of records to sample (0 = all)
|
|
38
|
+
"""
|
|
39
|
+
if not input_files:
|
|
40
|
+
raise ValueError("At least one input file is required")
|
|
41
|
+
|
|
42
|
+
values = _load_json_values(input_files, sample_size)
|
|
43
|
+
|
|
44
|
+
if not values:
|
|
45
|
+
raise ValueError("No valid JSON data found in input files")
|
|
46
|
+
|
|
47
|
+
inferrer = AvroSchemaInferrer(namespace=avro_namespace)
|
|
48
|
+
schema = inferrer.infer_from_json_values(type_name, values)
|
|
49
|
+
|
|
50
|
+
# Ensure output directory exists
|
|
51
|
+
output_dir = os.path.dirname(avro_schema_file)
|
|
52
|
+
if output_dir and not os.path.exists(output_dir):
|
|
53
|
+
os.makedirs(output_dir)
|
|
54
|
+
|
|
55
|
+
with open(avro_schema_file, 'w', encoding='utf-8') as f:
|
|
56
|
+
json.dump(schema, f, indent=2)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def convert_json_to_jstruct(
|
|
60
|
+
input_files: List[str],
|
|
61
|
+
jstruct_schema_file: str,
|
|
62
|
+
type_name: str = 'Document',
|
|
63
|
+
base_id: str = 'https://example.com/',
|
|
64
|
+
sample_size: int = 0
|
|
65
|
+
) -> None:
|
|
66
|
+
"""Infers JSON Structure schema from JSON files.
|
|
67
|
+
|
|
68
|
+
Reads JSON files, analyzes their structure, and generates a JSON Structure
|
|
69
|
+
schema that validates with the official JSON Structure SDK.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
input_files: List of JSON file paths to analyze
|
|
73
|
+
jstruct_schema_file: Output path for the JSON Structure schema
|
|
74
|
+
type_name: Name for the root type
|
|
75
|
+
base_id: Base URI for $id generation
|
|
76
|
+
sample_size: Maximum number of records to sample (0 = all)
|
|
77
|
+
"""
|
|
78
|
+
if not input_files:
|
|
79
|
+
raise ValueError("At least one input file is required")
|
|
80
|
+
|
|
81
|
+
values = _load_json_values(input_files, sample_size)
|
|
82
|
+
|
|
83
|
+
if not values:
|
|
84
|
+
raise ValueError("No valid JSON data found in input files")
|
|
85
|
+
|
|
86
|
+
inferrer = JsonStructureSchemaInferrer(base_id=base_id)
|
|
87
|
+
schema = inferrer.infer_from_json_values(type_name, values)
|
|
88
|
+
|
|
89
|
+
# Ensure output directory exists
|
|
90
|
+
output_dir = os.path.dirname(jstruct_schema_file)
|
|
91
|
+
if output_dir and not os.path.exists(output_dir):
|
|
92
|
+
os.makedirs(output_dir)
|
|
93
|
+
|
|
94
|
+
with open(jstruct_schema_file, 'w', encoding='utf-8') as f:
|
|
95
|
+
json.dump(schema, f, indent=2)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
|
|
99
|
+
"""Loads JSON values from files.
|
|
100
|
+
|
|
101
|
+
Handles both single JSON documents and JSON Lines (JSONL) files.
|
|
102
|
+
Arrays at the root level are flattened into individual values.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
input_files: List of file paths
|
|
106
|
+
sample_size: Maximum values to load (0 = all)
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
List of parsed JSON values
|
|
110
|
+
"""
|
|
111
|
+
values: List[Any] = []
|
|
112
|
+
|
|
113
|
+
for file_path in input_files:
|
|
114
|
+
if sample_size > 0 and len(values) >= sample_size:
|
|
115
|
+
break
|
|
116
|
+
|
|
117
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
118
|
+
content = f.read().strip()
|
|
119
|
+
|
|
120
|
+
if not content:
|
|
121
|
+
continue
|
|
122
|
+
|
|
123
|
+
# Try parsing as a single JSON document first
|
|
124
|
+
try:
|
|
125
|
+
data = json.loads(content)
|
|
126
|
+
if isinstance(data, list):
|
|
127
|
+
# Root-level array: each element is a separate value
|
|
128
|
+
for item in data:
|
|
129
|
+
values.append(item)
|
|
130
|
+
if sample_size > 0 and len(values) >= sample_size:
|
|
131
|
+
break
|
|
132
|
+
else:
|
|
133
|
+
values.append(data)
|
|
134
|
+
continue
|
|
135
|
+
except json.JSONDecodeError:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
# Try parsing as JSON Lines (JSONL)
|
|
139
|
+
for line in content.split('\n'):
|
|
140
|
+
line = line.strip()
|
|
141
|
+
if not line:
|
|
142
|
+
continue
|
|
143
|
+
try:
|
|
144
|
+
data = json.loads(line)
|
|
145
|
+
values.append(data)
|
|
146
|
+
if sample_size > 0 and len(values) >= sample_size:
|
|
147
|
+
break
|
|
148
|
+
except json.JSONDecodeError:
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
return values
|