structurize 3.2.1__tar.gz → 3.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {structurize-3.2.1/structurize.egg-info → structurize-3.3.0}/PKG-INFO +1 -1
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/_version.py +3 -3
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotize.py +4 -1
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/choice_inference.py +121 -2
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/commands.json +26 -2
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/jsontoschema.py +11 -4
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/schema_inference.py +788 -8
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretocsharp.py +59 -19
- {structurize-3.2.1 → structurize-3.3.0/structurize.egg-info}/PKG-INFO +1 -1
- {structurize-3.2.1 → structurize-3.3.0}/.gitignore +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/LICENSE +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/MANIFEST.in +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/README.md +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/__init__.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/__main__.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/asn1toavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotocpp.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotocsharp.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotocsv.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotodatapackage.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotodb.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotogo.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotographql.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotoiceberg.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotojava.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotojs.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotojsons.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotojstruct.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotokusto.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotomd.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotools.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotoparquet.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotoproto.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotopython.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotorust.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotots.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrotoxsd.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/avrovalidator.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/cddltostructure.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/common.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/constants.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/csvtoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/datapackagetoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/dependencies/cpp/vcpkg/vcpkg.json +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/dependencies/typescript/node22/package.json +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/dependency_resolver.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/dependency_version.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/jsonstoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/jsonstostructure.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/jstructtoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/kstructtoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/kustotoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/openapitostructure.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/parquettoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/proto2parser.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/proto3parser.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/prototoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/sqltoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretocddl.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretocpp.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretocsv.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretodatapackage.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretodb.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretogo.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretographql.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretoiceberg.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretojava.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretojs.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretojsons.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretokusto.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretomd.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretoproto.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretopython.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretorust.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretots.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/structuretoxsd.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/validate.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/xmltoschema.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/avrotize/xsdtoavro.py +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/build.ps1 +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/build.sh +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/pyproject.toml +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/setup.cfg +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/SOURCES.txt +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/dependency_links.txt +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/entry_points.txt +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/requires.txt +0 -0
- {structurize-3.2.1 → structurize-3.3.0}/structurize.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: structurize
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.3.0
|
|
4
4
|
Summary: Tools to convert from and to JSON Structure from various other schema languages.
|
|
5
5
|
Author-email: Clemens Vasters <clemensv@microsoft.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '3.
|
|
32
|
-
__version_tuple__ = version_tuple = (3,
|
|
31
|
+
__version__ = version = '3.3.0'
|
|
32
|
+
__version_tuple__ = version_tuple = (3, 3, 0)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g09d8d822a'
|
|
@@ -67,7 +67,10 @@ def dynamic_import(module, func):
|
|
|
67
67
|
def main():
|
|
68
68
|
"""Main function for the command line utility."""
|
|
69
69
|
commands = load_commands()
|
|
70
|
-
parser = argparse.ArgumentParser(
|
|
70
|
+
parser = argparse.ArgumentParser(
|
|
71
|
+
description='Convert a variety of schema formats to Avrotize schema and vice versa.',
|
|
72
|
+
fromfile_prefix_chars='@'
|
|
73
|
+
)
|
|
71
74
|
parser.add_argument('--version', action='store_true', help='Print the version of Avrotize.')
|
|
72
75
|
|
|
73
76
|
subparsers = parser.add_subparsers(dest='command')
|
|
@@ -222,6 +222,23 @@ def _detect_discriminators(
|
|
|
222
222
|
value_to_docs[doc.field_values[field_name]].append(doc)
|
|
223
223
|
|
|
224
224
|
if len(value_to_docs) >= 2:
|
|
225
|
+
# Check 1: Does each value map to a consistent signature?
|
|
226
|
+
# A perfect discriminator has each value producing identical signatures
|
|
227
|
+
value_to_sigs: Dict[str, Set[tuple]] = {}
|
|
228
|
+
for val, val_docs in value_to_docs.items():
|
|
229
|
+
sigs = set(tuple(sorted(d.field_signature)) for d in val_docs)
|
|
230
|
+
value_to_sigs[val] = sigs
|
|
231
|
+
|
|
232
|
+
# Count values with perfectly consistent signatures (all docs same sig)
|
|
233
|
+
consistent_values = sum(1 for sigs in value_to_sigs.values() if len(sigs) == 1)
|
|
234
|
+
consistency_ratio = consistent_values / len(value_to_sigs)
|
|
235
|
+
|
|
236
|
+
# Check 2: Are signatures distinct across values?
|
|
237
|
+
all_primary_sigs = [list(sigs)[0] for sigs in value_to_sigs.values() if sigs]
|
|
238
|
+
distinct_sigs = set(all_primary_sigs)
|
|
239
|
+
distinctness_ratio = len(distinct_sigs) / len(all_primary_sigs) if all_primary_sigs else 0
|
|
240
|
+
|
|
241
|
+
# Check 3: Original inter-similarity check (relaxed to 0.85)
|
|
225
242
|
all_values = list(value_to_docs.keys())
|
|
226
243
|
inter_sims = []
|
|
227
244
|
for i, v1 in enumerate(all_values):
|
|
@@ -234,13 +251,97 @@ def _detect_discriminators(
|
|
|
234
251
|
|
|
235
252
|
avg_inter_sim = sum(inter_sims) / len(inter_sims) if inter_sims else 1.0
|
|
236
253
|
|
|
237
|
-
|
|
254
|
+
# Check 4: Discriminator-field correlation (envelope pattern)
|
|
255
|
+
# If discriminator value matches a unique payload field name, it's legitimate
|
|
256
|
+
# e.g., _subtype: "play" -> has field "play" that only appears for this value
|
|
257
|
+
discriminator_field_matches = 0
|
|
258
|
+
unique_fields_per_value: Dict[str, Set[str]] = {}
|
|
259
|
+
for disc_val, val_docs in value_to_docs.items():
|
|
260
|
+
if not val_docs:
|
|
261
|
+
continue
|
|
262
|
+
# Get fields unique to this discriminator value
|
|
263
|
+
this_sig = val_docs[0].field_signature
|
|
264
|
+
other_sigs = [d.field_signature for v, docs in value_to_docs.items()
|
|
265
|
+
if v != disc_val for d in docs[:1]]
|
|
266
|
+
if other_sigs:
|
|
267
|
+
common_with_others = this_sig.intersection(*other_sigs)
|
|
268
|
+
unique_fields = this_sig - common_with_others - {field_name}
|
|
269
|
+
unique_fields_per_value[disc_val] = unique_fields
|
|
270
|
+
# Check if discriminator value matches any unique field (case-insensitive)
|
|
271
|
+
disc_val_lower = disc_val.lower() if isinstance(disc_val, str) else str(disc_val).lower()
|
|
272
|
+
if any(uf.lower() == disc_val_lower for uf in unique_fields):
|
|
273
|
+
discriminator_field_matches += 1
|
|
274
|
+
|
|
275
|
+
has_envelope_pattern = discriminator_field_matches >= len(value_to_docs) * 0.5
|
|
276
|
+
|
|
277
|
+
# Check 5: Structural quality - detect sparse data false positives
|
|
278
|
+
# If unique fields are very few AND overlap across variants, it's likely sparse data
|
|
279
|
+
should_reject_sparse = False
|
|
280
|
+
if not has_envelope_pattern and unique_fields_per_value:
|
|
281
|
+
all_unique_fields = [ufs for ufs in unique_fields_per_value.values() if ufs]
|
|
282
|
+
if all_unique_fields:
|
|
283
|
+
# Count total unique fields across all variants
|
|
284
|
+
total_unique = set().union(*all_unique_fields)
|
|
285
|
+
avg_unique = sum(len(ufs) for ufs in all_unique_fields) / len(all_unique_fields)
|
|
286
|
+
|
|
287
|
+
# Check for "sparse optional field" pattern:
|
|
288
|
+
# - Very few unique fields per variant (1-2)
|
|
289
|
+
# - Moderate to high similarity (>0.6)
|
|
290
|
+
# - No envelope pattern
|
|
291
|
+
# - Few total samples per variant (could be sample noise)
|
|
292
|
+
# - Only 2 variants (binary split is more likely to be accidental)
|
|
293
|
+
# - NOT a subset pattern (where one variant is just base + extras)
|
|
294
|
+
min_samples = min(len(docs) for docs in value_to_docs.values())
|
|
295
|
+
num_variants = len(value_to_docs)
|
|
296
|
+
|
|
297
|
+
# Check for subset pattern (inheritance-like polymorphism)
|
|
298
|
+
# If one variant's signature is a subset of another, it's likely real
|
|
299
|
+
is_subset_pattern = False
|
|
300
|
+
all_sigs = [list(sigs)[0] for v, sigs in value_to_sigs.items() if sigs and len(sigs) == 1]
|
|
301
|
+
if len(all_sigs) >= 2:
|
|
302
|
+
for i, sig1 in enumerate(all_sigs):
|
|
303
|
+
for sig2 in all_sigs[i+1:]:
|
|
304
|
+
# Check if one is subset of the other (ignoring discriminator)
|
|
305
|
+
sig1_set = set(sig1) - {field_name}
|
|
306
|
+
sig2_set = set(sig2) - {field_name}
|
|
307
|
+
if sig1_set < sig2_set or sig2_set < sig1_set:
|
|
308
|
+
is_subset_pattern = True
|
|
309
|
+
break
|
|
310
|
+
if is_subset_pattern:
|
|
311
|
+
break
|
|
312
|
+
|
|
313
|
+
# Binary splits with few samples and minimal structural difference
|
|
314
|
+
# are most likely sparse data artifacts (unless it's a subset pattern)
|
|
315
|
+
if (num_variants == 2 and
|
|
316
|
+
avg_unique <= 1.5 and
|
|
317
|
+
avg_inter_sim > 0.6 and
|
|
318
|
+
min_samples < 5 and
|
|
319
|
+
not is_subset_pattern):
|
|
320
|
+
should_reject_sparse = True
|
|
321
|
+
|
|
322
|
+
if should_reject_sparse:
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
# Accept if: (a) low similarity, OR (b) high consistency + distinct signatures,
|
|
326
|
+
# OR (c) envelope pattern detected
|
|
327
|
+
is_discriminator = (
|
|
328
|
+
avg_inter_sim < 0.7 or
|
|
329
|
+
(consistency_ratio > 0.9 and distinctness_ratio > 0.9) or
|
|
330
|
+
has_envelope_pattern
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
if is_discriminator:
|
|
334
|
+
# Use distinctness as correlation score when similarity is high
|
|
335
|
+
# Boost score if envelope pattern detected
|
|
336
|
+
score = max(1.0 - avg_inter_sim, distinctness_ratio)
|
|
337
|
+
if has_envelope_pattern:
|
|
338
|
+
score = max(score, 0.95)
|
|
238
339
|
correlation = {v: i for i, v in enumerate(all_values)}
|
|
239
340
|
candidates.append(DiscriminatorCandidate(
|
|
240
341
|
field_name=field_name,
|
|
241
342
|
values=values,
|
|
242
343
|
correlation=correlation,
|
|
243
|
-
correlation_score=
|
|
344
|
+
correlation_score=score
|
|
244
345
|
))
|
|
245
346
|
continue
|
|
246
347
|
|
|
@@ -395,6 +496,24 @@ def infer_choice_type(
|
|
|
395
496
|
if best.correlation_score > 0.3:
|
|
396
497
|
clusters = _recluster_by_discriminator(documents, best)
|
|
397
498
|
|
|
499
|
+
# Fallback: if multi-cluster but no discriminator, try single-cluster analysis
|
|
500
|
+
# This handles cases where clustering threshold merged distinct types
|
|
501
|
+
if len(clusters) > 1 and not discriminators:
|
|
502
|
+
# Treat all documents as one cluster for discriminator detection
|
|
503
|
+
all_sigs = [d.field_signature for d in documents]
|
|
504
|
+
merged_sig = set().union(*all_sigs) if all_sigs else set()
|
|
505
|
+
required_sig = set(all_sigs[0]).intersection(*all_sigs[1:]) if len(all_sigs) > 1 else (set(all_sigs[0]) if all_sigs else set())
|
|
506
|
+
single_cluster = [SchemaCluster(
|
|
507
|
+
id=0,
|
|
508
|
+
documents=documents,
|
|
509
|
+
merged_signature=merged_sig,
|
|
510
|
+
required_fields=required_sig
|
|
511
|
+
)]
|
|
512
|
+
fallback_discriminators = _detect_discriminators(documents, single_cluster)
|
|
513
|
+
if fallback_discriminators and fallback_discriminators[0].correlation_score > 0.3:
|
|
514
|
+
discriminators = fallback_discriminators
|
|
515
|
+
clusters = _recluster_by_discriminator(documents, fallback_discriminators[0])
|
|
516
|
+
|
|
398
517
|
# Single cluster = check for nested discriminator or sparse data
|
|
399
518
|
if len(clusters) == 1:
|
|
400
519
|
nested_result = None
|
|
@@ -1163,7 +1163,8 @@
|
|
|
1163
1163
|
"type_name": "args.type_name",
|
|
1164
1164
|
"avro_namespace": "args.namespace",
|
|
1165
1165
|
"sample_size": "args.sample_size",
|
|
1166
|
-
"infer_choices": "args.infer_choices"
|
|
1166
|
+
"infer_choices": "args.infer_choices",
|
|
1167
|
+
"choice_depth": "args.choice_depth"
|
|
1167
1168
|
}
|
|
1168
1169
|
},
|
|
1169
1170
|
"extensions": [".json", ".jsonl", ".ndjson"],
|
|
@@ -1207,6 +1208,13 @@
|
|
|
1207
1208
|
"action": "store_true",
|
|
1208
1209
|
"help": "Detect discriminated unions and emit as Avro unions with discriminator field defaults",
|
|
1209
1210
|
"required": false
|
|
1211
|
+
},
|
|
1212
|
+
{
|
|
1213
|
+
"name": "--choice-depth",
|
|
1214
|
+
"type": "int",
|
|
1215
|
+
"help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
|
|
1216
|
+
"default": 1,
|
|
1217
|
+
"required": false
|
|
1210
1218
|
}
|
|
1211
1219
|
],
|
|
1212
1220
|
"suggested_output_file_path": "{input_file_name}.avsc",
|
|
@@ -1239,7 +1247,9 @@
|
|
|
1239
1247
|
"type_name": "args.type_name",
|
|
1240
1248
|
"base_id": "args.base_id",
|
|
1241
1249
|
"sample_size": "args.sample_size",
|
|
1242
|
-
"infer_choices": "args.infer_choices"
|
|
1250
|
+
"infer_choices": "args.infer_choices",
|
|
1251
|
+
"choice_depth": "args.choice_depth",
|
|
1252
|
+
"infer_enums": "args.infer_enums"
|
|
1243
1253
|
}
|
|
1244
1254
|
},
|
|
1245
1255
|
"extensions": [".json", ".jsonl", ".ndjson"],
|
|
@@ -1284,6 +1294,20 @@
|
|
|
1284
1294
|
"action": "store_true",
|
|
1285
1295
|
"help": "Detect discriminated unions and emit as choice types with discriminator field defaults",
|
|
1286
1296
|
"required": false
|
|
1297
|
+
},
|
|
1298
|
+
{
|
|
1299
|
+
"name": "--choice-depth",
|
|
1300
|
+
"type": "int",
|
|
1301
|
+
"help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
|
|
1302
|
+
"default": 1,
|
|
1303
|
+
"required": false
|
|
1304
|
+
},
|
|
1305
|
+
{
|
|
1306
|
+
"name": "--infer-enums",
|
|
1307
|
+
"type": "bool",
|
|
1308
|
+
"action": "store_true",
|
|
1309
|
+
"help": "Detect enum types from repeated string values with low cardinality",
|
|
1310
|
+
"required": false
|
|
1287
1311
|
}
|
|
1288
1312
|
],
|
|
1289
1313
|
"suggested_output_file_path": "{input_file_name}.jstruct.json",
|
|
@@ -22,7 +22,8 @@ def convert_json_to_avro(
|
|
|
22
22
|
type_name: str = 'Document',
|
|
23
23
|
avro_namespace: str = '',
|
|
24
24
|
sample_size: int = 0,
|
|
25
|
-
infer_choices: bool = False
|
|
25
|
+
infer_choices: bool = False,
|
|
26
|
+
choice_depth: int = 1
|
|
26
27
|
) -> None:
|
|
27
28
|
"""Infers Avro schema from JSON files.
|
|
28
29
|
|
|
@@ -37,6 +38,7 @@ def convert_json_to_avro(
|
|
|
37
38
|
avro_namespace: Namespace for generated Avro types
|
|
38
39
|
sample_size: Maximum number of records to sample (0 = all)
|
|
39
40
|
infer_choices: Detect discriminated unions and emit as Avro unions with discriminator defaults
|
|
41
|
+
choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
|
|
40
42
|
"""
|
|
41
43
|
if not input_files:
|
|
42
44
|
raise ValueError("At least one input file is required")
|
|
@@ -46,7 +48,7 @@ def convert_json_to_avro(
|
|
|
46
48
|
if not values:
|
|
47
49
|
raise ValueError("No valid JSON data found in input files")
|
|
48
50
|
|
|
49
|
-
inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices)
|
|
51
|
+
inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices, choice_depth=choice_depth)
|
|
50
52
|
schema = inferrer.infer_from_json_values(type_name, values)
|
|
51
53
|
|
|
52
54
|
# Ensure output directory exists
|
|
@@ -64,7 +66,9 @@ def convert_json_to_jstruct(
|
|
|
64
66
|
type_name: str = 'Document',
|
|
65
67
|
base_id: str = 'https://example.com/',
|
|
66
68
|
sample_size: int = 0,
|
|
67
|
-
infer_choices: bool = False
|
|
69
|
+
infer_choices: bool = False,
|
|
70
|
+
choice_depth: int = 1,
|
|
71
|
+
infer_enums: bool = False
|
|
68
72
|
) -> None:
|
|
69
73
|
"""Infers JSON Structure schema from JSON files.
|
|
70
74
|
|
|
@@ -78,6 +82,8 @@ def convert_json_to_jstruct(
|
|
|
78
82
|
base_id: Base URI for $id generation
|
|
79
83
|
sample_size: Maximum number of records to sample (0 = all)
|
|
80
84
|
infer_choices: Detect discriminated unions and emit as choice types with discriminator defaults
|
|
85
|
+
choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
|
|
86
|
+
infer_enums: Detect enum types from repeated string values with low cardinality
|
|
81
87
|
"""
|
|
82
88
|
if not input_files:
|
|
83
89
|
raise ValueError("At least one input file is required")
|
|
@@ -87,7 +93,8 @@ def convert_json_to_jstruct(
|
|
|
87
93
|
if not values:
|
|
88
94
|
raise ValueError("No valid JSON data found in input files")
|
|
89
95
|
|
|
90
|
-
inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices
|
|
96
|
+
inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices,
|
|
97
|
+
choice_depth=choice_depth, infer_enums=infer_enums)
|
|
91
98
|
schema = inferrer.infer_from_json_values(type_name, values)
|
|
92
99
|
|
|
93
100
|
# Ensure output directory exists
|