structurize 3.2.2__tar.gz → 3.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {structurize-3.2.2/structurize.egg-info → structurize-3.3.1}/PKG-INFO +1 -1
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/_version.py +3 -3
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotize.py +4 -1
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/choice_inference.py +128 -2
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/commands.json +207 -11
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/jsontoschema.py +14 -13
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/kustotoavro.py +17 -14
- structurize-3.3.1/avrotize/kustotojstruct.py +247 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/schema_inference.py +989 -16
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/validate.py +11 -2
- {structurize-3.2.2 → structurize-3.3.1/structurize.egg-info}/PKG-INFO +1 -1
- {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/SOURCES.txt +2 -0
- {structurize-3.2.2 → structurize-3.3.1}/.gitignore +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/LICENSE +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/MANIFEST.in +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/README.md +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/__init__.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/__main__.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/asn1toavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotocpp.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotocsharp.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotocsv.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotodatapackage.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotodb.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotogo.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotographql.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotoiceberg.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotojava.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotojs.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotojsons.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotojstruct.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotokusto.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotomd.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotools.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotoparquet.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotoproto.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotopython.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotorust.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotots.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotoxsd.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrovalidator.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/cddltostructure.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/common.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/constants.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/csvtoavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/datapackagetoavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/dependencies/cpp/vcpkg/vcpkg.json +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/dependencies/typescript/node22/package.json +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/dependency_resolver.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/dependency_version.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/jsonstoavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/jsonstostructure.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/jstructtoavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/kstructtoavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/openapitostructure.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/parquettoavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/proto2parser.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/proto3parser.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/prototoavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/sqltoavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretocddl.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretocpp.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretocsharp.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretocsv.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretodatapackage.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretodb.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretogo.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretographql.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretoiceberg.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretojava.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretojs.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretojsons.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretokusto.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretomd.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretoproto.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretopython.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretorust.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretots.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretoxsd.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/xmltoschema.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/avrotize/xsdtoavro.py +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/build.ps1 +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/build.sh +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/pyproject.toml +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/setup.cfg +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/dependency_links.txt +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/entry_points.txt +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/requires.txt +0 -0
- {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: structurize
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.3.1
|
|
4
4
|
Summary: Tools to convert from and to JSON Structure from various other schema languages.
|
|
5
5
|
Author-email: Clemens Vasters <clemensv@microsoft.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '3.
|
|
32
|
-
__version_tuple__ = version_tuple = (3,
|
|
31
|
+
__version__ = version = '3.3.1'
|
|
32
|
+
__version_tuple__ = version_tuple = (3, 3, 1)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g670e64099'
|
|
@@ -67,7 +67,10 @@ def dynamic_import(module, func):
|
|
|
67
67
|
def main():
|
|
68
68
|
"""Main function for the command line utility."""
|
|
69
69
|
commands = load_commands()
|
|
70
|
-
parser = argparse.ArgumentParser(
|
|
70
|
+
parser = argparse.ArgumentParser(
|
|
71
|
+
description='Convert a variety of schema formats to Avrotize schema and vice versa.',
|
|
72
|
+
fromfile_prefix_chars='@'
|
|
73
|
+
)
|
|
71
74
|
parser.add_argument('--version', action='store_true', help='Print the version of Avrotize.')
|
|
72
75
|
|
|
73
76
|
subparsers = parser.add_subparsers(dest='command')
|
|
@@ -214,6 +214,13 @@ def _detect_discriminators(
|
|
|
214
214
|
if len(values) < 2:
|
|
215
215
|
continue
|
|
216
216
|
|
|
217
|
+
# Skip boolean-like string values - these are flags, not discriminators
|
|
218
|
+
# A field with only "true"/"false" (or similar) values is not a type discriminator
|
|
219
|
+
normalized_values = {v.lower() if isinstance(v, str) else str(v).lower() for v in values}
|
|
220
|
+
boolean_values = {'true', 'false', 'yes', 'no', '0', '1'}
|
|
221
|
+
if normalized_values <= boolean_values:
|
|
222
|
+
continue
|
|
223
|
+
|
|
217
224
|
# Single cluster with multiple values - check if values create distinct groups
|
|
218
225
|
if len(clusters) == 1:
|
|
219
226
|
value_to_docs: Dict[str, List[DocumentInfo]] = defaultdict(list)
|
|
@@ -222,6 +229,23 @@ def _detect_discriminators(
|
|
|
222
229
|
value_to_docs[doc.field_values[field_name]].append(doc)
|
|
223
230
|
|
|
224
231
|
if len(value_to_docs) >= 2:
|
|
232
|
+
# Check 1: Does each value map to a consistent signature?
|
|
233
|
+
# A perfect discriminator has each value producing identical signatures
|
|
234
|
+
value_to_sigs: Dict[str, Set[tuple]] = {}
|
|
235
|
+
for val, val_docs in value_to_docs.items():
|
|
236
|
+
sigs = set(tuple(sorted(d.field_signature)) for d in val_docs)
|
|
237
|
+
value_to_sigs[val] = sigs
|
|
238
|
+
|
|
239
|
+
# Count values with perfectly consistent signatures (all docs same sig)
|
|
240
|
+
consistent_values = sum(1 for sigs in value_to_sigs.values() if len(sigs) == 1)
|
|
241
|
+
consistency_ratio = consistent_values / len(value_to_sigs)
|
|
242
|
+
|
|
243
|
+
# Check 2: Are signatures distinct across values?
|
|
244
|
+
all_primary_sigs = [list(sigs)[0] for sigs in value_to_sigs.values() if sigs]
|
|
245
|
+
distinct_sigs = set(all_primary_sigs)
|
|
246
|
+
distinctness_ratio = len(distinct_sigs) / len(all_primary_sigs) if all_primary_sigs else 0
|
|
247
|
+
|
|
248
|
+
# Check 3: Original inter-similarity check (relaxed to 0.85)
|
|
225
249
|
all_values = list(value_to_docs.keys())
|
|
226
250
|
inter_sims = []
|
|
227
251
|
for i, v1 in enumerate(all_values):
|
|
@@ -234,13 +258,97 @@ def _detect_discriminators(
|
|
|
234
258
|
|
|
235
259
|
avg_inter_sim = sum(inter_sims) / len(inter_sims) if inter_sims else 1.0
|
|
236
260
|
|
|
237
|
-
|
|
261
|
+
# Check 4: Discriminator-field correlation (envelope pattern)
|
|
262
|
+
# If discriminator value matches a unique payload field name, it's legitimate
|
|
263
|
+
# e.g., _subtype: "play" -> has field "play" that only appears for this value
|
|
264
|
+
discriminator_field_matches = 0
|
|
265
|
+
unique_fields_per_value: Dict[str, Set[str]] = {}
|
|
266
|
+
for disc_val, val_docs in value_to_docs.items():
|
|
267
|
+
if not val_docs:
|
|
268
|
+
continue
|
|
269
|
+
# Get fields unique to this discriminator value
|
|
270
|
+
this_sig = val_docs[0].field_signature
|
|
271
|
+
other_sigs = [d.field_signature for v, docs in value_to_docs.items()
|
|
272
|
+
if v != disc_val for d in docs[:1]]
|
|
273
|
+
if other_sigs:
|
|
274
|
+
common_with_others = this_sig.intersection(*other_sigs)
|
|
275
|
+
unique_fields = this_sig - common_with_others - {field_name}
|
|
276
|
+
unique_fields_per_value[disc_val] = unique_fields
|
|
277
|
+
# Check if discriminator value matches any unique field (case-insensitive)
|
|
278
|
+
disc_val_lower = disc_val.lower() if isinstance(disc_val, str) else str(disc_val).lower()
|
|
279
|
+
if any(uf.lower() == disc_val_lower for uf in unique_fields):
|
|
280
|
+
discriminator_field_matches += 1
|
|
281
|
+
|
|
282
|
+
has_envelope_pattern = discriminator_field_matches >= len(value_to_docs) * 0.5
|
|
283
|
+
|
|
284
|
+
# Check 5: Structural quality - detect sparse data false positives
|
|
285
|
+
# If unique fields are very few AND overlap across variants, it's likely sparse data
|
|
286
|
+
should_reject_sparse = False
|
|
287
|
+
if not has_envelope_pattern and unique_fields_per_value:
|
|
288
|
+
all_unique_fields = [ufs for ufs in unique_fields_per_value.values() if ufs]
|
|
289
|
+
if all_unique_fields:
|
|
290
|
+
# Count total unique fields across all variants
|
|
291
|
+
total_unique = set().union(*all_unique_fields)
|
|
292
|
+
avg_unique = sum(len(ufs) for ufs in all_unique_fields) / len(all_unique_fields)
|
|
293
|
+
|
|
294
|
+
# Check for "sparse optional field" pattern:
|
|
295
|
+
# - Very few unique fields per variant (1-2)
|
|
296
|
+
# - Moderate to high similarity (>0.6)
|
|
297
|
+
# - No envelope pattern
|
|
298
|
+
# - Few total samples per variant (could be sample noise)
|
|
299
|
+
# - Only 2 variants (binary split is more likely to be accidental)
|
|
300
|
+
# - NOT a subset pattern (where one variant is just base + extras)
|
|
301
|
+
min_samples = min(len(docs) for docs in value_to_docs.values())
|
|
302
|
+
num_variants = len(value_to_docs)
|
|
303
|
+
|
|
304
|
+
# Check for subset pattern (inheritance-like polymorphism)
|
|
305
|
+
# If one variant's signature is a subset of another, it's likely real
|
|
306
|
+
is_subset_pattern = False
|
|
307
|
+
all_sigs = [list(sigs)[0] for v, sigs in value_to_sigs.items() if sigs and len(sigs) == 1]
|
|
308
|
+
if len(all_sigs) >= 2:
|
|
309
|
+
for i, sig1 in enumerate(all_sigs):
|
|
310
|
+
for sig2 in all_sigs[i+1:]:
|
|
311
|
+
# Check if one is subset of the other (ignoring discriminator)
|
|
312
|
+
sig1_set = set(sig1) - {field_name}
|
|
313
|
+
sig2_set = set(sig2) - {field_name}
|
|
314
|
+
if sig1_set < sig2_set or sig2_set < sig1_set:
|
|
315
|
+
is_subset_pattern = True
|
|
316
|
+
break
|
|
317
|
+
if is_subset_pattern:
|
|
318
|
+
break
|
|
319
|
+
|
|
320
|
+
# Binary splits with few samples and minimal structural difference
|
|
321
|
+
# are most likely sparse data artifacts (unless it's a subset pattern)
|
|
322
|
+
if (num_variants == 2 and
|
|
323
|
+
avg_unique <= 1.5 and
|
|
324
|
+
avg_inter_sim > 0.6 and
|
|
325
|
+
min_samples < 5 and
|
|
326
|
+
not is_subset_pattern):
|
|
327
|
+
should_reject_sparse = True
|
|
328
|
+
|
|
329
|
+
if should_reject_sparse:
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
# Accept if: (a) low similarity, OR (b) high consistency + distinct signatures,
|
|
333
|
+
# OR (c) envelope pattern detected
|
|
334
|
+
is_discriminator = (
|
|
335
|
+
avg_inter_sim < 0.7 or
|
|
336
|
+
(consistency_ratio > 0.9 and distinctness_ratio > 0.9) or
|
|
337
|
+
has_envelope_pattern
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
if is_discriminator:
|
|
341
|
+
# Use distinctness as correlation score when similarity is high
|
|
342
|
+
# Boost score if envelope pattern detected
|
|
343
|
+
score = max(1.0 - avg_inter_sim, distinctness_ratio)
|
|
344
|
+
if has_envelope_pattern:
|
|
345
|
+
score = max(score, 0.95)
|
|
238
346
|
correlation = {v: i for i, v in enumerate(all_values)}
|
|
239
347
|
candidates.append(DiscriminatorCandidate(
|
|
240
348
|
field_name=field_name,
|
|
241
349
|
values=values,
|
|
242
350
|
correlation=correlation,
|
|
243
|
-
correlation_score=
|
|
351
|
+
correlation_score=score
|
|
244
352
|
))
|
|
245
353
|
continue
|
|
246
354
|
|
|
@@ -395,6 +503,24 @@ def infer_choice_type(
|
|
|
395
503
|
if best.correlation_score > 0.3:
|
|
396
504
|
clusters = _recluster_by_discriminator(documents, best)
|
|
397
505
|
|
|
506
|
+
# Fallback: if multi-cluster but no discriminator, try single-cluster analysis
|
|
507
|
+
# This handles cases where clustering threshold merged distinct types
|
|
508
|
+
if len(clusters) > 1 and not discriminators:
|
|
509
|
+
# Treat all documents as one cluster for discriminator detection
|
|
510
|
+
all_sigs = [d.field_signature for d in documents]
|
|
511
|
+
merged_sig = set().union(*all_sigs) if all_sigs else set()
|
|
512
|
+
required_sig = set(all_sigs[0]).intersection(*all_sigs[1:]) if len(all_sigs) > 1 else (set(all_sigs[0]) if all_sigs else set())
|
|
513
|
+
single_cluster = [SchemaCluster(
|
|
514
|
+
id=0,
|
|
515
|
+
documents=documents,
|
|
516
|
+
merged_signature=merged_sig,
|
|
517
|
+
required_fields=required_sig
|
|
518
|
+
)]
|
|
519
|
+
fallback_discriminators = _detect_discriminators(documents, single_cluster)
|
|
520
|
+
if fallback_discriminators and fallback_discriminators[0].correlation_score > 0.3:
|
|
521
|
+
discriminators = fallback_discriminators
|
|
522
|
+
clusters = _recluster_by_discriminator(documents, fallback_discriminators[0])
|
|
523
|
+
|
|
398
524
|
# Single cluster = check for nested discriminator or sparse data
|
|
399
525
|
if len(clusters) == 1:
|
|
400
526
|
nested_result = None
|
|
@@ -764,7 +764,10 @@
|
|
|
764
764
|
"avro_namespace": "args.namespace",
|
|
765
765
|
"avro_schema_file": "output_file_path",
|
|
766
766
|
"emit_cloudevents": "args.emit_cloudevents",
|
|
767
|
-
"emit_cloudevents_xregistry": "args.emit_xregistry"
|
|
767
|
+
"emit_cloudevents_xregistry": "args.emit_xregistry",
|
|
768
|
+
"sample_size": "args.sample_size",
|
|
769
|
+
"infer_choices": "args.infer_choices",
|
|
770
|
+
"choice_depth": "args.choice_depth"
|
|
768
771
|
}
|
|
769
772
|
},
|
|
770
773
|
"extensions": [
|
|
@@ -819,6 +822,26 @@
|
|
|
819
822
|
"type": "bool",
|
|
820
823
|
"help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single Avrotize schema",
|
|
821
824
|
"required": false
|
|
825
|
+
},
|
|
826
|
+
{
|
|
827
|
+
"name": "--sample-size",
|
|
828
|
+
"type": "int",
|
|
829
|
+
"help": "Maximum number of records to sample for dynamic field inference (0 = all)",
|
|
830
|
+
"default": 100,
|
|
831
|
+
"required": false
|
|
832
|
+
},
|
|
833
|
+
{
|
|
834
|
+
"name": "--infer-choices",
|
|
835
|
+
"type": "bool",
|
|
836
|
+
"help": "Detect discriminated unions in dynamic fields and emit as Avro unions with discriminator defaults",
|
|
837
|
+
"required": false
|
|
838
|
+
},
|
|
839
|
+
{
|
|
840
|
+
"name": "--choice-depth",
|
|
841
|
+
"type": "int",
|
|
842
|
+
"help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
|
|
843
|
+
"default": 1,
|
|
844
|
+
"required": false
|
|
822
845
|
}
|
|
823
846
|
],
|
|
824
847
|
"suggested_output_file_path": "{kusto_database}.avsc",
|
|
@@ -846,6 +869,131 @@
|
|
|
846
869
|
],
|
|
847
870
|
"skip_input_file_handling": true
|
|
848
871
|
},
|
|
872
|
+
{
|
|
873
|
+
"command": "k2s",
|
|
874
|
+
"description": "Convert Kusto schema to JSON Structure schema",
|
|
875
|
+
"group": "1_Schemas",
|
|
876
|
+
"function": {
|
|
877
|
+
"name": "avrotize.kustotojstruct.convert_kusto_to_jstruct",
|
|
878
|
+
"args": {
|
|
879
|
+
"kusto_uri": "args.kusto_uri",
|
|
880
|
+
"kusto_database": "args.kusto_database",
|
|
881
|
+
"table_name": "args.table_name",
|
|
882
|
+
"base_id": "args.base_id",
|
|
883
|
+
"jstruct_schema_file": "output_file_path",
|
|
884
|
+
"emit_cloudevents": "args.emit_cloudevents",
|
|
885
|
+
"emit_cloudevents_xregistry": "args.emit_xregistry",
|
|
886
|
+
"sample_size": "args.sample_size",
|
|
887
|
+
"infer_choices": "args.infer_choices",
|
|
888
|
+
"choice_depth": "args.choice_depth",
|
|
889
|
+
"infer_enums": "args.infer_enums"
|
|
890
|
+
}
|
|
891
|
+
},
|
|
892
|
+
"extensions": [
|
|
893
|
+
".kusto"
|
|
894
|
+
],
|
|
895
|
+
"args": [
|
|
896
|
+
{
|
|
897
|
+
"name": "input",
|
|
898
|
+
"type": "str",
|
|
899
|
+
"nargs": "?",
|
|
900
|
+
"help": "Kusto file",
|
|
901
|
+
"required": false
|
|
902
|
+
},
|
|
903
|
+
{
|
|
904
|
+
"name": "--out",
|
|
905
|
+
"type": "str",
|
|
906
|
+
"help": "Path to the JSON Structure schema file",
|
|
907
|
+
"required": false
|
|
908
|
+
},
|
|
909
|
+
{
|
|
910
|
+
"name": "--kusto-uri",
|
|
911
|
+
"type": "str",
|
|
912
|
+
"help": "Kusto Cluster URI",
|
|
913
|
+
"required": false
|
|
914
|
+
},
|
|
915
|
+
{
|
|
916
|
+
"name": "--kusto-database",
|
|
917
|
+
"type": "str",
|
|
918
|
+
"help": "Kusto database",
|
|
919
|
+
"required": false
|
|
920
|
+
},
|
|
921
|
+
{
|
|
922
|
+
"name": "--table-name",
|
|
923
|
+
"type": "str",
|
|
924
|
+
"help": "Kusto table name",
|
|
925
|
+
"required": false
|
|
926
|
+
},
|
|
927
|
+
{
|
|
928
|
+
"name": "--base-id",
|
|
929
|
+
"type": "str",
|
|
930
|
+
"help": "Base URI for $id generation",
|
|
931
|
+
"required": false
|
|
932
|
+
},
|
|
933
|
+
{
|
|
934
|
+
"name": "--emit-cloudevents",
|
|
935
|
+
"type": "bool",
|
|
936
|
+
"help": "Emit CloudEvents declarations for each table",
|
|
937
|
+
"required": false
|
|
938
|
+
},
|
|
939
|
+
{
|
|
940
|
+
"name": "--emit-xregistry",
|
|
941
|
+
"type": "bool",
|
|
942
|
+
"help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single JSON Structure schema",
|
|
943
|
+
"required": false
|
|
944
|
+
},
|
|
945
|
+
{
|
|
946
|
+
"name": "--sample-size",
|
|
947
|
+
"type": "int",
|
|
948
|
+
"help": "Maximum number of records to sample for dynamic field inference (0 = all)",
|
|
949
|
+
"default": 100,
|
|
950
|
+
"required": false
|
|
951
|
+
},
|
|
952
|
+
{
|
|
953
|
+
"name": "--infer-choices",
|
|
954
|
+
"type": "bool",
|
|
955
|
+
"help": "Detect discriminated unions in dynamic fields and emit as choice types with discriminator defaults",
|
|
956
|
+
"required": false
|
|
957
|
+
},
|
|
958
|
+
{
|
|
959
|
+
"name": "--choice-depth",
|
|
960
|
+
"type": "int",
|
|
961
|
+
"help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
|
|
962
|
+
"default": 1,
|
|
963
|
+
"required": false
|
|
964
|
+
},
|
|
965
|
+
{
|
|
966
|
+
"name": "--infer-enums",
|
|
967
|
+
"type": "bool",
|
|
968
|
+
"help": "Detect enum types from repeated string values with low cardinality in dynamic fields",
|
|
969
|
+
"required": false
|
|
970
|
+
}
|
|
971
|
+
],
|
|
972
|
+
"suggested_output_file_path": "{kusto_database}.jstruct.json",
|
|
973
|
+
"prompts": [
|
|
974
|
+
{
|
|
975
|
+
"name": "--base-id",
|
|
976
|
+
"message": "Enter the base URI for $id generation",
|
|
977
|
+
"type": "str",
|
|
978
|
+
"required": false
|
|
979
|
+
},
|
|
980
|
+
{
|
|
981
|
+
"name": "--emit-cloudevents",
|
|
982
|
+
"message": "Emit CloudEvents declarations for each table?",
|
|
983
|
+
"type": "bool",
|
|
984
|
+
"default": false,
|
|
985
|
+
"required": false
|
|
986
|
+
},
|
|
987
|
+
{
|
|
988
|
+
"name": "--emit-xregistry",
|
|
989
|
+
"message": "Emit an xRegistry manifest with CloudEvents declarations?",
|
|
990
|
+
"type": "bool",
|
|
991
|
+
"default": false,
|
|
992
|
+
"required": false
|
|
993
|
+
}
|
|
994
|
+
],
|
|
995
|
+
"skip_input_file_handling": true
|
|
996
|
+
},
|
|
849
997
|
{
|
|
850
998
|
"command": "a2sql",
|
|
851
999
|
"description": "Convert Avrotize schema to SQL schema",
|
|
@@ -1163,10 +1311,15 @@
|
|
|
1163
1311
|
"type_name": "args.type_name",
|
|
1164
1312
|
"avro_namespace": "args.namespace",
|
|
1165
1313
|
"sample_size": "args.sample_size",
|
|
1166
|
-
"infer_choices": "args.infer_choices"
|
|
1314
|
+
"infer_choices": "args.infer_choices",
|
|
1315
|
+
"choice_depth": "args.choice_depth"
|
|
1167
1316
|
}
|
|
1168
1317
|
},
|
|
1169
|
-
"extensions": [
|
|
1318
|
+
"extensions": [
|
|
1319
|
+
".json",
|
|
1320
|
+
".jsonl",
|
|
1321
|
+
".ndjson"
|
|
1322
|
+
],
|
|
1170
1323
|
"args": [
|
|
1171
1324
|
{
|
|
1172
1325
|
"name": "input",
|
|
@@ -1207,6 +1360,13 @@
|
|
|
1207
1360
|
"action": "store_true",
|
|
1208
1361
|
"help": "Detect discriminated unions and emit as Avro unions with discriminator field defaults",
|
|
1209
1362
|
"required": false
|
|
1363
|
+
},
|
|
1364
|
+
{
|
|
1365
|
+
"name": "--choice-depth",
|
|
1366
|
+
"type": "int",
|
|
1367
|
+
"help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
|
|
1368
|
+
"default": 1,
|
|
1369
|
+
"required": false
|
|
1210
1370
|
}
|
|
1211
1371
|
],
|
|
1212
1372
|
"suggested_output_file_path": "{input_file_name}.avsc",
|
|
@@ -1239,10 +1399,16 @@
|
|
|
1239
1399
|
"type_name": "args.type_name",
|
|
1240
1400
|
"base_id": "args.base_id",
|
|
1241
1401
|
"sample_size": "args.sample_size",
|
|
1242
|
-
"infer_choices": "args.infer_choices"
|
|
1402
|
+
"infer_choices": "args.infer_choices",
|
|
1403
|
+
"choice_depth": "args.choice_depth",
|
|
1404
|
+
"infer_enums": "args.infer_enums"
|
|
1243
1405
|
}
|
|
1244
1406
|
},
|
|
1245
|
-
"extensions": [
|
|
1407
|
+
"extensions": [
|
|
1408
|
+
".json",
|
|
1409
|
+
".jsonl",
|
|
1410
|
+
".ndjson"
|
|
1411
|
+
],
|
|
1246
1412
|
"args": [
|
|
1247
1413
|
{
|
|
1248
1414
|
"name": "input",
|
|
@@ -1284,6 +1450,20 @@
|
|
|
1284
1450
|
"action": "store_true",
|
|
1285
1451
|
"help": "Detect discriminated unions and emit as choice types with discriminator field defaults",
|
|
1286
1452
|
"required": false
|
|
1453
|
+
},
|
|
1454
|
+
{
|
|
1455
|
+
"name": "--choice-depth",
|
|
1456
|
+
"type": "int",
|
|
1457
|
+
"help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
|
|
1458
|
+
"default": 1,
|
|
1459
|
+
"required": false
|
|
1460
|
+
},
|
|
1461
|
+
{
|
|
1462
|
+
"name": "--infer-enums",
|
|
1463
|
+
"type": "bool",
|
|
1464
|
+
"action": "store_true",
|
|
1465
|
+
"help": "Detect enum types from repeated string values with low cardinality",
|
|
1466
|
+
"required": false
|
|
1287
1467
|
}
|
|
1288
1468
|
],
|
|
1289
1469
|
"suggested_output_file_path": "{input_file_name}.jstruct.json",
|
|
@@ -1319,7 +1499,9 @@
|
|
|
1319
1499
|
"sample_size": "args.sample_size"
|
|
1320
1500
|
}
|
|
1321
1501
|
},
|
|
1322
|
-
"extensions": [
|
|
1502
|
+
"extensions": [
|
|
1503
|
+
".xml"
|
|
1504
|
+
],
|
|
1323
1505
|
"args": [
|
|
1324
1506
|
{
|
|
1325
1507
|
"name": "input",
|
|
@@ -1387,7 +1569,9 @@
|
|
|
1387
1569
|
"sample_size": "args.sample_size"
|
|
1388
1570
|
}
|
|
1389
1571
|
},
|
|
1390
|
-
"extensions": [
|
|
1572
|
+
"extensions": [
|
|
1573
|
+
".xml"
|
|
1574
|
+
],
|
|
1391
1575
|
"args": [
|
|
1392
1576
|
{
|
|
1393
1577
|
"name": "input",
|
|
@@ -1456,7 +1640,10 @@
|
|
|
1456
1640
|
"quiet": "args.quiet"
|
|
1457
1641
|
}
|
|
1458
1642
|
},
|
|
1459
|
-
"extensions": [
|
|
1643
|
+
"extensions": [
|
|
1644
|
+
".json",
|
|
1645
|
+
".jsonl"
|
|
1646
|
+
],
|
|
1460
1647
|
"args": [
|
|
1461
1648
|
{
|
|
1462
1649
|
"name": "input",
|
|
@@ -1643,7 +1830,10 @@
|
|
|
1643
1830
|
"name": "--format",
|
|
1644
1831
|
"type": "str",
|
|
1645
1832
|
"help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
|
|
1646
|
-
"choices": [
|
|
1833
|
+
"choices": [
|
|
1834
|
+
"schema",
|
|
1835
|
+
"arrow"
|
|
1836
|
+
],
|
|
1647
1837
|
"default": "arrow",
|
|
1648
1838
|
"required": false
|
|
1649
1839
|
}
|
|
@@ -1707,7 +1897,10 @@
|
|
|
1707
1897
|
"name": "--format",
|
|
1708
1898
|
"type": "str",
|
|
1709
1899
|
"help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
|
|
1710
|
-
"choices": [
|
|
1900
|
+
"choices": [
|
|
1901
|
+
"schema",
|
|
1902
|
+
"arrow"
|
|
1903
|
+
],
|
|
1711
1904
|
"default": "arrow",
|
|
1712
1905
|
"required": false
|
|
1713
1906
|
}
|
|
@@ -2676,7 +2869,10 @@
|
|
|
2676
2869
|
"avro_annotation": "args.avro_annotation"
|
|
2677
2870
|
}
|
|
2678
2871
|
},
|
|
2679
|
-
"extensions": [
|
|
2872
|
+
"extensions": [
|
|
2873
|
+
".struct.json",
|
|
2874
|
+
".json"
|
|
2875
|
+
],
|
|
2680
2876
|
"args": [
|
|
2681
2877
|
{
|
|
2682
2878
|
"name": "input",
|
|
@@ -22,7 +22,8 @@ def convert_json_to_avro(
|
|
|
22
22
|
type_name: str = 'Document',
|
|
23
23
|
avro_namespace: str = '',
|
|
24
24
|
sample_size: int = 0,
|
|
25
|
-
infer_choices: bool = False
|
|
25
|
+
infer_choices: bool = False,
|
|
26
|
+
choice_depth: int = 1
|
|
26
27
|
) -> None:
|
|
27
28
|
"""Infers Avro schema from JSON files.
|
|
28
29
|
|
|
@@ -37,6 +38,7 @@ def convert_json_to_avro(
|
|
|
37
38
|
avro_namespace: Namespace for generated Avro types
|
|
38
39
|
sample_size: Maximum number of records to sample (0 = all)
|
|
39
40
|
infer_choices: Detect discriminated unions and emit as Avro unions with discriminator defaults
|
|
41
|
+
choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
|
|
40
42
|
"""
|
|
41
43
|
if not input_files:
|
|
42
44
|
raise ValueError("At least one input file is required")
|
|
@@ -46,7 +48,7 @@ def convert_json_to_avro(
|
|
|
46
48
|
if not values:
|
|
47
49
|
raise ValueError("No valid JSON data found in input files")
|
|
48
50
|
|
|
49
|
-
inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices)
|
|
51
|
+
inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices, choice_depth=choice_depth)
|
|
50
52
|
schema = inferrer.infer_from_json_values(type_name, values)
|
|
51
53
|
|
|
52
54
|
# Ensure output directory exists
|
|
@@ -64,7 +66,9 @@ def convert_json_to_jstruct(
|
|
|
64
66
|
type_name: str = 'Document',
|
|
65
67
|
base_id: str = 'https://example.com/',
|
|
66
68
|
sample_size: int = 0,
|
|
67
|
-
infer_choices: bool = False
|
|
69
|
+
infer_choices: bool = False,
|
|
70
|
+
choice_depth: int = 1,
|
|
71
|
+
infer_enums: bool = False
|
|
68
72
|
) -> None:
|
|
69
73
|
"""Infers JSON Structure schema from JSON files.
|
|
70
74
|
|
|
@@ -78,6 +82,8 @@ def convert_json_to_jstruct(
|
|
|
78
82
|
base_id: Base URI for $id generation
|
|
79
83
|
sample_size: Maximum number of records to sample (0 = all)
|
|
80
84
|
infer_choices: Detect discriminated unions and emit as choice types with discriminator defaults
|
|
85
|
+
choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
|
|
86
|
+
infer_enums: Detect enum types from repeated string values with low cardinality
|
|
81
87
|
"""
|
|
82
88
|
if not input_files:
|
|
83
89
|
raise ValueError("At least one input file is required")
|
|
@@ -87,7 +93,8 @@ def convert_json_to_jstruct(
|
|
|
87
93
|
if not values:
|
|
88
94
|
raise ValueError("No valid JSON data found in input files")
|
|
89
95
|
|
|
90
|
-
inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices
|
|
96
|
+
inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices,
|
|
97
|
+
choice_depth=choice_depth, infer_enums=infer_enums)
|
|
91
98
|
schema = inferrer.infer_from_json_values(type_name, values)
|
|
92
99
|
|
|
93
100
|
# Ensure output directory exists
|
|
@@ -103,7 +110,7 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
|
|
|
103
110
|
"""Loads JSON values from files.
|
|
104
111
|
|
|
105
112
|
Handles both single JSON documents and JSON Lines (JSONL) files.
|
|
106
|
-
|
|
113
|
+
Top-level arrays are treated as single array values, not flattened.
|
|
107
114
|
|
|
108
115
|
Args:
|
|
109
116
|
input_files: List of file paths
|
|
@@ -127,14 +134,8 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
|
|
|
127
134
|
# Try parsing as a single JSON document first
|
|
128
135
|
try:
|
|
129
136
|
data = json.loads(content)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
for item in data:
|
|
133
|
-
values.append(item)
|
|
134
|
-
if sample_size > 0 and len(values) >= sample_size:
|
|
135
|
-
break
|
|
136
|
-
else:
|
|
137
|
-
values.append(data)
|
|
137
|
+
# Treat any valid JSON (including arrays) as a single value
|
|
138
|
+
values.append(data)
|
|
138
139
|
continue
|
|
139
140
|
except json.JSONDecodeError:
|
|
140
141
|
pass
|
|
@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Tuple
|
|
|
7
7
|
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
|
|
8
8
|
from avrotize.common import get_tree_hash
|
|
9
9
|
from avrotize.constants import AVRO_VERSION
|
|
10
|
+
from avrotize.schema_inference import AvroSchemaInferrer
|
|
10
11
|
|
|
11
12
|
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
12
13
|
|
|
@@ -14,7 +15,7 @@ JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
|
14
15
|
class KustoToAvro:
|
|
15
16
|
""" Converts Kusto table schemas to Avro schema format."""
|
|
16
17
|
|
|
17
|
-
def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None):
|
|
18
|
+
def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
|
|
18
19
|
""" Initializes the KustoToAvro class with the Kusto URI and database name. """
|
|
19
20
|
kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(kusto_uri) if not token_provider else KustoConnectionStringBuilder.with_token_provider(kusto_uri, token_provider)
|
|
20
21
|
self.client = KustoClient(kcsb)
|
|
@@ -24,6 +25,9 @@ class KustoToAvro:
|
|
|
24
25
|
self.avro_schema_path = avro_schema_path
|
|
25
26
|
self.emit_xregistry = emit_cloudevents_xregistry
|
|
26
27
|
self.emit_cloudevents = emit_cloudevents or emit_cloudevents_xregistry
|
|
28
|
+
self.sample_size = sample_size if sample_size > 0 else 100
|
|
29
|
+
self.infer_choices = infer_choices
|
|
30
|
+
self.choice_depth = choice_depth
|
|
27
31
|
if self.emit_xregistry:
|
|
28
32
|
if not self.avro_namespace:
|
|
29
33
|
raise ValueError(
|
|
@@ -172,20 +176,19 @@ class KustoToAvro:
|
|
|
172
176
|
type_value: The value of the type column (if any)
|
|
173
177
|
"""
|
|
174
178
|
type_column_name = type_column['Name'] if type_column else None
|
|
175
|
-
query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take
|
|
179
|
+
query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take {self.sample_size}"
|
|
176
180
|
rows = self.client.execute(self.kusto_database, query)
|
|
177
181
|
values = [row[column_name] for row in rows.primary_results[0]]
|
|
178
182
|
type_name = type_value if type_value else f"{table_name}.{column_name}"
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
return "string"
|
|
183
|
+
|
|
184
|
+
# Use the new AvroSchemaInferrer for consistent inference
|
|
185
|
+
inferrer = AvroSchemaInferrer(
|
|
186
|
+
namespace=self.avro_namespace,
|
|
187
|
+
altnames_key='kql',
|
|
188
|
+
infer_choices=self.infer_choices,
|
|
189
|
+
choice_depth=self.choice_depth
|
|
190
|
+
)
|
|
191
|
+
return inferrer.infer_from_json_values(type_name, values)
|
|
189
192
|
|
|
190
193
|
type_map : Dict[str, JsonNode] = {
|
|
191
194
|
"int": "int",
|
|
@@ -440,7 +443,7 @@ class KustoToAvro:
|
|
|
440
443
|
json.dump(output, avro_file, indent=4)
|
|
441
444
|
|
|
442
445
|
|
|
443
|
-
def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None):
|
|
446
|
+
def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
|
|
444
447
|
""" Converts Kusto table schemas to Avro schema format."""
|
|
445
448
|
|
|
446
449
|
if not kusto_uri:
|
|
@@ -451,5 +454,5 @@ def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str |
|
|
|
451
454
|
avro_namespace = kusto_database
|
|
452
455
|
|
|
453
456
|
kusto_to_avro = KustoToAvro(
|
|
454
|
-
kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file,emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider)
|
|
457
|
+
kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file, emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider, sample_size=sample_size, infer_choices=infer_choices, choice_depth=choice_depth)
|
|
455
458
|
return kusto_to_avro.process_all_tables()
|