structurize 3.2.2__tar.gz → 3.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {structurize-3.2.2/structurize.egg-info → structurize-3.3.1}/PKG-INFO +1 -1
  2. {structurize-3.2.2 → structurize-3.3.1}/avrotize/_version.py +3 -3
  3. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotize.py +4 -1
  4. {structurize-3.2.2 → structurize-3.3.1}/avrotize/choice_inference.py +128 -2
  5. {structurize-3.2.2 → structurize-3.3.1}/avrotize/commands.json +207 -11
  6. {structurize-3.2.2 → structurize-3.3.1}/avrotize/jsontoschema.py +14 -13
  7. {structurize-3.2.2 → structurize-3.3.1}/avrotize/kustotoavro.py +17 -14
  8. structurize-3.3.1/avrotize/kustotojstruct.py +247 -0
  9. {structurize-3.2.2 → structurize-3.3.1}/avrotize/schema_inference.py +989 -16
  10. {structurize-3.2.2 → structurize-3.3.1}/avrotize/validate.py +11 -2
  11. {structurize-3.2.2 → structurize-3.3.1/structurize.egg-info}/PKG-INFO +1 -1
  12. {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/SOURCES.txt +2 -0
  13. {structurize-3.2.2 → structurize-3.3.1}/.gitignore +0 -0
  14. {structurize-3.2.2 → structurize-3.3.1}/LICENSE +0 -0
  15. {structurize-3.2.2 → structurize-3.3.1}/MANIFEST.in +0 -0
  16. {structurize-3.2.2 → structurize-3.3.1}/README.md +0 -0
  17. {structurize-3.2.2 → structurize-3.3.1}/avrotize/__init__.py +0 -0
  18. {structurize-3.2.2 → structurize-3.3.1}/avrotize/__main__.py +0 -0
  19. {structurize-3.2.2 → structurize-3.3.1}/avrotize/asn1toavro.py +0 -0
  20. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotocpp.py +0 -0
  21. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotocsharp.py +0 -0
  22. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotocsv.py +0 -0
  23. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotodatapackage.py +0 -0
  24. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotodb.py +0 -0
  25. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotogo.py +0 -0
  26. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotographql.py +0 -0
  27. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotoiceberg.py +0 -0
  28. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotojava.py +0 -0
  29. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotojs.py +0 -0
  30. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotojsons.py +0 -0
  31. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotojstruct.py +0 -0
  32. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotokusto.py +0 -0
  33. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotomd.py +0 -0
  34. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotools.py +0 -0
  35. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotoparquet.py +0 -0
  36. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotoproto.py +0 -0
  37. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotopython.py +0 -0
  38. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotorust.py +0 -0
  39. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotots.py +0 -0
  40. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrotoxsd.py +0 -0
  41. {structurize-3.2.2 → structurize-3.3.1}/avrotize/avrovalidator.py +0 -0
  42. {structurize-3.2.2 → structurize-3.3.1}/avrotize/cddltostructure.py +0 -0
  43. {structurize-3.2.2 → structurize-3.3.1}/avrotize/common.py +0 -0
  44. {structurize-3.2.2 → structurize-3.3.1}/avrotize/constants.py +0 -0
  45. {structurize-3.2.2 → structurize-3.3.1}/avrotize/csvtoavro.py +0 -0
  46. {structurize-3.2.2 → structurize-3.3.1}/avrotize/datapackagetoavro.py +0 -0
  47. {structurize-3.2.2 → structurize-3.3.1}/avrotize/dependencies/cpp/vcpkg/vcpkg.json +0 -0
  48. {structurize-3.2.2 → structurize-3.3.1}/avrotize/dependencies/typescript/node22/package.json +0 -0
  49. {structurize-3.2.2 → structurize-3.3.1}/avrotize/dependency_resolver.py +0 -0
  50. {structurize-3.2.2 → structurize-3.3.1}/avrotize/dependency_version.py +0 -0
  51. {structurize-3.2.2 → structurize-3.3.1}/avrotize/jsonstoavro.py +0 -0
  52. {structurize-3.2.2 → structurize-3.3.1}/avrotize/jsonstostructure.py +0 -0
  53. {structurize-3.2.2 → structurize-3.3.1}/avrotize/jstructtoavro.py +0 -0
  54. {structurize-3.2.2 → structurize-3.3.1}/avrotize/kstructtoavro.py +0 -0
  55. {structurize-3.2.2 → structurize-3.3.1}/avrotize/openapitostructure.py +0 -0
  56. {structurize-3.2.2 → structurize-3.3.1}/avrotize/parquettoavro.py +0 -0
  57. {structurize-3.2.2 → structurize-3.3.1}/avrotize/proto2parser.py +0 -0
  58. {structurize-3.2.2 → structurize-3.3.1}/avrotize/proto3parser.py +0 -0
  59. {structurize-3.2.2 → structurize-3.3.1}/avrotize/prototoavro.py +0 -0
  60. {structurize-3.2.2 → structurize-3.3.1}/avrotize/sqltoavro.py +0 -0
  61. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretocddl.py +0 -0
  62. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretocpp.py +0 -0
  63. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretocsharp.py +0 -0
  64. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretocsv.py +0 -0
  65. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretodatapackage.py +0 -0
  66. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretodb.py +0 -0
  67. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretogo.py +0 -0
  68. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretographql.py +0 -0
  69. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretoiceberg.py +0 -0
  70. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretojava.py +0 -0
  71. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretojs.py +0 -0
  72. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretojsons.py +0 -0
  73. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretokusto.py +0 -0
  74. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretomd.py +0 -0
  75. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretoproto.py +0 -0
  76. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretopython.py +0 -0
  77. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretorust.py +0 -0
  78. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretots.py +0 -0
  79. {structurize-3.2.2 → structurize-3.3.1}/avrotize/structuretoxsd.py +0 -0
  80. {structurize-3.2.2 → structurize-3.3.1}/avrotize/xmltoschema.py +0 -0
  81. {structurize-3.2.2 → structurize-3.3.1}/avrotize/xsdtoavro.py +0 -0
  82. {structurize-3.2.2 → structurize-3.3.1}/build.ps1 +0 -0
  83. {structurize-3.2.2 → structurize-3.3.1}/build.sh +0 -0
  84. {structurize-3.2.2 → structurize-3.3.1}/pyproject.toml +0 -0
  85. {structurize-3.2.2 → structurize-3.3.1}/setup.cfg +0 -0
  86. {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/dependency_links.txt +0 -0
  87. {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/entry_points.txt +0 -0
  88. {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/requires.txt +0 -0
  89. {structurize-3.2.2 → structurize-3.3.1}/structurize.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: structurize
3
- Version: 3.2.2
3
+ Version: 3.3.1
4
4
  Summary: Tools to convert from and to JSON Structure from various other schema languages.
5
5
  Author-email: Clemens Vasters <clemensv@microsoft.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '3.2.2'
32
- __version_tuple__ = version_tuple = (3, 2, 2)
31
+ __version__ = version = '3.3.1'
32
+ __version_tuple__ = version_tuple = (3, 3, 1)
33
33
 
34
- __commit_id__ = commit_id = 'g4fc2eb920'
34
+ __commit_id__ = commit_id = 'g670e64099'
@@ -67,7 +67,10 @@ def dynamic_import(module, func):
67
67
  def main():
68
68
  """Main function for the command line utility."""
69
69
  commands = load_commands()
70
- parser = argparse.ArgumentParser(description='Convert a variety of schema formats to Avrotize schema and vice versa.')
70
+ parser = argparse.ArgumentParser(
71
+ description='Convert a variety of schema formats to Avrotize schema and vice versa.',
72
+ fromfile_prefix_chars='@'
73
+ )
71
74
  parser.add_argument('--version', action='store_true', help='Print the version of Avrotize.')
72
75
 
73
76
  subparsers = parser.add_subparsers(dest='command')
@@ -214,6 +214,13 @@ def _detect_discriminators(
214
214
  if len(values) < 2:
215
215
  continue
216
216
 
217
+ # Skip boolean-like string values - these are flags, not discriminators
218
+ # A field with only "true"/"false" (or similar) values is not a type discriminator
219
+ normalized_values = {v.lower() if isinstance(v, str) else str(v).lower() for v in values}
220
+ boolean_values = {'true', 'false', 'yes', 'no', '0', '1'}
221
+ if normalized_values <= boolean_values:
222
+ continue
223
+
217
224
  # Single cluster with multiple values - check if values create distinct groups
218
225
  if len(clusters) == 1:
219
226
  value_to_docs: Dict[str, List[DocumentInfo]] = defaultdict(list)
@@ -222,6 +229,23 @@ def _detect_discriminators(
222
229
  value_to_docs[doc.field_values[field_name]].append(doc)
223
230
 
224
231
  if len(value_to_docs) >= 2:
232
+ # Check 1: Does each value map to a consistent signature?
233
+ # A perfect discriminator has each value producing identical signatures
234
+ value_to_sigs: Dict[str, Set[tuple]] = {}
235
+ for val, val_docs in value_to_docs.items():
236
+ sigs = set(tuple(sorted(d.field_signature)) for d in val_docs)
237
+ value_to_sigs[val] = sigs
238
+
239
+ # Count values with perfectly consistent signatures (all docs same sig)
240
+ consistent_values = sum(1 for sigs in value_to_sigs.values() if len(sigs) == 1)
241
+ consistency_ratio = consistent_values / len(value_to_sigs)
242
+
243
+ # Check 2: Are signatures distinct across values?
244
+ all_primary_sigs = [list(sigs)[0] for sigs in value_to_sigs.values() if sigs]
245
+ distinct_sigs = set(all_primary_sigs)
246
+ distinctness_ratio = len(distinct_sigs) / len(all_primary_sigs) if all_primary_sigs else 0
247
+
248
+ # Check 3: Original inter-similarity check (relaxed to 0.85)
225
249
  all_values = list(value_to_docs.keys())
226
250
  inter_sims = []
227
251
  for i, v1 in enumerate(all_values):
@@ -234,13 +258,97 @@ def _detect_discriminators(
234
258
 
235
259
  avg_inter_sim = sum(inter_sims) / len(inter_sims) if inter_sims else 1.0
236
260
 
237
- if avg_inter_sim < 0.7:
261
+ # Check 4: Discriminator-field correlation (envelope pattern)
262
+ # If discriminator value matches a unique payload field name, it's legitimate
263
+ # e.g., _subtype: "play" -> has field "play" that only appears for this value
264
+ discriminator_field_matches = 0
265
+ unique_fields_per_value: Dict[str, Set[str]] = {}
266
+ for disc_val, val_docs in value_to_docs.items():
267
+ if not val_docs:
268
+ continue
269
+ # Get fields unique to this discriminator value
270
+ this_sig = val_docs[0].field_signature
271
+ other_sigs = [d.field_signature for v, docs in value_to_docs.items()
272
+ if v != disc_val for d in docs[:1]]
273
+ if other_sigs:
274
+ common_with_others = this_sig.intersection(*other_sigs)
275
+ unique_fields = this_sig - common_with_others - {field_name}
276
+ unique_fields_per_value[disc_val] = unique_fields
277
+ # Check if discriminator value matches any unique field (case-insensitive)
278
+ disc_val_lower = disc_val.lower() if isinstance(disc_val, str) else str(disc_val).lower()
279
+ if any(uf.lower() == disc_val_lower for uf in unique_fields):
280
+ discriminator_field_matches += 1
281
+
282
+ has_envelope_pattern = discriminator_field_matches >= len(value_to_docs) * 0.5
283
+
284
+ # Check 5: Structural quality - detect sparse data false positives
285
+ # If unique fields are very few AND overlap across variants, it's likely sparse data
286
+ should_reject_sparse = False
287
+ if not has_envelope_pattern and unique_fields_per_value:
288
+ all_unique_fields = [ufs for ufs in unique_fields_per_value.values() if ufs]
289
+ if all_unique_fields:
290
+ # Count total unique fields across all variants
291
+ total_unique = set().union(*all_unique_fields)
292
+ avg_unique = sum(len(ufs) for ufs in all_unique_fields) / len(all_unique_fields)
293
+
294
+ # Check for "sparse optional field" pattern:
295
+ # - Very few unique fields per variant (1-2)
296
+ # - Moderate to high similarity (>0.6)
297
+ # - No envelope pattern
298
+ # - Few total samples per variant (could be sample noise)
299
+ # - Only 2 variants (binary split is more likely to be accidental)
300
+ # - NOT a subset pattern (where one variant is just base + extras)
301
+ min_samples = min(len(docs) for docs in value_to_docs.values())
302
+ num_variants = len(value_to_docs)
303
+
304
+ # Check for subset pattern (inheritance-like polymorphism)
305
+ # If one variant's signature is a subset of another, it's likely real
306
+ is_subset_pattern = False
307
+ all_sigs = [list(sigs)[0] for v, sigs in value_to_sigs.items() if sigs and len(sigs) == 1]
308
+ if len(all_sigs) >= 2:
309
+ for i, sig1 in enumerate(all_sigs):
310
+ for sig2 in all_sigs[i+1:]:
311
+ # Check if one is subset of the other (ignoring discriminator)
312
+ sig1_set = set(sig1) - {field_name}
313
+ sig2_set = set(sig2) - {field_name}
314
+ if sig1_set < sig2_set or sig2_set < sig1_set:
315
+ is_subset_pattern = True
316
+ break
317
+ if is_subset_pattern:
318
+ break
319
+
320
+ # Binary splits with few samples and minimal structural difference
321
+ # are most likely sparse data artifacts (unless it's a subset pattern)
322
+ if (num_variants == 2 and
323
+ avg_unique <= 1.5 and
324
+ avg_inter_sim > 0.6 and
325
+ min_samples < 5 and
326
+ not is_subset_pattern):
327
+ should_reject_sparse = True
328
+
329
+ if should_reject_sparse:
330
+ continue
331
+
332
+ # Accept if: (a) low similarity, OR (b) high consistency + distinct signatures,
333
+ # OR (c) envelope pattern detected
334
+ is_discriminator = (
335
+ avg_inter_sim < 0.7 or
336
+ (consistency_ratio > 0.9 and distinctness_ratio > 0.9) or
337
+ has_envelope_pattern
338
+ )
339
+
340
+ if is_discriminator:
341
+ # Use distinctness as correlation score when similarity is high
342
+ # Boost score if envelope pattern detected
343
+ score = max(1.0 - avg_inter_sim, distinctness_ratio)
344
+ if has_envelope_pattern:
345
+ score = max(score, 0.95)
238
346
  correlation = {v: i for i, v in enumerate(all_values)}
239
347
  candidates.append(DiscriminatorCandidate(
240
348
  field_name=field_name,
241
349
  values=values,
242
350
  correlation=correlation,
243
- correlation_score=1.0 - avg_inter_sim
351
+ correlation_score=score
244
352
  ))
245
353
  continue
246
354
 
@@ -395,6 +503,24 @@ def infer_choice_type(
395
503
  if best.correlation_score > 0.3:
396
504
  clusters = _recluster_by_discriminator(documents, best)
397
505
 
506
+ # Fallback: if multi-cluster but no discriminator, try single-cluster analysis
507
+ # This handles cases where clustering threshold merged distinct types
508
+ if len(clusters) > 1 and not discriminators:
509
+ # Treat all documents as one cluster for discriminator detection
510
+ all_sigs = [d.field_signature for d in documents]
511
+ merged_sig = set().union(*all_sigs) if all_sigs else set()
512
+ required_sig = set(all_sigs[0]).intersection(*all_sigs[1:]) if len(all_sigs) > 1 else (set(all_sigs[0]) if all_sigs else set())
513
+ single_cluster = [SchemaCluster(
514
+ id=0,
515
+ documents=documents,
516
+ merged_signature=merged_sig,
517
+ required_fields=required_sig
518
+ )]
519
+ fallback_discriminators = _detect_discriminators(documents, single_cluster)
520
+ if fallback_discriminators and fallback_discriminators[0].correlation_score > 0.3:
521
+ discriminators = fallback_discriminators
522
+ clusters = _recluster_by_discriminator(documents, fallback_discriminators[0])
523
+
398
524
  # Single cluster = check for nested discriminator or sparse data
399
525
  if len(clusters) == 1:
400
526
  nested_result = None
@@ -764,7 +764,10 @@
764
764
  "avro_namespace": "args.namespace",
765
765
  "avro_schema_file": "output_file_path",
766
766
  "emit_cloudevents": "args.emit_cloudevents",
767
- "emit_cloudevents_xregistry": "args.emit_xregistry"
767
+ "emit_cloudevents_xregistry": "args.emit_xregistry",
768
+ "sample_size": "args.sample_size",
769
+ "infer_choices": "args.infer_choices",
770
+ "choice_depth": "args.choice_depth"
768
771
  }
769
772
  },
770
773
  "extensions": [
@@ -819,6 +822,26 @@
819
822
  "type": "bool",
820
823
  "help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single Avrotize schema",
821
824
  "required": false
825
+ },
826
+ {
827
+ "name": "--sample-size",
828
+ "type": "int",
829
+ "help": "Maximum number of records to sample for dynamic field inference (0 = all)",
830
+ "default": 100,
831
+ "required": false
832
+ },
833
+ {
834
+ "name": "--infer-choices",
835
+ "type": "bool",
836
+ "help": "Detect discriminated unions in dynamic fields and emit as Avro unions with discriminator defaults",
837
+ "required": false
838
+ },
839
+ {
840
+ "name": "--choice-depth",
841
+ "type": "int",
842
+ "help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
843
+ "default": 1,
844
+ "required": false
822
845
  }
823
846
  ],
824
847
  "suggested_output_file_path": "{kusto_database}.avsc",
@@ -846,6 +869,131 @@
846
869
  ],
847
870
  "skip_input_file_handling": true
848
871
  },
872
+ {
873
+ "command": "k2s",
874
+ "description": "Convert Kusto schema to JSON Structure schema",
875
+ "group": "1_Schemas",
876
+ "function": {
877
+ "name": "avrotize.kustotojstruct.convert_kusto_to_jstruct",
878
+ "args": {
879
+ "kusto_uri": "args.kusto_uri",
880
+ "kusto_database": "args.kusto_database",
881
+ "table_name": "args.table_name",
882
+ "base_id": "args.base_id",
883
+ "jstruct_schema_file": "output_file_path",
884
+ "emit_cloudevents": "args.emit_cloudevents",
885
+ "emit_cloudevents_xregistry": "args.emit_xregistry",
886
+ "sample_size": "args.sample_size",
887
+ "infer_choices": "args.infer_choices",
888
+ "choice_depth": "args.choice_depth",
889
+ "infer_enums": "args.infer_enums"
890
+ }
891
+ },
892
+ "extensions": [
893
+ ".kusto"
894
+ ],
895
+ "args": [
896
+ {
897
+ "name": "input",
898
+ "type": "str",
899
+ "nargs": "?",
900
+ "help": "Kusto file",
901
+ "required": false
902
+ },
903
+ {
904
+ "name": "--out",
905
+ "type": "str",
906
+ "help": "Path to the JSON Structure schema file",
907
+ "required": false
908
+ },
909
+ {
910
+ "name": "--kusto-uri",
911
+ "type": "str",
912
+ "help": "Kusto Cluster URI",
913
+ "required": false
914
+ },
915
+ {
916
+ "name": "--kusto-database",
917
+ "type": "str",
918
+ "help": "Kusto database",
919
+ "required": false
920
+ },
921
+ {
922
+ "name": "--table-name",
923
+ "type": "str",
924
+ "help": "Kusto table name",
925
+ "required": false
926
+ },
927
+ {
928
+ "name": "--base-id",
929
+ "type": "str",
930
+ "help": "Base URI for $id generation",
931
+ "required": false
932
+ },
933
+ {
934
+ "name": "--emit-cloudevents",
935
+ "type": "bool",
936
+ "help": "Emit CloudEvents declarations for each table",
937
+ "required": false
938
+ },
939
+ {
940
+ "name": "--emit-xregistry",
941
+ "type": "bool",
942
+ "help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single JSON Structure schema",
943
+ "required": false
944
+ },
945
+ {
946
+ "name": "--sample-size",
947
+ "type": "int",
948
+ "help": "Maximum number of records to sample for dynamic field inference (0 = all)",
949
+ "default": 100,
950
+ "required": false
951
+ },
952
+ {
953
+ "name": "--infer-choices",
954
+ "type": "bool",
955
+ "help": "Detect discriminated unions in dynamic fields and emit as choice types with discriminator defaults",
956
+ "required": false
957
+ },
958
+ {
959
+ "name": "--choice-depth",
960
+ "type": "int",
961
+ "help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
962
+ "default": 1,
963
+ "required": false
964
+ },
965
+ {
966
+ "name": "--infer-enums",
967
+ "type": "bool",
968
+ "help": "Detect enum types from repeated string values with low cardinality in dynamic fields",
969
+ "required": false
970
+ }
971
+ ],
972
+ "suggested_output_file_path": "{kusto_database}.jstruct.json",
973
+ "prompts": [
974
+ {
975
+ "name": "--base-id",
976
+ "message": "Enter the base URI for $id generation",
977
+ "type": "str",
978
+ "required": false
979
+ },
980
+ {
981
+ "name": "--emit-cloudevents",
982
+ "message": "Emit CloudEvents declarations for each table?",
983
+ "type": "bool",
984
+ "default": false,
985
+ "required": false
986
+ },
987
+ {
988
+ "name": "--emit-xregistry",
989
+ "message": "Emit an xRegistry manifest with CloudEvents declarations?",
990
+ "type": "bool",
991
+ "default": false,
992
+ "required": false
993
+ }
994
+ ],
995
+ "skip_input_file_handling": true
996
+ },
849
997
  {
850
998
  "command": "a2sql",
851
999
  "description": "Convert Avrotize schema to SQL schema",
@@ -1163,10 +1311,15 @@
1163
1311
  "type_name": "args.type_name",
1164
1312
  "avro_namespace": "args.namespace",
1165
1313
  "sample_size": "args.sample_size",
1166
- "infer_choices": "args.infer_choices"
1314
+ "infer_choices": "args.infer_choices",
1315
+ "choice_depth": "args.choice_depth"
1167
1316
  }
1168
1317
  },
1169
- "extensions": [".json", ".jsonl", ".ndjson"],
1318
+ "extensions": [
1319
+ ".json",
1320
+ ".jsonl",
1321
+ ".ndjson"
1322
+ ],
1170
1323
  "args": [
1171
1324
  {
1172
1325
  "name": "input",
@@ -1207,6 +1360,13 @@
1207
1360
  "action": "store_true",
1208
1361
  "help": "Detect discriminated unions and emit as Avro unions with discriminator field defaults",
1209
1362
  "required": false
1363
+ },
1364
+ {
1365
+ "name": "--choice-depth",
1366
+ "type": "int",
1367
+ "help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
1368
+ "default": 1,
1369
+ "required": false
1210
1370
  }
1211
1371
  ],
1212
1372
  "suggested_output_file_path": "{input_file_name}.avsc",
@@ -1239,10 +1399,16 @@
1239
1399
  "type_name": "args.type_name",
1240
1400
  "base_id": "args.base_id",
1241
1401
  "sample_size": "args.sample_size",
1242
- "infer_choices": "args.infer_choices"
1402
+ "infer_choices": "args.infer_choices",
1403
+ "choice_depth": "args.choice_depth",
1404
+ "infer_enums": "args.infer_enums"
1243
1405
  }
1244
1406
  },
1245
- "extensions": [".json", ".jsonl", ".ndjson"],
1407
+ "extensions": [
1408
+ ".json",
1409
+ ".jsonl",
1410
+ ".ndjson"
1411
+ ],
1246
1412
  "args": [
1247
1413
  {
1248
1414
  "name": "input",
@@ -1284,6 +1450,20 @@
1284
1450
  "action": "store_true",
1285
1451
  "help": "Detect discriminated unions and emit as choice types with discriminator field defaults",
1286
1452
  "required": false
1453
+ },
1454
+ {
1455
+ "name": "--choice-depth",
1456
+ "type": "int",
1457
+ "help": "Maximum nesting depth for recursive choice inference (1 = root only, 2+ = nested objects)",
1458
+ "default": 1,
1459
+ "required": false
1460
+ },
1461
+ {
1462
+ "name": "--infer-enums",
1463
+ "type": "bool",
1464
+ "action": "store_true",
1465
+ "help": "Detect enum types from repeated string values with low cardinality",
1466
+ "required": false
1287
1467
  }
1288
1468
  ],
1289
1469
  "suggested_output_file_path": "{input_file_name}.jstruct.json",
@@ -1319,7 +1499,9 @@
1319
1499
  "sample_size": "args.sample_size"
1320
1500
  }
1321
1501
  },
1322
- "extensions": [".xml"],
1502
+ "extensions": [
1503
+ ".xml"
1504
+ ],
1323
1505
  "args": [
1324
1506
  {
1325
1507
  "name": "input",
@@ -1387,7 +1569,9 @@
1387
1569
  "sample_size": "args.sample_size"
1388
1570
  }
1389
1571
  },
1390
- "extensions": [".xml"],
1572
+ "extensions": [
1573
+ ".xml"
1574
+ ],
1391
1575
  "args": [
1392
1576
  {
1393
1577
  "name": "input",
@@ -1456,7 +1640,10 @@
1456
1640
  "quiet": "args.quiet"
1457
1641
  }
1458
1642
  },
1459
- "extensions": [".json", ".jsonl"],
1643
+ "extensions": [
1644
+ ".json",
1645
+ ".jsonl"
1646
+ ],
1460
1647
  "args": [
1461
1648
  {
1462
1649
  "name": "input",
@@ -1643,7 +1830,10 @@
1643
1830
  "name": "--format",
1644
1831
  "type": "str",
1645
1832
  "help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
1646
- "choices": ["schema", "arrow"],
1833
+ "choices": [
1834
+ "schema",
1835
+ "arrow"
1836
+ ],
1647
1837
  "default": "arrow",
1648
1838
  "required": false
1649
1839
  }
@@ -1707,7 +1897,10 @@
1707
1897
  "name": "--format",
1708
1898
  "type": "str",
1709
1899
  "help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
1710
- "choices": ["schema", "arrow"],
1900
+ "choices": [
1901
+ "schema",
1902
+ "arrow"
1903
+ ],
1711
1904
  "default": "arrow",
1712
1905
  "required": false
1713
1906
  }
@@ -2676,7 +2869,10 @@
2676
2869
  "avro_annotation": "args.avro_annotation"
2677
2870
  }
2678
2871
  },
2679
- "extensions": [".struct.json", ".json"],
2872
+ "extensions": [
2873
+ ".struct.json",
2874
+ ".json"
2875
+ ],
2680
2876
  "args": [
2681
2877
  {
2682
2878
  "name": "input",
@@ -22,7 +22,8 @@ def convert_json_to_avro(
22
22
  type_name: str = 'Document',
23
23
  avro_namespace: str = '',
24
24
  sample_size: int = 0,
25
- infer_choices: bool = False
25
+ infer_choices: bool = False,
26
+ choice_depth: int = 1
26
27
  ) -> None:
27
28
  """Infers Avro schema from JSON files.
28
29
 
@@ -37,6 +38,7 @@ def convert_json_to_avro(
37
38
  avro_namespace: Namespace for generated Avro types
38
39
  sample_size: Maximum number of records to sample (0 = all)
39
40
  infer_choices: Detect discriminated unions and emit as Avro unions with discriminator defaults
41
+ choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
40
42
  """
41
43
  if not input_files:
42
44
  raise ValueError("At least one input file is required")
@@ -46,7 +48,7 @@ def convert_json_to_avro(
46
48
  if not values:
47
49
  raise ValueError("No valid JSON data found in input files")
48
50
 
49
- inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices)
51
+ inferrer = AvroSchemaInferrer(namespace=avro_namespace, infer_choices=infer_choices, choice_depth=choice_depth)
50
52
  schema = inferrer.infer_from_json_values(type_name, values)
51
53
 
52
54
  # Ensure output directory exists
@@ -64,7 +66,9 @@ def convert_json_to_jstruct(
64
66
  type_name: str = 'Document',
65
67
  base_id: str = 'https://example.com/',
66
68
  sample_size: int = 0,
67
- infer_choices: bool = False
69
+ infer_choices: bool = False,
70
+ choice_depth: int = 1,
71
+ infer_enums: bool = False
68
72
  ) -> None:
69
73
  """Infers JSON Structure schema from JSON files.
70
74
 
@@ -78,6 +82,8 @@ def convert_json_to_jstruct(
78
82
  base_id: Base URI for $id generation
79
83
  sample_size: Maximum number of records to sample (0 = all)
80
84
  infer_choices: Detect discriminated unions and emit as choice types with discriminator defaults
85
+ choice_depth: Maximum nesting depth for recursive choice inference (1 = root only)
86
+ infer_enums: Detect enum types from repeated string values with low cardinality
81
87
  """
82
88
  if not input_files:
83
89
  raise ValueError("At least one input file is required")
@@ -87,7 +93,8 @@ def convert_json_to_jstruct(
87
93
  if not values:
88
94
  raise ValueError("No valid JSON data found in input files")
89
95
 
90
- inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices)
96
+ inferrer = JsonStructureSchemaInferrer(base_id=base_id, infer_choices=infer_choices,
97
+ choice_depth=choice_depth, infer_enums=infer_enums)
91
98
  schema = inferrer.infer_from_json_values(type_name, values)
92
99
 
93
100
  # Ensure output directory exists
@@ -103,7 +110,7 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
103
110
  """Loads JSON values from files.
104
111
 
105
112
  Handles both single JSON documents and JSON Lines (JSONL) files.
106
- Arrays at the root level are flattened into individual values.
113
+ Top-level arrays are treated as single array values, not flattened.
107
114
 
108
115
  Args:
109
116
  input_files: List of file paths
@@ -127,14 +134,8 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
127
134
  # Try parsing as a single JSON document first
128
135
  try:
129
136
  data = json.loads(content)
130
- if isinstance(data, list):
131
- # Root-level array: each element is a separate value
132
- for item in data:
133
- values.append(item)
134
- if sample_size > 0 and len(values) >= sample_size:
135
- break
136
- else:
137
- values.append(data)
137
+ # Treat any valid JSON (including arrays) as a single value
138
+ values.append(data)
138
139
  continue
139
140
  except json.JSONDecodeError:
140
141
  pass
@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Tuple
7
7
  from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
8
8
  from avrotize.common import get_tree_hash
9
9
  from avrotize.constants import AVRO_VERSION
10
+ from avrotize.schema_inference import AvroSchemaInferrer
10
11
 
11
12
  JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
12
13
 
@@ -14,7 +15,7 @@ JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
14
15
  class KustoToAvro:
15
16
  """ Converts Kusto table schemas to Avro schema format."""
16
17
 
17
- def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None):
18
+ def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
18
19
  """ Initializes the KustoToAvro class with the Kusto URI and database name. """
19
20
  kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(kusto_uri) if not token_provider else KustoConnectionStringBuilder.with_token_provider(kusto_uri, token_provider)
20
21
  self.client = KustoClient(kcsb)
@@ -24,6 +25,9 @@ class KustoToAvro:
24
25
  self.avro_schema_path = avro_schema_path
25
26
  self.emit_xregistry = emit_cloudevents_xregistry
26
27
  self.emit_cloudevents = emit_cloudevents or emit_cloudevents_xregistry
28
+ self.sample_size = sample_size if sample_size > 0 else 100
29
+ self.infer_choices = infer_choices
30
+ self.choice_depth = choice_depth
27
31
  if self.emit_xregistry:
28
32
  if not self.avro_namespace:
29
33
  raise ValueError(
@@ -172,20 +176,19 @@ class KustoToAvro:
172
176
  type_value: The value of the type column (if any)
173
177
  """
174
178
  type_column_name = type_column['Name'] if type_column else None
175
- query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take 100"
179
+ query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take {self.sample_size}"
176
180
  rows = self.client.execute(self.kusto_database, query)
177
181
  values = [row[column_name] for row in rows.primary_results[0]]
178
182
  type_name = type_value if type_value else f"{table_name}.{column_name}"
179
- unique_types = self.consolidated_type_list(type_name, values)
180
- if len(unique_types) > 1:
181
- # Using a union of inferred types
182
- return unique_types
183
- elif len(unique_types) == 1:
184
- # Single type, no need for union
185
- return unique_types[0]
186
- else:
187
- # No values, default to string
188
- return "string"
183
+
184
+ # Use the new AvroSchemaInferrer for consistent inference
185
+ inferrer = AvroSchemaInferrer(
186
+ namespace=self.avro_namespace,
187
+ altnames_key='kql',
188
+ infer_choices=self.infer_choices,
189
+ choice_depth=self.choice_depth
190
+ )
191
+ return inferrer.infer_from_json_values(type_name, values)
189
192
 
190
193
  type_map : Dict[str, JsonNode] = {
191
194
  "int": "int",
@@ -440,7 +443,7 @@ class KustoToAvro:
440
443
  json.dump(output, avro_file, indent=4)
441
444
 
442
445
 
443
- def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None):
446
+ def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
444
447
  """ Converts Kusto table schemas to Avro schema format."""
445
448
 
446
449
  if not kusto_uri:
@@ -451,5 +454,5 @@ def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str |
451
454
  avro_namespace = kusto_database
452
455
 
453
456
  kusto_to_avro = KustoToAvro(
454
- kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file,emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider)
457
+ kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file, emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider, sample_size=sample_size, infer_choices=infer_choices, choice_depth=choice_depth)
455
458
  return kusto_to_avro.process_all_tables()