structurize 3.3.0__tar.gz → 3.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {structurize-3.3.0/structurize.egg-info → structurize-3.3.1}/PKG-INFO +1 -1
  2. {structurize-3.3.0 → structurize-3.3.1}/avrotize/_version.py +3 -3
  3. {structurize-3.3.0 → structurize-3.3.1}/avrotize/choice_inference.py +7 -0
  4. {structurize-3.3.0 → structurize-3.3.1}/avrotize/commands.json +181 -9
  5. {structurize-3.3.0 → structurize-3.3.1}/avrotize/jsontoschema.py +3 -9
  6. {structurize-3.3.0 → structurize-3.3.1}/avrotize/kustotoavro.py +17 -14
  7. structurize-3.3.1/avrotize/kustotojstruct.py +247 -0
  8. {structurize-3.3.0 → structurize-3.3.1}/avrotize/schema_inference.py +206 -13
  9. {structurize-3.3.0 → structurize-3.3.1}/avrotize/validate.py +11 -2
  10. {structurize-3.3.0 → structurize-3.3.1/structurize.egg-info}/PKG-INFO +1 -1
  11. {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/SOURCES.txt +2 -0
  12. {structurize-3.3.0 → structurize-3.3.1}/.gitignore +0 -0
  13. {structurize-3.3.0 → structurize-3.3.1}/LICENSE +0 -0
  14. {structurize-3.3.0 → structurize-3.3.1}/MANIFEST.in +0 -0
  15. {structurize-3.3.0 → structurize-3.3.1}/README.md +0 -0
  16. {structurize-3.3.0 → structurize-3.3.1}/avrotize/__init__.py +0 -0
  17. {structurize-3.3.0 → structurize-3.3.1}/avrotize/__main__.py +0 -0
  18. {structurize-3.3.0 → structurize-3.3.1}/avrotize/asn1toavro.py +0 -0
  19. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotize.py +0 -0
  20. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotocpp.py +0 -0
  21. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotocsharp.py +0 -0
  22. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotocsv.py +0 -0
  23. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotodatapackage.py +0 -0
  24. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotodb.py +0 -0
  25. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotogo.py +0 -0
  26. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotographql.py +0 -0
  27. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotoiceberg.py +0 -0
  28. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotojava.py +0 -0
  29. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotojs.py +0 -0
  30. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotojsons.py +0 -0
  31. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotojstruct.py +0 -0
  32. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotokusto.py +0 -0
  33. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotomd.py +0 -0
  34. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotools.py +0 -0
  35. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotoparquet.py +0 -0
  36. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotoproto.py +0 -0
  37. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotopython.py +0 -0
  38. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotorust.py +0 -0
  39. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotots.py +0 -0
  40. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotoxsd.py +0 -0
  41. {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrovalidator.py +0 -0
  42. {structurize-3.3.0 → structurize-3.3.1}/avrotize/cddltostructure.py +0 -0
  43. {structurize-3.3.0 → structurize-3.3.1}/avrotize/common.py +0 -0
  44. {structurize-3.3.0 → structurize-3.3.1}/avrotize/constants.py +0 -0
  45. {structurize-3.3.0 → structurize-3.3.1}/avrotize/csvtoavro.py +0 -0
  46. {structurize-3.3.0 → structurize-3.3.1}/avrotize/datapackagetoavro.py +0 -0
  47. {structurize-3.3.0 → structurize-3.3.1}/avrotize/dependencies/cpp/vcpkg/vcpkg.json +0 -0
  48. {structurize-3.3.0 → structurize-3.3.1}/avrotize/dependencies/typescript/node22/package.json +0 -0
  49. {structurize-3.3.0 → structurize-3.3.1}/avrotize/dependency_resolver.py +0 -0
  50. {structurize-3.3.0 → structurize-3.3.1}/avrotize/dependency_version.py +0 -0
  51. {structurize-3.3.0 → structurize-3.3.1}/avrotize/jsonstoavro.py +0 -0
  52. {structurize-3.3.0 → structurize-3.3.1}/avrotize/jsonstostructure.py +0 -0
  53. {structurize-3.3.0 → structurize-3.3.1}/avrotize/jstructtoavro.py +0 -0
  54. {structurize-3.3.0 → structurize-3.3.1}/avrotize/kstructtoavro.py +0 -0
  55. {structurize-3.3.0 → structurize-3.3.1}/avrotize/openapitostructure.py +0 -0
  56. {structurize-3.3.0 → structurize-3.3.1}/avrotize/parquettoavro.py +0 -0
  57. {structurize-3.3.0 → structurize-3.3.1}/avrotize/proto2parser.py +0 -0
  58. {structurize-3.3.0 → structurize-3.3.1}/avrotize/proto3parser.py +0 -0
  59. {structurize-3.3.0 → structurize-3.3.1}/avrotize/prototoavro.py +0 -0
  60. {structurize-3.3.0 → structurize-3.3.1}/avrotize/sqltoavro.py +0 -0
  61. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretocddl.py +0 -0
  62. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretocpp.py +0 -0
  63. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretocsharp.py +0 -0
  64. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretocsv.py +0 -0
  65. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretodatapackage.py +0 -0
  66. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretodb.py +0 -0
  67. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretogo.py +0 -0
  68. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretographql.py +0 -0
  69. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretoiceberg.py +0 -0
  70. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretojava.py +0 -0
  71. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretojs.py +0 -0
  72. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretojsons.py +0 -0
  73. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretokusto.py +0 -0
  74. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretomd.py +0 -0
  75. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretoproto.py +0 -0
  76. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretopython.py +0 -0
  77. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretorust.py +0 -0
  78. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretots.py +0 -0
  79. {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretoxsd.py +0 -0
  80. {structurize-3.3.0 → structurize-3.3.1}/avrotize/xmltoschema.py +0 -0
  81. {structurize-3.3.0 → structurize-3.3.1}/avrotize/xsdtoavro.py +0 -0
  82. {structurize-3.3.0 → structurize-3.3.1}/build.ps1 +0 -0
  83. {structurize-3.3.0 → structurize-3.3.1}/build.sh +0 -0
  84. {structurize-3.3.0 → structurize-3.3.1}/pyproject.toml +0 -0
  85. {structurize-3.3.0 → structurize-3.3.1}/setup.cfg +0 -0
  86. {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/dependency_links.txt +0 -0
  87. {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/entry_points.txt +0 -0
  88. {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/requires.txt +0 -0
  89. {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: structurize
3
- Version: 3.3.0
3
+ Version: 3.3.1
4
4
  Summary: Tools to convert from and to JSON Structure from various other schema languages.
5
5
  Author-email: Clemens Vasters <clemensv@microsoft.com>
6
6
  Classifier: Programming Language :: Python :: 3
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '3.3.0'
32
- __version_tuple__ = version_tuple = (3, 3, 0)
31
+ __version__ = version = '3.3.1'
32
+ __version_tuple__ = version_tuple = (3, 3, 1)
33
33
 
34
- __commit_id__ = commit_id = 'g09d8d822a'
34
+ __commit_id__ = commit_id = 'g670e64099'
@@ -214,6 +214,13 @@ def _detect_discriminators(
214
214
  if len(values) < 2:
215
215
  continue
216
216
 
217
+ # Skip boolean-like string values - these are flags, not discriminators
218
+ # A field with only "true"/"false" (or similar) values is not a type discriminator
219
+ normalized_values = {v.lower() if isinstance(v, str) else str(v).lower() for v in values}
220
+ boolean_values = {'true', 'false', 'yes', 'no', '0', '1'}
221
+ if normalized_values <= boolean_values:
222
+ continue
223
+
217
224
  # Single cluster with multiple values - check if values create distinct groups
218
225
  if len(clusters) == 1:
219
226
  value_to_docs: Dict[str, List[DocumentInfo]] = defaultdict(list)
@@ -764,7 +764,10 @@
764
764
  "avro_namespace": "args.namespace",
765
765
  "avro_schema_file": "output_file_path",
766
766
  "emit_cloudevents": "args.emit_cloudevents",
767
- "emit_cloudevents_xregistry": "args.emit_xregistry"
767
+ "emit_cloudevents_xregistry": "args.emit_xregistry",
768
+ "sample_size": "args.sample_size",
769
+ "infer_choices": "args.infer_choices",
770
+ "choice_depth": "args.choice_depth"
768
771
  }
769
772
  },
770
773
  "extensions": [
@@ -819,6 +822,26 @@
819
822
  "type": "bool",
820
823
  "help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single Avrotize schema",
821
824
  "required": false
825
+ },
826
+ {
827
+ "name": "--sample-size",
828
+ "type": "int",
829
+ "help": "Maximum number of records to sample for dynamic field inference (0 = all)",
830
+ "default": 100,
831
+ "required": false
832
+ },
833
+ {
834
+ "name": "--infer-choices",
835
+ "type": "bool",
836
+ "help": "Detect discriminated unions in dynamic fields and emit as Avro unions with discriminator defaults",
837
+ "required": false
838
+ },
839
+ {
840
+ "name": "--choice-depth",
841
+ "type": "int",
842
+ "help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
843
+ "default": 1,
844
+ "required": false
822
845
  }
823
846
  ],
824
847
  "suggested_output_file_path": "{kusto_database}.avsc",
@@ -846,6 +869,131 @@
846
869
  ],
847
870
  "skip_input_file_handling": true
848
871
  },
872
+ {
873
+ "command": "k2s",
874
+ "description": "Convert Kusto schema to JSON Structure schema",
875
+ "group": "1_Schemas",
876
+ "function": {
877
+ "name": "avrotize.kustotojstruct.convert_kusto_to_jstruct",
878
+ "args": {
879
+ "kusto_uri": "args.kusto_uri",
880
+ "kusto_database": "args.kusto_database",
881
+ "table_name": "args.table_name",
882
+ "base_id": "args.base_id",
883
+ "jstruct_schema_file": "output_file_path",
884
+ "emit_cloudevents": "args.emit_cloudevents",
885
+ "emit_cloudevents_xregistry": "args.emit_xregistry",
886
+ "sample_size": "args.sample_size",
887
+ "infer_choices": "args.infer_choices",
888
+ "choice_depth": "args.choice_depth",
889
+ "infer_enums": "args.infer_enums"
890
+ }
891
+ },
892
+ "extensions": [
893
+ ".kusto"
894
+ ],
895
+ "args": [
896
+ {
897
+ "name": "input",
898
+ "type": "str",
899
+ "nargs": "?",
900
+ "help": "Kusto file",
901
+ "required": false
902
+ },
903
+ {
904
+ "name": "--out",
905
+ "type": "str",
906
+ "help": "Path to the JSON Structure schema file",
907
+ "required": false
908
+ },
909
+ {
910
+ "name": "--kusto-uri",
911
+ "type": "str",
912
+ "help": "Kusto Cluster URI",
913
+ "required": false
914
+ },
915
+ {
916
+ "name": "--kusto-database",
917
+ "type": "str",
918
+ "help": "Kusto database",
919
+ "required": false
920
+ },
921
+ {
922
+ "name": "--table-name",
923
+ "type": "str",
924
+ "help": "Kusto table name",
925
+ "required": false
926
+ },
927
+ {
928
+ "name": "--base-id",
929
+ "type": "str",
930
+ "help": "Base URI for $id generation",
931
+ "required": false
932
+ },
933
+ {
934
+ "name": "--emit-cloudevents",
935
+ "type": "bool",
936
+ "help": "Emit CloudEvents declarations for each table",
937
+ "required": false
938
+ },
939
+ {
940
+ "name": "--emit-xregistry",
941
+ "type": "bool",
942
+ "help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single JSON Structure schema",
943
+ "required": false
944
+ },
945
+ {
946
+ "name": "--sample-size",
947
+ "type": "int",
948
+ "help": "Maximum number of records to sample for dynamic field inference (0 = all)",
949
+ "default": 100,
950
+ "required": false
951
+ },
952
+ {
953
+ "name": "--infer-choices",
954
+ "type": "bool",
955
+ "help": "Detect discriminated unions in dynamic fields and emit as choice types with discriminator defaults",
956
+ "required": false
957
+ },
958
+ {
959
+ "name": "--choice-depth",
960
+ "type": "int",
961
+ "help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
962
+ "default": 1,
963
+ "required": false
964
+ },
965
+ {
966
+ "name": "--infer-enums",
967
+ "type": "bool",
968
+ "help": "Detect enum types from repeated string values with low cardinality in dynamic fields",
969
+ "required": false
970
+ }
971
+ ],
972
+ "suggested_output_file_path": "{kusto_database}.jstruct.json",
973
+ "prompts": [
974
+ {
975
+ "name": "--base-id",
976
+ "message": "Enter the base URI for $id generation",
977
+ "type": "str",
978
+ "required": false
979
+ },
980
+ {
981
+ "name": "--emit-cloudevents",
982
+ "message": "Emit CloudEvents declarations for each table?",
983
+ "type": "bool",
984
+ "default": false,
985
+ "required": false
986
+ },
987
+ {
988
+ "name": "--emit-xregistry",
989
+ "message": "Emit an xRegistry manifest with CloudEvents declarations?",
990
+ "type": "bool",
991
+ "default": false,
992
+ "required": false
993
+ }
994
+ ],
995
+ "skip_input_file_handling": true
996
+ },
849
997
  {
850
998
  "command": "a2sql",
851
999
  "description": "Convert Avrotize schema to SQL schema",
@@ -1167,7 +1315,11 @@
1167
1315
  "choice_depth": "args.choice_depth"
1168
1316
  }
1169
1317
  },
1170
- "extensions": [".json", ".jsonl", ".ndjson"],
1318
+ "extensions": [
1319
+ ".json",
1320
+ ".jsonl",
1321
+ ".ndjson"
1322
+ ],
1171
1323
  "args": [
1172
1324
  {
1173
1325
  "name": "input",
@@ -1252,7 +1404,11 @@
1252
1404
  "infer_enums": "args.infer_enums"
1253
1405
  }
1254
1406
  },
1255
- "extensions": [".json", ".jsonl", ".ndjson"],
1407
+ "extensions": [
1408
+ ".json",
1409
+ ".jsonl",
1410
+ ".ndjson"
1411
+ ],
1256
1412
  "args": [
1257
1413
  {
1258
1414
  "name": "input",
@@ -1343,7 +1499,9 @@
1343
1499
  "sample_size": "args.sample_size"
1344
1500
  }
1345
1501
  },
1346
- "extensions": [".xml"],
1502
+ "extensions": [
1503
+ ".xml"
1504
+ ],
1347
1505
  "args": [
1348
1506
  {
1349
1507
  "name": "input",
@@ -1411,7 +1569,9 @@
1411
1569
  "sample_size": "args.sample_size"
1412
1570
  }
1413
1571
  },
1414
- "extensions": [".xml"],
1572
+ "extensions": [
1573
+ ".xml"
1574
+ ],
1415
1575
  "args": [
1416
1576
  {
1417
1577
  "name": "input",
@@ -1480,7 +1640,10 @@
1480
1640
  "quiet": "args.quiet"
1481
1641
  }
1482
1642
  },
1483
- "extensions": [".json", ".jsonl"],
1643
+ "extensions": [
1644
+ ".json",
1645
+ ".jsonl"
1646
+ ],
1484
1647
  "args": [
1485
1648
  {
1486
1649
  "name": "input",
@@ -1667,7 +1830,10 @@
1667
1830
  "name": "--format",
1668
1831
  "type": "str",
1669
1832
  "help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
1670
- "choices": ["schema", "arrow"],
1833
+ "choices": [
1834
+ "schema",
1835
+ "arrow"
1836
+ ],
1671
1837
  "default": "arrow",
1672
1838
  "required": false
1673
1839
  }
@@ -1731,7 +1897,10 @@
1731
1897
  "name": "--format",
1732
1898
  "type": "str",
1733
1899
  "help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
1734
- "choices": ["schema", "arrow"],
1900
+ "choices": [
1901
+ "schema",
1902
+ "arrow"
1903
+ ],
1735
1904
  "default": "arrow",
1736
1905
  "required": false
1737
1906
  }
@@ -2700,7 +2869,10 @@
2700
2869
  "avro_annotation": "args.avro_annotation"
2701
2870
  }
2702
2871
  },
2703
- "extensions": [".struct.json", ".json"],
2872
+ "extensions": [
2873
+ ".struct.json",
2874
+ ".json"
2875
+ ],
2704
2876
  "args": [
2705
2877
  {
2706
2878
  "name": "input",
@@ -110,7 +110,7 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
110
110
  """Loads JSON values from files.
111
111
 
112
112
  Handles both single JSON documents and JSON Lines (JSONL) files.
113
- Arrays at the root level are flattened into individual values.
113
+ Top-level arrays are treated as single array values, not flattened.
114
114
 
115
115
  Args:
116
116
  input_files: List of file paths
@@ -134,14 +134,8 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
134
134
  # Try parsing as a single JSON document first
135
135
  try:
136
136
  data = json.loads(content)
137
- if isinstance(data, list):
138
- # Root-level array: each element is a separate value
139
- for item in data:
140
- values.append(item)
141
- if sample_size > 0 and len(values) >= sample_size:
142
- break
143
- else:
144
- values.append(data)
137
+ # Treat any valid JSON (including arrays) as a single value
138
+ values.append(data)
145
139
  continue
146
140
  except json.JSONDecodeError:
147
141
  pass
@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Tuple
7
7
  from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
8
8
  from avrotize.common import get_tree_hash
9
9
  from avrotize.constants import AVRO_VERSION
10
+ from avrotize.schema_inference import AvroSchemaInferrer
10
11
 
11
12
  JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
12
13
 
@@ -14,7 +15,7 @@ JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
14
15
  class KustoToAvro:
15
16
  """ Converts Kusto table schemas to Avro schema format."""
16
17
 
17
- def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None):
18
+ def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
18
19
  """ Initializes the KustoToAvro class with the Kusto URI and database name. """
19
20
  kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(kusto_uri) if not token_provider else KustoConnectionStringBuilder.with_token_provider(kusto_uri, token_provider)
20
21
  self.client = KustoClient(kcsb)
@@ -24,6 +25,9 @@ class KustoToAvro:
24
25
  self.avro_schema_path = avro_schema_path
25
26
  self.emit_xregistry = emit_cloudevents_xregistry
26
27
  self.emit_cloudevents = emit_cloudevents or emit_cloudevents_xregistry
28
+ self.sample_size = sample_size if sample_size > 0 else 100
29
+ self.infer_choices = infer_choices
30
+ self.choice_depth = choice_depth
27
31
  if self.emit_xregistry:
28
32
  if not self.avro_namespace:
29
33
  raise ValueError(
@@ -172,20 +176,19 @@ class KustoToAvro:
172
176
  type_value: The value of the type column (if any)
173
177
  """
174
178
  type_column_name = type_column['Name'] if type_column else None
175
- query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take 100"
179
+ query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take {self.sample_size}"
176
180
  rows = self.client.execute(self.kusto_database, query)
177
181
  values = [row[column_name] for row in rows.primary_results[0]]
178
182
  type_name = type_value if type_value else f"{table_name}.{column_name}"
179
- unique_types = self.consolidated_type_list(type_name, values)
180
- if len(unique_types) > 1:
181
- # Using a union of inferred types
182
- return unique_types
183
- elif len(unique_types) == 1:
184
- # Single type, no need for union
185
- return unique_types[0]
186
- else:
187
- # No values, default to string
188
- return "string"
183
+
184
+ # Use the new AvroSchemaInferrer for consistent inference
185
+ inferrer = AvroSchemaInferrer(
186
+ namespace=self.avro_namespace,
187
+ altnames_key='kql',
188
+ infer_choices=self.infer_choices,
189
+ choice_depth=self.choice_depth
190
+ )
191
+ return inferrer.infer_from_json_values(type_name, values)
189
192
 
190
193
  type_map : Dict[str, JsonNode] = {
191
194
  "int": "int",
@@ -440,7 +443,7 @@ class KustoToAvro:
440
443
  json.dump(output, avro_file, indent=4)
441
444
 
442
445
 
443
- def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None):
446
+ def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
444
447
  """ Converts Kusto table schemas to Avro schema format."""
445
448
 
446
449
  if not kusto_uri:
@@ -451,5 +454,5 @@ def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str |
451
454
  avro_namespace = kusto_database
452
455
 
453
456
  kusto_to_avro = KustoToAvro(
454
- kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file,emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider)
457
+ kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file, emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider, sample_size=sample_size, infer_choices=infer_choices, choice_depth=choice_depth)
455
458
  return kusto_to_avro.process_all_tables()
@@ -0,0 +1,247 @@
1
+ """ Converts Kusto table schemas to JSON Structure schema format. """
2
+
3
+ import os
4
+ import json
5
+ from typing import Dict, List
6
+ from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
7
+ from avrotize.schema_inference import JsonStructureSchemaInferrer
8
+
9
+ JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
10
+
11
+
12
+ class KustoToJsonStructure:
13
+ """ Converts Kusto table schemas to JSON Structure schema format."""
14
+
15
+ def __init__(self, kusto_uri, kusto_database, table_name: str | None, base_id: str, jstruct_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1, infer_enums: bool = False):
16
+ """ Initializes the KustoToJsonStructure class with the Kusto URI and database name. """
17
+ kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(kusto_uri) if not token_provider else KustoConnectionStringBuilder.with_token_provider(kusto_uri, token_provider)
18
+ self.client = KustoClient(kcsb)
19
+ self.kusto_database = kusto_database
20
+ self.single_table_name = table_name
21
+ self.base_id = base_id if base_id else f"https://{kusto_database}.example.com/"
22
+ self.jstruct_schema_path = jstruct_schema_path
23
+ self.emit_xregistry = emit_cloudevents_xregistry
24
+ self.emit_cloudevents = emit_cloudevents or emit_cloudevents_xregistry
25
+ self.sample_size = sample_size if sample_size > 0 else 100
26
+ self.infer_choices = infer_choices
27
+ self.choice_depth = choice_depth
28
+ self.infer_enums = infer_enums
29
+ if self.emit_xregistry:
30
+ if not self.base_id:
31
+ raise ValueError(
32
+ "The base_id must be specified when emit_cloudevents_xregistry is True")
33
+ self.generated_types: List[str] = []
34
+
35
+ def fetch_table_schema_and_docs(self, table_name: str):
36
+ """ Fetches the schema and docstrings for a given table."""
37
+ query = f".show table {table_name} schema as json"
38
+ response = self.client.execute(self.kusto_database, query)
39
+ schema_json = response.primary_results[0][0]['Schema']
40
+ schema = json.loads(schema_json)
41
+ return schema
42
+
43
+ def infer_dynamic_schema(self, table_name: str, column_name: str, type_column: dict | None, type_value: str | None) -> JsonNode:
44
+ """
45
+ Infers the schema for a dynamic column. If a type column is provided, it will infer the schema based
46
+ on constraining the result set by the type column.
47
+
48
+ Args:
49
+ table_name: The name of the table.
50
+ column_name: The name of the column.
51
+ type_column: The type column (if any)
52
+ type_value: The value of the type column (if any)
53
+ """
54
+ type_column_name = type_column['Name'] if type_column else None
55
+ query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take {self.sample_size}"
56
+ rows = self.client.execute(self.kusto_database, query)
57
+ values = [row[column_name] for row in rows.primary_results[0]]
58
+ type_name = type_value if type_value else f"{table_name}.{column_name}"
59
+
60
+ # Use the JsonStructureSchemaInferrer for consistent inference
61
+ inferrer = JsonStructureSchemaInferrer(
62
+ base_id=self.base_id,
63
+ infer_choices=self.infer_choices,
64
+ choice_depth=self.choice_depth,
65
+ infer_enums=self.infer_enums
66
+ )
67
+ return inferrer.infer_from_json_values(type_name, values)
68
+
69
+ type_map : Dict[str, JsonNode] = {
70
+ "int": "int32",
71
+ "long": "int64",
72
+ "string": "string",
73
+ "real": "double",
74
+ "bool": "boolean",
75
+ "datetime": "datetime",
76
+ "timespan": "duration",
77
+ "decimal": "decimal",
78
+ "dynamic": "object"
79
+ }
80
+
81
+ def map_kusto_type_to_jstruct_type(self, kusto_type, table_name, column_name, type_column: dict | None, type_value: str | None) -> JsonNode:
82
+ """ Maps Kusto types to JSON Structure types."""
83
+ if kusto_type == "dynamic":
84
+ return self.infer_dynamic_schema(table_name, column_name, type_column, type_value)
85
+ return self.type_map.get(kusto_type, "string")
86
+
87
+ def kusto_to_jstruct_schema(self, kusto_schema: dict, table_name: str) -> JsonNode:
88
+ """ Converts a Kusto schema to JSON Structure schema."""
89
+ column_names = set([column['Name'].lstrip('_')
90
+ for column in kusto_schema['OrderedColumns']])
91
+ type_values: List[str|None] = []
92
+ type_column: Dict[str, JsonNode] = {}
93
+ is_cloudevent = False
94
+ if self.emit_cloudevents:
95
+ is_cloudevent = 'type' in column_names and 'source' in column_names and 'data' in column_names and 'id' in column_names
96
+ if is_cloudevent:
97
+ type_column = next(
98
+ (column for column in kusto_schema['OrderedColumns'] if column['Name'].lstrip('_') == 'type'), {})
99
+ type_sampling_query = f"{table_name} | distinct {type_column['Name']}"
100
+ type_sampling_rows = self.client.execute(
101
+ self.kusto_database, type_sampling_query)
102
+ type_values.extend([row[type_column['Name']]
103
+ for row in type_sampling_rows.primary_results[0]])
104
+
105
+ if len(type_values) == 0:
106
+ type_values.append(None)
107
+
108
+ schemas: List[JsonNode] = []
109
+ for type_value in type_values:
110
+ schema: JsonNode = {}
111
+ properties: Dict[str, JsonNode] = {}
112
+ type_name = type_value if type_value and isinstance(type_value, str) else table_name
113
+
114
+ if is_cloudevent:
115
+ # get just the 'data' column and infer the schema
116
+ column = next(col for col in kusto_schema['OrderedColumns'] if col['Name'].lstrip('_') == 'data')
117
+ data_schemas: JsonNode = self.map_kusto_type_to_jstruct_type(
118
+ column['CslType'], table_name, column['Name'], type_column, type_value)
119
+ if isinstance(data_schemas, dict):
120
+ data_schemas = [data_schemas]
121
+ if isinstance(data_schemas, list):
122
+ for schema in data_schemas:
123
+ if not isinstance(schema, dict) or "type" not in schema or schema["type"] != "object":
124
+ schema = self.wrap_schema_in_root_record(schema, type_name)
125
+ self.apply_schema_attributes(schema, kusto_schema, table_name, type_value)
126
+ schemas.append(schema)
127
+ else:
128
+ for column in kusto_schema['OrderedColumns']:
129
+ jstruct_type = self.map_kusto_type_to_jstruct_type(
130
+ column['CslType'], table_name, column['Name'], type_column, type_value)
131
+
132
+ # For dynamic columns, jstruct_type is a full schema object
133
+ # For static columns, it's a type string
134
+ if isinstance(jstruct_type, dict):
135
+ # Full schema object from inference - use directly as property schema
136
+ prop = jstruct_type
137
+ else:
138
+ prop = {"type": jstruct_type}
139
+
140
+ doc: JsonNode = column.get('DocString', '')
141
+ if doc:
142
+ prop["description"] = doc
143
+ properties[column['Name']] = prop
144
+
145
+ schema = {
146
+ "type": "object",
147
+ "properties": properties
148
+ }
149
+ self.apply_schema_attributes(schema, kusto_schema, table_name, type_value)
150
+ schemas.append(schema)
151
+
152
+ return schemas if len(schemas) > 1 else schemas[0]
153
+
154
+
155
+ def wrap_schema_in_root_record(self, schema: JsonNode, type_name: str):
156
+ """ Wraps a schema in a root object."""
157
+ # If schema is already a complete type object, use it directly in properties
158
+ # Otherwise treat it as a type string
159
+ if isinstance(schema, dict) and "type" in schema:
160
+ data_prop = schema
161
+ else:
162
+ data_prop = {"type": schema}
163
+
164
+ data_prop["root"] = True
165
+
166
+ record: Dict[str, JsonNode] = {
167
+ "type": "object",
168
+ "properties": {
169
+ "data": data_prop
170
+ }
171
+ }
172
+ return record
173
+
174
+ def apply_schema_attributes(self, schema, kusto_schema, table_name, type_value):
175
+ """ Applies schema attributes to the schema."""
176
+ if isinstance(schema, dict):
177
+ type_name = type_value if type_value and isinstance(type_value, str) else table_name
178
+ schema["$id"] = f"{self.base_id}{type_name}"
179
+
180
+ # Add description from table doc string if available
181
+ if 'DocString' in kusto_schema:
182
+ schema["description"] = kusto_schema['DocString']
183
+
184
+ def fetch_all_table_names(self) -> List[str]:
185
+ """ Fetches all table names from the Kusto database."""
186
+ query = ".show tables"
187
+ rows = self.client.execute(self.kusto_database, query)
188
+ table_names = [row['TableName'] for row in rows.primary_results[0]]
189
+ return table_names
190
+
191
+ def process_all_tables(self):
192
+ """ Processes all tables in the Kusto database."""
193
+ if self.single_table_name:
194
+ table_names = [self.single_table_name]
195
+ else:
196
+ table_names = self.fetch_all_table_names()
197
+
198
+ all_schemas = []
199
+ for table_name in table_names:
200
+ kusto_schema = self.fetch_table_schema_and_docs(table_name)
201
+ jstruct_schema = self.kusto_to_jstruct_schema(kusto_schema, table_name)
202
+
203
+ if isinstance(jstruct_schema, list):
204
+ all_schemas.extend(jstruct_schema)
205
+ else:
206
+ all_schemas.append(jstruct_schema)
207
+
208
+ # Write output
209
+ output_dir = os.path.dirname(self.jstruct_schema_path)
210
+ if output_dir and not os.path.exists(output_dir):
211
+ os.makedirs(output_dir)
212
+
213
+ if self.emit_xregistry:
214
+ # Create xRegistry manifest
215
+ output = {
216
+ "$schema": "https://cloudevents.io/schemas/registry",
217
+ "specversion": "0.5-wip",
218
+ "endpoints": {},
219
+ "messagegroups": {},
220
+ "schemagroups": {
221
+ f"{self.kusto_database}": {
222
+ "schemas": {schema.get("$id", f"schema_{i}"): {"format": "JSONStructure/1.0", "schema": schema}
223
+ for i, schema in enumerate(all_schemas)}
224
+ }
225
+ }
226
+ }
227
+ else:
228
+ # Single schema or array of schemas
229
+ output = all_schemas if len(all_schemas) > 1 else all_schemas[0] if all_schemas else {}
230
+
231
+ with open(self.jstruct_schema_path, 'w', encoding='utf-8') as jstruct_file:
232
+ json.dump(output, jstruct_file, indent=4)
233
+
234
+
235
+ def convert_kusto_to_jstruct(kusto_uri: str, kusto_database: str, table_name: str | None, base_id: str, jstruct_schema_file: str, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1, infer_enums: bool = False):
236
+ """ Converts Kusto table schemas to JSON Structure schema format."""
237
+
238
+ if not kusto_uri:
239
+ raise ValueError("kusto_uri is required")
240
+ if not kusto_database:
241
+ raise ValueError("kusto_database is required")
242
+ if not base_id:
243
+ base_id = f"https://{kusto_database}.example.com/"
244
+
245
+ kusto_to_jstruct = KustoToJsonStructure(
246
+ kusto_uri, kusto_database, table_name, base_id, jstruct_schema_file, emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider, sample_size=sample_size, infer_choices=infer_choices, choice_depth=choice_depth, infer_enums=infer_enums)
247
+ return kusto_to_jstruct.process_all_tables()