structurize 3.3.0__tar.gz → 3.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {structurize-3.3.0/structurize.egg-info → structurize-3.3.1}/PKG-INFO +1 -1
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/_version.py +3 -3
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/choice_inference.py +7 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/commands.json +181 -9
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/jsontoschema.py +3 -9
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/kustotoavro.py +17 -14
- structurize-3.3.1/avrotize/kustotojstruct.py +247 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/schema_inference.py +206 -13
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/validate.py +11 -2
- {structurize-3.3.0 → structurize-3.3.1/structurize.egg-info}/PKG-INFO +1 -1
- {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/SOURCES.txt +2 -0
- {structurize-3.3.0 → structurize-3.3.1}/.gitignore +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/LICENSE +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/MANIFEST.in +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/README.md +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/__init__.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/__main__.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/asn1toavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotize.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotocpp.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotocsharp.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotocsv.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotodatapackage.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotodb.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotogo.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotographql.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotoiceberg.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotojava.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotojs.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotojsons.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotojstruct.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotokusto.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotomd.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotools.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotoparquet.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotoproto.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotopython.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotorust.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotots.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrotoxsd.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/avrovalidator.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/cddltostructure.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/common.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/constants.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/csvtoavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/datapackagetoavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/dependencies/cpp/vcpkg/vcpkg.json +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/dependencies/typescript/node22/package.json +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/dependency_resolver.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/dependency_version.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/jsonstoavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/jsonstostructure.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/jstructtoavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/kstructtoavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/openapitostructure.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/parquettoavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/proto2parser.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/proto3parser.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/prototoavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/sqltoavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretocddl.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretocpp.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretocsharp.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretocsv.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretodatapackage.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretodb.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretogo.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretographql.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretoiceberg.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretojava.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretojs.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretojsons.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretokusto.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretomd.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretoproto.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretopython.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretorust.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretots.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/structuretoxsd.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/xmltoschema.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/avrotize/xsdtoavro.py +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/build.ps1 +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/build.sh +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/pyproject.toml +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/setup.cfg +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/dependency_links.txt +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/entry_points.txt +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/requires.txt +0 -0
- {structurize-3.3.0 → structurize-3.3.1}/structurize.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: structurize
|
|
3
|
-
Version: 3.3.
|
|
3
|
+
Version: 3.3.1
|
|
4
4
|
Summary: Tools to convert from and to JSON Structure from various other schema languages.
|
|
5
5
|
Author-email: Clemens Vasters <clemensv@microsoft.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '3.3.
|
|
32
|
-
__version_tuple__ = version_tuple = (3, 3,
|
|
31
|
+
__version__ = version = '3.3.1'
|
|
32
|
+
__version_tuple__ = version_tuple = (3, 3, 1)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g670e64099'
|
|
@@ -214,6 +214,13 @@ def _detect_discriminators(
|
|
|
214
214
|
if len(values) < 2:
|
|
215
215
|
continue
|
|
216
216
|
|
|
217
|
+
# Skip boolean-like string values - these are flags, not discriminators
|
|
218
|
+
# A field with only "true"/"false" (or similar) values is not a type discriminator
|
|
219
|
+
normalized_values = {v.lower() if isinstance(v, str) else str(v).lower() for v in values}
|
|
220
|
+
boolean_values = {'true', 'false', 'yes', 'no', '0', '1'}
|
|
221
|
+
if normalized_values <= boolean_values:
|
|
222
|
+
continue
|
|
223
|
+
|
|
217
224
|
# Single cluster with multiple values - check if values create distinct groups
|
|
218
225
|
if len(clusters) == 1:
|
|
219
226
|
value_to_docs: Dict[str, List[DocumentInfo]] = defaultdict(list)
|
|
@@ -764,7 +764,10 @@
|
|
|
764
764
|
"avro_namespace": "args.namespace",
|
|
765
765
|
"avro_schema_file": "output_file_path",
|
|
766
766
|
"emit_cloudevents": "args.emit_cloudevents",
|
|
767
|
-
"emit_cloudevents_xregistry": "args.emit_xregistry"
|
|
767
|
+
"emit_cloudevents_xregistry": "args.emit_xregistry",
|
|
768
|
+
"sample_size": "args.sample_size",
|
|
769
|
+
"infer_choices": "args.infer_choices",
|
|
770
|
+
"choice_depth": "args.choice_depth"
|
|
768
771
|
}
|
|
769
772
|
},
|
|
770
773
|
"extensions": [
|
|
@@ -819,6 +822,26 @@
|
|
|
819
822
|
"type": "bool",
|
|
820
823
|
"help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single Avrotize schema",
|
|
821
824
|
"required": false
|
|
825
|
+
},
|
|
826
|
+
{
|
|
827
|
+
"name": "--sample-size",
|
|
828
|
+
"type": "int",
|
|
829
|
+
"help": "Maximum number of records to sample for dynamic field inference (0 = all)",
|
|
830
|
+
"default": 100,
|
|
831
|
+
"required": false
|
|
832
|
+
},
|
|
833
|
+
{
|
|
834
|
+
"name": "--infer-choices",
|
|
835
|
+
"type": "bool",
|
|
836
|
+
"help": "Detect discriminated unions in dynamic fields and emit as Avro unions with discriminator defaults",
|
|
837
|
+
"required": false
|
|
838
|
+
},
|
|
839
|
+
{
|
|
840
|
+
"name": "--choice-depth",
|
|
841
|
+
"type": "int",
|
|
842
|
+
"help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
|
|
843
|
+
"default": 1,
|
|
844
|
+
"required": false
|
|
822
845
|
}
|
|
823
846
|
],
|
|
824
847
|
"suggested_output_file_path": "{kusto_database}.avsc",
|
|
@@ -846,6 +869,131 @@
|
|
|
846
869
|
],
|
|
847
870
|
"skip_input_file_handling": true
|
|
848
871
|
},
|
|
872
|
+
{
|
|
873
|
+
"command": "k2s",
|
|
874
|
+
"description": "Convert Kusto schema to JSON Structure schema",
|
|
875
|
+
"group": "1_Schemas",
|
|
876
|
+
"function": {
|
|
877
|
+
"name": "avrotize.kustotojstruct.convert_kusto_to_jstruct",
|
|
878
|
+
"args": {
|
|
879
|
+
"kusto_uri": "args.kusto_uri",
|
|
880
|
+
"kusto_database": "args.kusto_database",
|
|
881
|
+
"table_name": "args.table_name",
|
|
882
|
+
"base_id": "args.base_id",
|
|
883
|
+
"jstruct_schema_file": "output_file_path",
|
|
884
|
+
"emit_cloudevents": "args.emit_cloudevents",
|
|
885
|
+
"emit_cloudevents_xregistry": "args.emit_xregistry",
|
|
886
|
+
"sample_size": "args.sample_size",
|
|
887
|
+
"infer_choices": "args.infer_choices",
|
|
888
|
+
"choice_depth": "args.choice_depth",
|
|
889
|
+
"infer_enums": "args.infer_enums"
|
|
890
|
+
}
|
|
891
|
+
},
|
|
892
|
+
"extensions": [
|
|
893
|
+
".kusto"
|
|
894
|
+
],
|
|
895
|
+
"args": [
|
|
896
|
+
{
|
|
897
|
+
"name": "input",
|
|
898
|
+
"type": "str",
|
|
899
|
+
"nargs": "?",
|
|
900
|
+
"help": "Kusto file",
|
|
901
|
+
"required": false
|
|
902
|
+
},
|
|
903
|
+
{
|
|
904
|
+
"name": "--out",
|
|
905
|
+
"type": "str",
|
|
906
|
+
"help": "Path to the JSON Structure schema file",
|
|
907
|
+
"required": false
|
|
908
|
+
},
|
|
909
|
+
{
|
|
910
|
+
"name": "--kusto-uri",
|
|
911
|
+
"type": "str",
|
|
912
|
+
"help": "Kusto Cluster URI",
|
|
913
|
+
"required": false
|
|
914
|
+
},
|
|
915
|
+
{
|
|
916
|
+
"name": "--kusto-database",
|
|
917
|
+
"type": "str",
|
|
918
|
+
"help": "Kusto database",
|
|
919
|
+
"required": false
|
|
920
|
+
},
|
|
921
|
+
{
|
|
922
|
+
"name": "--table-name",
|
|
923
|
+
"type": "str",
|
|
924
|
+
"help": "Kusto table name",
|
|
925
|
+
"required": false
|
|
926
|
+
},
|
|
927
|
+
{
|
|
928
|
+
"name": "--base-id",
|
|
929
|
+
"type": "str",
|
|
930
|
+
"help": "Base URI for $id generation",
|
|
931
|
+
"required": false
|
|
932
|
+
},
|
|
933
|
+
{
|
|
934
|
+
"name": "--emit-cloudevents",
|
|
935
|
+
"type": "bool",
|
|
936
|
+
"help": "Emit CloudEvents declarations for each table",
|
|
937
|
+
"required": false
|
|
938
|
+
},
|
|
939
|
+
{
|
|
940
|
+
"name": "--emit-xregistry",
|
|
941
|
+
"type": "bool",
|
|
942
|
+
"help": "Emit an xRegistry manifest with CloudEvents declarations for each table instead of a single JSON Structure schema",
|
|
943
|
+
"required": false
|
|
944
|
+
},
|
|
945
|
+
{
|
|
946
|
+
"name": "--sample-size",
|
|
947
|
+
"type": "int",
|
|
948
|
+
"help": "Maximum number of records to sample for dynamic field inference (0 = all)",
|
|
949
|
+
"default": 100,
|
|
950
|
+
"required": false
|
|
951
|
+
},
|
|
952
|
+
{
|
|
953
|
+
"name": "--infer-choices",
|
|
954
|
+
"type": "bool",
|
|
955
|
+
"help": "Detect discriminated unions in dynamic fields and emit as choice types with discriminator defaults",
|
|
956
|
+
"required": false
|
|
957
|
+
},
|
|
958
|
+
{
|
|
959
|
+
"name": "--choice-depth",
|
|
960
|
+
"type": "int",
|
|
961
|
+
"help": "Maximum nesting depth for recursive choice inference in dynamic fields (1 = root only)",
|
|
962
|
+
"default": 1,
|
|
963
|
+
"required": false
|
|
964
|
+
},
|
|
965
|
+
{
|
|
966
|
+
"name": "--infer-enums",
|
|
967
|
+
"type": "bool",
|
|
968
|
+
"help": "Detect enum types from repeated string values with low cardinality in dynamic fields",
|
|
969
|
+
"required": false
|
|
970
|
+
}
|
|
971
|
+
],
|
|
972
|
+
"suggested_output_file_path": "{kusto_database}.jstruct.json",
|
|
973
|
+
"prompts": [
|
|
974
|
+
{
|
|
975
|
+
"name": "--base-id",
|
|
976
|
+
"message": "Enter the base URI for $id generation",
|
|
977
|
+
"type": "str",
|
|
978
|
+
"required": false
|
|
979
|
+
},
|
|
980
|
+
{
|
|
981
|
+
"name": "--emit-cloudevents",
|
|
982
|
+
"message": "Emit CloudEvents declarations for each table?",
|
|
983
|
+
"type": "bool",
|
|
984
|
+
"default": false,
|
|
985
|
+
"required": false
|
|
986
|
+
},
|
|
987
|
+
{
|
|
988
|
+
"name": "--emit-xregistry",
|
|
989
|
+
"message": "Emit an xRegistry manifest with CloudEvents declarations?",
|
|
990
|
+
"type": "bool",
|
|
991
|
+
"default": false,
|
|
992
|
+
"required": false
|
|
993
|
+
}
|
|
994
|
+
],
|
|
995
|
+
"skip_input_file_handling": true
|
|
996
|
+
},
|
|
849
997
|
{
|
|
850
998
|
"command": "a2sql",
|
|
851
999
|
"description": "Convert Avrotize schema to SQL schema",
|
|
@@ -1167,7 +1315,11 @@
|
|
|
1167
1315
|
"choice_depth": "args.choice_depth"
|
|
1168
1316
|
}
|
|
1169
1317
|
},
|
|
1170
|
-
"extensions": [
|
|
1318
|
+
"extensions": [
|
|
1319
|
+
".json",
|
|
1320
|
+
".jsonl",
|
|
1321
|
+
".ndjson"
|
|
1322
|
+
],
|
|
1171
1323
|
"args": [
|
|
1172
1324
|
{
|
|
1173
1325
|
"name": "input",
|
|
@@ -1252,7 +1404,11 @@
|
|
|
1252
1404
|
"infer_enums": "args.infer_enums"
|
|
1253
1405
|
}
|
|
1254
1406
|
},
|
|
1255
|
-
"extensions": [
|
|
1407
|
+
"extensions": [
|
|
1408
|
+
".json",
|
|
1409
|
+
".jsonl",
|
|
1410
|
+
".ndjson"
|
|
1411
|
+
],
|
|
1256
1412
|
"args": [
|
|
1257
1413
|
{
|
|
1258
1414
|
"name": "input",
|
|
@@ -1343,7 +1499,9 @@
|
|
|
1343
1499
|
"sample_size": "args.sample_size"
|
|
1344
1500
|
}
|
|
1345
1501
|
},
|
|
1346
|
-
"extensions": [
|
|
1502
|
+
"extensions": [
|
|
1503
|
+
".xml"
|
|
1504
|
+
],
|
|
1347
1505
|
"args": [
|
|
1348
1506
|
{
|
|
1349
1507
|
"name": "input",
|
|
@@ -1411,7 +1569,9 @@
|
|
|
1411
1569
|
"sample_size": "args.sample_size"
|
|
1412
1570
|
}
|
|
1413
1571
|
},
|
|
1414
|
-
"extensions": [
|
|
1572
|
+
"extensions": [
|
|
1573
|
+
".xml"
|
|
1574
|
+
],
|
|
1415
1575
|
"args": [
|
|
1416
1576
|
{
|
|
1417
1577
|
"name": "input",
|
|
@@ -1480,7 +1640,10 @@
|
|
|
1480
1640
|
"quiet": "args.quiet"
|
|
1481
1641
|
}
|
|
1482
1642
|
},
|
|
1483
|
-
"extensions": [
|
|
1643
|
+
"extensions": [
|
|
1644
|
+
".json",
|
|
1645
|
+
".jsonl"
|
|
1646
|
+
],
|
|
1484
1647
|
"args": [
|
|
1485
1648
|
{
|
|
1486
1649
|
"name": "input",
|
|
@@ -1667,7 +1830,10 @@
|
|
|
1667
1830
|
"name": "--format",
|
|
1668
1831
|
"type": "str",
|
|
1669
1832
|
"help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
|
|
1670
|
-
"choices": [
|
|
1833
|
+
"choices": [
|
|
1834
|
+
"schema",
|
|
1835
|
+
"arrow"
|
|
1836
|
+
],
|
|
1671
1837
|
"default": "arrow",
|
|
1672
1838
|
"required": false
|
|
1673
1839
|
}
|
|
@@ -1731,7 +1897,10 @@
|
|
|
1731
1897
|
"name": "--format",
|
|
1732
1898
|
"type": "str",
|
|
1733
1899
|
"help": "Output format: 'arrow' for binary Arrow IPC (default), 'schema' for JSON",
|
|
1734
|
-
"choices": [
|
|
1900
|
+
"choices": [
|
|
1901
|
+
"schema",
|
|
1902
|
+
"arrow"
|
|
1903
|
+
],
|
|
1735
1904
|
"default": "arrow",
|
|
1736
1905
|
"required": false
|
|
1737
1906
|
}
|
|
@@ -2700,7 +2869,10 @@
|
|
|
2700
2869
|
"avro_annotation": "args.avro_annotation"
|
|
2701
2870
|
}
|
|
2702
2871
|
},
|
|
2703
|
-
"extensions": [
|
|
2872
|
+
"extensions": [
|
|
2873
|
+
".struct.json",
|
|
2874
|
+
".json"
|
|
2875
|
+
],
|
|
2704
2876
|
"args": [
|
|
2705
2877
|
{
|
|
2706
2878
|
"name": "input",
|
|
@@ -110,7 +110,7 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
|
|
|
110
110
|
"""Loads JSON values from files.
|
|
111
111
|
|
|
112
112
|
Handles both single JSON documents and JSON Lines (JSONL) files.
|
|
113
|
-
|
|
113
|
+
Top-level arrays are treated as single array values, not flattened.
|
|
114
114
|
|
|
115
115
|
Args:
|
|
116
116
|
input_files: List of file paths
|
|
@@ -134,14 +134,8 @@ def _load_json_values(input_files: List[str], sample_size: int) -> List[Any]:
|
|
|
134
134
|
# Try parsing as a single JSON document first
|
|
135
135
|
try:
|
|
136
136
|
data = json.loads(content)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
for item in data:
|
|
140
|
-
values.append(item)
|
|
141
|
-
if sample_size > 0 and len(values) >= sample_size:
|
|
142
|
-
break
|
|
143
|
-
else:
|
|
144
|
-
values.append(data)
|
|
137
|
+
# Treat any valid JSON (including arrays) as a single value
|
|
138
|
+
values.append(data)
|
|
145
139
|
continue
|
|
146
140
|
except json.JSONDecodeError:
|
|
147
141
|
pass
|
|
@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Tuple
|
|
|
7
7
|
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
|
|
8
8
|
from avrotize.common import get_tree_hash
|
|
9
9
|
from avrotize.constants import AVRO_VERSION
|
|
10
|
+
from avrotize.schema_inference import AvroSchemaInferrer
|
|
10
11
|
|
|
11
12
|
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
12
13
|
|
|
@@ -14,7 +15,7 @@ JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
|
14
15
|
class KustoToAvro:
|
|
15
16
|
""" Converts Kusto table schemas to Avro schema format."""
|
|
16
17
|
|
|
17
|
-
def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None):
|
|
18
|
+
def __init__(self, kusto_uri, kusto_database, table_name: str | None, avro_namespace: str, avro_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
|
|
18
19
|
""" Initializes the KustoToAvro class with the Kusto URI and database name. """
|
|
19
20
|
kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(kusto_uri) if not token_provider else KustoConnectionStringBuilder.with_token_provider(kusto_uri, token_provider)
|
|
20
21
|
self.client = KustoClient(kcsb)
|
|
@@ -24,6 +25,9 @@ class KustoToAvro:
|
|
|
24
25
|
self.avro_schema_path = avro_schema_path
|
|
25
26
|
self.emit_xregistry = emit_cloudevents_xregistry
|
|
26
27
|
self.emit_cloudevents = emit_cloudevents or emit_cloudevents_xregistry
|
|
28
|
+
self.sample_size = sample_size if sample_size > 0 else 100
|
|
29
|
+
self.infer_choices = infer_choices
|
|
30
|
+
self.choice_depth = choice_depth
|
|
27
31
|
if self.emit_xregistry:
|
|
28
32
|
if not self.avro_namespace:
|
|
29
33
|
raise ValueError(
|
|
@@ -172,20 +176,19 @@ class KustoToAvro:
|
|
|
172
176
|
type_value: The value of the type column (if any)
|
|
173
177
|
"""
|
|
174
178
|
type_column_name = type_column['Name'] if type_column else None
|
|
175
|
-
query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take
|
|
179
|
+
query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take {self.sample_size}"
|
|
176
180
|
rows = self.client.execute(self.kusto_database, query)
|
|
177
181
|
values = [row[column_name] for row in rows.primary_results[0]]
|
|
178
182
|
type_name = type_value if type_value else f"{table_name}.{column_name}"
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
return "string"
|
|
183
|
+
|
|
184
|
+
# Use the new AvroSchemaInferrer for consistent inference
|
|
185
|
+
inferrer = AvroSchemaInferrer(
|
|
186
|
+
namespace=self.avro_namespace,
|
|
187
|
+
altnames_key='kql',
|
|
188
|
+
infer_choices=self.infer_choices,
|
|
189
|
+
choice_depth=self.choice_depth
|
|
190
|
+
)
|
|
191
|
+
return inferrer.infer_from_json_values(type_name, values)
|
|
189
192
|
|
|
190
193
|
type_map : Dict[str, JsonNode] = {
|
|
191
194
|
"int": "int",
|
|
@@ -440,7 +443,7 @@ class KustoToAvro:
|
|
|
440
443
|
json.dump(output, avro_file, indent=4)
|
|
441
444
|
|
|
442
445
|
|
|
443
|
-
def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None):
|
|
446
|
+
def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str | None, avro_namespace: str, avro_schema_file: str, emit_cloudevents:bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1):
|
|
444
447
|
""" Converts Kusto table schemas to Avro schema format."""
|
|
445
448
|
|
|
446
449
|
if not kusto_uri:
|
|
@@ -451,5 +454,5 @@ def convert_kusto_to_avro(kusto_uri: str, kusto_database: str, table_name: str |
|
|
|
451
454
|
avro_namespace = kusto_database
|
|
452
455
|
|
|
453
456
|
kusto_to_avro = KustoToAvro(
|
|
454
|
-
kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file,emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider)
|
|
457
|
+
kusto_uri, kusto_database, table_name, avro_namespace, avro_schema_file, emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider, sample_size=sample_size, infer_choices=infer_choices, choice_depth=choice_depth)
|
|
455
458
|
return kusto_to_avro.process_all_tables()
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
""" Converts Kusto table schemas to JSON Structure schema format. """
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
|
|
7
|
+
from avrotize.schema_inference import JsonStructureSchemaInferrer
|
|
8
|
+
|
|
9
|
+
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class KustoToJsonStructure:
|
|
13
|
+
""" Converts Kusto table schemas to JSON Structure schema format."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, kusto_uri, kusto_database, table_name: str | None, base_id: str, jstruct_schema_path, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1, infer_enums: bool = False):
|
|
16
|
+
""" Initializes the KustoToJsonStructure class with the Kusto URI and database name. """
|
|
17
|
+
kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(kusto_uri) if not token_provider else KustoConnectionStringBuilder.with_token_provider(kusto_uri, token_provider)
|
|
18
|
+
self.client = KustoClient(kcsb)
|
|
19
|
+
self.kusto_database = kusto_database
|
|
20
|
+
self.single_table_name = table_name
|
|
21
|
+
self.base_id = base_id if base_id else f"https://{kusto_database}.example.com/"
|
|
22
|
+
self.jstruct_schema_path = jstruct_schema_path
|
|
23
|
+
self.emit_xregistry = emit_cloudevents_xregistry
|
|
24
|
+
self.emit_cloudevents = emit_cloudevents or emit_cloudevents_xregistry
|
|
25
|
+
self.sample_size = sample_size if sample_size > 0 else 100
|
|
26
|
+
self.infer_choices = infer_choices
|
|
27
|
+
self.choice_depth = choice_depth
|
|
28
|
+
self.infer_enums = infer_enums
|
|
29
|
+
if self.emit_xregistry:
|
|
30
|
+
if not self.base_id:
|
|
31
|
+
raise ValueError(
|
|
32
|
+
"The base_id must be specified when emit_cloudevents_xregistry is True")
|
|
33
|
+
self.generated_types: List[str] = []
|
|
34
|
+
|
|
35
|
+
def fetch_table_schema_and_docs(self, table_name: str):
|
|
36
|
+
""" Fetches the schema and docstrings for a given table."""
|
|
37
|
+
query = f".show table {table_name} schema as json"
|
|
38
|
+
response = self.client.execute(self.kusto_database, query)
|
|
39
|
+
schema_json = response.primary_results[0][0]['Schema']
|
|
40
|
+
schema = json.loads(schema_json)
|
|
41
|
+
return schema
|
|
42
|
+
|
|
43
|
+
def infer_dynamic_schema(self, table_name: str, column_name: str, type_column: dict | None, type_value: str | None) -> JsonNode:
|
|
44
|
+
"""
|
|
45
|
+
Infers the schema for a dynamic column. If a type column is provided, it will infer the schema based
|
|
46
|
+
on constraining the result set by the type column.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
table_name: The name of the table.
|
|
50
|
+
column_name: The name of the column.
|
|
51
|
+
type_column: The type column (if any)
|
|
52
|
+
type_value: The value of the type column (if any)
|
|
53
|
+
"""
|
|
54
|
+
type_column_name = type_column['Name'] if type_column else None
|
|
55
|
+
query = f"{table_name}"+(f' | where {type_column_name}=="{type_value}"' if type_column_name and type_value else '') + f" | project {column_name} | take {self.sample_size}"
|
|
56
|
+
rows = self.client.execute(self.kusto_database, query)
|
|
57
|
+
values = [row[column_name] for row in rows.primary_results[0]]
|
|
58
|
+
type_name = type_value if type_value else f"{table_name}.{column_name}"
|
|
59
|
+
|
|
60
|
+
# Use the JsonStructureSchemaInferrer for consistent inference
|
|
61
|
+
inferrer = JsonStructureSchemaInferrer(
|
|
62
|
+
base_id=self.base_id,
|
|
63
|
+
infer_choices=self.infer_choices,
|
|
64
|
+
choice_depth=self.choice_depth,
|
|
65
|
+
infer_enums=self.infer_enums
|
|
66
|
+
)
|
|
67
|
+
return inferrer.infer_from_json_values(type_name, values)
|
|
68
|
+
|
|
69
|
+
type_map : Dict[str, JsonNode] = {
|
|
70
|
+
"int": "int32",
|
|
71
|
+
"long": "int64",
|
|
72
|
+
"string": "string",
|
|
73
|
+
"real": "double",
|
|
74
|
+
"bool": "boolean",
|
|
75
|
+
"datetime": "datetime",
|
|
76
|
+
"timespan": "duration",
|
|
77
|
+
"decimal": "decimal",
|
|
78
|
+
"dynamic": "object"
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
def map_kusto_type_to_jstruct_type(self, kusto_type, table_name, column_name, type_column: dict | None, type_value: str | None) -> JsonNode:
|
|
82
|
+
""" Maps Kusto types to JSON Structure types."""
|
|
83
|
+
if kusto_type == "dynamic":
|
|
84
|
+
return self.infer_dynamic_schema(table_name, column_name, type_column, type_value)
|
|
85
|
+
return self.type_map.get(kusto_type, "string")
|
|
86
|
+
|
|
87
|
+
def kusto_to_jstruct_schema(self, kusto_schema: dict, table_name: str) -> JsonNode:
|
|
88
|
+
""" Converts a Kusto schema to JSON Structure schema."""
|
|
89
|
+
column_names = set([column['Name'].lstrip('_')
|
|
90
|
+
for column in kusto_schema['OrderedColumns']])
|
|
91
|
+
type_values: List[str|None] = []
|
|
92
|
+
type_column: Dict[str, JsonNode] = {}
|
|
93
|
+
is_cloudevent = False
|
|
94
|
+
if self.emit_cloudevents:
|
|
95
|
+
is_cloudevent = 'type' in column_names and 'source' in column_names and 'data' in column_names and 'id' in column_names
|
|
96
|
+
if is_cloudevent:
|
|
97
|
+
type_column = next(
|
|
98
|
+
(column for column in kusto_schema['OrderedColumns'] if column['Name'].lstrip('_') == 'type'), {})
|
|
99
|
+
type_sampling_query = f"{table_name} | distinct {type_column['Name']}"
|
|
100
|
+
type_sampling_rows = self.client.execute(
|
|
101
|
+
self.kusto_database, type_sampling_query)
|
|
102
|
+
type_values.extend([row[type_column['Name']]
|
|
103
|
+
for row in type_sampling_rows.primary_results[0]])
|
|
104
|
+
|
|
105
|
+
if len(type_values) == 0:
|
|
106
|
+
type_values.append(None)
|
|
107
|
+
|
|
108
|
+
schemas: List[JsonNode] = []
|
|
109
|
+
for type_value in type_values:
|
|
110
|
+
schema: JsonNode = {}
|
|
111
|
+
properties: Dict[str, JsonNode] = {}
|
|
112
|
+
type_name = type_value if type_value and isinstance(type_value, str) else table_name
|
|
113
|
+
|
|
114
|
+
if is_cloudevent:
|
|
115
|
+
# get just the 'data' column and infer the schema
|
|
116
|
+
column = next(col for col in kusto_schema['OrderedColumns'] if col['Name'].lstrip('_') == 'data')
|
|
117
|
+
data_schemas: JsonNode = self.map_kusto_type_to_jstruct_type(
|
|
118
|
+
column['CslType'], table_name, column['Name'], type_column, type_value)
|
|
119
|
+
if isinstance(data_schemas, dict):
|
|
120
|
+
data_schemas = [data_schemas]
|
|
121
|
+
if isinstance(data_schemas, list):
|
|
122
|
+
for schema in data_schemas:
|
|
123
|
+
if not isinstance(schema, dict) or "type" not in schema or schema["type"] != "object":
|
|
124
|
+
schema = self.wrap_schema_in_root_record(schema, type_name)
|
|
125
|
+
self.apply_schema_attributes(schema, kusto_schema, table_name, type_value)
|
|
126
|
+
schemas.append(schema)
|
|
127
|
+
else:
|
|
128
|
+
for column in kusto_schema['OrderedColumns']:
|
|
129
|
+
jstruct_type = self.map_kusto_type_to_jstruct_type(
|
|
130
|
+
column['CslType'], table_name, column['Name'], type_column, type_value)
|
|
131
|
+
|
|
132
|
+
# For dynamic columns, jstruct_type is a full schema object
|
|
133
|
+
# For static columns, it's a type string
|
|
134
|
+
if isinstance(jstruct_type, dict):
|
|
135
|
+
# Full schema object from inference - use directly as property schema
|
|
136
|
+
prop = jstruct_type
|
|
137
|
+
else:
|
|
138
|
+
prop = {"type": jstruct_type}
|
|
139
|
+
|
|
140
|
+
doc: JsonNode = column.get('DocString', '')
|
|
141
|
+
if doc:
|
|
142
|
+
prop["description"] = doc
|
|
143
|
+
properties[column['Name']] = prop
|
|
144
|
+
|
|
145
|
+
schema = {
|
|
146
|
+
"type": "object",
|
|
147
|
+
"properties": properties
|
|
148
|
+
}
|
|
149
|
+
self.apply_schema_attributes(schema, kusto_schema, table_name, type_value)
|
|
150
|
+
schemas.append(schema)
|
|
151
|
+
|
|
152
|
+
return schemas if len(schemas) > 1 else schemas[0]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def wrap_schema_in_root_record(self, schema: JsonNode, type_name: str):
|
|
156
|
+
""" Wraps a schema in a root object."""
|
|
157
|
+
# If schema is already a complete type object, use it directly in properties
|
|
158
|
+
# Otherwise treat it as a type string
|
|
159
|
+
if isinstance(schema, dict) and "type" in schema:
|
|
160
|
+
data_prop = schema
|
|
161
|
+
else:
|
|
162
|
+
data_prop = {"type": schema}
|
|
163
|
+
|
|
164
|
+
data_prop["root"] = True
|
|
165
|
+
|
|
166
|
+
record: Dict[str, JsonNode] = {
|
|
167
|
+
"type": "object",
|
|
168
|
+
"properties": {
|
|
169
|
+
"data": data_prop
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return record
|
|
173
|
+
|
|
174
|
+
def apply_schema_attributes(self, schema, kusto_schema, table_name, type_value):
|
|
175
|
+
""" Applies schema attributes to the schema."""
|
|
176
|
+
if isinstance(schema, dict):
|
|
177
|
+
type_name = type_value if type_value and isinstance(type_value, str) else table_name
|
|
178
|
+
schema["$id"] = f"{self.base_id}{type_name}"
|
|
179
|
+
|
|
180
|
+
# Add description from table doc string if available
|
|
181
|
+
if 'DocString' in kusto_schema:
|
|
182
|
+
schema["description"] = kusto_schema['DocString']
|
|
183
|
+
|
|
184
|
+
def fetch_all_table_names(self) -> List[str]:
|
|
185
|
+
""" Fetches all table names from the Kusto database."""
|
|
186
|
+
query = ".show tables"
|
|
187
|
+
rows = self.client.execute(self.kusto_database, query)
|
|
188
|
+
table_names = [row['TableName'] for row in rows.primary_results[0]]
|
|
189
|
+
return table_names
|
|
190
|
+
|
|
191
|
+
def process_all_tables(self):
|
|
192
|
+
""" Processes all tables in the Kusto database."""
|
|
193
|
+
if self.single_table_name:
|
|
194
|
+
table_names = [self.single_table_name]
|
|
195
|
+
else:
|
|
196
|
+
table_names = self.fetch_all_table_names()
|
|
197
|
+
|
|
198
|
+
all_schemas = []
|
|
199
|
+
for table_name in table_names:
|
|
200
|
+
kusto_schema = self.fetch_table_schema_and_docs(table_name)
|
|
201
|
+
jstruct_schema = self.kusto_to_jstruct_schema(kusto_schema, table_name)
|
|
202
|
+
|
|
203
|
+
if isinstance(jstruct_schema, list):
|
|
204
|
+
all_schemas.extend(jstruct_schema)
|
|
205
|
+
else:
|
|
206
|
+
all_schemas.append(jstruct_schema)
|
|
207
|
+
|
|
208
|
+
# Write output
|
|
209
|
+
output_dir = os.path.dirname(self.jstruct_schema_path)
|
|
210
|
+
if output_dir and not os.path.exists(output_dir):
|
|
211
|
+
os.makedirs(output_dir)
|
|
212
|
+
|
|
213
|
+
if self.emit_xregistry:
|
|
214
|
+
# Create xRegistry manifest
|
|
215
|
+
output = {
|
|
216
|
+
"$schema": "https://cloudevents.io/schemas/registry",
|
|
217
|
+
"specversion": "0.5-wip",
|
|
218
|
+
"endpoints": {},
|
|
219
|
+
"messagegroups": {},
|
|
220
|
+
"schemagroups": {
|
|
221
|
+
f"{self.kusto_database}": {
|
|
222
|
+
"schemas": {schema.get("$id", f"schema_{i}"): {"format": "JSONStructure/1.0", "schema": schema}
|
|
223
|
+
for i, schema in enumerate(all_schemas)}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
else:
|
|
228
|
+
# Single schema or array of schemas
|
|
229
|
+
output = all_schemas if len(all_schemas) > 1 else all_schemas[0] if all_schemas else {}
|
|
230
|
+
|
|
231
|
+
with open(self.jstruct_schema_path, 'w', encoding='utf-8') as jstruct_file:
|
|
232
|
+
json.dump(output, jstruct_file, indent=4)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def convert_kusto_to_jstruct(kusto_uri: str, kusto_database: str, table_name: str | None, base_id: str, jstruct_schema_file: str, emit_cloudevents: bool, emit_cloudevents_xregistry: bool, token_provider=None, sample_size: int = 100, infer_choices: bool = False, choice_depth: int = 1, infer_enums: bool = False):
|
|
236
|
+
""" Converts Kusto table schemas to JSON Structure schema format."""
|
|
237
|
+
|
|
238
|
+
if not kusto_uri:
|
|
239
|
+
raise ValueError("kusto_uri is required")
|
|
240
|
+
if not kusto_database:
|
|
241
|
+
raise ValueError("kusto_database is required")
|
|
242
|
+
if not base_id:
|
|
243
|
+
base_id = f"https://{kusto_database}.example.com/"
|
|
244
|
+
|
|
245
|
+
kusto_to_jstruct = KustoToJsonStructure(
|
|
246
|
+
kusto_uri, kusto_database, table_name, base_id, jstruct_schema_file, emit_cloudevents, emit_cloudevents_xregistry, token_provider=token_provider, sample_size=sample_size, infer_choices=infer_choices, choice_depth=choice_depth, infer_enums=infer_enums)
|
|
247
|
+
return kusto_to_jstruct.process_all_tables()
|