structurize 3.5.5__tar.gz → 3.5.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {structurize-3.5.5/structurize.egg-info → structurize-3.5.7}/PKG-INFO +1 -1
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/_version.py +3 -3
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotojstruct.py +4 -0
- structurize-3.5.7/avrotize/avrototsml.py +349 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/commands.json +180 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/jstructtoavro.py +5 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/sqltoavro.py +98 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretopython.py +56 -5
- structurize-3.5.7/avrotize/structuretotsml.py +34 -0
- structurize-3.5.7/avrotize/tmslvalidate.py +317 -0
- {structurize-3.5.5 → structurize-3.5.7/structurize.egg-info}/PKG-INFO +1 -1
- {structurize-3.5.5 → structurize-3.5.7}/structurize.egg-info/SOURCES.txt +6 -0
- {structurize-3.5.5 → structurize-3.5.7}/.gitignore +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/LICENSE +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/MANIFEST.in +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/README.md +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/__init__.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/__main__.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/asn1toavro.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotize.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotocpp.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotocsharp.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotocsv.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotodatapackage.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotodb.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotogo.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotographql.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotoiceberg.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotojava.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotojs.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotojsons.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotokusto.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotomd.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotools.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotoparquet.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotoproto.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotopython.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotorust.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotots.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrotoxsd.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/avrovalidator.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/cddltostructure.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/choice_inference.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/common.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/constants.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/csvtoavro.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/datapackagetoavro.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/dependencies/cpp/vcpkg/vcpkg.json +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/dependencies/typescript/node22/package.json +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/dependency_resolver.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/dependency_version.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/jsonstoavro.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/jsonstostructure.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/jsontoschema.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/kstructtoavro.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/kustotoavro.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/kustotojstruct.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/mcp_server.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/openapitostructure.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/parquettoavro.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/proto2parser.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/proto3parser.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/prototoavro.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/schema_inference.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretocddl.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretocpp.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretocsharp.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretocsv.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretodatapackage.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretodb.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretogo.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretographql.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretoiceberg.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretojava.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretojs.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretojsons.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretokusto.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretomd.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretoproto.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretorust.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretots.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/structuretoxsd.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/validate.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/xmltoschema.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/avrotize/xsdtoavro.py +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/build.ps1 +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/build.sh +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/pyproject.toml +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/setup.cfg +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/structurize.egg-info/dependency_links.txt +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/structurize.egg-info/entry_points.txt +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/structurize.egg-info/requires.txt +0 -0
- {structurize-3.5.5 → structurize-3.5.7}/structurize.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: structurize
|
|
3
|
-
Version: 3.5.
|
|
3
|
+
Version: 3.5.7
|
|
4
4
|
Summary: Tools to convert from and to JSON Structure from various other schema languages.
|
|
5
5
|
Author-email: Clemens Vasters <clemensv@microsoft.com>
|
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '3.5.
|
|
22
|
-
__version_tuple__ = version_tuple = (3, 5,
|
|
21
|
+
__version__ = version = '3.5.7'
|
|
22
|
+
__version_tuple__ = version_tuple = (3, 5, 7)
|
|
23
23
|
|
|
24
|
-
__commit_id__ = commit_id = '
|
|
24
|
+
__commit_id__ = commit_id = 'gaf486f61f'
|
|
@@ -132,6 +132,10 @@ class AvroToJsonStructure:
|
|
|
132
132
|
props = {"name": name, "type": "object", "properties": {}, "required": []}
|
|
133
133
|
if "doc" in avro_schema:
|
|
134
134
|
props["description"] = avro_schema["doc"]
|
|
135
|
+
if isinstance(avro_schema.get("unique"), list):
|
|
136
|
+
props["x-avrotize-unique"] = avro_schema["unique"]
|
|
137
|
+
if isinstance(avro_schema.get("foreignKeys"), list):
|
|
138
|
+
props["x-avrotize-foreignKeys"] = avro_schema["foreignKeys"]
|
|
135
139
|
|
|
136
140
|
# Namespace for resolving field types within this record
|
|
137
141
|
record_fields_namespace = avro_schema.get("namespace", namespace)
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
"""Convert an Avro schema to a Tabular Model Scripting Language (TMSL) schema."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
JsonNode = Dict[str, "JsonNode"] | List["JsonNode"] | str | bool | int | float | None
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AvroToTmslConverter:
|
|
11
|
+
"""Class to convert Avro schema to TMSL schema."""
|
|
12
|
+
|
|
13
|
+
def __init__(self: "AvroToTmslConverter") -> None:
|
|
14
|
+
self.named_type_cache: Dict[str, Dict[str, Any]] = {}
|
|
15
|
+
|
|
16
|
+
def get_fullname(self, namespace: str, name: str) -> str:
|
|
17
|
+
"""Get fully-qualified type name."""
|
|
18
|
+
return f"{namespace}.{name}" if namespace else name
|
|
19
|
+
|
|
20
|
+
def cache_named_types(self, avro_type: JsonNode, namespace: str = "") -> None:
|
|
21
|
+
"""Cache named Avro types for reference resolution."""
|
|
22
|
+
if isinstance(avro_type, list):
|
|
23
|
+
for item in avro_type:
|
|
24
|
+
self.cache_named_types(item, namespace)
|
|
25
|
+
return
|
|
26
|
+
|
|
27
|
+
if isinstance(avro_type, dict):
|
|
28
|
+
current_namespace = str(avro_type.get("namespace", namespace))
|
|
29
|
+
type_name = avro_type.get("name")
|
|
30
|
+
if isinstance(type_name, str):
|
|
31
|
+
fullname = self.get_fullname(current_namespace, type_name)
|
|
32
|
+
self.named_type_cache[fullname] = avro_type
|
|
33
|
+
self.named_type_cache[type_name] = avro_type
|
|
34
|
+
|
|
35
|
+
avro_kind = avro_type.get("type")
|
|
36
|
+
if avro_kind == "record":
|
|
37
|
+
for field in avro_type.get("fields", []):
|
|
38
|
+
if isinstance(field, dict) and "type" in field:
|
|
39
|
+
self.cache_named_types(field["type"], current_namespace)
|
|
40
|
+
elif avro_kind == "array":
|
|
41
|
+
self.cache_named_types(avro_type.get("items"), current_namespace)
|
|
42
|
+
elif avro_kind == "map":
|
|
43
|
+
self.cache_named_types(avro_type.get("values"), current_namespace)
|
|
44
|
+
|
|
45
|
+
def map_avro_type_to_tmsl(self, avro_type: JsonNode) -> Tuple[str, bool]:
|
|
46
|
+
"""Map an Avro field type to a TMSL data type and nullability."""
|
|
47
|
+
if isinstance(avro_type, list):
|
|
48
|
+
non_null_types = [item for item in avro_type if item != "null"]
|
|
49
|
+
nullable = len(non_null_types) != len(avro_type)
|
|
50
|
+
if not non_null_types:
|
|
51
|
+
return "string", True
|
|
52
|
+
if len(non_null_types) == 1:
|
|
53
|
+
mapped_type, _ = self.map_avro_type_to_tmsl(non_null_types[0])
|
|
54
|
+
return mapped_type, True if nullable else False
|
|
55
|
+
return "variant", True
|
|
56
|
+
|
|
57
|
+
if isinstance(avro_type, dict):
|
|
58
|
+
avro_kind = avro_type.get("type")
|
|
59
|
+
|
|
60
|
+
if avro_kind == "record":
|
|
61
|
+
return "variant", False
|
|
62
|
+
if avro_kind in ["array", "map"]:
|
|
63
|
+
return "variant", False
|
|
64
|
+
if avro_kind == "enum":
|
|
65
|
+
return "string", False
|
|
66
|
+
if avro_kind == "fixed":
|
|
67
|
+
return "binary", False
|
|
68
|
+
|
|
69
|
+
logical_type = avro_type.get("logicalType")
|
|
70
|
+
if logical_type in ["timestamp-millis", "timestamp-micros", "date", "time-millis", "time-micros"]:
|
|
71
|
+
return "dateTime", False
|
|
72
|
+
if logical_type == "decimal":
|
|
73
|
+
return "decimal", False
|
|
74
|
+
|
|
75
|
+
if isinstance(avro_kind, (str, dict, list)):
|
|
76
|
+
return self.map_avro_type_to_tmsl(avro_kind)
|
|
77
|
+
return "string", False
|
|
78
|
+
|
|
79
|
+
if isinstance(avro_type, str):
|
|
80
|
+
if avro_type in ["boolean"]:
|
|
81
|
+
return "boolean", False
|
|
82
|
+
if avro_type in ["int", "long"]:
|
|
83
|
+
return "int64", False
|
|
84
|
+
if avro_type in ["float", "double"]:
|
|
85
|
+
return "double", False
|
|
86
|
+
if avro_type == "bytes":
|
|
87
|
+
return "binary", False
|
|
88
|
+
if avro_type in ["string", "null"]:
|
|
89
|
+
return "string", avro_type == "null"
|
|
90
|
+
|
|
91
|
+
referenced = self.named_type_cache.get(avro_type)
|
|
92
|
+
if referenced is not None:
|
|
93
|
+
return self.map_avro_type_to_tmsl(referenced)
|
|
94
|
+
|
|
95
|
+
return "variant", False
|
|
96
|
+
|
|
97
|
+
return "string", False
|
|
98
|
+
|
|
99
|
+
def resolve_root_record(self, schema: JsonNode, avro_record_type: Optional[str]) -> Dict[str, Any]:
|
|
100
|
+
"""Resolve the root record from a schema document."""
|
|
101
|
+
if isinstance(schema, dict):
|
|
102
|
+
if schema.get("type") != "record":
|
|
103
|
+
print("Expected an Avro schema with a root type of 'record'")
|
|
104
|
+
sys.exit(1)
|
|
105
|
+
return schema
|
|
106
|
+
|
|
107
|
+
if isinstance(schema, list):
|
|
108
|
+
if avro_record_type:
|
|
109
|
+
for candidate in schema:
|
|
110
|
+
if not isinstance(candidate, dict):
|
|
111
|
+
continue
|
|
112
|
+
if candidate.get("type") != "record":
|
|
113
|
+
continue
|
|
114
|
+
record_name = str(candidate.get("name", ""))
|
|
115
|
+
namespace = str(candidate.get("namespace", ""))
|
|
116
|
+
fullname = self.get_fullname(namespace, record_name)
|
|
117
|
+
if avro_record_type in [record_name, fullname]:
|
|
118
|
+
return candidate
|
|
119
|
+
print(f"No top-level record type {avro_record_type} found in the Avro schema")
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
|
|
122
|
+
for candidate in schema:
|
|
123
|
+
if isinstance(candidate, dict) and candidate.get("type") == "record":
|
|
124
|
+
return candidate
|
|
125
|
+
|
|
126
|
+
print("Expected at least one Avro 'record' schema in the schema list")
|
|
127
|
+
sys.exit(1)
|
|
128
|
+
|
|
129
|
+
print("Expected an Avro schema as a JSON object or a list of schema records")
|
|
130
|
+
sys.exit(1)
|
|
131
|
+
|
|
132
|
+
def resolve_records(self, schema: JsonNode, avro_record_type: Optional[str]) -> List[Dict[str, Any]]:
|
|
133
|
+
"""Resolve one or more record schemas from the input document."""
|
|
134
|
+
if isinstance(schema, dict):
|
|
135
|
+
return [self.resolve_root_record(schema, avro_record_type)]
|
|
136
|
+
|
|
137
|
+
if isinstance(schema, list):
|
|
138
|
+
if avro_record_type:
|
|
139
|
+
return [self.resolve_root_record(schema, avro_record_type)]
|
|
140
|
+
|
|
141
|
+
records = [item for item in schema if isinstance(item, dict) and item.get("type") == "record"]
|
|
142
|
+
if records:
|
|
143
|
+
return records
|
|
144
|
+
|
|
145
|
+
print("Expected one or more Avro 'record' schemas")
|
|
146
|
+
sys.exit(1)
|
|
147
|
+
|
|
148
|
+
def build_table(self, record: Dict[str, Any], emit_cloudevents_columns: bool) -> Dict[str, Any]:
|
|
149
|
+
"""Build a TMSL table object from an Avro record."""
|
|
150
|
+
table_name = str(record.get("name", "Table"))
|
|
151
|
+
unique_columns = set(str(column) for column in record.get("unique", []) if isinstance(column, str))
|
|
152
|
+
|
|
153
|
+
columns: List[Dict[str, Any]] = []
|
|
154
|
+
for field in record.get("fields", []):
|
|
155
|
+
if not isinstance(field, dict):
|
|
156
|
+
continue
|
|
157
|
+
field_name = str(field.get("name", ""))
|
|
158
|
+
if not field_name:
|
|
159
|
+
continue
|
|
160
|
+
data_type, nullable = self.map_avro_type_to_tmsl(field.get("type"))
|
|
161
|
+
column: Dict[str, Any] = {
|
|
162
|
+
"name": field_name,
|
|
163
|
+
"dataType": data_type,
|
|
164
|
+
"sourceColumn": field_name,
|
|
165
|
+
}
|
|
166
|
+
if field_name in unique_columns:
|
|
167
|
+
column["isKey"] = True
|
|
168
|
+
if nullable:
|
|
169
|
+
column["isNullable"] = True
|
|
170
|
+
columns.append(column)
|
|
171
|
+
|
|
172
|
+
if emit_cloudevents_columns:
|
|
173
|
+
columns.extend([
|
|
174
|
+
{"name": "___type", "dataType": "string", "sourceColumn": "___type", "isNullable": True},
|
|
175
|
+
{"name": "___source", "dataType": "string", "sourceColumn": "___source", "isNullable": True},
|
|
176
|
+
{"name": "___id", "dataType": "string", "sourceColumn": "___id", "isNullable": True},
|
|
177
|
+
{"name": "___time", "dataType": "dateTime", "sourceColumn": "___time", "isNullable": True},
|
|
178
|
+
{"name": "___subject", "dataType": "string", "sourceColumn": "___subject", "isNullable": True},
|
|
179
|
+
])
|
|
180
|
+
|
|
181
|
+
return {
|
|
182
|
+
"name": table_name,
|
|
183
|
+
"columns": columns,
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
def _record_sql_identifier(self, record: Dict[str, Any]) -> str | None:
|
|
187
|
+
"""Get SQL table identifier from Avro altnames metadata, if available."""
|
|
188
|
+
altnames = record.get("altnames")
|
|
189
|
+
if isinstance(altnames, dict):
|
|
190
|
+
sql_name = altnames.get("sql")
|
|
191
|
+
if isinstance(sql_name, str) and sql_name:
|
|
192
|
+
return sql_name
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
def build_relationships(self, records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
196
|
+
"""Build TMSL relationships from Avro foreignKeys metadata."""
|
|
197
|
+
table_by_name = {
|
|
198
|
+
str(record.get("name")): str(record.get("name"))
|
|
199
|
+
for record in records
|
|
200
|
+
if isinstance(record.get("name"), str)
|
|
201
|
+
}
|
|
202
|
+
table_by_sql = {}
|
|
203
|
+
for record in records:
|
|
204
|
+
record_name = record.get("name")
|
|
205
|
+
if not isinstance(record_name, str) or not record_name:
|
|
206
|
+
continue
|
|
207
|
+
sql_identifier = self._record_sql_identifier(record)
|
|
208
|
+
if sql_identifier:
|
|
209
|
+
table_by_sql[sql_identifier] = record_name
|
|
210
|
+
|
|
211
|
+
relationships: List[Dict[str, Any]] = []
|
|
212
|
+
relationship_names: set[str] = set()
|
|
213
|
+
|
|
214
|
+
for record in records:
|
|
215
|
+
from_table = record.get("name")
|
|
216
|
+
if not isinstance(from_table, str) or not from_table:
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
foreign_keys = record.get("foreignKeys")
|
|
220
|
+
if not isinstance(foreign_keys, list):
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
for fk in foreign_keys:
|
|
224
|
+
if not isinstance(fk, dict):
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
columns = fk.get("columns")
|
|
228
|
+
referenced_columns = fk.get("referencedColumns")
|
|
229
|
+
if not isinstance(columns, list) or not isinstance(referenced_columns, list):
|
|
230
|
+
continue
|
|
231
|
+
if len(columns) != len(referenced_columns) or len(columns) == 0:
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
target_table = None
|
|
235
|
+
referenced_table_sql = fk.get("referencedTableSql")
|
|
236
|
+
if isinstance(referenced_table_sql, str):
|
|
237
|
+
target_table = table_by_sql.get(referenced_table_sql)
|
|
238
|
+
|
|
239
|
+
if not target_table:
|
|
240
|
+
referenced_table = fk.get("referencedTable")
|
|
241
|
+
if isinstance(referenced_table, str):
|
|
242
|
+
target_table = table_by_name.get(referenced_table)
|
|
243
|
+
|
|
244
|
+
if not target_table:
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
for from_column, to_column in zip(columns, referenced_columns):
|
|
248
|
+
if not isinstance(from_column, str) or not isinstance(to_column, str):
|
|
249
|
+
continue
|
|
250
|
+
|
|
251
|
+
relationship_name = f"{from_table}_{from_column}_to_{target_table}_{to_column}"
|
|
252
|
+
if relationship_name in relationship_names:
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
relationships.append(
|
|
256
|
+
{
|
|
257
|
+
"name": relationship_name,
|
|
258
|
+
"fromTable": from_table,
|
|
259
|
+
"fromColumn": from_column,
|
|
260
|
+
"toTable": target_table,
|
|
261
|
+
"toColumn": to_column,
|
|
262
|
+
}
|
|
263
|
+
)
|
|
264
|
+
relationship_names.add(relationship_name)
|
|
265
|
+
|
|
266
|
+
return relationships
|
|
267
|
+
|
|
268
|
+
def build_tmsl_schema(
|
|
269
|
+
self,
|
|
270
|
+
avro_schema: JsonNode,
|
|
271
|
+
avro_record_type: Optional[str] = None,
|
|
272
|
+
database_name: str = "",
|
|
273
|
+
compatibility_level: int = 1605,
|
|
274
|
+
emit_cloudevents_columns: bool = False,
|
|
275
|
+
) -> Dict[str, Any]:
|
|
276
|
+
"""Build a TMSL JSON document from an Avro schema document."""
|
|
277
|
+
self.cache_named_types(avro_schema)
|
|
278
|
+
records = self.resolve_records(avro_schema, avro_record_type)
|
|
279
|
+
|
|
280
|
+
tables = [self.build_table(record, emit_cloudevents_columns) for record in records]
|
|
281
|
+
first_table_name = str(tables[0].get("name", "Database")) if tables else "Database"
|
|
282
|
+
database = database_name or first_table_name
|
|
283
|
+
relationships = self.build_relationships(records)
|
|
284
|
+
|
|
285
|
+
model: Dict[str, Any] = {
|
|
286
|
+
"culture": "en-US",
|
|
287
|
+
"tables": tables,
|
|
288
|
+
}
|
|
289
|
+
if relationships:
|
|
290
|
+
model["relationships"] = relationships
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"createOrReplace": {
|
|
294
|
+
"object": {"database": database},
|
|
295
|
+
"database": {
|
|
296
|
+
"name": database,
|
|
297
|
+
"compatibilityLevel": compatibility_level,
|
|
298
|
+
"model": model,
|
|
299
|
+
},
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
def convert_avro_to_tmsl(
|
|
304
|
+
self,
|
|
305
|
+
avro_schema_path: str,
|
|
306
|
+
avro_record_type: Optional[str],
|
|
307
|
+
tmsl_file_path: str,
|
|
308
|
+
database_name: str = "",
|
|
309
|
+
compatibility_level: int = 1605,
|
|
310
|
+
emit_cloudevents_columns: bool = False,
|
|
311
|
+
) -> None:
|
|
312
|
+
"""Convert an Avro schema file to a TMSL JSON file."""
|
|
313
|
+
if not avro_schema_path:
|
|
314
|
+
print("Please specify the avro schema file")
|
|
315
|
+
sys.exit(1)
|
|
316
|
+
|
|
317
|
+
with open(avro_schema_path, "r", encoding="utf-8") as f:
|
|
318
|
+
schema = json.load(f)
|
|
319
|
+
|
|
320
|
+
tmsl_schema = self.build_tmsl_schema(
|
|
321
|
+
schema,
|
|
322
|
+
avro_record_type=avro_record_type,
|
|
323
|
+
database_name=database_name,
|
|
324
|
+
compatibility_level=compatibility_level,
|
|
325
|
+
emit_cloudevents_columns=emit_cloudevents_columns,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
with open(tmsl_file_path, "w", encoding="utf-8") as f:
|
|
329
|
+
json.dump(tmsl_schema, f, indent=2)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def convert_avro_to_tmsl(
|
|
333
|
+
avro_schema_path: str,
|
|
334
|
+
avro_record_type: Optional[str],
|
|
335
|
+
tmsl_file_path: str,
|
|
336
|
+
database_name: str = "",
|
|
337
|
+
compatibility_level: int = 1605,
|
|
338
|
+
emit_cloudevents_columns: bool = False,
|
|
339
|
+
) -> None:
|
|
340
|
+
"""Convert an Avro schema file to a TMSL JSON file."""
|
|
341
|
+
converter = AvroToTmslConverter()
|
|
342
|
+
converter.convert_avro_to_tmsl(
|
|
343
|
+
avro_schema_path,
|
|
344
|
+
avro_record_type,
|
|
345
|
+
tmsl_file_path,
|
|
346
|
+
database_name,
|
|
347
|
+
compatibility_level,
|
|
348
|
+
emit_cloudevents_columns,
|
|
349
|
+
)
|
|
@@ -1704,6 +1704,39 @@
|
|
|
1704
1704
|
],
|
|
1705
1705
|
"prompts": []
|
|
1706
1706
|
},
|
|
1707
|
+
{
|
|
1708
|
+
"command": "validate-tmsl",
|
|
1709
|
+
"description": "Validate TMSL scripts locally against documented object structure",
|
|
1710
|
+
"group": "7_Utility",
|
|
1711
|
+
"function": {
|
|
1712
|
+
"name": "avrotize.tmslvalidate.validate_tmsl",
|
|
1713
|
+
"args": {
|
|
1714
|
+
"tmsl_file_path": "input_file_path",
|
|
1715
|
+
"quiet": "args.quiet"
|
|
1716
|
+
}
|
|
1717
|
+
},
|
|
1718
|
+
"extensions": [
|
|
1719
|
+
".tmsl.json",
|
|
1720
|
+
".json"
|
|
1721
|
+
],
|
|
1722
|
+
"args": [
|
|
1723
|
+
{
|
|
1724
|
+
"name": "input",
|
|
1725
|
+
"type": "str",
|
|
1726
|
+
"nargs": "?",
|
|
1727
|
+
"help": "Path to the TMSL JSON file (or read from stdin if omitted)",
|
|
1728
|
+
"required": false
|
|
1729
|
+
},
|
|
1730
|
+
{
|
|
1731
|
+
"name": "--quiet",
|
|
1732
|
+
"type": "bool",
|
|
1733
|
+
"help": "Suppress output. Exit code 0 if valid, 1 if invalid.",
|
|
1734
|
+
"default": false,
|
|
1735
|
+
"required": false
|
|
1736
|
+
}
|
|
1737
|
+
],
|
|
1738
|
+
"prompts": []
|
|
1739
|
+
},
|
|
1707
1740
|
{
|
|
1708
1741
|
"command": "a2mongo",
|
|
1709
1742
|
"description": "Convert Avrotize schema to MongoDB schema",
|
|
@@ -1945,6 +1978,153 @@
|
|
|
1945
1978
|
}
|
|
1946
1979
|
]
|
|
1947
1980
|
},
|
|
1981
|
+
{
|
|
1982
|
+
"command": "a2tsml",
|
|
1983
|
+
"description": "Convert Avrotize schema to Tabular Model Scripting Language (TMSL) schema",
|
|
1984
|
+
"group": "3_Datalake",
|
|
1985
|
+
"function": {
|
|
1986
|
+
"name": "avrotize.avrototsml.convert_avro_to_tmsl",
|
|
1987
|
+
"args": {
|
|
1988
|
+
"avro_schema_path": "input_file_path",
|
|
1989
|
+
"tmsl_file_path": "output_file_path",
|
|
1990
|
+
"avro_record_type": "args.record_type",
|
|
1991
|
+
"database_name": "args.database_name",
|
|
1992
|
+
"compatibility_level": "args.compatibility_level",
|
|
1993
|
+
"emit_cloudevents_columns": "args.emit_cloudevents_columns"
|
|
1994
|
+
}
|
|
1995
|
+
},
|
|
1996
|
+
"extensions": [
|
|
1997
|
+
".avsc"
|
|
1998
|
+
],
|
|
1999
|
+
"args": [
|
|
2000
|
+
{
|
|
2001
|
+
"name": "input",
|
|
2002
|
+
"type": "str",
|
|
2003
|
+
"nargs": "?",
|
|
2004
|
+
"help": "Path to the Avrotize schema file (or read from stdin if omitted)",
|
|
2005
|
+
"required": false
|
|
2006
|
+
},
|
|
2007
|
+
{
|
|
2008
|
+
"name": "--out",
|
|
2009
|
+
"type": "str",
|
|
2010
|
+
"help": "Path to the TMSL schema JSON file",
|
|
2011
|
+
"required": false
|
|
2012
|
+
},
|
|
2013
|
+
{
|
|
2014
|
+
"name": "--avsc",
|
|
2015
|
+
"type": "str",
|
|
2016
|
+
"help": "Deprecated: Path to the Avrotize schema file (for backcompat)",
|
|
2017
|
+
"required": false
|
|
2018
|
+
},
|
|
2019
|
+
{
|
|
2020
|
+
"name": "--record-type",
|
|
2021
|
+
"type": "str",
|
|
2022
|
+
"help": "Record type in the Avrotize schema",
|
|
2023
|
+
"required": false
|
|
2024
|
+
},
|
|
2025
|
+
{
|
|
2026
|
+
"name": "--database-name",
|
|
2027
|
+
"type": "str",
|
|
2028
|
+
"help": "Tabular model database name (defaults to the selected record type name)",
|
|
2029
|
+
"required": false,
|
|
2030
|
+
"default": ""
|
|
2031
|
+
},
|
|
2032
|
+
{
|
|
2033
|
+
"name": "--compatibility-level",
|
|
2034
|
+
"type": "int",
|
|
2035
|
+
"help": "Tabular model compatibility level",
|
|
2036
|
+
"required": false,
|
|
2037
|
+
"default": 1605
|
|
2038
|
+
},
|
|
2039
|
+
{
|
|
2040
|
+
"name": "--emit-cloudevents-columns",
|
|
2041
|
+
"type": "bool",
|
|
2042
|
+
"help": "Add CloudEvents columns to the TMSL table",
|
|
2043
|
+
"default": false,
|
|
2044
|
+
"required": false
|
|
2045
|
+
}
|
|
2046
|
+
],
|
|
2047
|
+
"suggested_output_file_path": "{input_file_name}.tmsl.json",
|
|
2048
|
+
"prompts": [
|
|
2049
|
+
{
|
|
2050
|
+
"name": "--emit-cloudevents-columns",
|
|
2051
|
+
"message": "Add CloudEvents columns to the TMSL schema?",
|
|
2052
|
+
"type": "bool",
|
|
2053
|
+
"default": false
|
|
2054
|
+
}
|
|
2055
|
+
]
|
|
2056
|
+
},
|
|
2057
|
+
{
|
|
2058
|
+
"command": "s2tsml",
|
|
2059
|
+
"description": "Convert JSON Structure to Tabular Model Scripting Language (TMSL) schema",
|
|
2060
|
+
"group": "3_Datalake",
|
|
2061
|
+
"function": {
|
|
2062
|
+
"name": "avrotize.structuretotsml.convert_structure_to_tmsl",
|
|
2063
|
+
"args": {
|
|
2064
|
+
"structure_schema_path": "input_file_path",
|
|
2065
|
+
"tmsl_file_path": "output_file_path",
|
|
2066
|
+
"structure_record_type": "args.record_type",
|
|
2067
|
+
"database_name": "args.database_name",
|
|
2068
|
+
"compatibility_level": "args.compatibility_level",
|
|
2069
|
+
"emit_cloudevents_columns": "args.emit_cloudevents_columns"
|
|
2070
|
+
}
|
|
2071
|
+
},
|
|
2072
|
+
"extensions": [
|
|
2073
|
+
".struct.json",
|
|
2074
|
+
".json"
|
|
2075
|
+
],
|
|
2076
|
+
"args": [
|
|
2077
|
+
{
|
|
2078
|
+
"name": "input",
|
|
2079
|
+
"type": "str",
|
|
2080
|
+
"nargs": "?",
|
|
2081
|
+
"help": "Path to the JSON Structure schema file (or read from stdin if omitted)",
|
|
2082
|
+
"required": false
|
|
2083
|
+
},
|
|
2084
|
+
{
|
|
2085
|
+
"name": "--out",
|
|
2086
|
+
"type": "str",
|
|
2087
|
+
"help": "Path to the TMSL schema JSON file",
|
|
2088
|
+
"required": false
|
|
2089
|
+
},
|
|
2090
|
+
{
|
|
2091
|
+
"name": "--record-type",
|
|
2092
|
+
"type": "str",
|
|
2093
|
+
"help": "Record type in the JSON Structure schema",
|
|
2094
|
+
"required": false
|
|
2095
|
+
},
|
|
2096
|
+
{
|
|
2097
|
+
"name": "--database-name",
|
|
2098
|
+
"type": "str",
|
|
2099
|
+
"help": "Tabular model database name (defaults to the selected record type name)",
|
|
2100
|
+
"required": false,
|
|
2101
|
+
"default": ""
|
|
2102
|
+
},
|
|
2103
|
+
{
|
|
2104
|
+
"name": "--compatibility-level",
|
|
2105
|
+
"type": "int",
|
|
2106
|
+
"help": "Tabular model compatibility level",
|
|
2107
|
+
"required": false,
|
|
2108
|
+
"default": 1605
|
|
2109
|
+
},
|
|
2110
|
+
{
|
|
2111
|
+
"name": "--emit-cloudevents-columns",
|
|
2112
|
+
"type": "bool",
|
|
2113
|
+
"help": "Add CloudEvents columns to the TMSL table",
|
|
2114
|
+
"default": false,
|
|
2115
|
+
"required": false
|
|
2116
|
+
}
|
|
2117
|
+
],
|
|
2118
|
+
"suggested_output_file_path": "{input_file_name}.tmsl.json",
|
|
2119
|
+
"prompts": [
|
|
2120
|
+
{
|
|
2121
|
+
"name": "--emit-cloudevents-columns",
|
|
2122
|
+
"message": "Add CloudEvents columns to the TMSL schema?",
|
|
2123
|
+
"type": "bool",
|
|
2124
|
+
"default": false
|
|
2125
|
+
}
|
|
2126
|
+
]
|
|
2127
|
+
},
|
|
1948
2128
|
{
|
|
1949
2129
|
"command": "pq2a",
|
|
1950
2130
|
"description": "Convert Parquet schema to Avrotize schema",
|
|
@@ -364,6 +364,11 @@ class JsonStructureToAvro:
|
|
|
364
364
|
|
|
365
365
|
if 'description' in merged_schema:
|
|
366
366
|
avro_record['doc'] = merged_schema['description']
|
|
367
|
+
|
|
368
|
+
if isinstance(merged_schema.get('x-avrotize-unique'), list):
|
|
369
|
+
avro_record['unique'] = merged_schema['x-avrotize-unique']
|
|
370
|
+
if isinstance(merged_schema.get('x-avrotize-foreignKeys'), list):
|
|
371
|
+
avro_record['foreignKeys'] = merged_schema['x-avrotize-foreignKeys']
|
|
367
372
|
|
|
368
373
|
# Convert properties to fields
|
|
369
374
|
properties = merged_schema.get('properties', {})
|