structurize 2.16.2__py3-none-any.whl → 2.16.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +63 -63
- avrotize/__main__.py +5 -5
- avrotize/_version.py +34 -34
- avrotize/asn1toavro.py +160 -160
- avrotize/avrotize.py +152 -152
- avrotize/avrotocpp.py +483 -483
- avrotize/avrotocsharp.py +992 -992
- avrotize/avrotocsv.py +121 -121
- avrotize/avrotodatapackage.py +173 -173
- avrotize/avrotodb.py +1383 -1383
- avrotize/avrotogo.py +476 -476
- avrotize/avrotographql.py +197 -197
- avrotize/avrotoiceberg.py +210 -210
- avrotize/avrotojava.py +1023 -1023
- avrotize/avrotojs.py +250 -250
- avrotize/avrotojsons.py +481 -481
- avrotize/avrotojstruct.py +345 -345
- avrotize/avrotokusto.py +363 -363
- avrotize/avrotomd.py +137 -137
- avrotize/avrotools.py +168 -168
- avrotize/avrotoparquet.py +208 -208
- avrotize/avrotoproto.py +358 -358
- avrotize/avrotopython.py +622 -622
- avrotize/avrotorust.py +435 -435
- avrotize/avrotots.py +598 -598
- avrotize/avrotoxsd.py +344 -344
- avrotize/commands.json +2493 -2433
- avrotize/common.py +828 -828
- avrotize/constants.py +4 -4
- avrotize/csvtoavro.py +131 -131
- avrotize/datapackagetoavro.py +76 -76
- avrotize/dependency_resolver.py +348 -348
- avrotize/jsonstoavro.py +1698 -1698
- avrotize/jsonstostructure.py +2642 -2642
- avrotize/jstructtoavro.py +878 -878
- avrotize/kstructtoavro.py +93 -93
- avrotize/kustotoavro.py +455 -455
- avrotize/parquettoavro.py +157 -157
- avrotize/proto2parser.py +497 -497
- avrotize/proto3parser.py +402 -402
- avrotize/prototoavro.py +382 -382
- avrotize/structuretocsharp.py +2005 -2005
- avrotize/structuretojsons.py +498 -498
- avrotize/structuretopython.py +772 -772
- avrotize/structuretots.py +653 -0
- avrotize/xsdtoavro.py +413 -413
- structurize-2.16.6.dist-info/METADATA +107 -0
- structurize-2.16.6.dist-info/RECORD +52 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/licenses/LICENSE +200 -200
- structurize-2.16.2.dist-info/METADATA +0 -805
- structurize-2.16.2.dist-info/RECORD +0 -51
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/WHEEL +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/entry_points.txt +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.6.dist-info}/top_level.txt +0 -0
avrotize/avrotokusto.py
CHANGED
|
@@ -1,364 +1,364 @@
|
|
|
1
|
-
"""Converts an Avro schema to a Kusto table schema."""
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import sys
|
|
5
|
-
from typing import Any, List
|
|
6
|
-
from avrotize.common import build_flat_type_dict, inline_avro_references, strip_first_doc
|
|
7
|
-
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder, ClientRequestProperties
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class AvroToKusto:
|
|
11
|
-
"""Converts an Avro schema to a Kusto table schema."""
|
|
12
|
-
|
|
13
|
-
def ___init___(self):
|
|
14
|
-
"""Initializes a new instance of the AvroToKusto class."""
|
|
15
|
-
pass
|
|
16
|
-
|
|
17
|
-
def convert_record_to_kusto(self, type_dict:dict, recordschema: dict, emit_cloudevents_columns: bool, emit_cloudevents_dispatch_table: bool) -> List[str]:
|
|
18
|
-
"""Converts an Avro record schema to a Kusto table schema."""
|
|
19
|
-
# Get the name and fields of the top-level record
|
|
20
|
-
table_name = recordschema["name"]
|
|
21
|
-
fields = recordschema["fields"]
|
|
22
|
-
|
|
23
|
-
# Create a StringBuilder to store the kusto statements
|
|
24
|
-
kusto = []
|
|
25
|
-
|
|
26
|
-
# Append the create table statement with the column names and types
|
|
27
|
-
kusto.append(f".create-merge table [{table_name}] (")
|
|
28
|
-
columns = []
|
|
29
|
-
for field in fields:
|
|
30
|
-
column_name = field["name"]
|
|
31
|
-
column_type = self.convert_avro_type_to_kusto_type(field["type"])
|
|
32
|
-
columns.append(f" [{column_name}]: {column_type}")
|
|
33
|
-
if emit_cloudevents_columns:
|
|
34
|
-
columns.append(" [___type]: string")
|
|
35
|
-
columns.append(" [___source]: string")
|
|
36
|
-
columns.append(" [___id]: string")
|
|
37
|
-
columns.append(" [___time]: datetime")
|
|
38
|
-
columns.append(" [___subject]: string")
|
|
39
|
-
kusto.append(",\n".join(columns))
|
|
40
|
-
kusto.append(");")
|
|
41
|
-
kusto.append("")
|
|
42
|
-
|
|
43
|
-
# Add the doc string as table metadata
|
|
44
|
-
if "doc" in recordschema:
|
|
45
|
-
doc_data = recordschema["doc"]
|
|
46
|
-
doc_data = (doc_data[:997] + "...") if len(doc_data) > 1000 else doc_data
|
|
47
|
-
doc_string = json.dumps(json.dumps({
|
|
48
|
-
"description": doc_data
|
|
49
|
-
}))
|
|
50
|
-
kusto.append(
|
|
51
|
-
f".alter table [{table_name}] docstring {doc_string};")
|
|
52
|
-
kusto.append("")
|
|
53
|
-
|
|
54
|
-
doc_string_statement = []
|
|
55
|
-
for field in fields:
|
|
56
|
-
column_name = field["name"]
|
|
57
|
-
if "doc" in field:
|
|
58
|
-
doc_data = field["doc"]
|
|
59
|
-
if len(doc_data) > 900:
|
|
60
|
-
doc_data = (doc_data[:897] + "...")
|
|
61
|
-
doc_content = {
|
|
62
|
-
"description": doc_data
|
|
63
|
-
}
|
|
64
|
-
if isinstance(field["type"], list) or isinstance(field["type"], dict):
|
|
65
|
-
inline_schema = inline_avro_references(field["type"].copy(), type_dict.copy(), '')
|
|
66
|
-
if (len(json.dumps(inline_schema)) + len(doc_data)) > 900:
|
|
67
|
-
while strip_first_doc(inline_schema):
|
|
68
|
-
if (len(json.dumps(inline_schema)) + len(doc_data)) < 900:
|
|
69
|
-
break
|
|
70
|
-
if (len(json.dumps(inline_schema)) + len(doc_data)) > 900:
|
|
71
|
-
doc_content["schema"] = '{ "doc": "Schema too large to inline. Please refer to the Avro schema for more details." }'
|
|
72
|
-
else:
|
|
73
|
-
doc_content["schema"] = inline_schema
|
|
74
|
-
else:
|
|
75
|
-
doc_content["schema"] = inline_schema
|
|
76
|
-
doc = json.dumps(json.dumps(doc_content))
|
|
77
|
-
doc_string_statement.append(f" [{column_name}]: {doc}")
|
|
78
|
-
if doc_string_statement and emit_cloudevents_columns:
|
|
79
|
-
doc_string_statement.extend([
|
|
80
|
-
" [___type] : 'Event type'",
|
|
81
|
-
" [___source]: 'Context origin/source of the event'",
|
|
82
|
-
" [___id]: 'Event identifier'",
|
|
83
|
-
" [___time]: 'Event generation time'",
|
|
84
|
-
" [___subject]: 'Context subject of the event'"
|
|
85
|
-
])
|
|
86
|
-
if doc_string_statement:
|
|
87
|
-
kusto.append(f".alter table [{table_name}] column-docstrings (")
|
|
88
|
-
kusto.append(",\n".join(doc_string_statement))
|
|
89
|
-
kusto.append(");")
|
|
90
|
-
kusto.append("")
|
|
91
|
-
|
|
92
|
-
# add the JSON mapping for the table
|
|
93
|
-
# .create-or-alter table dfl_data_events ingestion json mapping
|
|
94
|
-
kusto.append(
|
|
95
|
-
f".create-or-alter table [{table_name}] ingestion json mapping \"{table_name}_json_flat\"")
|
|
96
|
-
kusto.append("```\n[")
|
|
97
|
-
if emit_cloudevents_columns:
|
|
98
|
-
kusto.append(" {\"column\": \"___type\", \"path\": \"$.type\"},")
|
|
99
|
-
kusto.append(
|
|
100
|
-
" {\"column\": \"___source\", \"path\": \"$.source\"},")
|
|
101
|
-
kusto.append(" {\"column\": \"___id\", \"path\": \"$.id\"},")
|
|
102
|
-
kusto.append(" {\"column\": \"___time\", \"path\": \"$.time\"},")
|
|
103
|
-
kusto.append(
|
|
104
|
-
" {\"column\": \"___subject\", \"path\": \"$.subject\"},")
|
|
105
|
-
for field in fields:
|
|
106
|
-
json_name = column_name = field["name"]
|
|
107
|
-
if 'altnames' in field:
|
|
108
|
-
if 'kql' in field['altnames']:
|
|
109
|
-
column_name = field['altnames']['kql']
|
|
110
|
-
if 'json' in field['altnames']:
|
|
111
|
-
json_name = field['altnames']['json']
|
|
112
|
-
kusto.append(
|
|
113
|
-
f" {{\"column\": \"{column_name}\", \"path\": \"$.{json_name}\"}},")
|
|
114
|
-
kusto.append("]\n```\n\n")
|
|
115
|
-
|
|
116
|
-
if emit_cloudevents_columns:
|
|
117
|
-
kusto.append(
|
|
118
|
-
f".create-or-alter table [{table_name}] ingestion json mapping \"{table_name}_json_ce_structured\"")
|
|
119
|
-
kusto.append("```\n[")
|
|
120
|
-
kusto.append(" {\"column\": \"___type\", \"path\": \"$.type\"},")
|
|
121
|
-
kusto.append(
|
|
122
|
-
" {\"column\": \"___source\", \"path\": \"$.source\"},")
|
|
123
|
-
kusto.append(" {\"column\": \"___id\", \"path\": \"$.id\"},")
|
|
124
|
-
kusto.append(" {\"column\": \"___time\", \"path\": \"$.time\"},")
|
|
125
|
-
kusto.append(
|
|
126
|
-
" {\"column\": \"___subject\", \"path\": \"$.subject\"},")
|
|
127
|
-
for field in fields:
|
|
128
|
-
json_name = column_name = field["name"]
|
|
129
|
-
if 'altnames' in field:
|
|
130
|
-
if 'kql' in field['altnames']:
|
|
131
|
-
column_name = field['altnames']['kql']
|
|
132
|
-
if 'json' in field['altnames']:
|
|
133
|
-
json_name = field['altnames']['json']
|
|
134
|
-
kusto.append(
|
|
135
|
-
f" {{\"column\": \"{column_name}\", \"path\": \"$.data.{json_name}\"}},")
|
|
136
|
-
kusto.append("]\n```\n\n")
|
|
137
|
-
|
|
138
|
-
if emit_cloudevents_columns:
|
|
139
|
-
kusto.append(
|
|
140
|
-
f".drop materialized-view {table_name}Latest ifexists;")
|
|
141
|
-
kusto.append("")
|
|
142
|
-
kusto.append(
|
|
143
|
-
f".create materialized-view with (backfill=true) {table_name}Latest on table {table_name} {{")
|
|
144
|
-
kusto.append(
|
|
145
|
-
f" {table_name} | summarize arg_max(___time, *) by ___type, ___source, ___subject")
|
|
146
|
-
kusto.append("}")
|
|
147
|
-
kusto.append("")
|
|
148
|
-
|
|
149
|
-
if emit_cloudevents_dispatch_table:
|
|
150
|
-
event_type = recordschema["namespace"] + "." + \
|
|
151
|
-
recordschema["name"] if "namespace" in recordschema else recordschema["name"]
|
|
152
|
-
|
|
153
|
-
query = f"_cloudevents_dispatch | where (specversion == '1.0' and type == '{event_type}') | " + \
|
|
154
|
-
"project"
|
|
155
|
-
for field in fields:
|
|
156
|
-
column_name = field["name"]
|
|
157
|
-
if "altnames" in field and "kql" in field["altnames"]:
|
|
158
|
-
column_name = field["altnames"]["kql"]
|
|
159
|
-
column_type = self.convert_avro_type_to_kusto_type(
|
|
160
|
-
field["type"])
|
|
161
|
-
query += f"['{column_name}'] = to{column_type}(data.['{column_name}']),"
|
|
162
|
-
query += "___type = type,___source = source,___id = ['id'],___time = ['time'],___subject = subject"
|
|
163
|
-
|
|
164
|
-
# build an update policy for the table that gets triggered by updates to the dispatch table and extracts the event
|
|
165
|
-
kusto.append(f".alter table [{table_name}] policy update")
|
|
166
|
-
kusto.append("```")
|
|
167
|
-
kusto.append("[{")
|
|
168
|
-
kusto.append(" \"IsEnabled\": true,")
|
|
169
|
-
kusto.append(" \"Source\": \"_cloudevents_dispatch\",")
|
|
170
|
-
kusto.append(
|
|
171
|
-
f" \"Query\": \"{query}\",")
|
|
172
|
-
kusto.append(" \"IsTransactional\": false,")
|
|
173
|
-
kusto.append(" \"PropagateIngestionProperties\": true,")
|
|
174
|
-
kusto.append("}]")
|
|
175
|
-
kusto.append("```\n")
|
|
176
|
-
|
|
177
|
-
return kusto
|
|
178
|
-
|
|
179
|
-
def convert_avro_to_kusto_script(self, avro_schema_path, avro_record_type, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False) -> str:
|
|
180
|
-
"""Converts an Avro schema to a Kusto table schema."""
|
|
181
|
-
if emit_cloudevents_dispatch_table:
|
|
182
|
-
emit_cloudevents_columns = True
|
|
183
|
-
schema_file = avro_schema_path
|
|
184
|
-
if not schema_file:
|
|
185
|
-
print("Please specify the avro schema file")
|
|
186
|
-
sys.exit(1)
|
|
187
|
-
with open(schema_file, "r", encoding="utf-8") as f:
|
|
188
|
-
schema_json = f.read()
|
|
189
|
-
|
|
190
|
-
# Parse the schema as a JSON object
|
|
191
|
-
schema = json.loads(schema_json)
|
|
192
|
-
|
|
193
|
-
if isinstance(schema, list):
|
|
194
|
-
if avro_record_type:
|
|
195
|
-
schema = next(
|
|
196
|
-
(x for x in schema if x["name"] == avro_record_type), None)
|
|
197
|
-
if schema is None:
|
|
198
|
-
print(
|
|
199
|
-
f"No record type {avro_record_type} found in the Avro schema")
|
|
200
|
-
sys.exit(1)
|
|
201
|
-
elif not isinstance(schema, dict) or "type" not in schema or schema["type"] != "record":
|
|
202
|
-
print(
|
|
203
|
-
"Expected a single Avro schema as a JSON object, or a list of schema records")
|
|
204
|
-
sys.exit(1)
|
|
205
|
-
|
|
206
|
-
if not isinstance(schema, list):
|
|
207
|
-
schema = [schema]
|
|
208
|
-
|
|
209
|
-
kusto_script = []
|
|
210
|
-
|
|
211
|
-
if emit_cloudevents_dispatch_table:
|
|
212
|
-
kusto_script.append(
|
|
213
|
-
".create-merge table [_cloudevents_dispatch] (")
|
|
214
|
-
kusto_script.append(" [specversion]: string,")
|
|
215
|
-
kusto_script.append(" [type]: string,")
|
|
216
|
-
kusto_script.append(" [source]: string,")
|
|
217
|
-
kusto_script.append(" [id]: string,")
|
|
218
|
-
kusto_script.append(" [time]: datetime,")
|
|
219
|
-
kusto_script.append(" [subject]: string,")
|
|
220
|
-
kusto_script.append(" [datacontenttype]: string,")
|
|
221
|
-
kusto_script.append(" [dataschema]: string,")
|
|
222
|
-
kusto_script.append(" [data]: dynamic")
|
|
223
|
-
kusto_script.append(");\n\n")
|
|
224
|
-
kusto_script.append(
|
|
225
|
-
".create-or-alter table [_cloudevents_dispatch] ingestion json mapping \"_cloudevents_dispatch_json\"")
|
|
226
|
-
kusto_script.append("```\n[")
|
|
227
|
-
kusto_script.append(
|
|
228
|
-
" {\"column\": \"specversion\", \"path\": \"$.specversion\"},")
|
|
229
|
-
kusto_script.append(
|
|
230
|
-
" {\"column\": \"type\", \"path\": \"$.type\"},")
|
|
231
|
-
kusto_script.append(
|
|
232
|
-
" {\"column\": \"source\", \"path\": \"$.source\"},")
|
|
233
|
-
kusto_script.append(" {\"column\": \"id\", \"path\": \"$.id\"},")
|
|
234
|
-
kusto_script.append(
|
|
235
|
-
" {\"column\": \"time\", \"path\": \"$.time\"},")
|
|
236
|
-
kusto_script.append(
|
|
237
|
-
" {\"column\": \"subject\", \"path\": \"$.subject\"},")
|
|
238
|
-
kusto_script.append(
|
|
239
|
-
" {\"column\": \"datacontenttype\", \"path\": \"$.datacontenttype\"},")
|
|
240
|
-
kusto_script.append(
|
|
241
|
-
" {\"column\": \"dataschema\", \"path\": \"$.dataschema\"},")
|
|
242
|
-
kusto_script.append(
|
|
243
|
-
" {\"column\": \"data\", \"path\": \"$.data\"}")
|
|
244
|
-
kusto_script.append("]\n```\n\n")
|
|
245
|
-
|
|
246
|
-
type_dict = build_flat_type_dict(schema)
|
|
247
|
-
for record in schema:
|
|
248
|
-
if not isinstance(record, dict) or "type" not in record or record["type"] != "record":
|
|
249
|
-
continue
|
|
250
|
-
kusto_script.extend(self.convert_record_to_kusto(type_dict,
|
|
251
|
-
record, emit_cloudevents_columns, emit_cloudevents_dispatch_table))
|
|
252
|
-
return "\n".join(kusto_script)
|
|
253
|
-
|
|
254
|
-
def convert_avro_to_kusto_file(self, avro_schema_path, avro_record_type, kusto_file_path, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False):
|
|
255
|
-
"""Converts an Avro schema to a Kusto table schema."""
|
|
256
|
-
script = self.convert_avro_to_kusto_script(
|
|
257
|
-
avro_schema_path, avro_record_type, emit_cloudevents_columns, emit_cloudevents_dispatch_table)
|
|
258
|
-
with open(kusto_file_path, "w", encoding="utf-8") as kusto_file:
|
|
259
|
-
kusto_file.write(script)
|
|
260
|
-
|
|
261
|
-
def convert_avro_type_to_kusto_type(self, avro_type: str | dict | list):
|
|
262
|
-
"""Converts an Avro type to a Kusto type."""
|
|
263
|
-
if isinstance(avro_type, list):
|
|
264
|
-
# If the type is an array, then it is a union type. Look whether it's a pair of a scalar type and null:
|
|
265
|
-
itemCount = len(avro_type)
|
|
266
|
-
if itemCount > 2:
|
|
267
|
-
return "dynamic"
|
|
268
|
-
if itemCount == 1:
|
|
269
|
-
return self.convert_avro_type_to_kusto_type(avro_type[0])
|
|
270
|
-
else:
|
|
271
|
-
first = avro_type[0]
|
|
272
|
-
second = avro_type[1]
|
|
273
|
-
if isinstance(first, str) and first == "null":
|
|
274
|
-
return self.convert_avro_type_to_kusto_type(second)
|
|
275
|
-
elif isinstance(second, str) and second == "null":
|
|
276
|
-
return self.convert_avro_type_to_kusto_type(first)
|
|
277
|
-
else:
|
|
278
|
-
return "dynamic"
|
|
279
|
-
elif isinstance(avro_type, dict):
|
|
280
|
-
type_value = avro_type.get("type")
|
|
281
|
-
if type_value == "enum":
|
|
282
|
-
return "string"
|
|
283
|
-
elif type_value == "fixed":
|
|
284
|
-
return "dynamic"
|
|
285
|
-
elif type_value == "string":
|
|
286
|
-
logical_type = avro_type.get("logicalType")
|
|
287
|
-
if logical_type == "uuid":
|
|
288
|
-
return "guid"
|
|
289
|
-
return "string"
|
|
290
|
-
elif type_value == "bytes":
|
|
291
|
-
logical_type = avro_type.get("logicalType")
|
|
292
|
-
if logical_type == "decimal":
|
|
293
|
-
return "decimal"
|
|
294
|
-
return "dynamic"
|
|
295
|
-
elif type_value == "long":
|
|
296
|
-
logical_type = avro_type.get("logicalType")
|
|
297
|
-
if logical_type in ["timestamp-millis", "timestamp-micros"]:
|
|
298
|
-
return "datetime"
|
|
299
|
-
if logical_type in ["time-millis", "time-micros"]:
|
|
300
|
-
return "timespan"
|
|
301
|
-
return "long"
|
|
302
|
-
elif type_value == "int":
|
|
303
|
-
logical_type = avro_type.get("logicalType")
|
|
304
|
-
if logical_type == "date":
|
|
305
|
-
return "datetime"
|
|
306
|
-
return "int"
|
|
307
|
-
else:
|
|
308
|
-
return self.map_scalar_type(type_value)
|
|
309
|
-
elif isinstance(avro_type, str):
|
|
310
|
-
return self.map_scalar_type(avro_type)
|
|
311
|
-
|
|
312
|
-
def map_scalar_type(self, type_value: Any):
|
|
313
|
-
"""Maps an Avro scalar type to a Kusto scalar type."""
|
|
314
|
-
if type_value == "null":
|
|
315
|
-
return "dynamic"
|
|
316
|
-
elif type_value == "int":
|
|
317
|
-
return "int"
|
|
318
|
-
elif type_value == "long":
|
|
319
|
-
return "long"
|
|
320
|
-
elif type_value == "float":
|
|
321
|
-
return "real"
|
|
322
|
-
elif type_value == "double":
|
|
323
|
-
return "real"
|
|
324
|
-
elif type_value == "boolean":
|
|
325
|
-
return "bool"
|
|
326
|
-
elif type_value == "bytes":
|
|
327
|
-
return "dynamic"
|
|
328
|
-
elif type_value == "string":
|
|
329
|
-
return "string"
|
|
330
|
-
else:
|
|
331
|
-
return "dynamic"
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
def convert_avro_to_kusto_file(avro_schema_path, avro_record_type, kusto_file_path, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False):
|
|
335
|
-
"""Converts an Avro schema to a Kusto table schema."""
|
|
336
|
-
avro_to_kusto = AvroToKusto()
|
|
337
|
-
avro_to_kusto.convert_avro_to_kusto_file(
|
|
338
|
-
avro_schema_path, avro_record_type, kusto_file_path, emit_cloudevents_columns, emit_cloudevents_dispatch_table)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
def convert_avro_to_kusto_db(avro_schema_path, avro_record_type, kusto_uri, kusto_database, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False, token_provider=None):
|
|
342
|
-
"""Converts an Avro schema to a Kusto table schema."""
|
|
343
|
-
avro_to_kusto = AvroToKusto()
|
|
344
|
-
script = avro_to_kusto.convert_avro_to_kusto_script(
|
|
345
|
-
avro_schema_path, avro_record_type, emit_cloudevents_columns, emit_cloudevents_dispatch_table)
|
|
346
|
-
kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(
|
|
347
|
-
kusto_uri) if not token_provider else KustoConnectionStringBuilder.with_token_provider(kusto_uri, token_provider)
|
|
348
|
-
client = KustoClient(kcsb)
|
|
349
|
-
for statement in script.split("\n\n"):
|
|
350
|
-
if statement.strip():
|
|
351
|
-
try:
|
|
352
|
-
client.execute_mgmt(kusto_database, statement)
|
|
353
|
-
except Exception as e:
|
|
354
|
-
print(e)
|
|
355
|
-
sys.exit(1)
|
|
356
|
-
|
|
357
|
-
def convert_avro_to_kusto(avro_schema_path, avro_record_type, kusto_file_path, kusto_uri, kusto_database, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False, token_provider=None):
|
|
358
|
-
"""Converts an Avro schema to a Kusto table schema."""
|
|
359
|
-
if not kusto_uri and not kusto_database:
|
|
360
|
-
convert_avro_to_kusto_file(
|
|
361
|
-
avro_schema_path, avro_record_type, kusto_file_path, emit_cloudevents_columns, emit_cloudevents_dispatch_table)
|
|
362
|
-
else:
|
|
363
|
-
convert_avro_to_kusto_db(
|
|
1
|
+
"""Converts an Avro schema to a Kusto table schema."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any, List
|
|
6
|
+
from avrotize.common import build_flat_type_dict, inline_avro_references, strip_first_doc
|
|
7
|
+
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder, ClientRequestProperties
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AvroToKusto:
|
|
11
|
+
"""Converts an Avro schema to a Kusto table schema."""
|
|
12
|
+
|
|
13
|
+
def ___init___(self):
|
|
14
|
+
"""Initializes a new instance of the AvroToKusto class."""
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
def convert_record_to_kusto(self, type_dict:dict, recordschema: dict, emit_cloudevents_columns: bool, emit_cloudevents_dispatch_table: bool) -> List[str]:
|
|
18
|
+
"""Converts an Avro record schema to a Kusto table schema."""
|
|
19
|
+
# Get the name and fields of the top-level record
|
|
20
|
+
table_name = recordschema["name"]
|
|
21
|
+
fields = recordschema["fields"]
|
|
22
|
+
|
|
23
|
+
# Create a StringBuilder to store the kusto statements
|
|
24
|
+
kusto = []
|
|
25
|
+
|
|
26
|
+
# Append the create table statement with the column names and types
|
|
27
|
+
kusto.append(f".create-merge table [{table_name}] (")
|
|
28
|
+
columns = []
|
|
29
|
+
for field in fields:
|
|
30
|
+
column_name = field["name"]
|
|
31
|
+
column_type = self.convert_avro_type_to_kusto_type(field["type"])
|
|
32
|
+
columns.append(f" [{column_name}]: {column_type}")
|
|
33
|
+
if emit_cloudevents_columns:
|
|
34
|
+
columns.append(" [___type]: string")
|
|
35
|
+
columns.append(" [___source]: string")
|
|
36
|
+
columns.append(" [___id]: string")
|
|
37
|
+
columns.append(" [___time]: datetime")
|
|
38
|
+
columns.append(" [___subject]: string")
|
|
39
|
+
kusto.append(",\n".join(columns))
|
|
40
|
+
kusto.append(");")
|
|
41
|
+
kusto.append("")
|
|
42
|
+
|
|
43
|
+
# Add the doc string as table metadata
|
|
44
|
+
if "doc" in recordschema:
|
|
45
|
+
doc_data = recordschema["doc"]
|
|
46
|
+
doc_data = (doc_data[:997] + "...") if len(doc_data) > 1000 else doc_data
|
|
47
|
+
doc_string = json.dumps(json.dumps({
|
|
48
|
+
"description": doc_data
|
|
49
|
+
}))
|
|
50
|
+
kusto.append(
|
|
51
|
+
f".alter table [{table_name}] docstring {doc_string};")
|
|
52
|
+
kusto.append("")
|
|
53
|
+
|
|
54
|
+
doc_string_statement = []
|
|
55
|
+
for field in fields:
|
|
56
|
+
column_name = field["name"]
|
|
57
|
+
if "doc" in field:
|
|
58
|
+
doc_data = field["doc"]
|
|
59
|
+
if len(doc_data) > 900:
|
|
60
|
+
doc_data = (doc_data[:897] + "...")
|
|
61
|
+
doc_content = {
|
|
62
|
+
"description": doc_data
|
|
63
|
+
}
|
|
64
|
+
if isinstance(field["type"], list) or isinstance(field["type"], dict):
|
|
65
|
+
inline_schema = inline_avro_references(field["type"].copy(), type_dict.copy(), '')
|
|
66
|
+
if (len(json.dumps(inline_schema)) + len(doc_data)) > 900:
|
|
67
|
+
while strip_first_doc(inline_schema):
|
|
68
|
+
if (len(json.dumps(inline_schema)) + len(doc_data)) < 900:
|
|
69
|
+
break
|
|
70
|
+
if (len(json.dumps(inline_schema)) + len(doc_data)) > 900:
|
|
71
|
+
doc_content["schema"] = '{ "doc": "Schema too large to inline. Please refer to the Avro schema for more details." }'
|
|
72
|
+
else:
|
|
73
|
+
doc_content["schema"] = inline_schema
|
|
74
|
+
else:
|
|
75
|
+
doc_content["schema"] = inline_schema
|
|
76
|
+
doc = json.dumps(json.dumps(doc_content))
|
|
77
|
+
doc_string_statement.append(f" [{column_name}]: {doc}")
|
|
78
|
+
if doc_string_statement and emit_cloudevents_columns:
|
|
79
|
+
doc_string_statement.extend([
|
|
80
|
+
" [___type] : 'Event type'",
|
|
81
|
+
" [___source]: 'Context origin/source of the event'",
|
|
82
|
+
" [___id]: 'Event identifier'",
|
|
83
|
+
" [___time]: 'Event generation time'",
|
|
84
|
+
" [___subject]: 'Context subject of the event'"
|
|
85
|
+
])
|
|
86
|
+
if doc_string_statement:
|
|
87
|
+
kusto.append(f".alter table [{table_name}] column-docstrings (")
|
|
88
|
+
kusto.append(",\n".join(doc_string_statement))
|
|
89
|
+
kusto.append(");")
|
|
90
|
+
kusto.append("")
|
|
91
|
+
|
|
92
|
+
# add the JSON mapping for the table
|
|
93
|
+
# .create-or-alter table dfl_data_events ingestion json mapping
|
|
94
|
+
kusto.append(
|
|
95
|
+
f".create-or-alter table [{table_name}] ingestion json mapping \"{table_name}_json_flat\"")
|
|
96
|
+
kusto.append("```\n[")
|
|
97
|
+
if emit_cloudevents_columns:
|
|
98
|
+
kusto.append(" {\"column\": \"___type\", \"path\": \"$.type\"},")
|
|
99
|
+
kusto.append(
|
|
100
|
+
" {\"column\": \"___source\", \"path\": \"$.source\"},")
|
|
101
|
+
kusto.append(" {\"column\": \"___id\", \"path\": \"$.id\"},")
|
|
102
|
+
kusto.append(" {\"column\": \"___time\", \"path\": \"$.time\"},")
|
|
103
|
+
kusto.append(
|
|
104
|
+
" {\"column\": \"___subject\", \"path\": \"$.subject\"},")
|
|
105
|
+
for field in fields:
|
|
106
|
+
json_name = column_name = field["name"]
|
|
107
|
+
if 'altnames' in field:
|
|
108
|
+
if 'kql' in field['altnames']:
|
|
109
|
+
column_name = field['altnames']['kql']
|
|
110
|
+
if 'json' in field['altnames']:
|
|
111
|
+
json_name = field['altnames']['json']
|
|
112
|
+
kusto.append(
|
|
113
|
+
f" {{\"column\": \"{column_name}\", \"path\": \"$.{json_name}\"}},")
|
|
114
|
+
kusto.append("]\n```\n\n")
|
|
115
|
+
|
|
116
|
+
if emit_cloudevents_columns:
|
|
117
|
+
kusto.append(
|
|
118
|
+
f".create-or-alter table [{table_name}] ingestion json mapping \"{table_name}_json_ce_structured\"")
|
|
119
|
+
kusto.append("```\n[")
|
|
120
|
+
kusto.append(" {\"column\": \"___type\", \"path\": \"$.type\"},")
|
|
121
|
+
kusto.append(
|
|
122
|
+
" {\"column\": \"___source\", \"path\": \"$.source\"},")
|
|
123
|
+
kusto.append(" {\"column\": \"___id\", \"path\": \"$.id\"},")
|
|
124
|
+
kusto.append(" {\"column\": \"___time\", \"path\": \"$.time\"},")
|
|
125
|
+
kusto.append(
|
|
126
|
+
" {\"column\": \"___subject\", \"path\": \"$.subject\"},")
|
|
127
|
+
for field in fields:
|
|
128
|
+
json_name = column_name = field["name"]
|
|
129
|
+
if 'altnames' in field:
|
|
130
|
+
if 'kql' in field['altnames']:
|
|
131
|
+
column_name = field['altnames']['kql']
|
|
132
|
+
if 'json' in field['altnames']:
|
|
133
|
+
json_name = field['altnames']['json']
|
|
134
|
+
kusto.append(
|
|
135
|
+
f" {{\"column\": \"{column_name}\", \"path\": \"$.data.{json_name}\"}},")
|
|
136
|
+
kusto.append("]\n```\n\n")
|
|
137
|
+
|
|
138
|
+
if emit_cloudevents_columns:
|
|
139
|
+
kusto.append(
|
|
140
|
+
f".drop materialized-view {table_name}Latest ifexists;")
|
|
141
|
+
kusto.append("")
|
|
142
|
+
kusto.append(
|
|
143
|
+
f".create materialized-view with (backfill=true) {table_name}Latest on table {table_name} {{")
|
|
144
|
+
kusto.append(
|
|
145
|
+
f" {table_name} | summarize arg_max(___time, *) by ___type, ___source, ___subject")
|
|
146
|
+
kusto.append("}")
|
|
147
|
+
kusto.append("")
|
|
148
|
+
|
|
149
|
+
if emit_cloudevents_dispatch_table:
|
|
150
|
+
event_type = recordschema["namespace"] + "." + \
|
|
151
|
+
recordschema["name"] if "namespace" in recordschema else recordschema["name"]
|
|
152
|
+
|
|
153
|
+
query = f"_cloudevents_dispatch | where (specversion == '1.0' and type == '{event_type}') | " + \
|
|
154
|
+
"project"
|
|
155
|
+
for field in fields:
|
|
156
|
+
column_name = field["name"]
|
|
157
|
+
if "altnames" in field and "kql" in field["altnames"]:
|
|
158
|
+
column_name = field["altnames"]["kql"]
|
|
159
|
+
column_type = self.convert_avro_type_to_kusto_type(
|
|
160
|
+
field["type"])
|
|
161
|
+
query += f"['{column_name}'] = to{column_type}(data.['{column_name}']),"
|
|
162
|
+
query += "___type = type,___source = source,___id = ['id'],___time = ['time'],___subject = subject"
|
|
163
|
+
|
|
164
|
+
# build an update policy for the table that gets triggered by updates to the dispatch table and extracts the event
|
|
165
|
+
kusto.append(f".alter table [{table_name}] policy update")
|
|
166
|
+
kusto.append("```")
|
|
167
|
+
kusto.append("[{")
|
|
168
|
+
kusto.append(" \"IsEnabled\": true,")
|
|
169
|
+
kusto.append(" \"Source\": \"_cloudevents_dispatch\",")
|
|
170
|
+
kusto.append(
|
|
171
|
+
f" \"Query\": \"{query}\",")
|
|
172
|
+
kusto.append(" \"IsTransactional\": false,")
|
|
173
|
+
kusto.append(" \"PropagateIngestionProperties\": true,")
|
|
174
|
+
kusto.append("}]")
|
|
175
|
+
kusto.append("```\n")
|
|
176
|
+
|
|
177
|
+
return kusto
|
|
178
|
+
|
|
179
|
+
def convert_avro_to_kusto_script(self, avro_schema_path, avro_record_type, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False) -> str:
|
|
180
|
+
"""Converts an Avro schema to a Kusto table schema."""
|
|
181
|
+
if emit_cloudevents_dispatch_table:
|
|
182
|
+
emit_cloudevents_columns = True
|
|
183
|
+
schema_file = avro_schema_path
|
|
184
|
+
if not schema_file:
|
|
185
|
+
print("Please specify the avro schema file")
|
|
186
|
+
sys.exit(1)
|
|
187
|
+
with open(schema_file, "r", encoding="utf-8") as f:
|
|
188
|
+
schema_json = f.read()
|
|
189
|
+
|
|
190
|
+
# Parse the schema as a JSON object
|
|
191
|
+
schema = json.loads(schema_json)
|
|
192
|
+
|
|
193
|
+
if isinstance(schema, list):
|
|
194
|
+
if avro_record_type:
|
|
195
|
+
schema = next(
|
|
196
|
+
(x for x in schema if x["name"] == avro_record_type), None)
|
|
197
|
+
if schema is None:
|
|
198
|
+
print(
|
|
199
|
+
f"No record type {avro_record_type} found in the Avro schema")
|
|
200
|
+
sys.exit(1)
|
|
201
|
+
elif not isinstance(schema, dict) or "type" not in schema or schema["type"] != "record":
|
|
202
|
+
print(
|
|
203
|
+
"Expected a single Avro schema as a JSON object, or a list of schema records")
|
|
204
|
+
sys.exit(1)
|
|
205
|
+
|
|
206
|
+
if not isinstance(schema, list):
|
|
207
|
+
schema = [schema]
|
|
208
|
+
|
|
209
|
+
kusto_script = []
|
|
210
|
+
|
|
211
|
+
if emit_cloudevents_dispatch_table:
|
|
212
|
+
kusto_script.append(
|
|
213
|
+
".create-merge table [_cloudevents_dispatch] (")
|
|
214
|
+
kusto_script.append(" [specversion]: string,")
|
|
215
|
+
kusto_script.append(" [type]: string,")
|
|
216
|
+
kusto_script.append(" [source]: string,")
|
|
217
|
+
kusto_script.append(" [id]: string,")
|
|
218
|
+
kusto_script.append(" [time]: datetime,")
|
|
219
|
+
kusto_script.append(" [subject]: string,")
|
|
220
|
+
kusto_script.append(" [datacontenttype]: string,")
|
|
221
|
+
kusto_script.append(" [dataschema]: string,")
|
|
222
|
+
kusto_script.append(" [data]: dynamic")
|
|
223
|
+
kusto_script.append(");\n\n")
|
|
224
|
+
kusto_script.append(
|
|
225
|
+
".create-or-alter table [_cloudevents_dispatch] ingestion json mapping \"_cloudevents_dispatch_json\"")
|
|
226
|
+
kusto_script.append("```\n[")
|
|
227
|
+
kusto_script.append(
|
|
228
|
+
" {\"column\": \"specversion\", \"path\": \"$.specversion\"},")
|
|
229
|
+
kusto_script.append(
|
|
230
|
+
" {\"column\": \"type\", \"path\": \"$.type\"},")
|
|
231
|
+
kusto_script.append(
|
|
232
|
+
" {\"column\": \"source\", \"path\": \"$.source\"},")
|
|
233
|
+
kusto_script.append(" {\"column\": \"id\", \"path\": \"$.id\"},")
|
|
234
|
+
kusto_script.append(
|
|
235
|
+
" {\"column\": \"time\", \"path\": \"$.time\"},")
|
|
236
|
+
kusto_script.append(
|
|
237
|
+
" {\"column\": \"subject\", \"path\": \"$.subject\"},")
|
|
238
|
+
kusto_script.append(
|
|
239
|
+
" {\"column\": \"datacontenttype\", \"path\": \"$.datacontenttype\"},")
|
|
240
|
+
kusto_script.append(
|
|
241
|
+
" {\"column\": \"dataschema\", \"path\": \"$.dataschema\"},")
|
|
242
|
+
kusto_script.append(
|
|
243
|
+
" {\"column\": \"data\", \"path\": \"$.data\"}")
|
|
244
|
+
kusto_script.append("]\n```\n\n")
|
|
245
|
+
|
|
246
|
+
type_dict = build_flat_type_dict(schema)
|
|
247
|
+
for record in schema:
|
|
248
|
+
if not isinstance(record, dict) or "type" not in record or record["type"] != "record":
|
|
249
|
+
continue
|
|
250
|
+
kusto_script.extend(self.convert_record_to_kusto(type_dict,
|
|
251
|
+
record, emit_cloudevents_columns, emit_cloudevents_dispatch_table))
|
|
252
|
+
return "\n".join(kusto_script)
|
|
253
|
+
|
|
254
|
+
def convert_avro_to_kusto_file(self, avro_schema_path, avro_record_type, kusto_file_path, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False):
|
|
255
|
+
"""Converts an Avro schema to a Kusto table schema."""
|
|
256
|
+
script = self.convert_avro_to_kusto_script(
|
|
257
|
+
avro_schema_path, avro_record_type, emit_cloudevents_columns, emit_cloudevents_dispatch_table)
|
|
258
|
+
with open(kusto_file_path, "w", encoding="utf-8") as kusto_file:
|
|
259
|
+
kusto_file.write(script)
|
|
260
|
+
|
|
261
|
+
def convert_avro_type_to_kusto_type(self, avro_type: str | dict | list):
|
|
262
|
+
"""Converts an Avro type to a Kusto type."""
|
|
263
|
+
if isinstance(avro_type, list):
|
|
264
|
+
# If the type is an array, then it is a union type. Look whether it's a pair of a scalar type and null:
|
|
265
|
+
itemCount = len(avro_type)
|
|
266
|
+
if itemCount > 2:
|
|
267
|
+
return "dynamic"
|
|
268
|
+
if itemCount == 1:
|
|
269
|
+
return self.convert_avro_type_to_kusto_type(avro_type[0])
|
|
270
|
+
else:
|
|
271
|
+
first = avro_type[0]
|
|
272
|
+
second = avro_type[1]
|
|
273
|
+
if isinstance(first, str) and first == "null":
|
|
274
|
+
return self.convert_avro_type_to_kusto_type(second)
|
|
275
|
+
elif isinstance(second, str) and second == "null":
|
|
276
|
+
return self.convert_avro_type_to_kusto_type(first)
|
|
277
|
+
else:
|
|
278
|
+
return "dynamic"
|
|
279
|
+
elif isinstance(avro_type, dict):
|
|
280
|
+
type_value = avro_type.get("type")
|
|
281
|
+
if type_value == "enum":
|
|
282
|
+
return "string"
|
|
283
|
+
elif type_value == "fixed":
|
|
284
|
+
return "dynamic"
|
|
285
|
+
elif type_value == "string":
|
|
286
|
+
logical_type = avro_type.get("logicalType")
|
|
287
|
+
if logical_type == "uuid":
|
|
288
|
+
return "guid"
|
|
289
|
+
return "string"
|
|
290
|
+
elif type_value == "bytes":
|
|
291
|
+
logical_type = avro_type.get("logicalType")
|
|
292
|
+
if logical_type == "decimal":
|
|
293
|
+
return "decimal"
|
|
294
|
+
return "dynamic"
|
|
295
|
+
elif type_value == "long":
|
|
296
|
+
logical_type = avro_type.get("logicalType")
|
|
297
|
+
if logical_type in ["timestamp-millis", "timestamp-micros"]:
|
|
298
|
+
return "datetime"
|
|
299
|
+
if logical_type in ["time-millis", "time-micros"]:
|
|
300
|
+
return "timespan"
|
|
301
|
+
return "long"
|
|
302
|
+
elif type_value == "int":
|
|
303
|
+
logical_type = avro_type.get("logicalType")
|
|
304
|
+
if logical_type == "date":
|
|
305
|
+
return "datetime"
|
|
306
|
+
return "int"
|
|
307
|
+
else:
|
|
308
|
+
return self.map_scalar_type(type_value)
|
|
309
|
+
elif isinstance(avro_type, str):
|
|
310
|
+
return self.map_scalar_type(avro_type)
|
|
311
|
+
|
|
312
|
+
def map_scalar_type(self, type_value: Any):
|
|
313
|
+
"""Maps an Avro scalar type to a Kusto scalar type."""
|
|
314
|
+
if type_value == "null":
|
|
315
|
+
return "dynamic"
|
|
316
|
+
elif type_value == "int":
|
|
317
|
+
return "int"
|
|
318
|
+
elif type_value == "long":
|
|
319
|
+
return "long"
|
|
320
|
+
elif type_value == "float":
|
|
321
|
+
return "real"
|
|
322
|
+
elif type_value == "double":
|
|
323
|
+
return "real"
|
|
324
|
+
elif type_value == "boolean":
|
|
325
|
+
return "bool"
|
|
326
|
+
elif type_value == "bytes":
|
|
327
|
+
return "dynamic"
|
|
328
|
+
elif type_value == "string":
|
|
329
|
+
return "string"
|
|
330
|
+
else:
|
|
331
|
+
return "dynamic"
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def convert_avro_to_kusto_file(avro_schema_path, avro_record_type, kusto_file_path, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False):
|
|
335
|
+
"""Converts an Avro schema to a Kusto table schema."""
|
|
336
|
+
avro_to_kusto = AvroToKusto()
|
|
337
|
+
avro_to_kusto.convert_avro_to_kusto_file(
|
|
338
|
+
avro_schema_path, avro_record_type, kusto_file_path, emit_cloudevents_columns, emit_cloudevents_dispatch_table)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def convert_avro_to_kusto_db(avro_schema_path, avro_record_type, kusto_uri, kusto_database, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False, token_provider=None):
|
|
342
|
+
"""Converts an Avro schema to a Kusto table schema."""
|
|
343
|
+
avro_to_kusto = AvroToKusto()
|
|
344
|
+
script = avro_to_kusto.convert_avro_to_kusto_script(
|
|
345
|
+
avro_schema_path, avro_record_type, emit_cloudevents_columns, emit_cloudevents_dispatch_table)
|
|
346
|
+
kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(
|
|
347
|
+
kusto_uri) if not token_provider else KustoConnectionStringBuilder.with_token_provider(kusto_uri, token_provider)
|
|
348
|
+
client = KustoClient(kcsb)
|
|
349
|
+
for statement in script.split("\n\n"):
|
|
350
|
+
if statement.strip():
|
|
351
|
+
try:
|
|
352
|
+
client.execute_mgmt(kusto_database, statement)
|
|
353
|
+
except Exception as e:
|
|
354
|
+
print(e)
|
|
355
|
+
sys.exit(1)
|
|
356
|
+
|
|
357
|
+
def convert_avro_to_kusto(avro_schema_path, avro_record_type, kusto_file_path, kusto_uri, kusto_database, emit_cloudevents_columns=False, emit_cloudevents_dispatch_table=False, token_provider=None):
|
|
358
|
+
"""Converts an Avro schema to a Kusto table schema."""
|
|
359
|
+
if not kusto_uri and not kusto_database:
|
|
360
|
+
convert_avro_to_kusto_file(
|
|
361
|
+
avro_schema_path, avro_record_type, kusto_file_path, emit_cloudevents_columns, emit_cloudevents_dispatch_table)
|
|
362
|
+
else:
|
|
363
|
+
convert_avro_to_kusto_db(
|
|
364
364
|
avro_schema_path, avro_record_type, kusto_uri, kusto_database, emit_cloudevents_columns, emit_cloudevents_dispatch_table, token_provider)
|