structurize 2.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +63 -0
- avrotize/__main__.py +6 -0
- avrotize/_version.py +34 -0
- avrotize/asn1toavro.py +160 -0
- avrotize/avrotize.py +152 -0
- avrotize/avrotocpp.py +483 -0
- avrotize/avrotocsharp.py +992 -0
- avrotize/avrotocsv.py +121 -0
- avrotize/avrotodatapackage.py +173 -0
- avrotize/avrotodb.py +1383 -0
- avrotize/avrotogo.py +476 -0
- avrotize/avrotographql.py +197 -0
- avrotize/avrotoiceberg.py +210 -0
- avrotize/avrotojava.py +1023 -0
- avrotize/avrotojs.py +250 -0
- avrotize/avrotojsons.py +481 -0
- avrotize/avrotojstruct.py +345 -0
- avrotize/avrotokusto.py +364 -0
- avrotize/avrotomd.py +137 -0
- avrotize/avrotools.py +168 -0
- avrotize/avrotoparquet.py +208 -0
- avrotize/avrotoproto.py +359 -0
- avrotize/avrotopython.py +622 -0
- avrotize/avrotorust.py +435 -0
- avrotize/avrotots.py +598 -0
- avrotize/avrotoxsd.py +344 -0
- avrotize/commands.json +2433 -0
- avrotize/common.py +829 -0
- avrotize/constants.py +5 -0
- avrotize/csvtoavro.py +132 -0
- avrotize/datapackagetoavro.py +76 -0
- avrotize/dependency_resolver.py +348 -0
- avrotize/jsonstoavro.py +1698 -0
- avrotize/jsonstostructure.py +2642 -0
- avrotize/jstructtoavro.py +878 -0
- avrotize/kstructtoavro.py +93 -0
- avrotize/kustotoavro.py +455 -0
- avrotize/parquettoavro.py +157 -0
- avrotize/proto2parser.py +498 -0
- avrotize/proto3parser.py +403 -0
- avrotize/prototoavro.py +382 -0
- avrotize/structuretocsharp.py +2005 -0
- avrotize/structuretojsons.py +498 -0
- avrotize/structuretopython.py +772 -0
- avrotize/xsdtoavro.py +413 -0
- structurize-2.16.2.dist-info/METADATA +805 -0
- structurize-2.16.2.dist-info/RECORD +51 -0
- structurize-2.16.2.dist-info/WHEEL +5 -0
- structurize-2.16.2.dist-info/entry_points.txt +2 -0
- structurize-2.16.2.dist-info/licenses/LICENSE +201 -0
- structurize-2.16.2.dist-info/top_level.txt +1 -0
avrotize/avrotodb.py
ADDED
|
@@ -0,0 +1,1383 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Convert Avro schema to SQL schema for various databases.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
import os
|
|
8
|
+
from typing import Dict, List, cast
|
|
9
|
+
from avrotize.common import altname, camel
|
|
10
|
+
|
|
11
|
+
JsonNode = Dict[str,
|
|
12
|
+
'JsonNode'] | List['JsonNode'] | str | bool | int | float | None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def convert_avro_to_sql(avro_schema_path, dbscript_file_path, db_dialect, emit_cloudevents_columns=False, schema_name: str = ''):
|
|
16
|
+
"""
|
|
17
|
+
Converts an Avro schema to database schema for the specified DB dialect.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
avro_schema_path (str): Path to the Avro schema file.
|
|
21
|
+
dbscript_file_path (str): Path to the output SQL file.
|
|
22
|
+
db_dialect (str): SQL/DB dialect. Supported: 'sqlserver', 'postgres', 'mysql', 'mariadb',
|
|
23
|
+
'sqlite', 'oracle', 'db2', 'sqlanywhere', 'bigquery', 'snowflake',
|
|
24
|
+
'redshift', 'cassandra', 'mongodb', 'dynamodb', 'elasticsearch',
|
|
25
|
+
'couchdb', 'neo4j', 'firebase', 'cosmosdb', 'hbase'.
|
|
26
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
27
|
+
schema_name (str): Schema name (optional).
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
ValueError: If the SQL dialect is unsupported.
|
|
31
|
+
"""
|
|
32
|
+
if db_dialect not in ["sqlserver", "postgres", "mysql", "mariadb", "sqlite", "oracle", "db2",
|
|
33
|
+
"sqlanywhere", "bigquery", "snowflake", "redshift", "cassandra", "mongodb",
|
|
34
|
+
"dynamodb", "elasticsearch", "couchdb", "neo4j", "firebase", "cosmosdb", "hbase"]:
|
|
35
|
+
print(f"Unsupported SQL dialect: {db_dialect}")
|
|
36
|
+
sys.exit(1)
|
|
37
|
+
|
|
38
|
+
schema_file = avro_schema_path
|
|
39
|
+
if not schema_file:
|
|
40
|
+
print("Please specify the avro schema file")
|
|
41
|
+
sys.exit(1)
|
|
42
|
+
with open(schema_file, "r", encoding="utf-8") as f:
|
|
43
|
+
schema_json = f.read()
|
|
44
|
+
|
|
45
|
+
schema_list = schema = json.loads(schema_json)
|
|
46
|
+
|
|
47
|
+
if isinstance(schema, list):
|
|
48
|
+
tables_sql = []
|
|
49
|
+
for schema in schema_list:
|
|
50
|
+
if not isinstance(schema, dict) or "type" not in schema or schema["type"] != "record":
|
|
51
|
+
continue
|
|
52
|
+
tables_sql.extend(generate_sql(
|
|
53
|
+
schema, db_dialect, emit_cloudevents_columns, schema_list, schema_name))
|
|
54
|
+
with open(dbscript_file_path, "w", encoding="utf-8") as sql_file:
|
|
55
|
+
sql_file.write("\n".join(tables_sql))
|
|
56
|
+
else:
|
|
57
|
+
if not isinstance(schema, dict) or "type" not in schema or schema["type"] != "record":
|
|
58
|
+
raise ValueError("Invalid Avro record schema")
|
|
59
|
+
tables_sql = generate_sql(
|
|
60
|
+
schema, db_dialect, emit_cloudevents_columns, schema_list, schema_name)
|
|
61
|
+
with open(dbscript_file_path, "w", encoding="utf-8") as sql_file:
|
|
62
|
+
sql_file.write("\n".join(tables_sql))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def generate_sql(
|
|
66
|
+
schema: Dict[str, JsonNode],
|
|
67
|
+
sql_dialect: str,
|
|
68
|
+
emit_cloudevents_columns: bool,
|
|
69
|
+
schema_list: List[Dict[str, JsonNode]],
|
|
70
|
+
schema_name: str = '') -> List[str]:
|
|
71
|
+
"""
|
|
72
|
+
Generates SQL schema statements for the given Avro schema.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
schema (dict): Avro schema.
|
|
76
|
+
sql_dialect (str): SQL dialect.
|
|
77
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
78
|
+
schema_list (list): List of all schemas.
|
|
79
|
+
schema_name (str): Schema name (optional).
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
list: List of SQL statements.
|
|
83
|
+
"""
|
|
84
|
+
if sql_dialect in ["sqlserver", "postgres", "mysql", "mariadb", "sqlite", "oracle", "db2", "sqlanywhere",
|
|
85
|
+
"bigquery", "snowflake", "redshift"]:
|
|
86
|
+
return generate_relational_sql(schema, sql_dialect, emit_cloudevents_columns, schema_list, schema_name)
|
|
87
|
+
elif sql_dialect == "cassandra":
|
|
88
|
+
return generate_cassandra_schema(schema, emit_cloudevents_columns, schema_name)
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError(f"Unsupported SQL dialect: {sql_dialect}")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def generate_relational_sql(
|
|
94
|
+
schema: Dict[str, JsonNode],
|
|
95
|
+
sql_dialect: str,
|
|
96
|
+
emit_cloudevents_columns: bool,
|
|
97
|
+
schema_list: List[Dict[str, JsonNode]],
|
|
98
|
+
schema_name: str = '') -> List[str]:
|
|
99
|
+
"""
|
|
100
|
+
Generates relational SQL schema statements for the given Avro schema.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
schema (dict): Avro schema.
|
|
104
|
+
sql_dialect (str): SQL dialect.
|
|
105
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
106
|
+
schema_list (list): List of all schemas.
|
|
107
|
+
schema_name (str): Schema name (optional).
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
list: List of SQL statements.
|
|
111
|
+
"""
|
|
112
|
+
namespace = str(schema.get("namespace", "")).replace('.', '_')
|
|
113
|
+
plain_table_name = altname(
|
|
114
|
+
schema, 'sql') or f"{namespace}_{schema['name']}"
|
|
115
|
+
table_name = escape_name(plain_table_name, sql_dialect)
|
|
116
|
+
fields: List[Dict[str, JsonNode]] = cast(
|
|
117
|
+
List[Dict[str, JsonNode]], schema["fields"])
|
|
118
|
+
unique_record_keys: List[str] = cast(List[str], schema.get("unique", []))
|
|
119
|
+
|
|
120
|
+
table_comments = generate_table_comments_json(schema)
|
|
121
|
+
column_comments = generate_column_comments_json(fields, schema_list)
|
|
122
|
+
|
|
123
|
+
sql = []
|
|
124
|
+
sql.append(f"CREATE TABLE {table_name} (")
|
|
125
|
+
for field in fields:
|
|
126
|
+
column_name = escape_name(
|
|
127
|
+
altname(field, 'sql') or field["name"], sql_dialect)
|
|
128
|
+
column_type = avro_type_to_sql_type(field["type"], sql_dialect)
|
|
129
|
+
column_definition = f"{column_name} {column_type}"
|
|
130
|
+
if field.get("unique", False):
|
|
131
|
+
column_definition += f" {unique_clause(sql_dialect)}"
|
|
132
|
+
if sql_dialect == "mysql" and field["name"] in column_comments:
|
|
133
|
+
cmt = column_comments[str(field['name'])].replace("'", "''")
|
|
134
|
+
column_definition += f" COMMENT '{cmt}'"
|
|
135
|
+
sql.append(f" {column_definition},")
|
|
136
|
+
|
|
137
|
+
if emit_cloudevents_columns:
|
|
138
|
+
sql.extend([
|
|
139
|
+
f" {escape_name('___type', sql_dialect)} {avro_type_to_sql_type('string', sql_dialect)} NOT NULL,",
|
|
140
|
+
f" {escape_name('___source', sql_dialect)} {avro_type_to_sql_type('string', sql_dialect)} NOT NULL,",
|
|
141
|
+
f" {escape_name('___id', sql_dialect)} {avro_type_to_sql_type('string', sql_dialect)} NOT NULL,",
|
|
142
|
+
f" {escape_name('___time', sql_dialect)} {avro_type_to_sql_type('timestamp', sql_dialect)} NULL,",
|
|
143
|
+
f" {escape_name('___subject', sql_dialect)} {avro_type_to_sql_type('string', sql_dialect)} NULL,"
|
|
144
|
+
])
|
|
145
|
+
|
|
146
|
+
if unique_record_keys:
|
|
147
|
+
unique_column_altnames = []
|
|
148
|
+
if sql_dialect in ["mysql", "mariadb"]:
|
|
149
|
+
for field in fields:
|
|
150
|
+
if field["name"] in unique_record_keys:
|
|
151
|
+
column_type = avro_type_to_sql_type(
|
|
152
|
+
field["type"], sql_dialect)
|
|
153
|
+
if column_type in ["BLOB", "TEXT"]:
|
|
154
|
+
unique_column_altnames.append(escape_name(
|
|
155
|
+
altname(field, 'sql') + "(20)", sql_dialect))
|
|
156
|
+
else:
|
|
157
|
+
unique_column_altnames.append(
|
|
158
|
+
escape_name(altname(field, 'sql'), sql_dialect))
|
|
159
|
+
else:
|
|
160
|
+
unique_column_altnames = [escape_name(altname(
|
|
161
|
+
field, 'sql'), sql_dialect) for field in fields if field["name"] in unique_record_keys]
|
|
162
|
+
|
|
163
|
+
sql.append(
|
|
164
|
+
f" {primary_key_clause(sql_dialect)} ({', '.join(unique_column_altnames)})")
|
|
165
|
+
else:
|
|
166
|
+
sql[-1] = sql[-1][:-1] # Remove the last comma
|
|
167
|
+
sql.append(");")
|
|
168
|
+
sql.append("")
|
|
169
|
+
|
|
170
|
+
if sql_dialect != "mysql":
|
|
171
|
+
sql.extend(generate_table_comment_sql(
|
|
172
|
+
sql_dialect, table_comments, plain_table_name))
|
|
173
|
+
sql.extend(generate_column_comment_sql(
|
|
174
|
+
sql_dialect, column_comments, plain_table_name))
|
|
175
|
+
|
|
176
|
+
return sql
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def generate_table_comments_json(schema: Dict[str, JsonNode]) -> Dict[str, str]:
|
|
180
|
+
"""
|
|
181
|
+
Generates table-level comments as JSON.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
schema (dict): Avro schema.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
dict: Table-level comments as JSON.
|
|
188
|
+
"""
|
|
189
|
+
comments = {}
|
|
190
|
+
if "doc" in schema:
|
|
191
|
+
comments["doc"] = str(schema["doc"])
|
|
192
|
+
return comments
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def generate_column_comments_json(fields: List[Dict[str, JsonNode]], schema_list: List[Dict[str, JsonNode]]) -> Dict[str, str]:
|
|
196
|
+
"""
|
|
197
|
+
Generates column-level comments as JSON.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
fields (list): List of fields.
|
|
201
|
+
schema_list (list): List of all schemas.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
dict: Column-level comments as JSON.
|
|
205
|
+
"""
|
|
206
|
+
comments = {}
|
|
207
|
+
for field in fields:
|
|
208
|
+
column_comment = {}
|
|
209
|
+
if "doc" in field:
|
|
210
|
+
column_comment["doc"] = field["doc"]
|
|
211
|
+
column_type = field["type"]
|
|
212
|
+
if isinstance(column_type, list):
|
|
213
|
+
column_type = [x for x in column_type if x != "null"]
|
|
214
|
+
if len(column_type) > 1:
|
|
215
|
+
column_comment["schema"] = {
|
|
216
|
+
"type": "union", "types": column_type}
|
|
217
|
+
column_type = "union"
|
|
218
|
+
else:
|
|
219
|
+
column_type = column_type[0]
|
|
220
|
+
if isinstance(column_type, dict) and column_type["type"] in ["array", "map", "record", "enum", "fixed"]:
|
|
221
|
+
column_comment["schema"] = column_type
|
|
222
|
+
elif isinstance(schema_list, list):
|
|
223
|
+
column_schema = next(
|
|
224
|
+
(x for x in schema_list if x["name"] == column_type), None)
|
|
225
|
+
if column_schema:
|
|
226
|
+
column_comment["schema"] = column_schema
|
|
227
|
+
comments[str(field["name"])] = json.dumps(column_comment)
|
|
228
|
+
return comments
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def generate_table_comment_sql(dialect: str, table_comments: Dict[str, str], table_name: str) -> List[str]:
|
|
232
|
+
"""
|
|
233
|
+
Generates SQL statements for table-level comments.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
dialect (str): SQL dialect.
|
|
237
|
+
table_comments (dict): Table-level comments as JSON.
|
|
238
|
+
table_name (str): Table name.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
list: SQL statements for table-level comments.
|
|
242
|
+
"""
|
|
243
|
+
comments = []
|
|
244
|
+
if "doc" in table_comments:
|
|
245
|
+
doc_string = table_comments["doc"].replace("'", "''")
|
|
246
|
+
if dialect == "sqlserver":
|
|
247
|
+
comments.append(
|
|
248
|
+
f"EXEC sp_addextendedproperty 'MS_Description', '{doc_string}', 'SCHEMA', 'dbo', 'TABLE', '{table_name}';")
|
|
249
|
+
elif dialect in ["postgres", "oracle"]:
|
|
250
|
+
comments.append(
|
|
251
|
+
f"COMMENT ON TABLE {escape_name(table_name, dialect)} IS '{doc_string}';")
|
|
252
|
+
elif dialect == "sqlite":
|
|
253
|
+
comments.append(
|
|
254
|
+
f"-- COMMENT ON TABLE {escape_name(table_name, dialect)} IS '{doc_string}';")
|
|
255
|
+
# Add more dialect-specific clauses if needed
|
|
256
|
+
return comments
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def generate_column_comment_sql(dialect: str, column_comments: Dict[str, str], table_name: str) -> List[str]:
|
|
260
|
+
"""
|
|
261
|
+
Generates SQL statements for column-level comments.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
dialect (str): SQL dialect.
|
|
265
|
+
column_comments (dict): Column-level comments as JSON.
|
|
266
|
+
table_name (str): Table name.
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
list: SQL statements for column-level comments.
|
|
270
|
+
"""
|
|
271
|
+
comments = []
|
|
272
|
+
for column_name, comment in column_comments.items():
|
|
273
|
+
comment_data = json.loads(comment)
|
|
274
|
+
doc = comment_data.get("doc", "")
|
|
275
|
+
doc = doc.replace("'", "''")
|
|
276
|
+
schema = comment_data.get("schema", "")
|
|
277
|
+
if dialect == "sqlserver":
|
|
278
|
+
if doc:
|
|
279
|
+
comments.append(
|
|
280
|
+
f"EXEC sp_addextendedproperty 'MS_Description', '{doc}', 'SCHEMA', 'dbo', 'TABLE', '{table_name}', 'COLUMN', '{column_name}';")
|
|
281
|
+
if schema:
|
|
282
|
+
comments.append(
|
|
283
|
+
f"EXEC sp_addextendedproperty 'MS_Schema', '{json.dumps(schema)}', 'SCHEMA', 'dbo', 'TABLE', '{table_name}', 'COLUMN', '{column_name}';")
|
|
284
|
+
else:
|
|
285
|
+
comment = comment.replace("'", "''")
|
|
286
|
+
comments.append(
|
|
287
|
+
f"COMMENT ON COLUMN {escape_name(table_name, dialect)}.{escape_name(column_name, dialect)} IS '{comment}';")
|
|
288
|
+
return comments
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def escape_name(name, dialect):
|
|
292
|
+
"""
|
|
293
|
+
Escapes a name (table or column) for the given SQL dialect.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
name (str): The name to escape.
|
|
297
|
+
dialect (str): The SQL dialect.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
str: The escaped name.
|
|
301
|
+
"""
|
|
302
|
+
if dialect in ["sqlserver", "sqlanywhere"]:
|
|
303
|
+
return f"[{name}]"
|
|
304
|
+
elif dialect in ["postgres", "sqlite", "bigquery", "snowflake", "redshift"]:
|
|
305
|
+
return f'"{name}"'
|
|
306
|
+
elif dialect in ["mysql", "mariadb"]:
|
|
307
|
+
return f"`{name}`"
|
|
308
|
+
elif dialect in ["oracle", "db2"]:
|
|
309
|
+
return f'"{name.upper()}"'
|
|
310
|
+
elif dialect == "cassandra":
|
|
311
|
+
return f'"{name}"'
|
|
312
|
+
elif dialect in ["mongodb", "dynamodb", "elasticsearch", "couchdb", "neo4j", "firebase", "cosmosdb", "hbase"]:
|
|
313
|
+
return name
|
|
314
|
+
else:
|
|
315
|
+
return name
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def avro_type_to_sql_type(avro_type, dialect):
|
|
319
|
+
"""
|
|
320
|
+
Maps an Avro type to a SQL type for the specified dialect.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
avro_type (str or dict): The Avro type.
|
|
324
|
+
dialect (str): The SQL dialect.
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
str: The corresponding SQL type.
|
|
328
|
+
"""
|
|
329
|
+
avro_to_sql_type_map = {
|
|
330
|
+
"sqlserver": {
|
|
331
|
+
"null": "NULL",
|
|
332
|
+
"boolean": "BIT",
|
|
333
|
+
"int": "INT",
|
|
334
|
+
"long": "BIGINT",
|
|
335
|
+
"float": "FLOAT",
|
|
336
|
+
"double": "FLOAT",
|
|
337
|
+
"bytes": "VARBINARY(MAX)",
|
|
338
|
+
"string": "NVARCHAR(512)",
|
|
339
|
+
"array": "NVARCHAR(MAX)",
|
|
340
|
+
"map": "NVARCHAR(MAX)",
|
|
341
|
+
"record": "NVARCHAR(MAX)",
|
|
342
|
+
"union": "NVARCHAR(MAX)"
|
|
343
|
+
},
|
|
344
|
+
"postgres": {
|
|
345
|
+
"null": "NULL",
|
|
346
|
+
"boolean": "BOOLEAN",
|
|
347
|
+
"int": "INTEGER",
|
|
348
|
+
"long": "BIGINT",
|
|
349
|
+
"float": "REAL",
|
|
350
|
+
"double": "DOUBLE PRECISION",
|
|
351
|
+
"bytes": "BYTEA",
|
|
352
|
+
"string": "VARCHAR(512)",
|
|
353
|
+
"array": "JSONB",
|
|
354
|
+
"map": "JSONB",
|
|
355
|
+
"record": "JSONB",
|
|
356
|
+
"union": "JSONB"
|
|
357
|
+
},
|
|
358
|
+
"mysql": {
|
|
359
|
+
"null": "NULL",
|
|
360
|
+
"boolean": "BOOLEAN",
|
|
361
|
+
"int": "INT",
|
|
362
|
+
"long": "BIGINT",
|
|
363
|
+
"float": "FLOAT",
|
|
364
|
+
"double": "DOUBLE",
|
|
365
|
+
"bytes": "BLOB",
|
|
366
|
+
"string": "VARCHAR(512)",
|
|
367
|
+
"array": "JSON",
|
|
368
|
+
"map": "JSON",
|
|
369
|
+
"record": "JSON",
|
|
370
|
+
"union": "JSON"
|
|
371
|
+
},
|
|
372
|
+
"mariadb": {
|
|
373
|
+
"null": "NULL",
|
|
374
|
+
"boolean": "BOOLEAN",
|
|
375
|
+
"int": "INT",
|
|
376
|
+
"long": "BIGINT",
|
|
377
|
+
"float": "FLOAT",
|
|
378
|
+
"double": "DOUBLE",
|
|
379
|
+
"bytes": "BLOB",
|
|
380
|
+
"string": "VARCHAR(512)",
|
|
381
|
+
"array": "JSON",
|
|
382
|
+
"map": "JSON",
|
|
383
|
+
"record": "JSON",
|
|
384
|
+
"union": "JSON"
|
|
385
|
+
},
|
|
386
|
+
"sqlite": {
|
|
387
|
+
"null": "NULL",
|
|
388
|
+
"boolean": "BOOLEAN",
|
|
389
|
+
"int": "INTEGER",
|
|
390
|
+
"long": "INTEGER",
|
|
391
|
+
"float": "REAL",
|
|
392
|
+
"double": "REAL",
|
|
393
|
+
"bytes": "BLOB",
|
|
394
|
+
"string": "VARCHAR(512)",
|
|
395
|
+
"array": "TEXT",
|
|
396
|
+
"map": "TEXT",
|
|
397
|
+
"record": "TEXT",
|
|
398
|
+
"union": "TEXT"
|
|
399
|
+
},
|
|
400
|
+
"oracle": {
|
|
401
|
+
"null": "NULL",
|
|
402
|
+
"boolean": "NUMBER(1)",
|
|
403
|
+
"int": "NUMBER(10)",
|
|
404
|
+
"long": "NUMBER(19)",
|
|
405
|
+
"float": "FLOAT(126)",
|
|
406
|
+
"double": "FLOAT(126)",
|
|
407
|
+
"bytes": "BLOB",
|
|
408
|
+
"string": "VARCHAR(512)",
|
|
409
|
+
"array": "CLOB",
|
|
410
|
+
"map": "CLOB",
|
|
411
|
+
"record": "CLOB",
|
|
412
|
+
"union": "CLOB"
|
|
413
|
+
},
|
|
414
|
+
"db2": {
|
|
415
|
+
"null": "NULL",
|
|
416
|
+
"boolean": "BOOLEAN",
|
|
417
|
+
"int": "INTEGER",
|
|
418
|
+
"long": "BIGINT",
|
|
419
|
+
"float": "REAL",
|
|
420
|
+
"double": "DOUBLE",
|
|
421
|
+
"bytes": "BLOB",
|
|
422
|
+
"string": "VARCHAR(512)",
|
|
423
|
+
"array": "CLOB",
|
|
424
|
+
"map": "CLOB",
|
|
425
|
+
"record": "CLOB",
|
|
426
|
+
"union": "CLOB"
|
|
427
|
+
},
|
|
428
|
+
"sqlanywhere": {
|
|
429
|
+
"null": "NULL",
|
|
430
|
+
"boolean": "BIT",
|
|
431
|
+
"int": "INTEGER",
|
|
432
|
+
"long": "BIGINT",
|
|
433
|
+
"float": "FLOAT",
|
|
434
|
+
"double": "FLOAT",
|
|
435
|
+
"bytes": "LONG BINARY",
|
|
436
|
+
"string": "VARCHAR(512)",
|
|
437
|
+
"array": "LONG VARCHAR",
|
|
438
|
+
"map": "LONG VARCHAR",
|
|
439
|
+
"record": "LONG VARCHAR",
|
|
440
|
+
"union": "LONG VARCHAR"
|
|
441
|
+
},
|
|
442
|
+
"bigquery": {
|
|
443
|
+
"null": "NULL",
|
|
444
|
+
"boolean": "BOOL",
|
|
445
|
+
"int": "INT64",
|
|
446
|
+
"long": "INT64",
|
|
447
|
+
"float": "FLOAT64",
|
|
448
|
+
"double": "FLOAT64",
|
|
449
|
+
"bytes": "BYTES",
|
|
450
|
+
"string": "STRING",
|
|
451
|
+
"array": "STRING",
|
|
452
|
+
"map": "STRING",
|
|
453
|
+
"record": "STRING",
|
|
454
|
+
"union": "STRING"
|
|
455
|
+
},
|
|
456
|
+
"snowflake": {
|
|
457
|
+
"null": "NULL",
|
|
458
|
+
"boolean": "BOOLEAN",
|
|
459
|
+
"int": "NUMBER",
|
|
460
|
+
"long": "NUMBER",
|
|
461
|
+
"float": "FLOAT",
|
|
462
|
+
"double": "FLOAT",
|
|
463
|
+
"bytes": "BINARY",
|
|
464
|
+
"string": "STRING",
|
|
465
|
+
"array": "VARIANT",
|
|
466
|
+
"map": "VARIANT",
|
|
467
|
+
"record": "VARIANT",
|
|
468
|
+
"union": "VARIANT"
|
|
469
|
+
},
|
|
470
|
+
"redshift": {
|
|
471
|
+
"null": "NULL",
|
|
472
|
+
"boolean": "BOOLEAN",
|
|
473
|
+
"int": "INTEGER",
|
|
474
|
+
"long": "BIGINT",
|
|
475
|
+
"float": "REAL",
|
|
476
|
+
"double": "DOUBLE PRECISION",
|
|
477
|
+
"bytes": "VARBYTE",
|
|
478
|
+
"string": "VARCHAR(256)",
|
|
479
|
+
"array": "VARCHAR(65535)",
|
|
480
|
+
"map": "VARCHAR(65535)",
|
|
481
|
+
"record": "VARCHAR(65535)",
|
|
482
|
+
"union": "VARCHAR(65535)"
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
if isinstance(avro_type, list):
|
|
487
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
488
|
+
if len(avro_type) > 1:
|
|
489
|
+
return avro_to_sql_type_map[dialect]["union"]
|
|
490
|
+
avro_type = avro_type[0]
|
|
491
|
+
|
|
492
|
+
if isinstance(avro_type, dict):
|
|
493
|
+
avro_type = avro_type.get("type", "string")
|
|
494
|
+
|
|
495
|
+
return avro_to_sql_type_map[dialect].get(avro_type, avro_to_sql_type_map[dialect]["string"])
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def unique_clause(dialect):
|
|
499
|
+
"""
|
|
500
|
+
Returns the UNIQUE clause for the given SQL dialect.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
dialect (str): The SQL dialect.
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
str: The UNIQUE clause.
|
|
507
|
+
"""
|
|
508
|
+
return "UNIQUE"
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def primary_key_clause(dialect):
|
|
512
|
+
"""
|
|
513
|
+
Returns the PRIMARY KEY clause for the given SQL dialect.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
dialect (str): The SQL dialect.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
str: The PRIMARY KEY clause.
|
|
520
|
+
"""
|
|
521
|
+
return "PRIMARY KEY"
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def generate_cassandra_schema(schema: Dict[str, JsonNode], emit_cloudevents_columns: bool, schema_name: str) -> List[str]:
|
|
525
|
+
"""
|
|
526
|
+
Generates Cassandra schema statements for the given Avro schema.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
schema (dict): Avro schema.
|
|
530
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
531
|
+
schema_name (str): Schema name (optional).
|
|
532
|
+
|
|
533
|
+
Returns:
|
|
534
|
+
list: List of Cassandra schema statements.
|
|
535
|
+
"""
|
|
536
|
+
namespace = cast(str, schema.get("namespace", "")).replace(".", "_")
|
|
537
|
+
table_name = altname(schema, 'sql')
|
|
538
|
+
table_name = escape_name(
|
|
539
|
+
f"{namespace}_{table_name}" if namespace else table_name, "cassandra")
|
|
540
|
+
table_name = f"{schema_name}.{table_name}" if schema_name else table_name
|
|
541
|
+
# Cassandra table name length limit
|
|
542
|
+
table_name = compact_table_name(table_name, 48)
|
|
543
|
+
fields: List[Dict[str, JsonNode]] = cast(
|
|
544
|
+
List[Dict[str, JsonNode]], schema["fields"])
|
|
545
|
+
unique_record_keys: List[str] = cast(List[str], schema.get("unique", []))
|
|
546
|
+
|
|
547
|
+
cql = []
|
|
548
|
+
cql.append(f"CREATE TABLE {table_name} (")
|
|
549
|
+
for field in fields:
|
|
550
|
+
column_name = escape_name(
|
|
551
|
+
altname(field, 'sql') or field["name"], 'cassandra')
|
|
552
|
+
column_type = convert_avro_type_to_cassandra_type(field["type"])
|
|
553
|
+
cql.append(f" {column_name} {column_type},")
|
|
554
|
+
if emit_cloudevents_columns:
|
|
555
|
+
cql.extend([
|
|
556
|
+
f" {escape_name('cloudevents_type', 'cassandra')} text,",
|
|
557
|
+
f" {escape_name('cloudevents_source', 'cassandra')} text,",
|
|
558
|
+
f" {escape_name('cloudevents_id', 'cassandra')} text,",
|
|
559
|
+
f" {escape_name('cloudevents_time', 'cassandra')} timestamp,",
|
|
560
|
+
f" {escape_name('cloudevents_subject', 'cassandra')} text,"
|
|
561
|
+
])
|
|
562
|
+
if unique_record_keys:
|
|
563
|
+
unique_columns = [escape_name(field_name, "cassandra")
|
|
564
|
+
for field_name in unique_record_keys]
|
|
565
|
+
unique_column_altnames = [altname(
|
|
566
|
+
field, 'sql') for field in fields if field["name"] in unique_record_keys]
|
|
567
|
+
cql.append(f" PRIMARY KEY ({', '.join(unique_columns)})")
|
|
568
|
+
elif emit_cloudevents_columns:
|
|
569
|
+
cql.append(
|
|
570
|
+
f" PRIMARY KEY ({escape_name('cloudevents_id', 'cassandra')})")
|
|
571
|
+
else:
|
|
572
|
+
all_columns = [escape_name(
|
|
573
|
+
altname(field, 'sql') or field, 'cassandra') for field in fields]
|
|
574
|
+
cql.append(f" PRIMARY KEY ({', '.join(all_columns)})")
|
|
575
|
+
cql.append(");")
|
|
576
|
+
return cql
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def convert_avro_type_to_cassandra_type(avro_type):
|
|
580
|
+
"""
|
|
581
|
+
Converts an Avro type to Cassandra type.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
avro_type (str or dict): The Avro type.
|
|
585
|
+
|
|
586
|
+
Returns:
|
|
587
|
+
str: The corresponding Cassandra type.
|
|
588
|
+
"""
|
|
589
|
+
avro_to_cassandra_type_map = {
|
|
590
|
+
"null": "NULL",
|
|
591
|
+
"boolean": "boolean",
|
|
592
|
+
"int": "int",
|
|
593
|
+
"long": "bigint",
|
|
594
|
+
"float": "float",
|
|
595
|
+
"double": "double",
|
|
596
|
+
"bytes": "blob",
|
|
597
|
+
"string": "text",
|
|
598
|
+
"array": "text",
|
|
599
|
+
"map": "text",
|
|
600
|
+
"record": "text",
|
|
601
|
+
"union": "text"
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
if isinstance(avro_type, list):
|
|
605
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
606
|
+
if len(avro_type) > 1:
|
|
607
|
+
return avro_to_cassandra_type_map["union"]
|
|
608
|
+
avro_type = avro_type[0]
|
|
609
|
+
|
|
610
|
+
if isinstance(avro_type, dict):
|
|
611
|
+
avro_type = avro_type.get("type", "string")
|
|
612
|
+
|
|
613
|
+
return avro_to_cassandra_type_map.get(avro_type, "text")
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def compact_table_name(table_name, max_length):
|
|
617
|
+
"""
|
|
618
|
+
Compacts the table name to fit the specified maximum length.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
table_name (str): Table name.
|
|
622
|
+
max_length (int): Maximum length of the table name.
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
str: Compacted table name.
|
|
626
|
+
"""
|
|
627
|
+
if len(table_name) > max_length:
|
|
628
|
+
# Drop vowels, one by one, seek from end
|
|
629
|
+
vowels = "aeiou"
|
|
630
|
+
while len(table_name) > max_length:
|
|
631
|
+
count = 0
|
|
632
|
+
for i in range(len(table_name) - 1, -1, -1):
|
|
633
|
+
if table_name[i].lower() in vowels:
|
|
634
|
+
count += 1
|
|
635
|
+
table_name = table_name[:i] + table_name[i + 1:]
|
|
636
|
+
break
|
|
637
|
+
if count == 0:
|
|
638
|
+
break
|
|
639
|
+
return table_name[:max_length]
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def convert_avro_to_nosql(avro_schema_path, nosql_file_path, nosql_dialect, emit_cloudevents_columns=False):
|
|
643
|
+
"""
|
|
644
|
+
Converts an Avro schema to NoSQL schema for the specified NoSQL dialect.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
avro_schema_path (str): Path to the Avro schema file.
|
|
648
|
+
nosql_file_path (str): Path to the output NoSQL schema file.
|
|
649
|
+
nosql_dialect (str): NoSQL dialect (e.g., 'mongodb', 'dynamodb', 'cassandra').
|
|
650
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
651
|
+
|
|
652
|
+
Raises:
|
|
653
|
+
ValueError: If the NoSQL dialect is unsupported.
|
|
654
|
+
"""
|
|
655
|
+
schema_file = avro_schema_path
|
|
656
|
+
if not schema_file:
|
|
657
|
+
print("Please specify the avro schema file")
|
|
658
|
+
sys.exit(1)
|
|
659
|
+
with open(schema_file, "r", encoding="utf-8") as f:
|
|
660
|
+
schema_json = f.read()
|
|
661
|
+
|
|
662
|
+
schema_list = schema = json.loads(schema_json)
|
|
663
|
+
dirname = nosql_file_path
|
|
664
|
+
if not os.path.exists(dirname):
|
|
665
|
+
os.makedirs(dirname, exist_ok=True)
|
|
666
|
+
|
|
667
|
+
if isinstance(schema, list):
|
|
668
|
+
for schema in schema_list:
|
|
669
|
+
if not isinstance(schema, dict) or "type" not in schema or schema["type"] != "record":
|
|
670
|
+
continue
|
|
671
|
+
model = generate_nosql(schema, nosql_dialect,
|
|
672
|
+
emit_cloudevents_columns, schema_list)
|
|
673
|
+
file_name = os.path.join(
|
|
674
|
+
nosql_file_path, get_file_name(schema, get_nosql_file_extension(nosql_dialect)))
|
|
675
|
+
with open(file_name, "w", encoding="utf-8") as nosql_file:
|
|
676
|
+
if isinstance(model, list):
|
|
677
|
+
nosql_file.write("\n".join(model))
|
|
678
|
+
else:
|
|
679
|
+
nosql_file.write(model)
|
|
680
|
+
else:
|
|
681
|
+
if not isinstance(schema, dict) or "type" not in schema or schema["type"] != "record":
|
|
682
|
+
raise ValueError("Invalid Avro record schema")
|
|
683
|
+
model = generate_nosql(schema, nosql_dialect,
|
|
684
|
+
emit_cloudevents_columns, schema_list)
|
|
685
|
+
file_name = os.path.join(
|
|
686
|
+
nosql_file_path, get_file_name(schema_list, get_nosql_file_extension(nosql_dialect)))
|
|
687
|
+
with open(file_name, "w", encoding="utf-8") as nosql_file:
|
|
688
|
+
nosql_file.write(model)
|
|
689
|
+
|
|
690
|
+
def get_nosql_file_extension(nosql_dialect):
|
|
691
|
+
"""
|
|
692
|
+
Returns the file extension for the given NoSQL dialect.
|
|
693
|
+
|
|
694
|
+
Args:
|
|
695
|
+
nosql_dialect (str): NoSQL dialect.
|
|
696
|
+
|
|
697
|
+
Returns:
|
|
698
|
+
str: File extension.
|
|
699
|
+
"""
|
|
700
|
+
if nosql_dialect == "neo4j":
|
|
701
|
+
return "cypher"
|
|
702
|
+
else:
|
|
703
|
+
return "json"
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def generate_nosql(schema, nosql_dialect, emit_cloudevents_columns, schema_list):
|
|
707
|
+
"""
|
|
708
|
+
Generates NoSQL schema statements for the given Avro schema.
|
|
709
|
+
|
|
710
|
+
Args:
|
|
711
|
+
schema (dict): Avro schema.
|
|
712
|
+
nosql_dialect (str): NoSQL dialect.
|
|
713
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
714
|
+
schema_list (list): List of all schemas.
|
|
715
|
+
|
|
716
|
+
Returns:
|
|
717
|
+
list: List of NoSQL schema statements.
|
|
718
|
+
"""
|
|
719
|
+
if nosql_dialect == "mongodb":
|
|
720
|
+
return generate_mongodb_schema(schema, emit_cloudevents_columns)
|
|
721
|
+
elif nosql_dialect == "dynamodb":
|
|
722
|
+
return generate_dynamodb_schema(schema, emit_cloudevents_columns)
|
|
723
|
+
elif nosql_dialect == "elasticsearch":
|
|
724
|
+
return generate_elasticsearch_schema(schema, emit_cloudevents_columns)
|
|
725
|
+
elif nosql_dialect == "couchdb":
|
|
726
|
+
return generate_couchdb_schema(schema, emit_cloudevents_columns)
|
|
727
|
+
elif nosql_dialect == "neo4j":
|
|
728
|
+
return generate_neo4j_schema(schema, emit_cloudevents_columns)
|
|
729
|
+
elif nosql_dialect == "firebase":
|
|
730
|
+
return generate_firebase_schema(schema, emit_cloudevents_columns)
|
|
731
|
+
elif nosql_dialect == "cosmosdb":
|
|
732
|
+
return generate_cosmosdb_schema(schema, emit_cloudevents_columns)
|
|
733
|
+
elif nosql_dialect == "hbase":
|
|
734
|
+
return generate_hbase_schema(schema, emit_cloudevents_columns)
|
|
735
|
+
else:
|
|
736
|
+
raise ValueError(f"Unsupported NoSQL dialect: {nosql_dialect}")
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def generate_mongodb_schema(schema, emit_cloudevents_columns):
|
|
740
|
+
"""
|
|
741
|
+
Generates MongoDB schema statements for the given Avro schema.
|
|
742
|
+
|
|
743
|
+
Args:
|
|
744
|
+
schema (dict): Avro schema.
|
|
745
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
746
|
+
|
|
747
|
+
Returns:
|
|
748
|
+
list: List of MongoDB schema statements.
|
|
749
|
+
"""
|
|
750
|
+
namespace = schema.get("namespace", "")
|
|
751
|
+
collection_name = altname(schema, 'sql') or f"{namespace}_{schema['name']}"
|
|
752
|
+
collection_name = namespace + "." + \
|
|
753
|
+
collection_name if namespace else collection_name
|
|
754
|
+
collection_name = '.'.join(camel(t) for t in collection_name.split('.'))
|
|
755
|
+
fields = schema["fields"]
|
|
756
|
+
unique_record_keys = schema.get("unique", [])
|
|
757
|
+
|
|
758
|
+
mongodb_schema = {
|
|
759
|
+
"$jsonSchema": {
|
|
760
|
+
"bsonType": "object",
|
|
761
|
+
"required": [],
|
|
762
|
+
"properties": {}
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
for field in fields:
|
|
767
|
+
column_name = altname(field, 'sql') or field["name"]
|
|
768
|
+
column_type = convert_avro_type_to_mongodb_type(field["type"])
|
|
769
|
+
mongodb_schema["$jsonSchema"]["properties"][column_name] = column_type
|
|
770
|
+
if "null" not in field["type"]:
|
|
771
|
+
mongodb_schema["$jsonSchema"]["required"].append(column_name)
|
|
772
|
+
if field.get("unique", False):
|
|
773
|
+
mongodb_schema["$jsonSchema"]["properties"][column_name]["unique"] = True
|
|
774
|
+
if field.get("doc", ""):
|
|
775
|
+
mongodb_schema["$jsonSchema"]["properties"][column_name]["description"] = field["doc"]
|
|
776
|
+
|
|
777
|
+
if emit_cloudevents_columns:
|
|
778
|
+
mongodb_schema["$jsonSchema"]["properties"].update({
|
|
779
|
+
"___type": {"bsonType": "string"},
|
|
780
|
+
"___source": {"bsonType": "string"},
|
|
781
|
+
"___id": {"bsonType": "string"},
|
|
782
|
+
"___time": {"bsonType": "date"},
|
|
783
|
+
"___subject": {"bsonType": "string"}
|
|
784
|
+
})
|
|
785
|
+
mongodb_schema["$jsonSchema"]["required"].extend(
|
|
786
|
+
["___type", "___source", "___id"])
|
|
787
|
+
|
|
788
|
+
return json.dumps({collection_name: mongodb_schema}, indent=4)
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
def convert_avro_type_to_mongodb_type(avro_type):
|
|
792
|
+
"""
|
|
793
|
+
Converts an Avro type to MongoDB type.
|
|
794
|
+
|
|
795
|
+
Args:
|
|
796
|
+
avro_type (str or dict): The Avro type.
|
|
797
|
+
|
|
798
|
+
Returns:
|
|
799
|
+
dict: The corresponding MongoDB type.
|
|
800
|
+
"""
|
|
801
|
+
avro_to_mongodb_type_map = {
|
|
802
|
+
"null": {"bsonType": "null"},
|
|
803
|
+
"boolean": {"bsonType": "bool"},
|
|
804
|
+
"int": {"bsonType": "int"},
|
|
805
|
+
"long": {"bsonType": "long"},
|
|
806
|
+
"float": {"bsonType": "double"},
|
|
807
|
+
"double": {"bsonType": "double"},
|
|
808
|
+
"bytes": {"bsonType": "binData"},
|
|
809
|
+
"string": {"bsonType": "string"},
|
|
810
|
+
"array": {"bsonType": "array"},
|
|
811
|
+
"map": {"bsonType": "object"},
|
|
812
|
+
"record": {"bsonType": "object"},
|
|
813
|
+
"union": {"bsonType": "object"}
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
if isinstance(avro_type, list):
|
|
817
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
818
|
+
if len(avro_type) > 1:
|
|
819
|
+
return avro_to_mongodb_type_map["union"]
|
|
820
|
+
avro_type = avro_type[0]
|
|
821
|
+
|
|
822
|
+
if isinstance(avro_type, dict):
|
|
823
|
+
avro_type = avro_type.get("type", "string")
|
|
824
|
+
|
|
825
|
+
return avro_to_mongodb_type_map.get(avro_type, {"bsonType": "string"})
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def generate_dynamodb_schema(schema, emit_cloudevents_columns):
|
|
829
|
+
"""
|
|
830
|
+
Generates DynamoDB schema statements for the given Avro schema.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
schema (dict): Avro schema.
|
|
834
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
list: List of DynamoDB schema statements.
|
|
838
|
+
"""
|
|
839
|
+
namespace = schema.get("namespace", "").replace('.', '_')
|
|
840
|
+
table_name = altname(schema, 'sql') or f"{namespace}_{schema['name']}"
|
|
841
|
+
fields = schema["fields"]
|
|
842
|
+
unique_record_keys = schema.get("unique", [])
|
|
843
|
+
|
|
844
|
+
dynamodb_schema = {
|
|
845
|
+
"TableName": table_name,
|
|
846
|
+
"KeySchema": [],
|
|
847
|
+
"AttributeDefinitions": [],
|
|
848
|
+
"ProvisionedThroughput": {
|
|
849
|
+
"ReadCapacityUnits": 5,
|
|
850
|
+
"WriteCapacityUnits": 5
|
|
851
|
+
}
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
for field in fields:
|
|
855
|
+
column_name = altname(field, 'sql') or field["name"]
|
|
856
|
+
column_type = convert_avro_type_to_dynamodb_type(field["type"])
|
|
857
|
+
dynamodb_schema["AttributeDefinitions"].append(
|
|
858
|
+
{"AttributeName": column_name, "AttributeType": column_type})
|
|
859
|
+
if not dynamodb_schema["KeySchema"]:
|
|
860
|
+
dynamodb_schema["KeySchema"].append(
|
|
861
|
+
{"AttributeName": column_name, "KeyType": "HASH"})
|
|
862
|
+
if field.get("unique", False):
|
|
863
|
+
dynamodb_schema["AttributeDefinitions"].append(
|
|
864
|
+
{"AttributeName": column_name, "AttributeType": column_type})
|
|
865
|
+
dynamodb_schema["KeySchema"].append(
|
|
866
|
+
{"AttributeName": column_name, "KeyType": "RANGE"})
|
|
867
|
+
|
|
868
|
+
if emit_cloudevents_columns:
|
|
869
|
+
dynamodb_schema["AttributeDefinitions"].extend([
|
|
870
|
+
{"AttributeName": "___type", "AttributeType": "S"},
|
|
871
|
+
{"AttributeName": "___source", "AttributeType": "S"},
|
|
872
|
+
{"AttributeName": "___id", "AttributeType": "S"},
|
|
873
|
+
{"AttributeName": "___time", "AttributeType": "S"},
|
|
874
|
+
{"AttributeName": "___subject", "AttributeType": "S"}
|
|
875
|
+
])
|
|
876
|
+
dynamodb_schema["KeySchema"].append(
|
|
877
|
+
{"AttributeName": "___id", "KeyType": "HASH"})
|
|
878
|
+
|
|
879
|
+
return json.dumps(dynamodb_schema, indent=4)
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def convert_avro_type_to_dynamodb_type(avro_type):
|
|
883
|
+
"""
|
|
884
|
+
Converts an Avro type to DynamoDB type.
|
|
885
|
+
|
|
886
|
+
Args:
|
|
887
|
+
avro_type (str or dict): The Avro type.
|
|
888
|
+
|
|
889
|
+
Returns:
|
|
890
|
+
str: The corresponding DynamoDB type.
|
|
891
|
+
"""
|
|
892
|
+
avro_to_dynamodb_type_map = {
|
|
893
|
+
"null": "NULL",
|
|
894
|
+
"boolean": "BOOL",
|
|
895
|
+
"int": "N",
|
|
896
|
+
"long": "N",
|
|
897
|
+
"float": "N",
|
|
898
|
+
"double": "N",
|
|
899
|
+
"bytes": "B",
|
|
900
|
+
"string": "S",
|
|
901
|
+
"array": "S",
|
|
902
|
+
"map": "S",
|
|
903
|
+
"record": "S",
|
|
904
|
+
"union": "S"
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
if isinstance(avro_type, list):
|
|
908
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
909
|
+
if len(avro_type) > 1:
|
|
910
|
+
return avro_to_dynamodb_type_map["union"]
|
|
911
|
+
avro_type = avro_type[0]
|
|
912
|
+
|
|
913
|
+
if isinstance(avro_type, dict):
|
|
914
|
+
avro_type = avro_type.get("type", "string")
|
|
915
|
+
|
|
916
|
+
return avro_to_dynamodb_type_map.get(avro_type, "S")
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
def generate_elasticsearch_schema(schema, emit_cloudevents_columns):
|
|
920
|
+
"""
|
|
921
|
+
Generates Elasticsearch schema statements for the given Avro schema.
|
|
922
|
+
|
|
923
|
+
Args:
|
|
924
|
+
schema (dict): Avro schema.
|
|
925
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
926
|
+
|
|
927
|
+
Returns:
|
|
928
|
+
list: List of Elasticsearch schema statements.
|
|
929
|
+
"""
|
|
930
|
+
namespace = schema.get("namespace", "").replace('.', '_')
|
|
931
|
+
index_name = altname(schema, 'sql') or f"{namespace}_{schema['name']}"
|
|
932
|
+
fields = schema["fields"]
|
|
933
|
+
|
|
934
|
+
es_mapping = {
|
|
935
|
+
"mappings": {
|
|
936
|
+
"properties": {}
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
for field in fields:
|
|
941
|
+
column_name = altname(field, 'sql') or field["name"]
|
|
942
|
+
column_type = convert_avro_type_to_elasticsearch_type(field["type"])
|
|
943
|
+
es_mapping["mappings"]["properties"][column_name] = column_type
|
|
944
|
+
|
|
945
|
+
if emit_cloudevents_columns:
|
|
946
|
+
es_mapping["mappings"]["properties"].update({
|
|
947
|
+
"___type": {"type": "keyword"},
|
|
948
|
+
"___source": {"type": "keyword"},
|
|
949
|
+
"___id": {"type": "keyword"},
|
|
950
|
+
"___time": {"type": "date"},
|
|
951
|
+
"___subject": {"type": "keyword"}
|
|
952
|
+
})
|
|
953
|
+
|
|
954
|
+
return json.dumps({index_name: es_mapping}, indent=4)
|
|
955
|
+
|
|
956
|
+
|
|
957
|
+
def convert_avro_type_to_elasticsearch_type(avro_type):
|
|
958
|
+
"""
|
|
959
|
+
Converts an Avro type to Elasticsearch type.
|
|
960
|
+
|
|
961
|
+
Args:
|
|
962
|
+
avro_type (str or dict): The Avro type.
|
|
963
|
+
|
|
964
|
+
Returns:
|
|
965
|
+
dict: The corresponding Elasticsearch type.
|
|
966
|
+
"""
|
|
967
|
+
avro_to_elasticsearch_type_map = {
|
|
968
|
+
"null": {"type": "null"},
|
|
969
|
+
"boolean": {"type": "boolean"},
|
|
970
|
+
"int": {"type": "integer"},
|
|
971
|
+
"long": {"type": "long"},
|
|
972
|
+
"float": {"type": "float"},
|
|
973
|
+
"double": {"type": "double"},
|
|
974
|
+
"bytes": {"type": "binary"},
|
|
975
|
+
"string": {"type": "text"},
|
|
976
|
+
"array": {"type": "text"},
|
|
977
|
+
"map": {"type": "text"},
|
|
978
|
+
"record": {"type": "text"},
|
|
979
|
+
"union": {"type": "text"}
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
if isinstance(avro_type, list):
|
|
983
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
984
|
+
if len(avro_type) > 1:
|
|
985
|
+
return avro_to_elasticsearch_type_map["union"]
|
|
986
|
+
avro_type = avro_type[0]
|
|
987
|
+
|
|
988
|
+
if isinstance(avro_type, dict):
|
|
989
|
+
avro_type = avro_type.get("type", "string")
|
|
990
|
+
|
|
991
|
+
return avro_to_elasticsearch_type_map.get(avro_type, {"type": "text"})
|
|
992
|
+
|
|
993
|
+
|
|
994
|
+
def generate_couchdb_schema(schema, emit_cloudevents_columns):
|
|
995
|
+
"""
|
|
996
|
+
Generates CouchDB schema statements for the given Avro schema.
|
|
997
|
+
|
|
998
|
+
Args:
|
|
999
|
+
schema (dict): Avro schema.
|
|
1000
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
1001
|
+
|
|
1002
|
+
Returns:
|
|
1003
|
+
list: List of CouchDB schema statements.
|
|
1004
|
+
"""
|
|
1005
|
+
namespace = schema.get("namespace", "").replace('.', '_')
|
|
1006
|
+
db_name = altname(schema, 'sql') or f"{namespace}_{schema['name']}"
|
|
1007
|
+
fields = schema["fields"]
|
|
1008
|
+
|
|
1009
|
+
couchdb_schema = {
|
|
1010
|
+
"type": "object",
|
|
1011
|
+
"properties": {}
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
for field in fields:
|
|
1015
|
+
column_name = altname(field, 'sql') or field["name"]
|
|
1016
|
+
column_type = convert_avro_type_to_couchdb_type(field["type"])
|
|
1017
|
+
couchdb_schema["properties"][column_name] = column_type
|
|
1018
|
+
|
|
1019
|
+
if emit_cloudevents_columns:
|
|
1020
|
+
couchdb_schema["properties"].update({
|
|
1021
|
+
"___type": {"type": "string"},
|
|
1022
|
+
"___source": {"type": "string"},
|
|
1023
|
+
"___id": {"type": "string"},
|
|
1024
|
+
"___time": {"type": "string"},
|
|
1025
|
+
"___subject": {"type": "string"}
|
|
1026
|
+
})
|
|
1027
|
+
|
|
1028
|
+
return json.dumps({db_name: couchdb_schema}, indent=4)
|
|
1029
|
+
|
|
1030
|
+
|
|
1031
|
+
def convert_avro_type_to_couchdb_type(avro_type):
|
|
1032
|
+
"""
|
|
1033
|
+
Converts an Avro type to CouchDB type.
|
|
1034
|
+
|
|
1035
|
+
Args:
|
|
1036
|
+
avro_type (str or dict): The Avro type.
|
|
1037
|
+
|
|
1038
|
+
Returns:
|
|
1039
|
+
dict: The corresponding CouchDB type.
|
|
1040
|
+
"""
|
|
1041
|
+
avro_to_couchdb_type_map = {
|
|
1042
|
+
"null": {"type": "null"},
|
|
1043
|
+
"boolean": {"type": "boolean"},
|
|
1044
|
+
"int": {"type": "integer"},
|
|
1045
|
+
"long": {"type": "integer"},
|
|
1046
|
+
"float": {"type": "number"},
|
|
1047
|
+
"double": {"type": "number"},
|
|
1048
|
+
"bytes": {"type": "string"},
|
|
1049
|
+
"string": {"type": "string"},
|
|
1050
|
+
"array": {"type": "string"},
|
|
1051
|
+
"map": {"type": "string"},
|
|
1052
|
+
"record": {"type": "string"},
|
|
1053
|
+
"union": {"type": "string"}
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
if isinstance(avro_type, list):
|
|
1057
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
1058
|
+
if len(avro_type) > 1:
|
|
1059
|
+
return avro_to_couchdb_type_map["union"]
|
|
1060
|
+
avro_type = avro_type[0]
|
|
1061
|
+
|
|
1062
|
+
if isinstance(avro_type, dict):
|
|
1063
|
+
avro_type = avro_type.get("type", "string")
|
|
1064
|
+
|
|
1065
|
+
return avro_to_couchdb_type_map.get(avro_type, {"type": "string"})
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
def generate_neo4j_schema(schema, emit_cloudevents_columns):
|
|
1069
|
+
"""
|
|
1070
|
+
Generates Neo4j schema statements for the given Avro schema.
|
|
1071
|
+
|
|
1072
|
+
Args:
|
|
1073
|
+
schema (dict): Avro schema.
|
|
1074
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
1075
|
+
|
|
1076
|
+
Returns:
|
|
1077
|
+
list: List of Neo4j schema statements.
|
|
1078
|
+
"""
|
|
1079
|
+
namespace = schema.get("namespace", "").replace('.', '_')
|
|
1080
|
+
label_name = altname(schema, 'sql') or f"{namespace}_{schema['name']}"
|
|
1081
|
+
fields = schema["fields"]
|
|
1082
|
+
|
|
1083
|
+
cypher = []
|
|
1084
|
+
cypher.append(f"CREATE (:{label_name} {{")
|
|
1085
|
+
for field in fields:
|
|
1086
|
+
column_name = altname(field, 'sql') or field["name"]
|
|
1087
|
+
column_type = convert_avro_type_to_neo4j_type(field["type"])
|
|
1088
|
+
cypher.append(f" {column_name}: {column_type},")
|
|
1089
|
+
if emit_cloudevents_columns:
|
|
1090
|
+
cypher.extend([
|
|
1091
|
+
f" {escape_name('___type', 'neo4j')}: 'string',",
|
|
1092
|
+
f" {escape_name('___source', 'neo4j')}: 'string',",
|
|
1093
|
+
f" {escape_name('___id', 'neo4j')}: 'string',",
|
|
1094
|
+
f" {escape_name('___time', 'neo4j')}: 'datetime',",
|
|
1095
|
+
f" {escape_name('___subject', 'neo4j')}: 'string'"
|
|
1096
|
+
])
|
|
1097
|
+
cypher[-1] = cypher[-1][:-1] # Remove the last comma
|
|
1098
|
+
cypher.append("});")
|
|
1099
|
+
return cypher
|
|
1100
|
+
|
|
1101
|
+
|
|
1102
|
+
def convert_avro_type_to_neo4j_type(avro_type):
|
|
1103
|
+
"""
|
|
1104
|
+
Converts an Avro type to Neo4j type.
|
|
1105
|
+
|
|
1106
|
+
Args:
|
|
1107
|
+
avro_type (str or dict): The Avro type.
|
|
1108
|
+
|
|
1109
|
+
Returns:
|
|
1110
|
+
str: The corresponding Neo4j type.
|
|
1111
|
+
"""
|
|
1112
|
+
avro_to_neo4j_type_map = {
|
|
1113
|
+
"null": "NULL",
|
|
1114
|
+
"boolean": "boolean",
|
|
1115
|
+
"int": "integer",
|
|
1116
|
+
"long": "long",
|
|
1117
|
+
"float": "float",
|
|
1118
|
+
"double": "float",
|
|
1119
|
+
"bytes": "string",
|
|
1120
|
+
"string": "string",
|
|
1121
|
+
"array": "string",
|
|
1122
|
+
"map": "string",
|
|
1123
|
+
"record": "string",
|
|
1124
|
+
"union": "string"
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
if isinstance(avro_type, list):
|
|
1128
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
1129
|
+
if len(avro_type) > 1:
|
|
1130
|
+
return avro_to_neo4j_type_map["union"]
|
|
1131
|
+
avro_type = avro_type[0]
|
|
1132
|
+
|
|
1133
|
+
if isinstance(avro_type, dict):
|
|
1134
|
+
avro_type = avro_type.get("type", "string")
|
|
1135
|
+
|
|
1136
|
+
return avro_to_neo4j_type_map.get(avro_type, "string")
|
|
1137
|
+
|
|
1138
|
+
|
|
1139
|
+
def generate_firebase_schema(schema, emit_cloudevents_columns):
|
|
1140
|
+
"""
|
|
1141
|
+
Generates Firebase schema statements for the given Avro schema.
|
|
1142
|
+
|
|
1143
|
+
Args:
|
|
1144
|
+
schema (dict): Avro schema.
|
|
1145
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
1146
|
+
|
|
1147
|
+
Returns:
|
|
1148
|
+
list: List of Firebase schema statements.
|
|
1149
|
+
"""
|
|
1150
|
+
namespace = schema.get("namespace", "").replace('.', '_')
|
|
1151
|
+
collection_name = altname(schema, 'sql') or f"{namespace}_{schema['name']}"
|
|
1152
|
+
fields = schema["fields"]
|
|
1153
|
+
|
|
1154
|
+
firebase_schema = {
|
|
1155
|
+
"fields": {}
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
for field in fields:
|
|
1159
|
+
column_name = altname(field, 'sql') or field["name"]
|
|
1160
|
+
column_type = convert_avro_type_to_firebase_type(field["type"])
|
|
1161
|
+
firebase_schema["fields"][column_name] = column_type
|
|
1162
|
+
|
|
1163
|
+
if emit_cloudevents_columns:
|
|
1164
|
+
firebase_schema["fields"].update({
|
|
1165
|
+
"___type": {"type": "string"},
|
|
1166
|
+
"___source": {"type": "string"},
|
|
1167
|
+
"___id": {"type": "string"},
|
|
1168
|
+
"___time": {"type": "timestamp"},
|
|
1169
|
+
"___subject": {"type": "string"}
|
|
1170
|
+
})
|
|
1171
|
+
|
|
1172
|
+
return json.dumps({collection_name: firebase_schema}, indent=4)
|
|
1173
|
+
|
|
1174
|
+
|
|
1175
|
+
def convert_avro_type_to_firebase_type(avro_type):
|
|
1176
|
+
"""
|
|
1177
|
+
Converts an Avro type to Firebase type.
|
|
1178
|
+
|
|
1179
|
+
Args:
|
|
1180
|
+
avro_type (str or dict): The Avro type.
|
|
1181
|
+
|
|
1182
|
+
Returns:
|
|
1183
|
+
dict: The corresponding Firebase type.
|
|
1184
|
+
"""
|
|
1185
|
+
avro_to_firebase_type_map = {
|
|
1186
|
+
"null": {"type": "null"},
|
|
1187
|
+
"boolean": {"type": "boolean"},
|
|
1188
|
+
"int": {"type": "integer"},
|
|
1189
|
+
"long": {"type": "integer"},
|
|
1190
|
+
"float": {"type": "number"},
|
|
1191
|
+
"double": {"type": "number"},
|
|
1192
|
+
"bytes": {"type": "string"},
|
|
1193
|
+
"string": {"type": "string"},
|
|
1194
|
+
"array": {"type": "string"},
|
|
1195
|
+
"map": {"type": "string"},
|
|
1196
|
+
"record": {"type": "string"},
|
|
1197
|
+
"union": {"type": "string"}
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
if isinstance(avro_type, list):
|
|
1201
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
1202
|
+
if len(avro_type) > 1:
|
|
1203
|
+
return avro_to_firebase_type_map["union"]
|
|
1204
|
+
avro_type = avro_type[0]
|
|
1205
|
+
|
|
1206
|
+
if isinstance(avro_type, dict):
|
|
1207
|
+
avro_type = avro_type.get("type", "string")
|
|
1208
|
+
|
|
1209
|
+
return avro_to_firebase_type_map.get(avro_type, {"type": "string"})
|
|
1210
|
+
|
|
1211
|
+
|
|
1212
|
+
def generate_cosmosdb_schema(schema, emit_cloudevents_columns):
|
|
1213
|
+
"""
|
|
1214
|
+
Generates CosmosDB schema statements for the given Avro schema.
|
|
1215
|
+
|
|
1216
|
+
Args:
|
|
1217
|
+
schema (dict): Avro schema.
|
|
1218
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
1219
|
+
|
|
1220
|
+
Returns:
|
|
1221
|
+
list: List of CosmosDB schema statements.
|
|
1222
|
+
"""
|
|
1223
|
+
namespace = schema.get("namespace", "").replace('.', '_')
|
|
1224
|
+
collection_name = altname(schema, 'sql') or f"{namespace}_{schema['name']}"
|
|
1225
|
+
fields = schema["fields"]
|
|
1226
|
+
|
|
1227
|
+
cosmosdb_schema = {
|
|
1228
|
+
"id": collection_name,
|
|
1229
|
+
"partitionKey": {
|
|
1230
|
+
"paths": [],
|
|
1231
|
+
"kind": "Hash"
|
|
1232
|
+
},
|
|
1233
|
+
"uniqueKeyPolicy": {
|
|
1234
|
+
"uniqueKeys": []
|
|
1235
|
+
},
|
|
1236
|
+
"fields": {}
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
for field in fields:
|
|
1240
|
+
column_name = altname(field, 'sql') or field["name"]
|
|
1241
|
+
column_type = convert_avro_type_to_cosmosdb_type(field["type"])
|
|
1242
|
+
cosmosdb_schema["fields"][column_name] = column_type
|
|
1243
|
+
cosmosdb_schema["partitionKey"]["paths"].append(f"/{column_name}")
|
|
1244
|
+
|
|
1245
|
+
if emit_cloudevents_columns:
|
|
1246
|
+
cosmosdb_schema["fields"].update({
|
|
1247
|
+
"___type": {"type": "string"},
|
|
1248
|
+
"___source": {"type": "string"},
|
|
1249
|
+
"___id": {"type": "string"},
|
|
1250
|
+
"___time": {"type": "string"},
|
|
1251
|
+
"___subject": {"type": "string"}
|
|
1252
|
+
})
|
|
1253
|
+
cosmosdb_schema["partitionKey"]["paths"].append("/___id")
|
|
1254
|
+
|
|
1255
|
+
return json.dumps(cosmosdb_schema, indent=4)
|
|
1256
|
+
|
|
1257
|
+
|
|
1258
|
+
def convert_avro_type_to_cosmosdb_type(avro_type):
|
|
1259
|
+
"""
|
|
1260
|
+
Converts an Avro type to CosmosDB type.
|
|
1261
|
+
|
|
1262
|
+
Args:
|
|
1263
|
+
avro_type (str or dict): The Avro type.
|
|
1264
|
+
|
|
1265
|
+
Returns:
|
|
1266
|
+
dict: The corresponding CosmosDB type.
|
|
1267
|
+
"""
|
|
1268
|
+
avro_to_cosmosdb_type_map = {
|
|
1269
|
+
"null": {"type": "null"},
|
|
1270
|
+
"boolean": {"type": "boolean"},
|
|
1271
|
+
"int": {"type": "number"},
|
|
1272
|
+
"long": {"type": "number"},
|
|
1273
|
+
"float": {"type": "number"},
|
|
1274
|
+
"double": {"type": "number"},
|
|
1275
|
+
"bytes": {"type": "string"},
|
|
1276
|
+
"string": {"type": "string"},
|
|
1277
|
+
"array": {"type": "string"},
|
|
1278
|
+
"map": {"type": "string"},
|
|
1279
|
+
"record": {"type": "string"},
|
|
1280
|
+
"union": {"type": "string"}
|
|
1281
|
+
}
|
|
1282
|
+
|
|
1283
|
+
if isinstance(avro_type, list):
|
|
1284
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
1285
|
+
if len(avro_type) > 1:
|
|
1286
|
+
return avro_to_cosmosdb_type_map["union"]
|
|
1287
|
+
avro_type = avro_type[0]
|
|
1288
|
+
|
|
1289
|
+
if isinstance(avro_type, dict):
|
|
1290
|
+
avro_type = avro_type.get("type", "string")
|
|
1291
|
+
|
|
1292
|
+
return avro_to_cosmosdb_type_map.get(avro_type, {"type": "string"})
|
|
1293
|
+
|
|
1294
|
+
|
|
1295
|
+
def generate_hbase_schema(schema, emit_cloudevents_columns):
|
|
1296
|
+
"""
|
|
1297
|
+
Generates HBase schema statements for the given Avro schema.
|
|
1298
|
+
|
|
1299
|
+
Args:
|
|
1300
|
+
schema (dict): Avro schema.
|
|
1301
|
+
emit_cloudevents_columns (bool): Whether to include cloud events columns.
|
|
1302
|
+
|
|
1303
|
+
Returns:
|
|
1304
|
+
list: List of HBase schema statements.
|
|
1305
|
+
"""
|
|
1306
|
+
namespace = schema.get("namespace", "").replace('.', '_')
|
|
1307
|
+
table_name = altname(schema, 'sql') or f"{namespace}_{schema['name']}"
|
|
1308
|
+
fields = schema["fields"]
|
|
1309
|
+
|
|
1310
|
+
hbase_schema = {
|
|
1311
|
+
"table": table_name,
|
|
1312
|
+
"column_families": []
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1315
|
+
for field in fields:
|
|
1316
|
+
column_name = altname(field, 'sql') or field["name"]
|
|
1317
|
+
column_family = convert_avro_type_to_hbase_column_family(field["type"])
|
|
1318
|
+
hbase_schema["column_families"].append(
|
|
1319
|
+
{"name": column_name, "column_family": column_family})
|
|
1320
|
+
|
|
1321
|
+
if emit_cloudevents_columns:
|
|
1322
|
+
hbase_schema["column_families"].extend([
|
|
1323
|
+
{"name": "___type", "column_family": "string"},
|
|
1324
|
+
{"name": "___source", "column_family": "string"},
|
|
1325
|
+
{"name": "___id", "column_family": "string"},
|
|
1326
|
+
{"name": "___time", "column_family": "string"},
|
|
1327
|
+
{"name": "___subject", "column_family": "string"}
|
|
1328
|
+
])
|
|
1329
|
+
|
|
1330
|
+
return json.dumps(hbase_schema, indent=4)
|
|
1331
|
+
|
|
1332
|
+
|
|
1333
|
+
def convert_avro_type_to_hbase_column_family(avro_type):
|
|
1334
|
+
"""
|
|
1335
|
+
Converts an Avro type to HBase column family.
|
|
1336
|
+
|
|
1337
|
+
Args:
|
|
1338
|
+
avro_type (str or dict): The Avro type.
|
|
1339
|
+
|
|
1340
|
+
Returns:
|
|
1341
|
+
str: The corresponding HBase column family.
|
|
1342
|
+
"""
|
|
1343
|
+
avro_to_hbase_column_family_map = {
|
|
1344
|
+
"null": "string",
|
|
1345
|
+
"boolean": "boolean",
|
|
1346
|
+
"int": "integer",
|
|
1347
|
+
"long": "integer",
|
|
1348
|
+
"float": "number",
|
|
1349
|
+
"double": "number",
|
|
1350
|
+
"bytes": "string",
|
|
1351
|
+
"string": "string",
|
|
1352
|
+
"array": "string",
|
|
1353
|
+
"map": "string",
|
|
1354
|
+
"record": "string",
|
|
1355
|
+
"union": "string"
|
|
1356
|
+
}
|
|
1357
|
+
|
|
1358
|
+
if isinstance(avro_type, list):
|
|
1359
|
+
avro_type = [x for x in avro_type if x != "null"]
|
|
1360
|
+
if len(avro_type) > 1:
|
|
1361
|
+
return avro_to_hbase_column_family_map["union"]
|
|
1362
|
+
avro_type = avro_type[0]
|
|
1363
|
+
|
|
1364
|
+
if isinstance(avro_type, dict):
|
|
1365
|
+
avro_type = avro_type.get("type", "string")
|
|
1366
|
+
|
|
1367
|
+
return avro_to_hbase_column_family_map.get(avro_type, "string")
|
|
1368
|
+
|
|
1369
|
+
|
|
1370
|
+
def get_file_name(avro_schema: dict, extension: str) -> str:
|
|
1371
|
+
"""
|
|
1372
|
+
Generates a file name based on the Avro schema.
|
|
1373
|
+
|
|
1374
|
+
Args:
|
|
1375
|
+
avro_schema (dict): Avro schema.
|
|
1376
|
+
extension (str): File extension.
|
|
1377
|
+
|
|
1378
|
+
Returns:
|
|
1379
|
+
str: Generated file name.
|
|
1380
|
+
"""
|
|
1381
|
+
namespace = avro_schema.get("namespace", "").replace('.', '_')
|
|
1382
|
+
name = avro_schema.get("name", "")
|
|
1383
|
+
return (namespace + '.' + name + '.' + extension) if namespace else (name + '.' + extension)
|