structurize 3.0.1__py3-none-any.whl → 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/_version.py +3 -3
- avrotize/avrotize.py +4 -0
- avrotize/avrotoiceberg.py +111 -13
- avrotize/avrotots.py +62 -7
- avrotize/avrovalidator.py +518 -0
- avrotize/commands.json +485 -2
- avrotize/dependencies/typescript/node22/package.json +1 -1
- avrotize/jsontoschema.py +151 -0
- avrotize/schema_inference.py +825 -0
- avrotize/sqltoavro.py +1159 -0
- avrotize/structuretodb.py +1 -1
- avrotize/structuretoiceberg.py +113 -13
- avrotize/validate.py +242 -0
- avrotize/xmltoschema.py +122 -0
- {structurize-3.0.1.dist-info → structurize-3.1.0.dist-info}/METADATA +1 -1
- {structurize-3.0.1.dist-info → structurize-3.1.0.dist-info}/RECORD +20 -14
- {structurize-3.0.1.dist-info → structurize-3.1.0.dist-info}/WHEEL +1 -1
- {structurize-3.0.1.dist-info → structurize-3.1.0.dist-info}/entry_points.txt +0 -0
- {structurize-3.0.1.dist-info → structurize-3.1.0.dist-info}/licenses/LICENSE +0 -0
- {structurize-3.0.1.dist-info → structurize-3.1.0.dist-info}/top_level.txt +0 -0
avrotize/sqltoavro.py
ADDED
|
@@ -0,0 +1,1159 @@
|
|
|
1
|
+
"""Converts SQL database schemas to Avro schema format."""
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any, Dict, List, cast
|
|
8
|
+
from urllib.parse import urlparse, parse_qs
|
|
9
|
+
|
|
10
|
+
from avrotize.common import avro_name
|
|
11
|
+
from avrotize.constants import AVRO_VERSION
|
|
12
|
+
from avrotize.schema_inference import AvroSchemaInferrer
|
|
13
|
+
|
|
14
|
+
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | float | None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SqlToAvro:
|
|
18
|
+
"""Converts SQL database schemas to Avro schema format."""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
connection_string: str,
|
|
23
|
+
database: str | None,
|
|
24
|
+
table_name: str | None,
|
|
25
|
+
avro_namespace: str,
|
|
26
|
+
avro_schema_path: str,
|
|
27
|
+
dialect: str,
|
|
28
|
+
emit_cloudevents: bool,
|
|
29
|
+
emit_cloudevents_xregistry: bool,
|
|
30
|
+
sample_size: int = 100,
|
|
31
|
+
infer_json_schema: bool = True,
|
|
32
|
+
infer_xml_schema: bool = True,
|
|
33
|
+
username: str | None = None,
|
|
34
|
+
password: str | None = None
|
|
35
|
+
):
|
|
36
|
+
"""Initializes the SqlToAvro class with database connection parameters.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
connection_string: Database connection string (e.g., postgresql://user:pass@host:port/dbname)
|
|
40
|
+
database: Database name (overrides connection string if provided)
|
|
41
|
+
table_name: Specific table to convert (None for all tables)
|
|
42
|
+
avro_namespace: Namespace for generated Avro schemas
|
|
43
|
+
avro_schema_path: Output path for the Avro schema file
|
|
44
|
+
dialect: SQL dialect (postgres, mysql, sqlserver, oracle, sqlite)
|
|
45
|
+
emit_cloudevents: Whether to emit CloudEvents declarations
|
|
46
|
+
emit_cloudevents_xregistry: Whether to emit xRegistry manifest format
|
|
47
|
+
sample_size: Number of rows to sample for JSON/XML inference
|
|
48
|
+
infer_json_schema: Whether to infer schema for JSON columns
|
|
49
|
+
infer_xml_schema: Whether to infer schema for XML columns
|
|
50
|
+
username: Database username (overrides connection string if provided)
|
|
51
|
+
password: Database password (overrides connection string if provided)
|
|
52
|
+
"""
|
|
53
|
+
self.connection_string = connection_string
|
|
54
|
+
self.dialect = dialect.lower()
|
|
55
|
+
self.single_table_name = table_name
|
|
56
|
+
self.avro_namespace = avro_namespace
|
|
57
|
+
self.avro_schema_path = avro_schema_path
|
|
58
|
+
self.emit_xregistry = emit_cloudevents_xregistry
|
|
59
|
+
self.emit_cloudevents = emit_cloudevents or emit_cloudevents_xregistry
|
|
60
|
+
self.sample_size = sample_size
|
|
61
|
+
self.infer_json_schema = infer_json_schema
|
|
62
|
+
self.infer_xml_schema = infer_xml_schema
|
|
63
|
+
self.generated_types: List[str] = []
|
|
64
|
+
|
|
65
|
+
# Schema inferrer for JSON/XML columns (use 'sql' altnames for SQL source)
|
|
66
|
+
self._inferrer = AvroSchemaInferrer(namespace=avro_namespace, altnames_key='sql')
|
|
67
|
+
|
|
68
|
+
if self.emit_xregistry and not self.avro_namespace:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"The avro_namespace must be specified when emit_cloudevents_xregistry is True")
|
|
71
|
+
|
|
72
|
+
# Store credentials for connection
|
|
73
|
+
self.username = username
|
|
74
|
+
self.password = password
|
|
75
|
+
|
|
76
|
+
# Parse connection string and establish connection
|
|
77
|
+
self.connection = self._connect(connection_string, database)
|
|
78
|
+
self.database = database or self._extract_database_from_connection_string(connection_string)
|
|
79
|
+
|
|
80
|
+
def _extract_database_from_connection_string(self, connection_string: str) -> str:
|
|
81
|
+
"""Extracts database name from connection string."""
|
|
82
|
+
parsed = urlparse(connection_string)
|
|
83
|
+
if parsed.path:
|
|
84
|
+
return parsed.path.lstrip('/')
|
|
85
|
+
return ''
|
|
86
|
+
|
|
87
|
+
def _connect(self, connection_string: str, database: str | None):
|
|
88
|
+
"""Establishes database connection based on dialect.
|
|
89
|
+
|
|
90
|
+
Connection strings can include SSL/TLS and authentication options:
|
|
91
|
+
|
|
92
|
+
PostgreSQL:
|
|
93
|
+
- Standard: postgresql://user:pass@host:port/dbname
|
|
94
|
+
- SSL: postgresql://user:pass@host:port/dbname?sslmode=require
|
|
95
|
+
- SSL modes: disable, allow, prefer, require, verify-ca, verify-full
|
|
96
|
+
|
|
97
|
+
MySQL:
|
|
98
|
+
- Standard: mysql://user:pass@host:port/dbname
|
|
99
|
+
- SSL: mysql://user:pass@host:port/dbname?ssl=true
|
|
100
|
+
- SSL with cert: mysql://...?ssl_ca=/path/to/ca.pem
|
|
101
|
+
|
|
102
|
+
SQL Server:
|
|
103
|
+
- Standard: mssql://user:pass@host:port/dbname
|
|
104
|
+
- Windows Auth: mssql://@host:port/dbname (no user/pass = integrated)
|
|
105
|
+
- Encrypt: mssql://...?encrypt=true
|
|
106
|
+
- Trust cert: mssql://...?trustServerCertificate=true
|
|
107
|
+
|
|
108
|
+
Oracle:
|
|
109
|
+
- Standard: oracle://user:pass@host:port/service_name
|
|
110
|
+
- Wallet: Uses TNS names or wallet configuration
|
|
111
|
+
"""
|
|
112
|
+
parsed = urlparse(connection_string)
|
|
113
|
+
query_params = dict(parse_qs(parsed.query)) if parsed.query else {}
|
|
114
|
+
# Flatten single-value lists
|
|
115
|
+
query_params = {k: v[0] if len(v) == 1 else v for k, v in query_params.items()}
|
|
116
|
+
|
|
117
|
+
if self.dialect == 'postgres':
|
|
118
|
+
try:
|
|
119
|
+
import psycopg2
|
|
120
|
+
except ImportError:
|
|
121
|
+
raise ImportError(
|
|
122
|
+
"psycopg2 is required for PostgreSQL support. "
|
|
123
|
+
"Install with: pip install psycopg2-binary"
|
|
124
|
+
)
|
|
125
|
+
# If separate credentials provided, use them instead of URL credentials
|
|
126
|
+
if self.username is not None:
|
|
127
|
+
connect_kwargs = {
|
|
128
|
+
'host': parsed.hostname or 'localhost',
|
|
129
|
+
'port': parsed.port or 5432,
|
|
130
|
+
'user': self.username,
|
|
131
|
+
'password': self.password or '',
|
|
132
|
+
'database': database or parsed.path.lstrip('/')
|
|
133
|
+
}
|
|
134
|
+
# Handle sslmode from query params
|
|
135
|
+
if 'sslmode' in query_params:
|
|
136
|
+
connect_kwargs['sslmode'] = query_params['sslmode']
|
|
137
|
+
return psycopg2.connect(**connect_kwargs)
|
|
138
|
+
# psycopg2 handles the full connection string including sslmode
|
|
139
|
+
return psycopg2.connect(connection_string)
|
|
140
|
+
elif self.dialect == 'mysql':
|
|
141
|
+
try:
|
|
142
|
+
import pymysql
|
|
143
|
+
except ImportError:
|
|
144
|
+
raise ImportError(
|
|
145
|
+
"pymysql is required for MySQL support. "
|
|
146
|
+
"Install with: pip install pymysql"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
connect_kwargs = {
|
|
150
|
+
'host': parsed.hostname or 'localhost',
|
|
151
|
+
'port': parsed.port or 3306,
|
|
152
|
+
'user': self.username if self.username is not None else parsed.username,
|
|
153
|
+
'password': self.password if self.username is not None else parsed.password,
|
|
154
|
+
'database': database or parsed.path.lstrip('/')
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
# SSL/TLS configuration
|
|
158
|
+
ssl_config = {}
|
|
159
|
+
if query_params.get('ssl') in ('true', 'True', '1', True):
|
|
160
|
+
ssl_config['ssl'] = True
|
|
161
|
+
if 'ssl_ca' in query_params:
|
|
162
|
+
ssl_config['ssl'] = {'ca': query_params['ssl_ca']}
|
|
163
|
+
if 'ssl_cert' in query_params:
|
|
164
|
+
ssl_config.setdefault('ssl', {})
|
|
165
|
+
if isinstance(ssl_config['ssl'], dict):
|
|
166
|
+
ssl_config['ssl']['cert'] = query_params['ssl_cert']
|
|
167
|
+
if 'ssl_key' in query_params:
|
|
168
|
+
ssl_config.setdefault('ssl', {})
|
|
169
|
+
if isinstance(ssl_config['ssl'], dict):
|
|
170
|
+
ssl_config['ssl']['key'] = query_params['ssl_key']
|
|
171
|
+
|
|
172
|
+
if ssl_config:
|
|
173
|
+
connect_kwargs.update(ssl_config)
|
|
174
|
+
|
|
175
|
+
return pymysql.connect(**connect_kwargs)
|
|
176
|
+
elif self.dialect == 'sqlserver':
|
|
177
|
+
try:
|
|
178
|
+
import pymssql
|
|
179
|
+
use_pymssql = True
|
|
180
|
+
except ImportError:
|
|
181
|
+
use_pymssql = False
|
|
182
|
+
try:
|
|
183
|
+
import pyodbc
|
|
184
|
+
except ImportError:
|
|
185
|
+
raise ImportError(
|
|
186
|
+
"pymssql or pyodbc is required for SQL Server support. "
|
|
187
|
+
"Install with: pip install pymssql or pip install pyodbc"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
if not use_pymssql:
|
|
191
|
+
# pyodbc - pass connection string directly (supports all ODBC options)
|
|
192
|
+
return pyodbc.connect(connection_string)
|
|
193
|
+
|
|
194
|
+
# pymssql - parse and build connection
|
|
195
|
+
connect_kwargs = {
|
|
196
|
+
'server': parsed.hostname or 'localhost',
|
|
197
|
+
'port': str(parsed.port or 1433),
|
|
198
|
+
'database': database or parsed.path.lstrip('/')
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
# Check for integrated/Windows authentication (no username)
|
|
202
|
+
# Separate credentials override URL credentials
|
|
203
|
+
if self.username is not None:
|
|
204
|
+
connect_kwargs['user'] = self.username
|
|
205
|
+
connect_kwargs['password'] = self.password or ''
|
|
206
|
+
elif parsed.username:
|
|
207
|
+
connect_kwargs['user'] = parsed.username
|
|
208
|
+
connect_kwargs['password'] = parsed.password or ''
|
|
209
|
+
# If no username, pymssql will attempt Windows auth
|
|
210
|
+
|
|
211
|
+
# TLS/encryption options
|
|
212
|
+
if query_params.get('encrypt') in ('true', 'True', '1', True):
|
|
213
|
+
connect_kwargs['tds_version'] = '7.4' # Ensures TLS
|
|
214
|
+
if query_params.get('trustServerCertificate') in ('true', 'True', '1', True):
|
|
215
|
+
# pymssql doesn't directly support this, but we note it
|
|
216
|
+
pass
|
|
217
|
+
|
|
218
|
+
return pymssql.connect(**connect_kwargs)
|
|
219
|
+
elif self.dialect == 'oracle':
|
|
220
|
+
try:
|
|
221
|
+
import oracledb
|
|
222
|
+
except ImportError:
|
|
223
|
+
raise ImportError(
|
|
224
|
+
"oracledb is required for Oracle support. "
|
|
225
|
+
"Install with: pip install oracledb"
|
|
226
|
+
)
|
|
227
|
+
# oracledb supports various connection methods including wallets
|
|
228
|
+
return oracledb.connect(connection_string)
|
|
229
|
+
elif self.dialect == 'sqlite':
|
|
230
|
+
import sqlite3
|
|
231
|
+
# For SQLite, connection_string is the file path
|
|
232
|
+
return sqlite3.connect(connection_string)
|
|
233
|
+
else:
|
|
234
|
+
raise ValueError(f"Unsupported SQL dialect: {self.dialect}")
|
|
235
|
+
|
|
236
|
+
def close(self):
|
|
237
|
+
"""Closes the database connection."""
|
|
238
|
+
if self.connection:
|
|
239
|
+
self.connection.close()
|
|
240
|
+
|
|
241
|
+
# -------------------------------------------------------------------------
|
|
242
|
+
# Type Mapping
|
|
243
|
+
# -------------------------------------------------------------------------
|
|
244
|
+
|
|
245
|
+
# PostgreSQL type mapping
|
|
246
|
+
postgres_type_map: Dict[str, JsonNode] = {
|
|
247
|
+
# Numeric types
|
|
248
|
+
'smallint': 'int',
|
|
249
|
+
'int2': 'int',
|
|
250
|
+
'integer': 'int',
|
|
251
|
+
'int': 'int',
|
|
252
|
+
'int4': 'int',
|
|
253
|
+
'bigint': 'long',
|
|
254
|
+
'int8': 'long',
|
|
255
|
+
'real': 'float',
|
|
256
|
+
'float4': 'float',
|
|
257
|
+
'double precision': 'double',
|
|
258
|
+
'float8': 'double',
|
|
259
|
+
'smallserial': 'int',
|
|
260
|
+
'serial': 'int',
|
|
261
|
+
'bigserial': 'long',
|
|
262
|
+
# Boolean
|
|
263
|
+
'boolean': 'boolean',
|
|
264
|
+
'bool': 'boolean',
|
|
265
|
+
# Character types
|
|
266
|
+
'character varying': 'string',
|
|
267
|
+
'varchar': 'string',
|
|
268
|
+
'character': 'string',
|
|
269
|
+
'char': 'string',
|
|
270
|
+
'bpchar': 'string',
|
|
271
|
+
'text': 'string',
|
|
272
|
+
'name': 'string',
|
|
273
|
+
# Binary
|
|
274
|
+
'bytea': 'bytes',
|
|
275
|
+
# Date/Time types
|
|
276
|
+
'date': {'type': 'int', 'logicalType': 'date'},
|
|
277
|
+
'time': {'type': 'int', 'logicalType': 'time-millis'},
|
|
278
|
+
'time with time zone': {'type': 'int', 'logicalType': 'time-millis'},
|
|
279
|
+
'time without time zone': {'type': 'int', 'logicalType': 'time-millis'},
|
|
280
|
+
'timetz': {'type': 'int', 'logicalType': 'time-millis'},
|
|
281
|
+
'timestamp': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
282
|
+
'timestamp with time zone': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
283
|
+
'timestamp without time zone': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
284
|
+
'timestamptz': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
285
|
+
'interval': {'type': 'fixed', 'size': 12, 'name': 'duration', 'logicalType': 'duration'},
|
|
286
|
+
# UUID
|
|
287
|
+
'uuid': {'type': 'string', 'logicalType': 'uuid'},
|
|
288
|
+
# JSON types (will be inferred if enabled)
|
|
289
|
+
'json': 'string',
|
|
290
|
+
'jsonb': 'string',
|
|
291
|
+
# XML (will be inferred if enabled)
|
|
292
|
+
'xml': 'string',
|
|
293
|
+
# Network types
|
|
294
|
+
'inet': 'string',
|
|
295
|
+
'cidr': 'string',
|
|
296
|
+
'macaddr': 'string',
|
|
297
|
+
'macaddr8': 'string',
|
|
298
|
+
# Geometric types (stored as string representation)
|
|
299
|
+
'point': 'string',
|
|
300
|
+
'line': 'string',
|
|
301
|
+
'lseg': 'string',
|
|
302
|
+
'box': 'string',
|
|
303
|
+
'path': 'string',
|
|
304
|
+
'polygon': 'string',
|
|
305
|
+
'circle': 'string',
|
|
306
|
+
# Other
|
|
307
|
+
'money': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 19, 'scale': 2},
|
|
308
|
+
'bit': 'string',
|
|
309
|
+
'bit varying': 'string',
|
|
310
|
+
'varbit': 'string',
|
|
311
|
+
'tsvector': 'string',
|
|
312
|
+
'tsquery': 'string',
|
|
313
|
+
'oid': 'long',
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
mysql_type_map: Dict[str, JsonNode] = {
|
|
317
|
+
'tinyint': 'int',
|
|
318
|
+
'smallint': 'int',
|
|
319
|
+
'mediumint': 'int',
|
|
320
|
+
'int': 'int',
|
|
321
|
+
'integer': 'int',
|
|
322
|
+
'bigint': 'long',
|
|
323
|
+
'float': 'float',
|
|
324
|
+
'double': 'double',
|
|
325
|
+
'decimal': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 38, 'scale': 10},
|
|
326
|
+
'numeric': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 38, 'scale': 10},
|
|
327
|
+
'bit': 'boolean',
|
|
328
|
+
'boolean': 'boolean',
|
|
329
|
+
'bool': 'boolean',
|
|
330
|
+
'char': 'string',
|
|
331
|
+
'varchar': 'string',
|
|
332
|
+
'tinytext': 'string',
|
|
333
|
+
'text': 'string',
|
|
334
|
+
'mediumtext': 'string',
|
|
335
|
+
'longtext': 'string',
|
|
336
|
+
'binary': 'bytes',
|
|
337
|
+
'varbinary': 'bytes',
|
|
338
|
+
'tinyblob': 'bytes',
|
|
339
|
+
'blob': 'bytes',
|
|
340
|
+
'mediumblob': 'bytes',
|
|
341
|
+
'longblob': 'bytes',
|
|
342
|
+
'date': {'type': 'int', 'logicalType': 'date'},
|
|
343
|
+
'time': {'type': 'int', 'logicalType': 'time-millis'},
|
|
344
|
+
'datetime': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
345
|
+
'timestamp': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
346
|
+
'year': 'int',
|
|
347
|
+
'json': 'string',
|
|
348
|
+
'enum': 'string',
|
|
349
|
+
'set': 'string',
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
sqlserver_type_map: Dict[str, JsonNode] = {
|
|
353
|
+
'bit': 'boolean',
|
|
354
|
+
'tinyint': 'int',
|
|
355
|
+
'smallint': 'int',
|
|
356
|
+
'int': 'int',
|
|
357
|
+
'bigint': 'long',
|
|
358
|
+
'float': 'double',
|
|
359
|
+
'real': 'float',
|
|
360
|
+
'decimal': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 38, 'scale': 10},
|
|
361
|
+
'numeric': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 38, 'scale': 10},
|
|
362
|
+
'money': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 19, 'scale': 4},
|
|
363
|
+
'smallmoney': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 10, 'scale': 4},
|
|
364
|
+
'char': 'string',
|
|
365
|
+
'varchar': 'string',
|
|
366
|
+
'nchar': 'string',
|
|
367
|
+
'nvarchar': 'string',
|
|
368
|
+
'text': 'string',
|
|
369
|
+
'ntext': 'string',
|
|
370
|
+
'binary': 'bytes',
|
|
371
|
+
'varbinary': 'bytes',
|
|
372
|
+
'image': 'bytes',
|
|
373
|
+
'date': {'type': 'int', 'logicalType': 'date'},
|
|
374
|
+
'time': {'type': 'int', 'logicalType': 'time-millis'},
|
|
375
|
+
'datetime': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
376
|
+
'datetime2': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
377
|
+
'smalldatetime': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
378
|
+
'datetimeoffset': {'type': 'long', 'logicalType': 'timestamp-millis'},
|
|
379
|
+
'uniqueidentifier': {'type': 'string', 'logicalType': 'uuid'},
|
|
380
|
+
'xml': 'string',
|
|
381
|
+
'sql_variant': 'string',
|
|
382
|
+
'hierarchyid': 'string',
|
|
383
|
+
'geometry': 'bytes',
|
|
384
|
+
'geography': 'bytes',
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
def get_type_map(self) -> Dict[str, JsonNode]:
|
|
388
|
+
"""Returns the type map for the current dialect."""
|
|
389
|
+
if self.dialect == 'postgres':
|
|
390
|
+
return self.postgres_type_map
|
|
391
|
+
elif self.dialect == 'mysql':
|
|
392
|
+
return self.mysql_type_map
|
|
393
|
+
elif self.dialect == 'sqlserver':
|
|
394
|
+
return self.sqlserver_type_map
|
|
395
|
+
else:
|
|
396
|
+
# Default to postgres map for now
|
|
397
|
+
return self.postgres_type_map
|
|
398
|
+
|
|
399
|
+
# -------------------------------------------------------------------------
|
|
400
|
+
# Schema Extraction (PostgreSQL)
|
|
401
|
+
# -------------------------------------------------------------------------
|
|
402
|
+
|
|
403
|
+
def fetch_tables(self) -> List[Dict[str, str]]:
|
|
404
|
+
"""Fetches list of tables from the database."""
|
|
405
|
+
cursor = self.connection.cursor()
|
|
406
|
+
|
|
407
|
+
if self.dialect == 'postgres':
|
|
408
|
+
query = """
|
|
409
|
+
SELECT table_name, table_schema
|
|
410
|
+
FROM information_schema.tables
|
|
411
|
+
WHERE table_schema NOT IN ('pg_catalog', 'information_schema')
|
|
412
|
+
AND table_type = 'BASE TABLE'
|
|
413
|
+
"""
|
|
414
|
+
if self.single_table_name:
|
|
415
|
+
query += f" AND table_name = '{self.single_table_name}'"
|
|
416
|
+
query += " ORDER BY table_schema, table_name"
|
|
417
|
+
elif self.dialect == 'mysql':
|
|
418
|
+
query = """
|
|
419
|
+
SELECT table_name, table_schema
|
|
420
|
+
FROM information_schema.tables
|
|
421
|
+
WHERE table_schema = DATABASE()
|
|
422
|
+
AND table_type = 'BASE TABLE'
|
|
423
|
+
"""
|
|
424
|
+
if self.single_table_name:
|
|
425
|
+
query += f" AND table_name = '{self.single_table_name}'"
|
|
426
|
+
elif self.dialect == 'sqlserver':
|
|
427
|
+
query = """
|
|
428
|
+
SELECT t.name AS table_name, s.name AS table_schema
|
|
429
|
+
FROM sys.tables t
|
|
430
|
+
INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
|
|
431
|
+
WHERE t.type = 'U'
|
|
432
|
+
"""
|
|
433
|
+
if self.single_table_name:
|
|
434
|
+
query += f" AND t.name = '{self.single_table_name}'"
|
|
435
|
+
query += " ORDER BY s.name, t.name"
|
|
436
|
+
else:
|
|
437
|
+
raise NotImplementedError(f"fetch_tables not implemented for {self.dialect}")
|
|
438
|
+
|
|
439
|
+
cursor.execute(query)
|
|
440
|
+
tables = [{'table_name': row[0], 'table_schema': row[1]} for row in cursor.fetchall()]
|
|
441
|
+
cursor.close()
|
|
442
|
+
return tables
|
|
443
|
+
|
|
444
|
+
def fetch_table_columns(self, table_name: str, table_schema: str = 'public') -> List[Dict[str, Any]]:
|
|
445
|
+
"""Fetches column information for a table."""
|
|
446
|
+
cursor = self.connection.cursor()
|
|
447
|
+
|
|
448
|
+
if self.dialect == 'postgres':
|
|
449
|
+
query = """
|
|
450
|
+
SELECT
|
|
451
|
+
c.column_name,
|
|
452
|
+
c.data_type,
|
|
453
|
+
c.udt_name,
|
|
454
|
+
c.is_nullable,
|
|
455
|
+
c.column_default,
|
|
456
|
+
c.character_maximum_length,
|
|
457
|
+
c.numeric_precision,
|
|
458
|
+
c.numeric_scale,
|
|
459
|
+
c.ordinal_position,
|
|
460
|
+
col_description(
|
|
461
|
+
(quote_ident(c.table_schema) || '.' || quote_ident(c.table_name))::regclass::oid,
|
|
462
|
+
c.ordinal_position
|
|
463
|
+
) as column_comment
|
|
464
|
+
FROM information_schema.columns c
|
|
465
|
+
WHERE c.table_name = %s AND c.table_schema = %s
|
|
466
|
+
ORDER BY c.ordinal_position
|
|
467
|
+
"""
|
|
468
|
+
cursor.execute(query, (table_name, table_schema))
|
|
469
|
+
elif self.dialect == 'mysql':
|
|
470
|
+
query = """
|
|
471
|
+
SELECT
|
|
472
|
+
column_name,
|
|
473
|
+
data_type,
|
|
474
|
+
column_type,
|
|
475
|
+
is_nullable,
|
|
476
|
+
column_default,
|
|
477
|
+
character_maximum_length,
|
|
478
|
+
numeric_precision,
|
|
479
|
+
numeric_scale,
|
|
480
|
+
ordinal_position,
|
|
481
|
+
column_comment
|
|
482
|
+
FROM information_schema.columns
|
|
483
|
+
WHERE table_name = %s AND table_schema = %s
|
|
484
|
+
ORDER BY ordinal_position
|
|
485
|
+
"""
|
|
486
|
+
cursor.execute(query, (table_name, table_schema))
|
|
487
|
+
elif self.dialect == 'sqlserver':
|
|
488
|
+
query = """
|
|
489
|
+
SELECT
|
|
490
|
+
c.name AS column_name,
|
|
491
|
+
t.name AS data_type,
|
|
492
|
+
t.name AS udt_name,
|
|
493
|
+
CASE WHEN c.is_nullable = 1 THEN 'YES' ELSE 'NO' END AS is_nullable,
|
|
494
|
+
dc.definition AS column_default,
|
|
495
|
+
c.max_length AS character_maximum_length,
|
|
496
|
+
c.precision AS numeric_precision,
|
|
497
|
+
c.scale AS numeric_scale,
|
|
498
|
+
c.column_id AS ordinal_position,
|
|
499
|
+
ep.value AS column_comment
|
|
500
|
+
FROM sys.columns c
|
|
501
|
+
INNER JOIN sys.types t ON c.user_type_id = t.user_type_id
|
|
502
|
+
INNER JOIN sys.tables tb ON c.object_id = tb.object_id
|
|
503
|
+
INNER JOIN sys.schemas s ON tb.schema_id = s.schema_id
|
|
504
|
+
LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id
|
|
505
|
+
LEFT JOIN sys.extended_properties ep
|
|
506
|
+
ON ep.major_id = c.object_id AND ep.minor_id = c.column_id AND ep.name = 'MS_Description'
|
|
507
|
+
WHERE tb.name = %s AND s.name = %s
|
|
508
|
+
ORDER BY c.column_id
|
|
509
|
+
"""
|
|
510
|
+
cursor.execute(query, (table_name, table_schema))
|
|
511
|
+
else:
|
|
512
|
+
raise NotImplementedError(f"fetch_table_columns not implemented for {self.dialect}")
|
|
513
|
+
|
|
514
|
+
columns = []
|
|
515
|
+
for row in cursor.fetchall():
|
|
516
|
+
columns.append({
|
|
517
|
+
'column_name': row[0],
|
|
518
|
+
'data_type': row[1],
|
|
519
|
+
'udt_name': row[2],
|
|
520
|
+
'is_nullable': row[3] == 'YES',
|
|
521
|
+
'column_default': row[4],
|
|
522
|
+
'character_maximum_length': row[5],
|
|
523
|
+
'numeric_precision': row[6],
|
|
524
|
+
'numeric_scale': row[7],
|
|
525
|
+
'ordinal_position': row[8],
|
|
526
|
+
'column_comment': row[9] if len(row) > 9 else None
|
|
527
|
+
})
|
|
528
|
+
cursor.close()
|
|
529
|
+
return columns
|
|
530
|
+
|
|
531
|
+
def fetch_primary_keys(self, table_name: str, table_schema: str = 'public') -> List[str]:
|
|
532
|
+
"""Fetches primary key columns for a table."""
|
|
533
|
+
cursor = self.connection.cursor()
|
|
534
|
+
|
|
535
|
+
if self.dialect == 'postgres':
|
|
536
|
+
query = """
|
|
537
|
+
SELECT kcu.column_name
|
|
538
|
+
FROM information_schema.table_constraints tc
|
|
539
|
+
JOIN information_schema.key_column_usage kcu
|
|
540
|
+
ON tc.constraint_name = kcu.constraint_name
|
|
541
|
+
AND tc.table_schema = kcu.table_schema
|
|
542
|
+
WHERE tc.table_name = %s
|
|
543
|
+
AND tc.table_schema = %s
|
|
544
|
+
AND tc.constraint_type = 'PRIMARY KEY'
|
|
545
|
+
ORDER BY kcu.ordinal_position
|
|
546
|
+
"""
|
|
547
|
+
cursor.execute(query, (table_name, table_schema))
|
|
548
|
+
elif self.dialect == 'mysql':
|
|
549
|
+
query = """
|
|
550
|
+
SELECT column_name
|
|
551
|
+
FROM information_schema.key_column_usage
|
|
552
|
+
WHERE table_name = %s
|
|
553
|
+
AND table_schema = %s
|
|
554
|
+
AND constraint_name = 'PRIMARY'
|
|
555
|
+
ORDER BY ordinal_position
|
|
556
|
+
"""
|
|
557
|
+
cursor.execute(query, (table_name, table_schema))
|
|
558
|
+
elif self.dialect == 'sqlserver':
|
|
559
|
+
query = """
|
|
560
|
+
SELECT c.name AS column_name
|
|
561
|
+
FROM sys.indexes i
|
|
562
|
+
INNER JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
|
|
563
|
+
INNER JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
|
|
564
|
+
INNER JOIN sys.tables t ON i.object_id = t.object_id
|
|
565
|
+
INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
|
|
566
|
+
WHERE i.is_primary_key = 1 AND t.name = %s AND s.name = %s
|
|
567
|
+
ORDER BY ic.key_ordinal
|
|
568
|
+
"""
|
|
569
|
+
cursor.execute(query, (table_name, table_schema))
|
|
570
|
+
else:
|
|
571
|
+
cursor.close()
|
|
572
|
+
return []
|
|
573
|
+
|
|
574
|
+
pk_columns = [row[0] for row in cursor.fetchall()]
|
|
575
|
+
cursor.close()
|
|
576
|
+
return pk_columns
|
|
577
|
+
|
|
578
|
+
def fetch_table_comment(self, table_name: str, table_schema: str = 'public') -> str | None:
|
|
579
|
+
"""Fetches table comment/description."""
|
|
580
|
+
cursor = self.connection.cursor()
|
|
581
|
+
|
|
582
|
+
if self.dialect == 'postgres':
|
|
583
|
+
query = """
|
|
584
|
+
SELECT obj_description(
|
|
585
|
+
(quote_ident(%s) || '.' || quote_ident(%s))::regclass::oid,
|
|
586
|
+
'pg_class'
|
|
587
|
+
)
|
|
588
|
+
"""
|
|
589
|
+
cursor.execute(query, (table_schema, table_name))
|
|
590
|
+
result = cursor.fetchone()
|
|
591
|
+
cursor.close()
|
|
592
|
+
return result[0] if result else None
|
|
593
|
+
elif self.dialect == 'mysql':
|
|
594
|
+
query = """
|
|
595
|
+
SELECT table_comment
|
|
596
|
+
FROM information_schema.tables
|
|
597
|
+
WHERE table_name = %s AND table_schema = %s
|
|
598
|
+
"""
|
|
599
|
+
cursor.execute(query, (table_name, table_schema))
|
|
600
|
+
result = cursor.fetchone()
|
|
601
|
+
cursor.close()
|
|
602
|
+
return result[0] if result and result[0] else None
|
|
603
|
+
elif self.dialect == 'sqlserver':
|
|
604
|
+
query = """
|
|
605
|
+
SELECT ep.value
|
|
606
|
+
FROM sys.extended_properties ep
|
|
607
|
+
INNER JOIN sys.tables t ON ep.major_id = t.object_id
|
|
608
|
+
INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
|
|
609
|
+
WHERE ep.minor_id = 0 AND ep.name = 'MS_Description'
|
|
610
|
+
AND t.name = %s AND s.name = %s
|
|
611
|
+
"""
|
|
612
|
+
cursor.execute(query, (table_name, table_schema))
|
|
613
|
+
result = cursor.fetchone()
|
|
614
|
+
cursor.close()
|
|
615
|
+
return result[0] if result and result[0] else None
|
|
616
|
+
|
|
617
|
+
cursor.close()
|
|
618
|
+
return None
|
|
619
|
+
|
|
620
|
+
# -------------------------------------------------------------------------
|
|
621
|
+
# JSON/XML Schema Inference (delegated to shared AvroSchemaInferrer)
|
|
622
|
+
# -------------------------------------------------------------------------
|
|
623
|
+
|
|
624
|
+
def infer_json_column_schema(
|
|
625
|
+
self,
|
|
626
|
+
table_name: str,
|
|
627
|
+
table_schema: str,
|
|
628
|
+
column_name: str,
|
|
629
|
+
type_column: str | None = None,
|
|
630
|
+
type_value: str | None = None
|
|
631
|
+
) -> JsonNode:
|
|
632
|
+
"""Infers Avro schema for a JSON/JSONB column by sampling data."""
|
|
633
|
+
cursor = self.connection.cursor()
|
|
634
|
+
|
|
635
|
+
if self.dialect == 'postgres':
|
|
636
|
+
if type_column and type_value:
|
|
637
|
+
query = f"""
|
|
638
|
+
SELECT "{column_name}"::text
|
|
639
|
+
FROM "{table_schema}"."{table_name}"
|
|
640
|
+
WHERE "{type_column}" = %s AND "{column_name}" IS NOT NULL
|
|
641
|
+
LIMIT {self.sample_size}
|
|
642
|
+
"""
|
|
643
|
+
cursor.execute(query, (type_value,))
|
|
644
|
+
else:
|
|
645
|
+
query = f"""
|
|
646
|
+
SELECT "{column_name}"::text
|
|
647
|
+
FROM "{table_schema}"."{table_name}"
|
|
648
|
+
WHERE "{column_name}" IS NOT NULL
|
|
649
|
+
LIMIT {self.sample_size}
|
|
650
|
+
"""
|
|
651
|
+
cursor.execute(query)
|
|
652
|
+
elif self.dialect == 'mysql':
|
|
653
|
+
if type_column and type_value:
|
|
654
|
+
query = f"""
|
|
655
|
+
SELECT `{column_name}`
|
|
656
|
+
FROM `{table_schema}`.`{table_name}`
|
|
657
|
+
WHERE `{type_column}` = %s AND `{column_name}` IS NOT NULL
|
|
658
|
+
LIMIT {self.sample_size}
|
|
659
|
+
"""
|
|
660
|
+
cursor.execute(query, (type_value,))
|
|
661
|
+
else:
|
|
662
|
+
query = f"""
|
|
663
|
+
SELECT `{column_name}`
|
|
664
|
+
FROM `{table_schema}`.`{table_name}`
|
|
665
|
+
WHERE `{column_name}` IS NOT NULL
|
|
666
|
+
LIMIT {self.sample_size}
|
|
667
|
+
"""
|
|
668
|
+
cursor.execute(query)
|
|
669
|
+
else:
|
|
670
|
+
cursor.close()
|
|
671
|
+
return "string"
|
|
672
|
+
|
|
673
|
+
values = []
|
|
674
|
+
for row in cursor.fetchall():
|
|
675
|
+
if row[0]:
|
|
676
|
+
try:
|
|
677
|
+
parsed = json.loads(row[0]) if isinstance(row[0], str) else row[0]
|
|
678
|
+
values.append(parsed)
|
|
679
|
+
except (json.JSONDecodeError, TypeError):
|
|
680
|
+
pass
|
|
681
|
+
cursor.close()
|
|
682
|
+
|
|
683
|
+
if not values:
|
|
684
|
+
return "string"
|
|
685
|
+
|
|
686
|
+
type_name = type_value if type_value else f"{table_name}.{column_name}"
|
|
687
|
+
return self._inferrer.infer_from_json_values(type_name, values)
|
|
688
|
+
|
|
689
|
+
def infer_xml_column_schema(
|
|
690
|
+
self,
|
|
691
|
+
table_name: str,
|
|
692
|
+
table_schema: str,
|
|
693
|
+
column_name: str
|
|
694
|
+
) -> JsonNode:
|
|
695
|
+
"""Infers Avro schema for an XML column by sampling data."""
|
|
696
|
+
cursor = self.connection.cursor()
|
|
697
|
+
|
|
698
|
+
if self.dialect == 'postgres':
|
|
699
|
+
query = f"""
|
|
700
|
+
SELECT "{column_name}"::text
|
|
701
|
+
FROM "{table_schema}"."{table_name}"
|
|
702
|
+
WHERE "{column_name}" IS NOT NULL
|
|
703
|
+
LIMIT {self.sample_size}
|
|
704
|
+
"""
|
|
705
|
+
cursor.execute(query)
|
|
706
|
+
else:
|
|
707
|
+
cursor.close()
|
|
708
|
+
return "string"
|
|
709
|
+
|
|
710
|
+
xml_strings: List[str] = []
|
|
711
|
+
for row in cursor.fetchall():
|
|
712
|
+
if row[0]:
|
|
713
|
+
xml_strings.append(row[0])
|
|
714
|
+
cursor.close()
|
|
715
|
+
|
|
716
|
+
if not xml_strings:
|
|
717
|
+
return "string"
|
|
718
|
+
|
|
719
|
+
type_name = f"{table_name}.{column_name}"
|
|
720
|
+
return self._inferrer.infer_from_xml_values(type_name, xml_strings)
|
|
721
|
+
|
|
722
|
+
# -------------------------------------------------------------------------
|
|
723
|
+
# Type Conversion
|
|
724
|
+
# -------------------------------------------------------------------------
|
|
725
|
+
|
|
726
|
+
def map_sql_type_to_avro_type(
|
|
727
|
+
self,
|
|
728
|
+
column: Dict[str, Any],
|
|
729
|
+
table_name: str,
|
|
730
|
+
table_schema: str,
|
|
731
|
+
type_column: str | None = None,
|
|
732
|
+
type_value: str | None = None
|
|
733
|
+
) -> JsonNode:
|
|
734
|
+
"""Maps a SQL column type to Avro type."""
|
|
735
|
+
data_type = column['data_type'].lower()
|
|
736
|
+
udt_name = (column.get('udt_name') or '').lower()
|
|
737
|
+
|
|
738
|
+
# Check for JSON types that need inference
|
|
739
|
+
if self.infer_json_schema and data_type in ('json', 'jsonb'):
|
|
740
|
+
inferred = self.infer_json_column_schema(
|
|
741
|
+
table_name, table_schema, column['column_name'], type_column, type_value
|
|
742
|
+
)
|
|
743
|
+
return inferred
|
|
744
|
+
|
|
745
|
+
# Check for XML types that need inference
|
|
746
|
+
if self.infer_xml_schema and data_type == 'xml':
|
|
747
|
+
inferred = self.infer_xml_column_schema(
|
|
748
|
+
table_name, table_schema, column['column_name']
|
|
749
|
+
)
|
|
750
|
+
return inferred
|
|
751
|
+
|
|
752
|
+
# Handle ARRAY types
|
|
753
|
+
if data_type == 'array' or (udt_name and udt_name.startswith('_')):
|
|
754
|
+
element_type = udt_name[1:] if udt_name.startswith('_') else 'text'
|
|
755
|
+
type_map = self.get_type_map()
|
|
756
|
+
element_avro_type = type_map.get(element_type, 'string')
|
|
757
|
+
return {"type": "array", "items": element_avro_type}
|
|
758
|
+
|
|
759
|
+
# Handle NUMERIC/DECIMAL with precision
|
|
760
|
+
if data_type in ('numeric', 'decimal'):
|
|
761
|
+
precision = column.get('numeric_precision') or 38
|
|
762
|
+
scale = column.get('numeric_scale') or 10
|
|
763
|
+
return {
|
|
764
|
+
"type": "bytes",
|
|
765
|
+
"logicalType": "decimal",
|
|
766
|
+
"precision": precision,
|
|
767
|
+
"scale": scale
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
# Handle MySQL bit(n) where n > 1 should be array of booleans
|
|
771
|
+
if self.dialect == 'mysql' and data_type == 'bit':
|
|
772
|
+
# character_maximum_length holds the bit width for MySQL bit type
|
|
773
|
+
bit_width = column.get('character_maximum_length') or 1
|
|
774
|
+
if bit_width > 1:
|
|
775
|
+
return {"type": "array", "items": "boolean"}
|
|
776
|
+
# bit(1) is commonly used as boolean
|
|
777
|
+
return "boolean"
|
|
778
|
+
|
|
779
|
+
# Look up in type map
|
|
780
|
+
type_map = self.get_type_map()
|
|
781
|
+
|
|
782
|
+
# Try exact match first
|
|
783
|
+
if data_type in type_map:
|
|
784
|
+
return copy.deepcopy(type_map[data_type])
|
|
785
|
+
|
|
786
|
+
# Try udt_name
|
|
787
|
+
if udt_name and udt_name in type_map:
|
|
788
|
+
return copy.deepcopy(type_map[udt_name])
|
|
789
|
+
|
|
790
|
+
# Try matching without "USER-DEFINED" and composite types
|
|
791
|
+
if data_type == 'user-defined':
|
|
792
|
+
return "string" # Default for user-defined types
|
|
793
|
+
|
|
794
|
+
# Default fallback
|
|
795
|
+
return "string"
|
|
796
|
+
|
|
797
|
+
# -------------------------------------------------------------------------
|
|
798
|
+
# Schema Generation
|
|
799
|
+
# -------------------------------------------------------------------------
|
|
800
|
+
|
|
801
|
+
def table_to_avro_schema(
|
|
802
|
+
self,
|
|
803
|
+
table_name: str,
|
|
804
|
+
table_schema: str = 'public'
|
|
805
|
+
) -> JsonNode:
|
|
806
|
+
"""Converts a SQL table to Avro schema."""
|
|
807
|
+
columns = self.fetch_table_columns(table_name, table_schema)
|
|
808
|
+
primary_keys = self.fetch_primary_keys(table_name, table_schema)
|
|
809
|
+
table_comment = self.fetch_table_comment(table_name, table_schema)
|
|
810
|
+
|
|
811
|
+
# Check for CloudEvents pattern
|
|
812
|
+
column_names = set(col['column_name'].lower() for col in columns)
|
|
813
|
+
is_cloudevent = False
|
|
814
|
+
type_values: List[str | None] = []
|
|
815
|
+
type_column: str | None = None
|
|
816
|
+
|
|
817
|
+
if self.emit_cloudevents:
|
|
818
|
+
is_cloudevent = all(c in column_names for c in ['type', 'source', 'data', 'id'])
|
|
819
|
+
if is_cloudevent:
|
|
820
|
+
type_column = next(
|
|
821
|
+
(col['column_name'] for col in columns if col['column_name'].lower() == 'type'),
|
|
822
|
+
None
|
|
823
|
+
)
|
|
824
|
+
if type_column:
|
|
825
|
+
type_values = self._fetch_distinct_type_values(table_name, table_schema, type_column)
|
|
826
|
+
|
|
827
|
+
if not type_values:
|
|
828
|
+
type_values = [None]
|
|
829
|
+
|
|
830
|
+
schemas: List[JsonNode] = []
|
|
831
|
+
|
|
832
|
+
for type_value in type_values:
|
|
833
|
+
if type_value and isinstance(type_value, str):
|
|
834
|
+
type_name_name = avro_name(type_value.rsplit('.', 1)[-1])
|
|
835
|
+
type_name_namespace = type_value.rsplit('.', 1)[0] if '.' in type_value else ''
|
|
836
|
+
type_namespace = self.avro_namespace + ('.' if self.avro_namespace and type_name_namespace else '') + type_name_namespace
|
|
837
|
+
else:
|
|
838
|
+
type_name_name = avro_name(table_name)
|
|
839
|
+
type_namespace = self.avro_namespace
|
|
840
|
+
|
|
841
|
+
if is_cloudevent and type_column:
|
|
842
|
+
# For CloudEvents, focus on the 'data' column
|
|
843
|
+
data_column = next(
|
|
844
|
+
(col for col in columns if col['column_name'].lower() == 'data'),
|
|
845
|
+
None
|
|
846
|
+
)
|
|
847
|
+
if data_column:
|
|
848
|
+
data_schema = self.map_sql_type_to_avro_type(
|
|
849
|
+
data_column, table_name, table_schema, type_column, type_value
|
|
850
|
+
)
|
|
851
|
+
if isinstance(data_schema, dict):
|
|
852
|
+
data_schema = [data_schema]
|
|
853
|
+
if isinstance(data_schema, list):
|
|
854
|
+
for schema in data_schema:
|
|
855
|
+
if not isinstance(schema, dict) or schema.get("type") != "record":
|
|
856
|
+
schema = self._wrap_schema_in_root_record(schema, type_name_name, type_namespace)
|
|
857
|
+
if self.emit_xregistry:
|
|
858
|
+
ce_attribs: Dict[str, JsonNode] = {}
|
|
859
|
+
for col in columns:
|
|
860
|
+
if col['column_name'].lower() != 'data':
|
|
861
|
+
ce_attribs[col['column_name'].lower()] = "string"
|
|
862
|
+
if isinstance(schema, dict):
|
|
863
|
+
schema["ce_attribs"] = ce_attribs
|
|
864
|
+
self._apply_schema_attributes(schema, table_name, table_schema, type_value, type_namespace, table_comment)
|
|
865
|
+
schemas.append(schema)
|
|
866
|
+
else:
|
|
867
|
+
# Normal table conversion
|
|
868
|
+
fields: List[JsonNode] = []
|
|
869
|
+
for column in columns:
|
|
870
|
+
avro_type = self.map_sql_type_to_avro_type(
|
|
871
|
+
column, table_name, table_schema, type_column, type_value
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
# Make nullable if column allows NULL
|
|
875
|
+
if column['is_nullable'] and avro_type != "null":
|
|
876
|
+
if isinstance(avro_type, list):
|
|
877
|
+
if "null" not in avro_type:
|
|
878
|
+
avro_type = ["null"] + avro_type
|
|
879
|
+
else:
|
|
880
|
+
avro_type = ["null", avro_type]
|
|
881
|
+
|
|
882
|
+
field: Dict[str, JsonNode] = {
|
|
883
|
+
"name": avro_name(column['column_name']),
|
|
884
|
+
"type": avro_type
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
# Add original name as altname if different
|
|
888
|
+
if avro_name(column['column_name']) != column['column_name']:
|
|
889
|
+
field["altnames"] = {"sql": column['column_name']}
|
|
890
|
+
|
|
891
|
+
# Add column comment as doc
|
|
892
|
+
if column.get('column_comment'):
|
|
893
|
+
field["doc"] = column['column_comment']
|
|
894
|
+
|
|
895
|
+
fields.append(field)
|
|
896
|
+
|
|
897
|
+
schema: Dict[str, JsonNode] = {
|
|
898
|
+
"type": "record",
|
|
899
|
+
"name": type_name_name,
|
|
900
|
+
"fields": fields
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
# Add primary keys as 'unique' annotation
|
|
904
|
+
if primary_keys:
|
|
905
|
+
schema["unique"] = [avro_name(pk) for pk in primary_keys]
|
|
906
|
+
|
|
907
|
+
self._apply_schema_attributes(schema, table_name, table_schema, type_value, type_namespace, table_comment)
|
|
908
|
+
schemas.append(schema)
|
|
909
|
+
|
|
910
|
+
return schemas if len(schemas) > 1 else schemas[0]
|
|
911
|
+
|
|
912
|
+
def _fetch_distinct_type_values(self, table_name: str, table_schema: str, type_column: str) -> List[str]:
|
|
913
|
+
"""Fetches distinct values from a type discriminator column."""
|
|
914
|
+
cursor = self.connection.cursor()
|
|
915
|
+
|
|
916
|
+
if self.dialect == 'postgres':
|
|
917
|
+
query = f"""
|
|
918
|
+
SELECT DISTINCT "{type_column}"
|
|
919
|
+
FROM "{table_schema}"."{table_name}"
|
|
920
|
+
WHERE "{type_column}" IS NOT NULL
|
|
921
|
+
LIMIT 1000
|
|
922
|
+
"""
|
|
923
|
+
elif self.dialect == 'mysql':
|
|
924
|
+
query = f"""
|
|
925
|
+
SELECT DISTINCT `{type_column}`
|
|
926
|
+
FROM `{table_schema}`.`{table_name}`
|
|
927
|
+
WHERE `{type_column}` IS NOT NULL
|
|
928
|
+
LIMIT 1000
|
|
929
|
+
"""
|
|
930
|
+
else:
|
|
931
|
+
cursor.close()
|
|
932
|
+
return []
|
|
933
|
+
|
|
934
|
+
cursor.execute(query)
|
|
935
|
+
values = [row[0] for row in cursor.fetchall() if row[0]]
|
|
936
|
+
cursor.close()
|
|
937
|
+
return values
|
|
938
|
+
|
|
939
|
+
def _wrap_schema_in_root_record(self, schema: JsonNode, type_name: str, type_namespace: str) -> Dict[str, JsonNode]:
|
|
940
|
+
"""Wraps a schema in a root record."""
|
|
941
|
+
record: Dict[str, JsonNode] = {
|
|
942
|
+
"type": "record",
|
|
943
|
+
"name": type_name,
|
|
944
|
+
"fields": [
|
|
945
|
+
{
|
|
946
|
+
"name": "data",
|
|
947
|
+
"type": schema,
|
|
948
|
+
"root": True
|
|
949
|
+
}
|
|
950
|
+
]
|
|
951
|
+
}
|
|
952
|
+
if type_namespace:
|
|
953
|
+
record["namespace"] = type_namespace
|
|
954
|
+
return record
|
|
955
|
+
|
|
956
|
+
def _apply_schema_attributes(
|
|
957
|
+
self,
|
|
958
|
+
schema: JsonNode,
|
|
959
|
+
table_name: str,
|
|
960
|
+
table_schema: str,
|
|
961
|
+
type_value: str | None,
|
|
962
|
+
type_namespace: str,
|
|
963
|
+
table_comment: str | None
|
|
964
|
+
):
|
|
965
|
+
"""Applies schema attributes to the schema."""
|
|
966
|
+
if isinstance(schema, dict):
|
|
967
|
+
schema["altnames"] = {"sql": f"{table_schema}.{table_name}" if table_schema != 'public' else table_name}
|
|
968
|
+
if self.emit_cloudevents and type_value:
|
|
969
|
+
schema["ce_type"] = type_value
|
|
970
|
+
if type_namespace:
|
|
971
|
+
schema["namespace"] = type_namespace
|
|
972
|
+
if table_comment:
|
|
973
|
+
schema["doc"] = table_comment
|
|
974
|
+
|
|
975
|
+
def make_type_names_unique(self, item_types: list):
|
|
976
|
+
"""Makes the type names unique (following k2a pattern)."""
|
|
977
|
+
for item in item_types:
|
|
978
|
+
if isinstance(item, dict) and item.get("type") == "array":
|
|
979
|
+
if isinstance(item.get("items"), dict) and item["items"].get("type") == "record":
|
|
980
|
+
self.make_type_names_unique([item["items"]])
|
|
981
|
+
elif isinstance(item.get("items"), list):
|
|
982
|
+
self.make_type_names_unique(item["items"])
|
|
983
|
+
if isinstance(item, dict) and item.get("type") == "map":
|
|
984
|
+
if isinstance(item.get("values"), dict) and item["values"].get("type") == "record":
|
|
985
|
+
self.make_type_names_unique([item["values"]])
|
|
986
|
+
elif isinstance(item.get("values"), list):
|
|
987
|
+
self.make_type_names_unique(item["values"])
|
|
988
|
+
elif isinstance(item, dict) and item.get("type") == "record":
|
|
989
|
+
namespace = item.get("namespace", '')
|
|
990
|
+
type_name = base_name = item["name"]
|
|
991
|
+
record_name = f"{namespace}.{type_name}" if namespace else type_name
|
|
992
|
+
if record_name in self.generated_types:
|
|
993
|
+
i = 0
|
|
994
|
+
while record_name in self.generated_types:
|
|
995
|
+
i += 1
|
|
996
|
+
type_name = f"{base_name}{i}"
|
|
997
|
+
record_name = f"{namespace}.{type_name}" if namespace else type_name
|
|
998
|
+
self.generated_types.append(record_name)
|
|
999
|
+
else:
|
|
1000
|
+
self.generated_types.append(record_name)
|
|
1001
|
+
item["name"] = type_name
|
|
1002
|
+
for field in item.get("fields", []):
|
|
1003
|
+
if isinstance(field.get("type"), dict):
|
|
1004
|
+
if field["type"].get("type") in ["record", "array", "map"]:
|
|
1005
|
+
self.make_type_names_unique([field["type"]])
|
|
1006
|
+
elif isinstance(field.get("type"), list):
|
|
1007
|
+
self.make_type_names_unique(field["type"])
|
|
1008
|
+
|
|
1009
|
+
# -------------------------------------------------------------------------
|
|
1010
|
+
# Main Processing
|
|
1011
|
+
# -------------------------------------------------------------------------
|
|
1012
|
+
|
|
1013
|
+
def process_all_tables(self):
|
|
1014
|
+
"""Processes all tables in the database and generates Avro schema."""
|
|
1015
|
+
union_schema: List[JsonNode] = []
|
|
1016
|
+
tables = self.fetch_tables()
|
|
1017
|
+
|
|
1018
|
+
for table_info in tables:
|
|
1019
|
+
table_name = table_info['table_name']
|
|
1020
|
+
table_schema = table_info['table_schema']
|
|
1021
|
+
print(f"Processing table: {table_schema}.{table_name}")
|
|
1022
|
+
|
|
1023
|
+
avro_schema = self.table_to_avro_schema(table_name, table_schema)
|
|
1024
|
+
if isinstance(avro_schema, list):
|
|
1025
|
+
union_schema.extend(avro_schema)
|
|
1026
|
+
else:
|
|
1027
|
+
union_schema.append(avro_schema)
|
|
1028
|
+
|
|
1029
|
+
output = None
|
|
1030
|
+
if self.emit_xregistry:
|
|
1031
|
+
xregistry_messages = {}
|
|
1032
|
+
xregistry_schemas = {}
|
|
1033
|
+
groupname = self.avro_namespace
|
|
1034
|
+
for schema in union_schema:
|
|
1035
|
+
self.generated_types = []
|
|
1036
|
+
self.make_type_names_unique([schema])
|
|
1037
|
+
ce_attribs: Dict[str, JsonNode] = {}
|
|
1038
|
+
if isinstance(schema, dict) and "ce_attribs" in schema:
|
|
1039
|
+
ce_attribs = cast(Dict[str, JsonNode], schema.get("ce_attribs", {}))
|
|
1040
|
+
del schema["ce_attribs"]
|
|
1041
|
+
if isinstance(schema, dict):
|
|
1042
|
+
schemaid = schema.get('ce_type', f"{self.avro_namespace}.{schema['name']}")
|
|
1043
|
+
schema_name = str(schemaid).rsplit('.', 1)[-1]
|
|
1044
|
+
xregistry_schemas[schemaid] = {
|
|
1045
|
+
"id": schemaid,
|
|
1046
|
+
"name": schema_name,
|
|
1047
|
+
"format": f"Avro/{AVRO_VERSION}",
|
|
1048
|
+
"defaultversionid": "1",
|
|
1049
|
+
"versions": {
|
|
1050
|
+
"1": {
|
|
1051
|
+
"id": "1",
|
|
1052
|
+
"format": f"Avro/{AVRO_VERSION}",
|
|
1053
|
+
"schema": schema
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
}
|
|
1057
|
+
xregistry_messages[schemaid] = {
|
|
1058
|
+
"id": schemaid,
|
|
1059
|
+
"name": schema_name,
|
|
1060
|
+
"envelope": "CloudEvents/1.0",
|
|
1061
|
+
"envelopemetadata": {
|
|
1062
|
+
"type": {"value": schemaid},
|
|
1063
|
+
"source": {"value": "{source}"},
|
|
1064
|
+
},
|
|
1065
|
+
"schemaformat": f"Avro/{AVRO_VERSION}",
|
|
1066
|
+
"schemauri": f"#/schemagroups/{groupname}/schemas/{schemaid}"
|
|
1067
|
+
}
|
|
1068
|
+
for key, value in ce_attribs.items():
|
|
1069
|
+
if key in ("type", "source", "id", "specversion"):
|
|
1070
|
+
continue
|
|
1071
|
+
xregistry_messages[schemaid]["envelopemetadata"][key] = {
|
|
1072
|
+
"type": value,
|
|
1073
|
+
"required": True
|
|
1074
|
+
}
|
|
1075
|
+
output = {
|
|
1076
|
+
"messagegroups": {
|
|
1077
|
+
groupname: {
|
|
1078
|
+
"id": groupname,
|
|
1079
|
+
"messages": xregistry_messages
|
|
1080
|
+
}
|
|
1081
|
+
},
|
|
1082
|
+
"schemagroups": {
|
|
1083
|
+
groupname: {
|
|
1084
|
+
"id": groupname,
|
|
1085
|
+
"schemas": xregistry_schemas
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
else:
|
|
1090
|
+
self.generated_types = []
|
|
1091
|
+
self.make_type_names_unique(union_schema)
|
|
1092
|
+
output = union_schema if len(union_schema) > 1 else union_schema[0] if union_schema else []
|
|
1093
|
+
|
|
1094
|
+
# Create the output directory if needed
|
|
1095
|
+
base_dir = os.path.dirname(self.avro_schema_path)
|
|
1096
|
+
if base_dir and not os.path.exists(base_dir):
|
|
1097
|
+
os.makedirs(base_dir)
|
|
1098
|
+
|
|
1099
|
+
with open(self.avro_schema_path, 'w', encoding='utf-8') as avro_file:
|
|
1100
|
+
json.dump(output, avro_file, indent=4)
|
|
1101
|
+
|
|
1102
|
+
self.close()
|
|
1103
|
+
|
|
1104
|
+
|
|
1105
|
+
def convert_sql_to_avro(
|
|
1106
|
+
connection_string: str,
|
|
1107
|
+
avro_schema_file: str,
|
|
1108
|
+
dialect: str = 'postgres',
|
|
1109
|
+
database: str | None = None,
|
|
1110
|
+
table_name: str | None = None,
|
|
1111
|
+
avro_namespace: str | None = None,
|
|
1112
|
+
emit_cloudevents: bool = False,
|
|
1113
|
+
emit_cloudevents_xregistry: bool = False,
|
|
1114
|
+
sample_size: int = 100,
|
|
1115
|
+
infer_json_schema: bool = True,
|
|
1116
|
+
infer_xml_schema: bool = True,
|
|
1117
|
+
username: str | None = None,
|
|
1118
|
+
password: str | None = None
|
|
1119
|
+
):
|
|
1120
|
+
"""Converts SQL database schemas to Avro schema format.
|
|
1121
|
+
|
|
1122
|
+
Args:
|
|
1123
|
+
connection_string: Database connection string
|
|
1124
|
+
avro_schema_file: Output path for the Avro schema file
|
|
1125
|
+
dialect: SQL dialect (postgres, mysql, sqlserver, oracle, sqlite)
|
|
1126
|
+
database: Database name (overrides connection string if provided)
|
|
1127
|
+
table_name: Specific table to convert (None for all tables)
|
|
1128
|
+
avro_namespace: Namespace for generated Avro schemas
|
|
1129
|
+
emit_cloudevents: Whether to emit CloudEvents declarations
|
|
1130
|
+
emit_cloudevents_xregistry: Whether to emit xRegistry manifest format
|
|
1131
|
+
sample_size: Number of rows to sample for JSON/XML inference
|
|
1132
|
+
infer_json_schema: Whether to infer schema for JSON columns
|
|
1133
|
+
infer_xml_schema: Whether to infer schema for XML columns
|
|
1134
|
+
username: Database username (overrides connection string credentials)
|
|
1135
|
+
password: Database password (overrides connection string credentials)
|
|
1136
|
+
"""
|
|
1137
|
+
if not connection_string:
|
|
1138
|
+
raise ValueError("connection_string is required")
|
|
1139
|
+
|
|
1140
|
+
if not avro_namespace:
|
|
1141
|
+
avro_namespace = database or 'database'
|
|
1142
|
+
|
|
1143
|
+
converter = SqlToAvro(
|
|
1144
|
+
connection_string=connection_string,
|
|
1145
|
+
database=database,
|
|
1146
|
+
table_name=table_name,
|
|
1147
|
+
avro_namespace=avro_namespace,
|
|
1148
|
+
avro_schema_path=avro_schema_file,
|
|
1149
|
+
dialect=dialect,
|
|
1150
|
+
emit_cloudevents=emit_cloudevents,
|
|
1151
|
+
emit_cloudevents_xregistry=emit_cloudevents_xregistry,
|
|
1152
|
+
sample_size=sample_size,
|
|
1153
|
+
infer_json_schema=infer_json_schema,
|
|
1154
|
+
infer_xml_schema=infer_xml_schema,
|
|
1155
|
+
username=username,
|
|
1156
|
+
password=password
|
|
1157
|
+
)
|
|
1158
|
+
|
|
1159
|
+
return converter.process_all_tables()
|