structurize 3.0.2__py3-none-any.whl → 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
avrotize/sqltoavro.py ADDED
@@ -0,0 +1,1159 @@
1
+ """Converts SQL database schemas to Avro schema format."""
2
+
3
+ import copy
4
+ import json
5
+ import os
6
+ import re
7
+ from typing import Any, Dict, List, cast
8
+ from urllib.parse import urlparse, parse_qs
9
+
10
+ from avrotize.common import avro_name
11
+ from avrotize.constants import AVRO_VERSION
12
+ from avrotize.schema_inference import AvroSchemaInferrer
13
+
14
+ JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | float | None
15
+
16
+
17
+ class SqlToAvro:
18
+ """Converts SQL database schemas to Avro schema format."""
19
+
20
+ def __init__(
21
+ self,
22
+ connection_string: str,
23
+ database: str | None,
24
+ table_name: str | None,
25
+ avro_namespace: str,
26
+ avro_schema_path: str,
27
+ dialect: str,
28
+ emit_cloudevents: bool,
29
+ emit_cloudevents_xregistry: bool,
30
+ sample_size: int = 100,
31
+ infer_json_schema: bool = True,
32
+ infer_xml_schema: bool = True,
33
+ username: str | None = None,
34
+ password: str | None = None
35
+ ):
36
+ """Initializes the SqlToAvro class with database connection parameters.
37
+
38
+ Args:
39
+ connection_string: Database connection string (e.g., postgresql://user:pass@host:port/dbname)
40
+ database: Database name (overrides connection string if provided)
41
+ table_name: Specific table to convert (None for all tables)
42
+ avro_namespace: Namespace for generated Avro schemas
43
+ avro_schema_path: Output path for the Avro schema file
44
+ dialect: SQL dialect (postgres, mysql, sqlserver, oracle, sqlite)
45
+ emit_cloudevents: Whether to emit CloudEvents declarations
46
+ emit_cloudevents_xregistry: Whether to emit xRegistry manifest format
47
+ sample_size: Number of rows to sample for JSON/XML inference
48
+ infer_json_schema: Whether to infer schema for JSON columns
49
+ infer_xml_schema: Whether to infer schema for XML columns
50
+ username: Database username (overrides connection string if provided)
51
+ password: Database password (overrides connection string if provided)
52
+ """
53
+ self.connection_string = connection_string
54
+ self.dialect = dialect.lower()
55
+ self.single_table_name = table_name
56
+ self.avro_namespace = avro_namespace
57
+ self.avro_schema_path = avro_schema_path
58
+ self.emit_xregistry = emit_cloudevents_xregistry
59
+ self.emit_cloudevents = emit_cloudevents or emit_cloudevents_xregistry
60
+ self.sample_size = sample_size
61
+ self.infer_json_schema = infer_json_schema
62
+ self.infer_xml_schema = infer_xml_schema
63
+ self.generated_types: List[str] = []
64
+
65
+ # Schema inferrer for JSON/XML columns (use 'sql' altnames for SQL source)
66
+ self._inferrer = AvroSchemaInferrer(namespace=avro_namespace, altnames_key='sql')
67
+
68
+ if self.emit_xregistry and not self.avro_namespace:
69
+ raise ValueError(
70
+ "The avro_namespace must be specified when emit_cloudevents_xregistry is True")
71
+
72
+ # Store credentials for connection
73
+ self.username = username
74
+ self.password = password
75
+
76
+ # Parse connection string and establish connection
77
+ self.connection = self._connect(connection_string, database)
78
+ self.database = database or self._extract_database_from_connection_string(connection_string)
79
+
80
+ def _extract_database_from_connection_string(self, connection_string: str) -> str:
81
+ """Extracts database name from connection string."""
82
+ parsed = urlparse(connection_string)
83
+ if parsed.path:
84
+ return parsed.path.lstrip('/')
85
+ return ''
86
+
87
+ def _connect(self, connection_string: str, database: str | None):
88
+ """Establishes database connection based on dialect.
89
+
90
+ Connection strings can include SSL/TLS and authentication options:
91
+
92
+ PostgreSQL:
93
+ - Standard: postgresql://user:pass@host:port/dbname
94
+ - SSL: postgresql://user:pass@host:port/dbname?sslmode=require
95
+ - SSL modes: disable, allow, prefer, require, verify-ca, verify-full
96
+
97
+ MySQL:
98
+ - Standard: mysql://user:pass@host:port/dbname
99
+ - SSL: mysql://user:pass@host:port/dbname?ssl=true
100
+ - SSL with cert: mysql://...?ssl_ca=/path/to/ca.pem
101
+
102
+ SQL Server:
103
+ - Standard: mssql://user:pass@host:port/dbname
104
+ - Windows Auth: mssql://@host:port/dbname (no user/pass = integrated)
105
+ - Encrypt: mssql://...?encrypt=true
106
+ - Trust cert: mssql://...?trustServerCertificate=true
107
+
108
+ Oracle:
109
+ - Standard: oracle://user:pass@host:port/service_name
110
+ - Wallet: Uses TNS names or wallet configuration
111
+ """
112
+ parsed = urlparse(connection_string)
113
+ query_params = dict(parse_qs(parsed.query)) if parsed.query else {}
114
+ # Flatten single-value lists
115
+ query_params = {k: v[0] if len(v) == 1 else v for k, v in query_params.items()}
116
+
117
+ if self.dialect == 'postgres':
118
+ try:
119
+ import psycopg2
120
+ except ImportError:
121
+ raise ImportError(
122
+ "psycopg2 is required for PostgreSQL support. "
123
+ "Install with: pip install psycopg2-binary"
124
+ )
125
+ # If separate credentials provided, use them instead of URL credentials
126
+ if self.username is not None:
127
+ connect_kwargs = {
128
+ 'host': parsed.hostname or 'localhost',
129
+ 'port': parsed.port or 5432,
130
+ 'user': self.username,
131
+ 'password': self.password or '',
132
+ 'database': database or parsed.path.lstrip('/')
133
+ }
134
+ # Handle sslmode from query params
135
+ if 'sslmode' in query_params:
136
+ connect_kwargs['sslmode'] = query_params['sslmode']
137
+ return psycopg2.connect(**connect_kwargs)
138
+ # psycopg2 handles the full connection string including sslmode
139
+ return psycopg2.connect(connection_string)
140
+ elif self.dialect == 'mysql':
141
+ try:
142
+ import pymysql
143
+ except ImportError:
144
+ raise ImportError(
145
+ "pymysql is required for MySQL support. "
146
+ "Install with: pip install pymysql"
147
+ )
148
+
149
+ connect_kwargs = {
150
+ 'host': parsed.hostname or 'localhost',
151
+ 'port': parsed.port or 3306,
152
+ 'user': self.username if self.username is not None else parsed.username,
153
+ 'password': self.password if self.username is not None else parsed.password,
154
+ 'database': database or parsed.path.lstrip('/')
155
+ }
156
+
157
+ # SSL/TLS configuration
158
+ ssl_config = {}
159
+ if query_params.get('ssl') in ('true', 'True', '1', True):
160
+ ssl_config['ssl'] = True
161
+ if 'ssl_ca' in query_params:
162
+ ssl_config['ssl'] = {'ca': query_params['ssl_ca']}
163
+ if 'ssl_cert' in query_params:
164
+ ssl_config.setdefault('ssl', {})
165
+ if isinstance(ssl_config['ssl'], dict):
166
+ ssl_config['ssl']['cert'] = query_params['ssl_cert']
167
+ if 'ssl_key' in query_params:
168
+ ssl_config.setdefault('ssl', {})
169
+ if isinstance(ssl_config['ssl'], dict):
170
+ ssl_config['ssl']['key'] = query_params['ssl_key']
171
+
172
+ if ssl_config:
173
+ connect_kwargs.update(ssl_config)
174
+
175
+ return pymysql.connect(**connect_kwargs)
176
+ elif self.dialect == 'sqlserver':
177
+ try:
178
+ import pymssql
179
+ use_pymssql = True
180
+ except ImportError:
181
+ use_pymssql = False
182
+ try:
183
+ import pyodbc
184
+ except ImportError:
185
+ raise ImportError(
186
+ "pymssql or pyodbc is required for SQL Server support. "
187
+ "Install with: pip install pymssql or pip install pyodbc"
188
+ )
189
+
190
+ if not use_pymssql:
191
+ # pyodbc - pass connection string directly (supports all ODBC options)
192
+ return pyodbc.connect(connection_string)
193
+
194
+ # pymssql - parse and build connection
195
+ connect_kwargs = {
196
+ 'server': parsed.hostname or 'localhost',
197
+ 'port': str(parsed.port or 1433),
198
+ 'database': database or parsed.path.lstrip('/')
199
+ }
200
+
201
+ # Check for integrated/Windows authentication (no username)
202
+ # Separate credentials override URL credentials
203
+ if self.username is not None:
204
+ connect_kwargs['user'] = self.username
205
+ connect_kwargs['password'] = self.password or ''
206
+ elif parsed.username:
207
+ connect_kwargs['user'] = parsed.username
208
+ connect_kwargs['password'] = parsed.password or ''
209
+ # If no username, pymssql will attempt Windows auth
210
+
211
+ # TLS/encryption options
212
+ if query_params.get('encrypt') in ('true', 'True', '1', True):
213
+ connect_kwargs['tds_version'] = '7.4' # Ensures TLS
214
+ if query_params.get('trustServerCertificate') in ('true', 'True', '1', True):
215
+ # pymssql doesn't directly support this, but we note it
216
+ pass
217
+
218
+ return pymssql.connect(**connect_kwargs)
219
+ elif self.dialect == 'oracle':
220
+ try:
221
+ import oracledb
222
+ except ImportError:
223
+ raise ImportError(
224
+ "oracledb is required for Oracle support. "
225
+ "Install with: pip install oracledb"
226
+ )
227
+ # oracledb supports various connection methods including wallets
228
+ return oracledb.connect(connection_string)
229
+ elif self.dialect == 'sqlite':
230
+ import sqlite3
231
+ # For SQLite, connection_string is the file path
232
+ return sqlite3.connect(connection_string)
233
+ else:
234
+ raise ValueError(f"Unsupported SQL dialect: {self.dialect}")
235
+
236
+ def close(self):
237
+ """Closes the database connection."""
238
+ if self.connection:
239
+ self.connection.close()
240
+
241
+ # -------------------------------------------------------------------------
242
+ # Type Mapping
243
+ # -------------------------------------------------------------------------
244
+
245
+ # PostgreSQL type mapping
246
+ postgres_type_map: Dict[str, JsonNode] = {
247
+ # Numeric types
248
+ 'smallint': 'int',
249
+ 'int2': 'int',
250
+ 'integer': 'int',
251
+ 'int': 'int',
252
+ 'int4': 'int',
253
+ 'bigint': 'long',
254
+ 'int8': 'long',
255
+ 'real': 'float',
256
+ 'float4': 'float',
257
+ 'double precision': 'double',
258
+ 'float8': 'double',
259
+ 'smallserial': 'int',
260
+ 'serial': 'int',
261
+ 'bigserial': 'long',
262
+ # Boolean
263
+ 'boolean': 'boolean',
264
+ 'bool': 'boolean',
265
+ # Character types
266
+ 'character varying': 'string',
267
+ 'varchar': 'string',
268
+ 'character': 'string',
269
+ 'char': 'string',
270
+ 'bpchar': 'string',
271
+ 'text': 'string',
272
+ 'name': 'string',
273
+ # Binary
274
+ 'bytea': 'bytes',
275
+ # Date/Time types
276
+ 'date': {'type': 'int', 'logicalType': 'date'},
277
+ 'time': {'type': 'int', 'logicalType': 'time-millis'},
278
+ 'time with time zone': {'type': 'int', 'logicalType': 'time-millis'},
279
+ 'time without time zone': {'type': 'int', 'logicalType': 'time-millis'},
280
+ 'timetz': {'type': 'int', 'logicalType': 'time-millis'},
281
+ 'timestamp': {'type': 'long', 'logicalType': 'timestamp-millis'},
282
+ 'timestamp with time zone': {'type': 'long', 'logicalType': 'timestamp-millis'},
283
+ 'timestamp without time zone': {'type': 'long', 'logicalType': 'timestamp-millis'},
284
+ 'timestamptz': {'type': 'long', 'logicalType': 'timestamp-millis'},
285
+ 'interval': {'type': 'fixed', 'size': 12, 'name': 'duration', 'logicalType': 'duration'},
286
+ # UUID
287
+ 'uuid': {'type': 'string', 'logicalType': 'uuid'},
288
+ # JSON types (will be inferred if enabled)
289
+ 'json': 'string',
290
+ 'jsonb': 'string',
291
+ # XML (will be inferred if enabled)
292
+ 'xml': 'string',
293
+ # Network types
294
+ 'inet': 'string',
295
+ 'cidr': 'string',
296
+ 'macaddr': 'string',
297
+ 'macaddr8': 'string',
298
+ # Geometric types (stored as string representation)
299
+ 'point': 'string',
300
+ 'line': 'string',
301
+ 'lseg': 'string',
302
+ 'box': 'string',
303
+ 'path': 'string',
304
+ 'polygon': 'string',
305
+ 'circle': 'string',
306
+ # Other
307
+ 'money': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 19, 'scale': 2},
308
+ 'bit': 'string',
309
+ 'bit varying': 'string',
310
+ 'varbit': 'string',
311
+ 'tsvector': 'string',
312
+ 'tsquery': 'string',
313
+ 'oid': 'long',
314
+ }
315
+
316
+ mysql_type_map: Dict[str, JsonNode] = {
317
+ 'tinyint': 'int',
318
+ 'smallint': 'int',
319
+ 'mediumint': 'int',
320
+ 'int': 'int',
321
+ 'integer': 'int',
322
+ 'bigint': 'long',
323
+ 'float': 'float',
324
+ 'double': 'double',
325
+ 'decimal': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 38, 'scale': 10},
326
+ 'numeric': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 38, 'scale': 10},
327
+ 'bit': 'boolean',
328
+ 'boolean': 'boolean',
329
+ 'bool': 'boolean',
330
+ 'char': 'string',
331
+ 'varchar': 'string',
332
+ 'tinytext': 'string',
333
+ 'text': 'string',
334
+ 'mediumtext': 'string',
335
+ 'longtext': 'string',
336
+ 'binary': 'bytes',
337
+ 'varbinary': 'bytes',
338
+ 'tinyblob': 'bytes',
339
+ 'blob': 'bytes',
340
+ 'mediumblob': 'bytes',
341
+ 'longblob': 'bytes',
342
+ 'date': {'type': 'int', 'logicalType': 'date'},
343
+ 'time': {'type': 'int', 'logicalType': 'time-millis'},
344
+ 'datetime': {'type': 'long', 'logicalType': 'timestamp-millis'},
345
+ 'timestamp': {'type': 'long', 'logicalType': 'timestamp-millis'},
346
+ 'year': 'int',
347
+ 'json': 'string',
348
+ 'enum': 'string',
349
+ 'set': 'string',
350
+ }
351
+
352
+ sqlserver_type_map: Dict[str, JsonNode] = {
353
+ 'bit': 'boolean',
354
+ 'tinyint': 'int',
355
+ 'smallint': 'int',
356
+ 'int': 'int',
357
+ 'bigint': 'long',
358
+ 'float': 'double',
359
+ 'real': 'float',
360
+ 'decimal': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 38, 'scale': 10},
361
+ 'numeric': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 38, 'scale': 10},
362
+ 'money': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 19, 'scale': 4},
363
+ 'smallmoney': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 10, 'scale': 4},
364
+ 'char': 'string',
365
+ 'varchar': 'string',
366
+ 'nchar': 'string',
367
+ 'nvarchar': 'string',
368
+ 'text': 'string',
369
+ 'ntext': 'string',
370
+ 'binary': 'bytes',
371
+ 'varbinary': 'bytes',
372
+ 'image': 'bytes',
373
+ 'date': {'type': 'int', 'logicalType': 'date'},
374
+ 'time': {'type': 'int', 'logicalType': 'time-millis'},
375
+ 'datetime': {'type': 'long', 'logicalType': 'timestamp-millis'},
376
+ 'datetime2': {'type': 'long', 'logicalType': 'timestamp-millis'},
377
+ 'smalldatetime': {'type': 'long', 'logicalType': 'timestamp-millis'},
378
+ 'datetimeoffset': {'type': 'long', 'logicalType': 'timestamp-millis'},
379
+ 'uniqueidentifier': {'type': 'string', 'logicalType': 'uuid'},
380
+ 'xml': 'string',
381
+ 'sql_variant': 'string',
382
+ 'hierarchyid': 'string',
383
+ 'geometry': 'bytes',
384
+ 'geography': 'bytes',
385
+ }
386
+
387
+ def get_type_map(self) -> Dict[str, JsonNode]:
388
+ """Returns the type map for the current dialect."""
389
+ if self.dialect == 'postgres':
390
+ return self.postgres_type_map
391
+ elif self.dialect == 'mysql':
392
+ return self.mysql_type_map
393
+ elif self.dialect == 'sqlserver':
394
+ return self.sqlserver_type_map
395
+ else:
396
+ # Default to postgres map for now
397
+ return self.postgres_type_map
398
+
399
+ # -------------------------------------------------------------------------
400
+ # Schema Extraction (PostgreSQL)
401
+ # -------------------------------------------------------------------------
402
+
403
+ def fetch_tables(self) -> List[Dict[str, str]]:
404
+ """Fetches list of tables from the database."""
405
+ cursor = self.connection.cursor()
406
+
407
+ if self.dialect == 'postgres':
408
+ query = """
409
+ SELECT table_name, table_schema
410
+ FROM information_schema.tables
411
+ WHERE table_schema NOT IN ('pg_catalog', 'information_schema')
412
+ AND table_type = 'BASE TABLE'
413
+ """
414
+ if self.single_table_name:
415
+ query += f" AND table_name = '{self.single_table_name}'"
416
+ query += " ORDER BY table_schema, table_name"
417
+ elif self.dialect == 'mysql':
418
+ query = """
419
+ SELECT table_name, table_schema
420
+ FROM information_schema.tables
421
+ WHERE table_schema = DATABASE()
422
+ AND table_type = 'BASE TABLE'
423
+ """
424
+ if self.single_table_name:
425
+ query += f" AND table_name = '{self.single_table_name}'"
426
+ elif self.dialect == 'sqlserver':
427
+ query = """
428
+ SELECT t.name AS table_name, s.name AS table_schema
429
+ FROM sys.tables t
430
+ INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
431
+ WHERE t.type = 'U'
432
+ """
433
+ if self.single_table_name:
434
+ query += f" AND t.name = '{self.single_table_name}'"
435
+ query += " ORDER BY s.name, t.name"
436
+ else:
437
+ raise NotImplementedError(f"fetch_tables not implemented for {self.dialect}")
438
+
439
+ cursor.execute(query)
440
+ tables = [{'table_name': row[0], 'table_schema': row[1]} for row in cursor.fetchall()]
441
+ cursor.close()
442
+ return tables
443
+
444
+ def fetch_table_columns(self, table_name: str, table_schema: str = 'public') -> List[Dict[str, Any]]:
445
+ """Fetches column information for a table."""
446
+ cursor = self.connection.cursor()
447
+
448
+ if self.dialect == 'postgres':
449
+ query = """
450
+ SELECT
451
+ c.column_name,
452
+ c.data_type,
453
+ c.udt_name,
454
+ c.is_nullable,
455
+ c.column_default,
456
+ c.character_maximum_length,
457
+ c.numeric_precision,
458
+ c.numeric_scale,
459
+ c.ordinal_position,
460
+ col_description(
461
+ (quote_ident(c.table_schema) || '.' || quote_ident(c.table_name))::regclass::oid,
462
+ c.ordinal_position
463
+ ) as column_comment
464
+ FROM information_schema.columns c
465
+ WHERE c.table_name = %s AND c.table_schema = %s
466
+ ORDER BY c.ordinal_position
467
+ """
468
+ cursor.execute(query, (table_name, table_schema))
469
+ elif self.dialect == 'mysql':
470
+ query = """
471
+ SELECT
472
+ column_name,
473
+ data_type,
474
+ column_type,
475
+ is_nullable,
476
+ column_default,
477
+ character_maximum_length,
478
+ numeric_precision,
479
+ numeric_scale,
480
+ ordinal_position,
481
+ column_comment
482
+ FROM information_schema.columns
483
+ WHERE table_name = %s AND table_schema = %s
484
+ ORDER BY ordinal_position
485
+ """
486
+ cursor.execute(query, (table_name, table_schema))
487
+ elif self.dialect == 'sqlserver':
488
+ query = """
489
+ SELECT
490
+ c.name AS column_name,
491
+ t.name AS data_type,
492
+ t.name AS udt_name,
493
+ CASE WHEN c.is_nullable = 1 THEN 'YES' ELSE 'NO' END AS is_nullable,
494
+ dc.definition AS column_default,
495
+ c.max_length AS character_maximum_length,
496
+ c.precision AS numeric_precision,
497
+ c.scale AS numeric_scale,
498
+ c.column_id AS ordinal_position,
499
+ ep.value AS column_comment
500
+ FROM sys.columns c
501
+ INNER JOIN sys.types t ON c.user_type_id = t.user_type_id
502
+ INNER JOIN sys.tables tb ON c.object_id = tb.object_id
503
+ INNER JOIN sys.schemas s ON tb.schema_id = s.schema_id
504
+ LEFT JOIN sys.default_constraints dc ON c.default_object_id = dc.object_id
505
+ LEFT JOIN sys.extended_properties ep
506
+ ON ep.major_id = c.object_id AND ep.minor_id = c.column_id AND ep.name = 'MS_Description'
507
+ WHERE tb.name = %s AND s.name = %s
508
+ ORDER BY c.column_id
509
+ """
510
+ cursor.execute(query, (table_name, table_schema))
511
+ else:
512
+ raise NotImplementedError(f"fetch_table_columns not implemented for {self.dialect}")
513
+
514
+ columns = []
515
+ for row in cursor.fetchall():
516
+ columns.append({
517
+ 'column_name': row[0],
518
+ 'data_type': row[1],
519
+ 'udt_name': row[2],
520
+ 'is_nullable': row[3] == 'YES',
521
+ 'column_default': row[4],
522
+ 'character_maximum_length': row[5],
523
+ 'numeric_precision': row[6],
524
+ 'numeric_scale': row[7],
525
+ 'ordinal_position': row[8],
526
+ 'column_comment': row[9] if len(row) > 9 else None
527
+ })
528
+ cursor.close()
529
+ return columns
530
+
531
+ def fetch_primary_keys(self, table_name: str, table_schema: str = 'public') -> List[str]:
532
+ """Fetches primary key columns for a table."""
533
+ cursor = self.connection.cursor()
534
+
535
+ if self.dialect == 'postgres':
536
+ query = """
537
+ SELECT kcu.column_name
538
+ FROM information_schema.table_constraints tc
539
+ JOIN information_schema.key_column_usage kcu
540
+ ON tc.constraint_name = kcu.constraint_name
541
+ AND tc.table_schema = kcu.table_schema
542
+ WHERE tc.table_name = %s
543
+ AND tc.table_schema = %s
544
+ AND tc.constraint_type = 'PRIMARY KEY'
545
+ ORDER BY kcu.ordinal_position
546
+ """
547
+ cursor.execute(query, (table_name, table_schema))
548
+ elif self.dialect == 'mysql':
549
+ query = """
550
+ SELECT column_name
551
+ FROM information_schema.key_column_usage
552
+ WHERE table_name = %s
553
+ AND table_schema = %s
554
+ AND constraint_name = 'PRIMARY'
555
+ ORDER BY ordinal_position
556
+ """
557
+ cursor.execute(query, (table_name, table_schema))
558
+ elif self.dialect == 'sqlserver':
559
+ query = """
560
+ SELECT c.name AS column_name
561
+ FROM sys.indexes i
562
+ INNER JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
563
+ INNER JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
564
+ INNER JOIN sys.tables t ON i.object_id = t.object_id
565
+ INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
566
+ WHERE i.is_primary_key = 1 AND t.name = %s AND s.name = %s
567
+ ORDER BY ic.key_ordinal
568
+ """
569
+ cursor.execute(query, (table_name, table_schema))
570
+ else:
571
+ cursor.close()
572
+ return []
573
+
574
+ pk_columns = [row[0] for row in cursor.fetchall()]
575
+ cursor.close()
576
+ return pk_columns
577
+
578
+ def fetch_table_comment(self, table_name: str, table_schema: str = 'public') -> str | None:
579
+ """Fetches table comment/description."""
580
+ cursor = self.connection.cursor()
581
+
582
+ if self.dialect == 'postgres':
583
+ query = """
584
+ SELECT obj_description(
585
+ (quote_ident(%s) || '.' || quote_ident(%s))::regclass::oid,
586
+ 'pg_class'
587
+ )
588
+ """
589
+ cursor.execute(query, (table_schema, table_name))
590
+ result = cursor.fetchone()
591
+ cursor.close()
592
+ return result[0] if result else None
593
+ elif self.dialect == 'mysql':
594
+ query = """
595
+ SELECT table_comment
596
+ FROM information_schema.tables
597
+ WHERE table_name = %s AND table_schema = %s
598
+ """
599
+ cursor.execute(query, (table_name, table_schema))
600
+ result = cursor.fetchone()
601
+ cursor.close()
602
+ return result[0] if result and result[0] else None
603
+ elif self.dialect == 'sqlserver':
604
+ query = """
605
+ SELECT ep.value
606
+ FROM sys.extended_properties ep
607
+ INNER JOIN sys.tables t ON ep.major_id = t.object_id
608
+ INNER JOIN sys.schemas s ON t.schema_id = s.schema_id
609
+ WHERE ep.minor_id = 0 AND ep.name = 'MS_Description'
610
+ AND t.name = %s AND s.name = %s
611
+ """
612
+ cursor.execute(query, (table_name, table_schema))
613
+ result = cursor.fetchone()
614
+ cursor.close()
615
+ return result[0] if result and result[0] else None
616
+
617
+ cursor.close()
618
+ return None
619
+
620
+ # -------------------------------------------------------------------------
621
+ # JSON/XML Schema Inference (delegated to shared AvroSchemaInferrer)
622
+ # -------------------------------------------------------------------------
623
+
624
+ def infer_json_column_schema(
625
+ self,
626
+ table_name: str,
627
+ table_schema: str,
628
+ column_name: str,
629
+ type_column: str | None = None,
630
+ type_value: str | None = None
631
+ ) -> JsonNode:
632
+ """Infers Avro schema for a JSON/JSONB column by sampling data."""
633
+ cursor = self.connection.cursor()
634
+
635
+ if self.dialect == 'postgres':
636
+ if type_column and type_value:
637
+ query = f"""
638
+ SELECT "{column_name}"::text
639
+ FROM "{table_schema}"."{table_name}"
640
+ WHERE "{type_column}" = %s AND "{column_name}" IS NOT NULL
641
+ LIMIT {self.sample_size}
642
+ """
643
+ cursor.execute(query, (type_value,))
644
+ else:
645
+ query = f"""
646
+ SELECT "{column_name}"::text
647
+ FROM "{table_schema}"."{table_name}"
648
+ WHERE "{column_name}" IS NOT NULL
649
+ LIMIT {self.sample_size}
650
+ """
651
+ cursor.execute(query)
652
+ elif self.dialect == 'mysql':
653
+ if type_column and type_value:
654
+ query = f"""
655
+ SELECT `{column_name}`
656
+ FROM `{table_schema}`.`{table_name}`
657
+ WHERE `{type_column}` = %s AND `{column_name}` IS NOT NULL
658
+ LIMIT {self.sample_size}
659
+ """
660
+ cursor.execute(query, (type_value,))
661
+ else:
662
+ query = f"""
663
+ SELECT `{column_name}`
664
+ FROM `{table_schema}`.`{table_name}`
665
+ WHERE `{column_name}` IS NOT NULL
666
+ LIMIT {self.sample_size}
667
+ """
668
+ cursor.execute(query)
669
+ else:
670
+ cursor.close()
671
+ return "string"
672
+
673
+ values = []
674
+ for row in cursor.fetchall():
675
+ if row[0]:
676
+ try:
677
+ parsed = json.loads(row[0]) if isinstance(row[0], str) else row[0]
678
+ values.append(parsed)
679
+ except (json.JSONDecodeError, TypeError):
680
+ pass
681
+ cursor.close()
682
+
683
+ if not values:
684
+ return "string"
685
+
686
+ type_name = type_value if type_value else f"{table_name}.{column_name}"
687
+ return self._inferrer.infer_from_json_values(type_name, values)
688
+
689
+ def infer_xml_column_schema(
690
+ self,
691
+ table_name: str,
692
+ table_schema: str,
693
+ column_name: str
694
+ ) -> JsonNode:
695
+ """Infers Avro schema for an XML column by sampling data."""
696
+ cursor = self.connection.cursor()
697
+
698
+ if self.dialect == 'postgres':
699
+ query = f"""
700
+ SELECT "{column_name}"::text
701
+ FROM "{table_schema}"."{table_name}"
702
+ WHERE "{column_name}" IS NOT NULL
703
+ LIMIT {self.sample_size}
704
+ """
705
+ cursor.execute(query)
706
+ else:
707
+ cursor.close()
708
+ return "string"
709
+
710
+ xml_strings: List[str] = []
711
+ for row in cursor.fetchall():
712
+ if row[0]:
713
+ xml_strings.append(row[0])
714
+ cursor.close()
715
+
716
+ if not xml_strings:
717
+ return "string"
718
+
719
+ type_name = f"{table_name}.{column_name}"
720
+ return self._inferrer.infer_from_xml_values(type_name, xml_strings)
721
+
722
+ # -------------------------------------------------------------------------
723
+ # Type Conversion
724
+ # -------------------------------------------------------------------------
725
+
726
+ def map_sql_type_to_avro_type(
727
+ self,
728
+ column: Dict[str, Any],
729
+ table_name: str,
730
+ table_schema: str,
731
+ type_column: str | None = None,
732
+ type_value: str | None = None
733
+ ) -> JsonNode:
734
+ """Maps a SQL column type to Avro type."""
735
+ data_type = column['data_type'].lower()
736
+ udt_name = (column.get('udt_name') or '').lower()
737
+
738
+ # Check for JSON types that need inference
739
+ if self.infer_json_schema and data_type in ('json', 'jsonb'):
740
+ inferred = self.infer_json_column_schema(
741
+ table_name, table_schema, column['column_name'], type_column, type_value
742
+ )
743
+ return inferred
744
+
745
+ # Check for XML types that need inference
746
+ if self.infer_xml_schema and data_type == 'xml':
747
+ inferred = self.infer_xml_column_schema(
748
+ table_name, table_schema, column['column_name']
749
+ )
750
+ return inferred
751
+
752
+ # Handle ARRAY types
753
+ if data_type == 'array' or (udt_name and udt_name.startswith('_')):
754
+ element_type = udt_name[1:] if udt_name.startswith('_') else 'text'
755
+ type_map = self.get_type_map()
756
+ element_avro_type = type_map.get(element_type, 'string')
757
+ return {"type": "array", "items": element_avro_type}
758
+
759
+ # Handle NUMERIC/DECIMAL with precision
760
+ if data_type in ('numeric', 'decimal'):
761
+ precision = column.get('numeric_precision') or 38
762
+ scale = column.get('numeric_scale') or 10
763
+ return {
764
+ "type": "bytes",
765
+ "logicalType": "decimal",
766
+ "precision": precision,
767
+ "scale": scale
768
+ }
769
+
770
+ # Handle MySQL bit(n) where n > 1 should be array of booleans
771
+ if self.dialect == 'mysql' and data_type == 'bit':
772
+ # character_maximum_length holds the bit width for MySQL bit type
773
+ bit_width = column.get('character_maximum_length') or 1
774
+ if bit_width > 1:
775
+ return {"type": "array", "items": "boolean"}
776
+ # bit(1) is commonly used as boolean
777
+ return "boolean"
778
+
779
+ # Look up in type map
780
+ type_map = self.get_type_map()
781
+
782
+ # Try exact match first
783
+ if data_type in type_map:
784
+ return copy.deepcopy(type_map[data_type])
785
+
786
+ # Try udt_name
787
+ if udt_name and udt_name in type_map:
788
+ return copy.deepcopy(type_map[udt_name])
789
+
790
+ # Try matching without "USER-DEFINED" and composite types
791
+ if data_type == 'user-defined':
792
+ return "string" # Default for user-defined types
793
+
794
+ # Default fallback
795
+ return "string"
796
+
797
+ # -------------------------------------------------------------------------
798
+ # Schema Generation
799
+ # -------------------------------------------------------------------------
800
+
801
+ def table_to_avro_schema(
802
+ self,
803
+ table_name: str,
804
+ table_schema: str = 'public'
805
+ ) -> JsonNode:
806
+ """Converts a SQL table to Avro schema."""
807
+ columns = self.fetch_table_columns(table_name, table_schema)
808
+ primary_keys = self.fetch_primary_keys(table_name, table_schema)
809
+ table_comment = self.fetch_table_comment(table_name, table_schema)
810
+
811
+ # Check for CloudEvents pattern
812
+ column_names = set(col['column_name'].lower() for col in columns)
813
+ is_cloudevent = False
814
+ type_values: List[str | None] = []
815
+ type_column: str | None = None
816
+
817
+ if self.emit_cloudevents:
818
+ is_cloudevent = all(c in column_names for c in ['type', 'source', 'data', 'id'])
819
+ if is_cloudevent:
820
+ type_column = next(
821
+ (col['column_name'] for col in columns if col['column_name'].lower() == 'type'),
822
+ None
823
+ )
824
+ if type_column:
825
+ type_values = self._fetch_distinct_type_values(table_name, table_schema, type_column)
826
+
827
+ if not type_values:
828
+ type_values = [None]
829
+
830
+ schemas: List[JsonNode] = []
831
+
832
+ for type_value in type_values:
833
+ if type_value and isinstance(type_value, str):
834
+ type_name_name = avro_name(type_value.rsplit('.', 1)[-1])
835
+ type_name_namespace = type_value.rsplit('.', 1)[0] if '.' in type_value else ''
836
+ type_namespace = self.avro_namespace + ('.' if self.avro_namespace and type_name_namespace else '') + type_name_namespace
837
+ else:
838
+ type_name_name = avro_name(table_name)
839
+ type_namespace = self.avro_namespace
840
+
841
+ if is_cloudevent and type_column:
842
+ # For CloudEvents, focus on the 'data' column
843
+ data_column = next(
844
+ (col for col in columns if col['column_name'].lower() == 'data'),
845
+ None
846
+ )
847
+ if data_column:
848
+ data_schema = self.map_sql_type_to_avro_type(
849
+ data_column, table_name, table_schema, type_column, type_value
850
+ )
851
+ if isinstance(data_schema, dict):
852
+ data_schema = [data_schema]
853
+ if isinstance(data_schema, list):
854
+ for schema in data_schema:
855
+ if not isinstance(schema, dict) or schema.get("type") != "record":
856
+ schema = self._wrap_schema_in_root_record(schema, type_name_name, type_namespace)
857
+ if self.emit_xregistry:
858
+ ce_attribs: Dict[str, JsonNode] = {}
859
+ for col in columns:
860
+ if col['column_name'].lower() != 'data':
861
+ ce_attribs[col['column_name'].lower()] = "string"
862
+ if isinstance(schema, dict):
863
+ schema["ce_attribs"] = ce_attribs
864
+ self._apply_schema_attributes(schema, table_name, table_schema, type_value, type_namespace, table_comment)
865
+ schemas.append(schema)
866
+ else:
867
+ # Normal table conversion
868
+ fields: List[JsonNode] = []
869
+ for column in columns:
870
+ avro_type = self.map_sql_type_to_avro_type(
871
+ column, table_name, table_schema, type_column, type_value
872
+ )
873
+
874
+ # Make nullable if column allows NULL
875
+ if column['is_nullable'] and avro_type != "null":
876
+ if isinstance(avro_type, list):
877
+ if "null" not in avro_type:
878
+ avro_type = ["null"] + avro_type
879
+ else:
880
+ avro_type = ["null", avro_type]
881
+
882
+ field: Dict[str, JsonNode] = {
883
+ "name": avro_name(column['column_name']),
884
+ "type": avro_type
885
+ }
886
+
887
+ # Add original name as altname if different
888
+ if avro_name(column['column_name']) != column['column_name']:
889
+ field["altnames"] = {"sql": column['column_name']}
890
+
891
+ # Add column comment as doc
892
+ if column.get('column_comment'):
893
+ field["doc"] = column['column_comment']
894
+
895
+ fields.append(field)
896
+
897
+ schema: Dict[str, JsonNode] = {
898
+ "type": "record",
899
+ "name": type_name_name,
900
+ "fields": fields
901
+ }
902
+
903
+ # Add primary keys as 'unique' annotation
904
+ if primary_keys:
905
+ schema["unique"] = [avro_name(pk) for pk in primary_keys]
906
+
907
+ self._apply_schema_attributes(schema, table_name, table_schema, type_value, type_namespace, table_comment)
908
+ schemas.append(schema)
909
+
910
+ return schemas if len(schemas) > 1 else schemas[0]
911
+
912
+ def _fetch_distinct_type_values(self, table_name: str, table_schema: str, type_column: str) -> List[str]:
913
+ """Fetches distinct values from a type discriminator column."""
914
+ cursor = self.connection.cursor()
915
+
916
+ if self.dialect == 'postgres':
917
+ query = f"""
918
+ SELECT DISTINCT "{type_column}"
919
+ FROM "{table_schema}"."{table_name}"
920
+ WHERE "{type_column}" IS NOT NULL
921
+ LIMIT 1000
922
+ """
923
+ elif self.dialect == 'mysql':
924
+ query = f"""
925
+ SELECT DISTINCT `{type_column}`
926
+ FROM `{table_schema}`.`{table_name}`
927
+ WHERE `{type_column}` IS NOT NULL
928
+ LIMIT 1000
929
+ """
930
+ else:
931
+ cursor.close()
932
+ return []
933
+
934
+ cursor.execute(query)
935
+ values = [row[0] for row in cursor.fetchall() if row[0]]
936
+ cursor.close()
937
+ return values
938
+
939
+ def _wrap_schema_in_root_record(self, schema: JsonNode, type_name: str, type_namespace: str) -> Dict[str, JsonNode]:
940
+ """Wraps a schema in a root record."""
941
+ record: Dict[str, JsonNode] = {
942
+ "type": "record",
943
+ "name": type_name,
944
+ "fields": [
945
+ {
946
+ "name": "data",
947
+ "type": schema,
948
+ "root": True
949
+ }
950
+ ]
951
+ }
952
+ if type_namespace:
953
+ record["namespace"] = type_namespace
954
+ return record
955
+
956
+ def _apply_schema_attributes(
957
+ self,
958
+ schema: JsonNode,
959
+ table_name: str,
960
+ table_schema: str,
961
+ type_value: str | None,
962
+ type_namespace: str,
963
+ table_comment: str | None
964
+ ):
965
+ """Applies schema attributes to the schema."""
966
+ if isinstance(schema, dict):
967
+ schema["altnames"] = {"sql": f"{table_schema}.{table_name}" if table_schema != 'public' else table_name}
968
+ if self.emit_cloudevents and type_value:
969
+ schema["ce_type"] = type_value
970
+ if type_namespace:
971
+ schema["namespace"] = type_namespace
972
+ if table_comment:
973
+ schema["doc"] = table_comment
974
+
975
+ def make_type_names_unique(self, item_types: list):
976
+ """Makes the type names unique (following k2a pattern)."""
977
+ for item in item_types:
978
+ if isinstance(item, dict) and item.get("type") == "array":
979
+ if isinstance(item.get("items"), dict) and item["items"].get("type") == "record":
980
+ self.make_type_names_unique([item["items"]])
981
+ elif isinstance(item.get("items"), list):
982
+ self.make_type_names_unique(item["items"])
983
+ if isinstance(item, dict) and item.get("type") == "map":
984
+ if isinstance(item.get("values"), dict) and item["values"].get("type") == "record":
985
+ self.make_type_names_unique([item["values"]])
986
+ elif isinstance(item.get("values"), list):
987
+ self.make_type_names_unique(item["values"])
988
+ elif isinstance(item, dict) and item.get("type") == "record":
989
+ namespace = item.get("namespace", '')
990
+ type_name = base_name = item["name"]
991
+ record_name = f"{namespace}.{type_name}" if namespace else type_name
992
+ if record_name in self.generated_types:
993
+ i = 0
994
+ while record_name in self.generated_types:
995
+ i += 1
996
+ type_name = f"{base_name}{i}"
997
+ record_name = f"{namespace}.{type_name}" if namespace else type_name
998
+ self.generated_types.append(record_name)
999
+ else:
1000
+ self.generated_types.append(record_name)
1001
+ item["name"] = type_name
1002
+ for field in item.get("fields", []):
1003
+ if isinstance(field.get("type"), dict):
1004
+ if field["type"].get("type") in ["record", "array", "map"]:
1005
+ self.make_type_names_unique([field["type"]])
1006
+ elif isinstance(field.get("type"), list):
1007
+ self.make_type_names_unique(field["type"])
1008
+
1009
+ # -------------------------------------------------------------------------
1010
+ # Main Processing
1011
+ # -------------------------------------------------------------------------
1012
+
1013
+ def process_all_tables(self):
1014
+ """Processes all tables in the database and generates Avro schema."""
1015
+ union_schema: List[JsonNode] = []
1016
+ tables = self.fetch_tables()
1017
+
1018
+ for table_info in tables:
1019
+ table_name = table_info['table_name']
1020
+ table_schema = table_info['table_schema']
1021
+ print(f"Processing table: {table_schema}.{table_name}")
1022
+
1023
+ avro_schema = self.table_to_avro_schema(table_name, table_schema)
1024
+ if isinstance(avro_schema, list):
1025
+ union_schema.extend(avro_schema)
1026
+ else:
1027
+ union_schema.append(avro_schema)
1028
+
1029
+ output = None
1030
+ if self.emit_xregistry:
1031
+ xregistry_messages = {}
1032
+ xregistry_schemas = {}
1033
+ groupname = self.avro_namespace
1034
+ for schema in union_schema:
1035
+ self.generated_types = []
1036
+ self.make_type_names_unique([schema])
1037
+ ce_attribs: Dict[str, JsonNode] = {}
1038
+ if isinstance(schema, dict) and "ce_attribs" in schema:
1039
+ ce_attribs = cast(Dict[str, JsonNode], schema.get("ce_attribs", {}))
1040
+ del schema["ce_attribs"]
1041
+ if isinstance(schema, dict):
1042
+ schemaid = schema.get('ce_type', f"{self.avro_namespace}.{schema['name']}")
1043
+ schema_name = str(schemaid).rsplit('.', 1)[-1]
1044
+ xregistry_schemas[schemaid] = {
1045
+ "id": schemaid,
1046
+ "name": schema_name,
1047
+ "format": f"Avro/{AVRO_VERSION}",
1048
+ "defaultversionid": "1",
1049
+ "versions": {
1050
+ "1": {
1051
+ "id": "1",
1052
+ "format": f"Avro/{AVRO_VERSION}",
1053
+ "schema": schema
1054
+ }
1055
+ }
1056
+ }
1057
+ xregistry_messages[schemaid] = {
1058
+ "id": schemaid,
1059
+ "name": schema_name,
1060
+ "envelope": "CloudEvents/1.0",
1061
+ "envelopemetadata": {
1062
+ "type": {"value": schemaid},
1063
+ "source": {"value": "{source}"},
1064
+ },
1065
+ "schemaformat": f"Avro/{AVRO_VERSION}",
1066
+ "schemauri": f"#/schemagroups/{groupname}/schemas/{schemaid}"
1067
+ }
1068
+ for key, value in ce_attribs.items():
1069
+ if key in ("type", "source", "id", "specversion"):
1070
+ continue
1071
+ xregistry_messages[schemaid]["envelopemetadata"][key] = {
1072
+ "type": value,
1073
+ "required": True
1074
+ }
1075
+ output = {
1076
+ "messagegroups": {
1077
+ groupname: {
1078
+ "id": groupname,
1079
+ "messages": xregistry_messages
1080
+ }
1081
+ },
1082
+ "schemagroups": {
1083
+ groupname: {
1084
+ "id": groupname,
1085
+ "schemas": xregistry_schemas
1086
+ }
1087
+ }
1088
+ }
1089
+ else:
1090
+ self.generated_types = []
1091
+ self.make_type_names_unique(union_schema)
1092
+ output = union_schema if len(union_schema) > 1 else union_schema[0] if union_schema else []
1093
+
1094
+ # Create the output directory if needed
1095
+ base_dir = os.path.dirname(self.avro_schema_path)
1096
+ if base_dir and not os.path.exists(base_dir):
1097
+ os.makedirs(base_dir)
1098
+
1099
+ with open(self.avro_schema_path, 'w', encoding='utf-8') as avro_file:
1100
+ json.dump(output, avro_file, indent=4)
1101
+
1102
+ self.close()
1103
+
1104
+
1105
+ def convert_sql_to_avro(
1106
+ connection_string: str,
1107
+ avro_schema_file: str,
1108
+ dialect: str = 'postgres',
1109
+ database: str | None = None,
1110
+ table_name: str | None = None,
1111
+ avro_namespace: str | None = None,
1112
+ emit_cloudevents: bool = False,
1113
+ emit_cloudevents_xregistry: bool = False,
1114
+ sample_size: int = 100,
1115
+ infer_json_schema: bool = True,
1116
+ infer_xml_schema: bool = True,
1117
+ username: str | None = None,
1118
+ password: str | None = None
1119
+ ):
1120
+ """Converts SQL database schemas to Avro schema format.
1121
+
1122
+ Args:
1123
+ connection_string: Database connection string
1124
+ avro_schema_file: Output path for the Avro schema file
1125
+ dialect: SQL dialect (postgres, mysql, sqlserver, oracle, sqlite)
1126
+ database: Database name (overrides connection string if provided)
1127
+ table_name: Specific table to convert (None for all tables)
1128
+ avro_namespace: Namespace for generated Avro schemas
1129
+ emit_cloudevents: Whether to emit CloudEvents declarations
1130
+ emit_cloudevents_xregistry: Whether to emit xRegistry manifest format
1131
+ sample_size: Number of rows to sample for JSON/XML inference
1132
+ infer_json_schema: Whether to infer schema for JSON columns
1133
+ infer_xml_schema: Whether to infer schema for XML columns
1134
+ username: Database username (overrides connection string credentials)
1135
+ password: Database password (overrides connection string credentials)
1136
+ """
1137
+ if not connection_string:
1138
+ raise ValueError("connection_string is required")
1139
+
1140
+ if not avro_namespace:
1141
+ avro_namespace = database or 'database'
1142
+
1143
+ converter = SqlToAvro(
1144
+ connection_string=connection_string,
1145
+ database=database,
1146
+ table_name=table_name,
1147
+ avro_namespace=avro_namespace,
1148
+ avro_schema_path=avro_schema_file,
1149
+ dialect=dialect,
1150
+ emit_cloudevents=emit_cloudevents,
1151
+ emit_cloudevents_xregistry=emit_cloudevents_xregistry,
1152
+ sample_size=sample_size,
1153
+ infer_json_schema=infer_json_schema,
1154
+ infer_xml_schema=infer_xml_schema,
1155
+ username=username,
1156
+ password=password
1157
+ )
1158
+
1159
+ return converter.process_all_tables()