thoth-dbmanager 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thoth_dbmanager/ThothDbManager.py +459 -0
- thoth_dbmanager/__init__.py +136 -0
- thoth_dbmanager/adapters/__init__.py +21 -0
- thoth_dbmanager/adapters/mariadb.py +165 -0
- thoth_dbmanager/adapters/mysql.py +165 -0
- thoth_dbmanager/adapters/oracle.py +554 -0
- thoth_dbmanager/adapters/postgresql.py +444 -0
- thoth_dbmanager/adapters/sqlite.py +385 -0
- thoth_dbmanager/adapters/sqlserver.py +583 -0
- thoth_dbmanager/adapters/supabase.py +249 -0
- thoth_dbmanager/core/__init__.py +13 -0
- thoth_dbmanager/core/factory.py +272 -0
- thoth_dbmanager/core/interfaces.py +271 -0
- thoth_dbmanager/core/registry.py +220 -0
- thoth_dbmanager/documents.py +155 -0
- thoth_dbmanager/dynamic_imports.py +250 -0
- thoth_dbmanager/helpers/__init__.py +0 -0
- thoth_dbmanager/helpers/multi_db_generator.py +508 -0
- thoth_dbmanager/helpers/preprocess_values.py +159 -0
- thoth_dbmanager/helpers/schema.py +376 -0
- thoth_dbmanager/helpers/search.py +117 -0
- thoth_dbmanager/lsh/__init__.py +21 -0
- thoth_dbmanager/lsh/core.py +182 -0
- thoth_dbmanager/lsh/factory.py +76 -0
- thoth_dbmanager/lsh/manager.py +170 -0
- thoth_dbmanager/lsh/storage.py +96 -0
- thoth_dbmanager/plugins/__init__.py +23 -0
- thoth_dbmanager/plugins/mariadb.py +436 -0
- thoth_dbmanager/plugins/mysql.py +408 -0
- thoth_dbmanager/plugins/oracle.py +150 -0
- thoth_dbmanager/plugins/postgresql.py +145 -0
- thoth_dbmanager/plugins/sqlite.py +170 -0
- thoth_dbmanager/plugins/sqlserver.py +149 -0
- thoth_dbmanager/plugins/supabase.py +224 -0
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/METADATA +6 -6
- thoth_dbmanager-0.4.1.dist-info/RECORD +39 -0
- thoth_dbmanager-0.4.1.dist-info/top_level.txt +1 -0
- thoth_dbmanager-0.4.0.dist-info/RECORD +0 -5
- thoth_dbmanager-0.4.0.dist-info/top_level.txt +0 -1
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/WHEEL +0 -0
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,444 @@
|
|
1
|
+
"""
|
2
|
+
PostgreSQL adapter implementation.
|
3
|
+
"""
|
4
|
+
import logging
|
5
|
+
from typing import Any, Dict, List, Optional, Union
|
6
|
+
import psycopg2
|
7
|
+
from psycopg2.extras import RealDictCursor
|
8
|
+
from sqlalchemy import create_engine, text
|
9
|
+
from sqlalchemy.exc import SQLAlchemyError
|
10
|
+
|
11
|
+
from ..core.interfaces import DbAdapter
|
12
|
+
from ..documents import (
|
13
|
+
TableDocument,
|
14
|
+
ColumnDocument,
|
15
|
+
SchemaDocument,
|
16
|
+
ForeignKeyDocument,
|
17
|
+
IndexDocument
|
18
|
+
)
|
19
|
+
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class PostgreSQLAdapter(DbAdapter):
|
24
|
+
"""
|
25
|
+
PostgreSQL database adapter implementation.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(self, connection_params: Dict[str, Any]):
|
29
|
+
super().__init__(connection_params)
|
30
|
+
self.engine = None
|
31
|
+
self.raw_connection = None
|
32
|
+
|
33
|
+
def connect(self) -> None:
|
34
|
+
"""Establish PostgreSQL connection"""
|
35
|
+
try:
|
36
|
+
# Create SQLAlchemy engine
|
37
|
+
connection_string = self._build_connection_string()
|
38
|
+
self.engine = create_engine(connection_string, echo=False)
|
39
|
+
|
40
|
+
# Test connection
|
41
|
+
with self.engine.connect() as conn:
|
42
|
+
conn.execute(text("SELECT 1"))
|
43
|
+
|
44
|
+
# Also create raw psycopg2 connection for specific operations
|
45
|
+
self.raw_connection = psycopg2.connect(**self._get_psycopg2_params())
|
46
|
+
|
47
|
+
self._initialized = True
|
48
|
+
logger.info("PostgreSQL connection established successfully")
|
49
|
+
|
50
|
+
except Exception as e:
|
51
|
+
logger.error(f"Failed to connect to PostgreSQL: {e}")
|
52
|
+
raise
|
53
|
+
|
54
|
+
def disconnect(self) -> None:
|
55
|
+
"""Close PostgreSQL connection"""
|
56
|
+
try:
|
57
|
+
if self.engine:
|
58
|
+
self.engine.dispose()
|
59
|
+
self.engine = None
|
60
|
+
|
61
|
+
if self.raw_connection:
|
62
|
+
self.raw_connection.close()
|
63
|
+
self.raw_connection = None
|
64
|
+
|
65
|
+
self._initialized = False
|
66
|
+
logger.info("PostgreSQL connection closed")
|
67
|
+
|
68
|
+
except Exception as e:
|
69
|
+
logger.error(f"Error closing PostgreSQL connection: {e}")
|
70
|
+
|
71
|
+
def _build_connection_string(self) -> str:
|
72
|
+
"""Build SQLAlchemy connection string"""
|
73
|
+
params = self.connection_params
|
74
|
+
host = params.get('host', 'localhost')
|
75
|
+
port = params.get('port', 5432)
|
76
|
+
database = params.get('database')
|
77
|
+
user = params.get('user')
|
78
|
+
password = params.get('password')
|
79
|
+
|
80
|
+
if not all([database, user, password]):
|
81
|
+
raise ValueError("Missing required connection parameters: database, user, password")
|
82
|
+
|
83
|
+
return f"postgresql://{user}:{password}@{host}:{port}/{database}"
|
84
|
+
|
85
|
+
def _get_psycopg2_params(self) -> Dict[str, Any]:
|
86
|
+
"""Get parameters for psycopg2 connection"""
|
87
|
+
return {
|
88
|
+
'host': self.connection_params.get('host', 'localhost'),
|
89
|
+
'port': self.connection_params.get('port', 5432),
|
90
|
+
'database': self.connection_params.get('database'),
|
91
|
+
'user': self.connection_params.get('user'),
|
92
|
+
'password': self.connection_params.get('password')
|
93
|
+
}
|
94
|
+
|
95
|
+
def execute_query(self, query: str, params: Optional[Dict] = None, fetch: Union[str, int] = "all", timeout: int = 60) -> Any:
|
96
|
+
"""Execute SQL query"""
|
97
|
+
if not self.engine:
|
98
|
+
raise RuntimeError("Not connected to database")
|
99
|
+
|
100
|
+
try:
|
101
|
+
with self.engine.connect() as conn:
|
102
|
+
# Set query timeout
|
103
|
+
conn.execute(text(f"SET statement_timeout = {timeout * 1000}")) # PostgreSQL uses milliseconds
|
104
|
+
|
105
|
+
# Execute query
|
106
|
+
if params:
|
107
|
+
result = conn.execute(text(query), params)
|
108
|
+
else:
|
109
|
+
result = conn.execute(text(query))
|
110
|
+
|
111
|
+
# Handle different fetch modes
|
112
|
+
if query.strip().upper().startswith(('SELECT', 'WITH')):
|
113
|
+
if fetch == "all":
|
114
|
+
return result.fetchall()
|
115
|
+
elif fetch == "one":
|
116
|
+
return result.fetchone()
|
117
|
+
elif isinstance(fetch, int):
|
118
|
+
return result.fetchmany(fetch)
|
119
|
+
else:
|
120
|
+
return result.fetchall()
|
121
|
+
else:
|
122
|
+
# For non-SELECT queries, return rowcount
|
123
|
+
conn.commit()
|
124
|
+
return result.rowcount
|
125
|
+
|
126
|
+
except SQLAlchemyError as e:
|
127
|
+
logger.error(f"PostgreSQL query error: {e}")
|
128
|
+
raise
|
129
|
+
|
130
|
+
def get_tables_as_documents(self) -> List[TableDocument]:
|
131
|
+
"""Get tables as document objects"""
|
132
|
+
query = """
|
133
|
+
SELECT
|
134
|
+
schemaname as schema_name,
|
135
|
+
tablename as table_name,
|
136
|
+
COALESCE(obj_description(c.oid), '') as comment
|
137
|
+
FROM pg_tables pt
|
138
|
+
LEFT JOIN pg_class c ON c.relname = pt.tablename
|
139
|
+
LEFT JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = pt.schemaname
|
140
|
+
WHERE schemaname NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
|
141
|
+
ORDER BY schemaname, tablename
|
142
|
+
"""
|
143
|
+
|
144
|
+
results = self.execute_query(query)
|
145
|
+
documents = []
|
146
|
+
|
147
|
+
for row in results:
|
148
|
+
doc = TableDocument(
|
149
|
+
table_name=row.table_name,
|
150
|
+
schema_name=row.schema_name,
|
151
|
+
comment=row.comment or ""
|
152
|
+
)
|
153
|
+
documents.append(doc)
|
154
|
+
|
155
|
+
return documents
|
156
|
+
|
157
|
+
|
158
|
+
def get_columns_as_documents(self, table_name: str) -> List[ColumnDocument]:
|
159
|
+
"""Get columns as document objects"""
|
160
|
+
query = """
|
161
|
+
SELECT
|
162
|
+
c.column_name,
|
163
|
+
c.data_type,
|
164
|
+
c.is_nullable,
|
165
|
+
c.column_default,
|
166
|
+
c.character_maximum_length,
|
167
|
+
COALESCE(pgd.description, '') as comment,
|
168
|
+
CASE WHEN pk.column_name IS NOT NULL THEN true ELSE false END as is_pk,
|
169
|
+
c.table_schema as schema_name
|
170
|
+
FROM information_schema.columns c
|
171
|
+
LEFT JOIN pg_class pgc ON pgc.relname = c.table_name
|
172
|
+
LEFT JOIN pg_namespace pgn ON pgn.oid = pgc.relnamespace AND pgn.nspname = c.table_schema
|
173
|
+
LEFT JOIN pg_description pgd ON pgd.objoid = pgc.oid AND pgd.objsubid = c.ordinal_position
|
174
|
+
LEFT JOIN (
|
175
|
+
SELECT ku.column_name, ku.table_name, ku.table_schema
|
176
|
+
FROM information_schema.table_constraints tc
|
177
|
+
JOIN information_schema.key_column_usage ku ON tc.constraint_name = ku.constraint_name
|
178
|
+
WHERE tc.constraint_type = 'PRIMARY KEY'
|
179
|
+
) pk ON pk.column_name = c.column_name AND pk.table_name = c.table_name AND pk.table_schema = c.table_schema
|
180
|
+
WHERE c.table_name = :table_name
|
181
|
+
AND c.table_schema NOT IN ('information_schema', 'pg_catalog')
|
182
|
+
ORDER BY c.ordinal_position
|
183
|
+
"""
|
184
|
+
|
185
|
+
results = self.execute_query(query, {"table_name": table_name})
|
186
|
+
documents = []
|
187
|
+
|
188
|
+
for row in results:
|
189
|
+
doc = ColumnDocument(
|
190
|
+
table_name=table_name,
|
191
|
+
column_name=row.column_name,
|
192
|
+
data_type=row.data_type,
|
193
|
+
comment=row.comment or "",
|
194
|
+
is_pk=row.is_pk,
|
195
|
+
is_nullable=row.is_nullable == 'YES',
|
196
|
+
default_value=row.column_default,
|
197
|
+
max_length=row.character_maximum_length,
|
198
|
+
schema_name=row.schema_name
|
199
|
+
)
|
200
|
+
documents.append(doc)
|
201
|
+
|
202
|
+
return documents
|
203
|
+
|
204
|
+
def get_foreign_keys_as_documents(self) -> List[ForeignKeyDocument]:
|
205
|
+
"""Get foreign keys as document objects"""
|
206
|
+
query = """
|
207
|
+
SELECT
|
208
|
+
tc.constraint_name,
|
209
|
+
tc.table_schema as schema_name,
|
210
|
+
tc.table_name as source_table,
|
211
|
+
kcu.column_name as source_column,
|
212
|
+
ccu.table_name as target_table,
|
213
|
+
ccu.column_name as target_column
|
214
|
+
FROM information_schema.table_constraints tc
|
215
|
+
JOIN information_schema.key_column_usage kcu ON tc.constraint_name = kcu.constraint_name
|
216
|
+
JOIN information_schema.constraint_column_usage ccu ON ccu.constraint_name = tc.constraint_name
|
217
|
+
WHERE tc.constraint_type = 'FOREIGN KEY'
|
218
|
+
AND tc.table_schema NOT IN ('information_schema', 'pg_catalog')
|
219
|
+
ORDER BY tc.table_schema, tc.table_name, kcu.ordinal_position
|
220
|
+
"""
|
221
|
+
|
222
|
+
results = self.execute_query(query)
|
223
|
+
documents = []
|
224
|
+
|
225
|
+
for row in results:
|
226
|
+
doc = ForeignKeyDocument(
|
227
|
+
source_table_name=row.source_table,
|
228
|
+
source_column_name=row.source_column,
|
229
|
+
target_table_name=row.target_table,
|
230
|
+
target_column_name=row.target_column,
|
231
|
+
constraint_name=row.constraint_name,
|
232
|
+
schema_name=row.schema_name
|
233
|
+
)
|
234
|
+
documents.append(doc)
|
235
|
+
|
236
|
+
return documents
|
237
|
+
|
238
|
+
def get_schemas_as_documents(self) -> List[SchemaDocument]:
|
239
|
+
"""Get schemas as document objects"""
|
240
|
+
query = """
|
241
|
+
SELECT
|
242
|
+
schema_name,
|
243
|
+
schema_owner as owner,
|
244
|
+
COALESCE(obj_description(n.oid), '') as description
|
245
|
+
FROM information_schema.schemata s
|
246
|
+
LEFT JOIN pg_namespace n ON n.nspname = s.schema_name
|
247
|
+
WHERE schema_name NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
|
248
|
+
ORDER BY schema_name
|
249
|
+
"""
|
250
|
+
|
251
|
+
results = self.execute_query(query)
|
252
|
+
documents = []
|
253
|
+
|
254
|
+
for row in results:
|
255
|
+
doc = SchemaDocument(
|
256
|
+
schema_name=row.schema_name,
|
257
|
+
description=row.description or "",
|
258
|
+
owner=row.owner
|
259
|
+
)
|
260
|
+
documents.append(doc)
|
261
|
+
|
262
|
+
return documents
|
263
|
+
|
264
|
+
def get_indexes_as_documents(self, table_name: Optional[str] = None) -> List[IndexDocument]:
|
265
|
+
"""Get indexes as document objects"""
|
266
|
+
base_query = """
|
267
|
+
SELECT
|
268
|
+
i.relname as index_name,
|
269
|
+
t.relname as table_name,
|
270
|
+
n.nspname as schema_name,
|
271
|
+
ix.indisunique as is_unique,
|
272
|
+
ix.indisprimary as is_primary,
|
273
|
+
am.amname as index_type,
|
274
|
+
array_agg(a.attname ORDER BY a.attnum) as columns
|
275
|
+
FROM pg_index ix
|
276
|
+
JOIN pg_class i ON i.oid = ix.indexrelid
|
277
|
+
JOIN pg_class t ON t.oid = ix.indrelid
|
278
|
+
JOIN pg_namespace n ON n.oid = t.relnamespace
|
279
|
+
JOIN pg_am am ON am.oid = i.relam
|
280
|
+
JOIN pg_attribute a ON a.attrelid = t.oid AND a.attnum = ANY(ix.indkey)
|
281
|
+
WHERE n.nspname NOT IN ('information_schema', 'pg_catalog', 'pg_toast')
|
282
|
+
"""
|
283
|
+
|
284
|
+
if table_name:
|
285
|
+
query = base_query + " AND t.relname = :table_name"
|
286
|
+
params = {"table_name": table_name}
|
287
|
+
else:
|
288
|
+
query = base_query
|
289
|
+
params = None
|
290
|
+
|
291
|
+
query += " GROUP BY i.relname, t.relname, n.nspname, ix.indisunique, ix.indisprimary, am.amname ORDER BY t.relname, i.relname"
|
292
|
+
|
293
|
+
results = self.execute_query(query, params)
|
294
|
+
documents = []
|
295
|
+
|
296
|
+
for row in results:
|
297
|
+
doc = IndexDocument(
|
298
|
+
index_name=row.index_name,
|
299
|
+
table_name=row.table_name,
|
300
|
+
columns=row.columns,
|
301
|
+
is_unique=row.is_unique,
|
302
|
+
is_primary=row.is_primary,
|
303
|
+
index_type=row.index_type,
|
304
|
+
schema_name=row.schema_name
|
305
|
+
)
|
306
|
+
documents.append(doc)
|
307
|
+
|
308
|
+
return documents
|
309
|
+
|
310
|
+
def get_unique_values(self) -> Dict[str, Dict[str, List[str]]]:
|
311
|
+
"""Get unique values from the database"""
|
312
|
+
result = {}
|
313
|
+
|
314
|
+
# Get all tables
|
315
|
+
tables = self.get_tables_as_documents()
|
316
|
+
|
317
|
+
for table_doc in tables:
|
318
|
+
table_name = table_doc.table_name
|
319
|
+
schema_name = table_doc.schema_name
|
320
|
+
full_table_name = f"{schema_name}.{table_name}"
|
321
|
+
|
322
|
+
# Get columns for this table
|
323
|
+
columns = self.get_columns_as_documents(table_name)
|
324
|
+
|
325
|
+
result[table_name] = {}
|
326
|
+
|
327
|
+
for column_doc in columns:
|
328
|
+
column_name = column_doc.column_name
|
329
|
+
|
330
|
+
# Only get unique values for text/varchar columns to avoid large datasets
|
331
|
+
if column_doc.data_type in ['text', 'varchar', 'character varying', 'char', 'character']:
|
332
|
+
try:
|
333
|
+
query = f"""
|
334
|
+
SELECT DISTINCT "{column_name}"
|
335
|
+
FROM "{schema_name}"."{table_name}"
|
336
|
+
WHERE "{column_name}" IS NOT NULL
|
337
|
+
AND LENGTH("{column_name}") > 0
|
338
|
+
ORDER BY "{column_name}"
|
339
|
+
LIMIT 1000
|
340
|
+
"""
|
341
|
+
|
342
|
+
values = self.execute_query(query)
|
343
|
+
result[table_name][column_name] = [str(row[0]) for row in values if row[0]]
|
344
|
+
|
345
|
+
except Exception as e:
|
346
|
+
logger.warning(f"Could not get unique values for {full_table_name}.{column_name}: {e}")
|
347
|
+
result[table_name][column_name] = []
|
348
|
+
else:
|
349
|
+
result[table_name][column_name] = []
|
350
|
+
|
351
|
+
return result
|
352
|
+
|
353
|
+
def get_example_data(self, table_name: str, number_of_rows: int = 30) -> Dict[str, List[Any]]:
|
354
|
+
"""
|
355
|
+
Retrieves the most frequent values for each column in the specified table.
|
356
|
+
|
357
|
+
Args:
|
358
|
+
table_name (str): The name of the table.
|
359
|
+
number_of_rows (int, optional): Maximum number of example values to return per column. Defaults to 30.
|
360
|
+
|
361
|
+
Returns:
|
362
|
+
Dict[str, List[Any]]: A dictionary mapping column names to lists of example values.
|
363
|
+
"""
|
364
|
+
# First, get the schema name for the table
|
365
|
+
schema_query = """
|
366
|
+
SELECT table_schema
|
367
|
+
FROM information_schema.tables
|
368
|
+
WHERE table_name = :table_name
|
369
|
+
AND table_schema NOT IN ('information_schema', 'pg_catalog')
|
370
|
+
LIMIT 1
|
371
|
+
"""
|
372
|
+
|
373
|
+
try:
|
374
|
+
schema_result = self.execute_query(schema_query, {"table_name": table_name})
|
375
|
+
if not schema_result:
|
376
|
+
logger.warning(f"Table {table_name} not found")
|
377
|
+
return {}
|
378
|
+
|
379
|
+
schema_name = schema_result[0][0]
|
380
|
+
except Exception as e:
|
381
|
+
logger.error(f"Error getting schema for table {table_name}: {e}")
|
382
|
+
return {}
|
383
|
+
|
384
|
+
# Get column information
|
385
|
+
columns_query = """
|
386
|
+
SELECT column_name, data_type
|
387
|
+
FROM information_schema.columns
|
388
|
+
WHERE table_name = :table_name AND table_schema = :schema_name
|
389
|
+
ORDER BY ordinal_position
|
390
|
+
"""
|
391
|
+
|
392
|
+
try:
|
393
|
+
columns_result = self.execute_query(columns_query, {"table_name": table_name, "schema_name": schema_name})
|
394
|
+
except Exception as e:
|
395
|
+
logger.error(f"Error getting columns for table {schema_name}.{table_name}: {e}")
|
396
|
+
return {}
|
397
|
+
|
398
|
+
if not columns_result:
|
399
|
+
logger.warning(f"No columns found for table {schema_name}.{table_name}")
|
400
|
+
return {}
|
401
|
+
|
402
|
+
most_frequent_values: Dict[str, List[Any]] = {}
|
403
|
+
|
404
|
+
for row in columns_result:
|
405
|
+
column_name = row[0]
|
406
|
+
data_type = row[1]
|
407
|
+
|
408
|
+
# PostgreSQL uses double quotes for identifier quoting
|
409
|
+
quoted_column_name = f'"{column_name}"'
|
410
|
+
quoted_schema_name = f'"{schema_name}"'
|
411
|
+
quoted_table_name = f'"{table_name}"'
|
412
|
+
|
413
|
+
# Query to get most frequent values
|
414
|
+
query_str = f"""
|
415
|
+
SELECT {quoted_column_name}
|
416
|
+
FROM (
|
417
|
+
SELECT {quoted_column_name}, COUNT(*) as _freq
|
418
|
+
FROM {quoted_schema_name}.{quoted_table_name}
|
419
|
+
WHERE {quoted_column_name} IS NOT NULL
|
420
|
+
GROUP BY {quoted_column_name}
|
421
|
+
ORDER BY _freq DESC
|
422
|
+
LIMIT :num_rows
|
423
|
+
) as subquery
|
424
|
+
"""
|
425
|
+
|
426
|
+
try:
|
427
|
+
result = self.execute_query(query_str, {"num_rows": number_of_rows})
|
428
|
+
values = [row[0] for row in result]
|
429
|
+
most_frequent_values[column_name] = values
|
430
|
+
except Exception as e:
|
431
|
+
logger.error(f"Error fetching frequent values for {column_name} in {schema_name}.{table_name}: {e}")
|
432
|
+
most_frequent_values[column_name] = []
|
433
|
+
|
434
|
+
# Normalize list lengths
|
435
|
+
max_length = 0
|
436
|
+
if most_frequent_values:
|
437
|
+
max_length = max(len(v) for v in most_frequent_values.values()) if most_frequent_values else 0
|
438
|
+
|
439
|
+
for column_name in most_frequent_values:
|
440
|
+
current_len = len(most_frequent_values[column_name])
|
441
|
+
if current_len < max_length:
|
442
|
+
most_frequent_values[column_name].extend([None] * (max_length - current_len))
|
443
|
+
|
444
|
+
return most_frequent_values
|