thoth-dbmanager 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thoth_dbmanager/ThothDbManager.py +459 -0
- thoth_dbmanager/__init__.py +136 -0
- thoth_dbmanager/adapters/__init__.py +21 -0
- thoth_dbmanager/adapters/mariadb.py +165 -0
- thoth_dbmanager/adapters/mysql.py +165 -0
- thoth_dbmanager/adapters/oracle.py +554 -0
- thoth_dbmanager/adapters/postgresql.py +444 -0
- thoth_dbmanager/adapters/sqlite.py +385 -0
- thoth_dbmanager/adapters/sqlserver.py +583 -0
- thoth_dbmanager/adapters/supabase.py +249 -0
- thoth_dbmanager/core/__init__.py +13 -0
- thoth_dbmanager/core/factory.py +272 -0
- thoth_dbmanager/core/interfaces.py +271 -0
- thoth_dbmanager/core/registry.py +220 -0
- thoth_dbmanager/documents.py +155 -0
- thoth_dbmanager/dynamic_imports.py +250 -0
- thoth_dbmanager/helpers/__init__.py +0 -0
- thoth_dbmanager/helpers/multi_db_generator.py +508 -0
- thoth_dbmanager/helpers/preprocess_values.py +159 -0
- thoth_dbmanager/helpers/schema.py +376 -0
- thoth_dbmanager/helpers/search.py +117 -0
- thoth_dbmanager/lsh/__init__.py +21 -0
- thoth_dbmanager/lsh/core.py +182 -0
- thoth_dbmanager/lsh/factory.py +76 -0
- thoth_dbmanager/lsh/manager.py +170 -0
- thoth_dbmanager/lsh/storage.py +96 -0
- thoth_dbmanager/plugins/__init__.py +23 -0
- thoth_dbmanager/plugins/mariadb.py +436 -0
- thoth_dbmanager/plugins/mysql.py +408 -0
- thoth_dbmanager/plugins/oracle.py +150 -0
- thoth_dbmanager/plugins/postgresql.py +145 -0
- thoth_dbmanager/plugins/sqlite.py +170 -0
- thoth_dbmanager/plugins/sqlserver.py +149 -0
- thoth_dbmanager/plugins/supabase.py +224 -0
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/METADATA +6 -6
- thoth_dbmanager-0.4.1.dist-info/RECORD +39 -0
- thoth_dbmanager-0.4.1.dist-info/top_level.txt +1 -0
- thoth_dbmanager-0.4.0.dist-info/RECORD +0 -5
- thoth_dbmanager-0.4.0.dist-info/top_level.txt +0 -1
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/WHEEL +0 -0
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,508 @@
|
|
1
|
+
import logging
|
2
|
+
import random
|
3
|
+
import re
|
4
|
+
from typing import Dict, List, Optional
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
from dbmanager.helpers.schema import DatabaseSchema
|
8
|
+
|
9
|
+
|
10
|
+
class MultiDbGenerator:
|
11
|
+
"""
|
12
|
+
Class for multi-database schema generation.
|
13
|
+
Works with any ThothDbManager implementation.
|
14
|
+
"""
|
15
|
+
|
16
|
+
CACHED_DB_SCHEMA = {}
|
17
|
+
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
dbmanager,
|
21
|
+
tentative_schema: Optional[DatabaseSchema] = None,
|
22
|
+
schema_with_examples: Optional[DatabaseSchema] = None,
|
23
|
+
schema_with_descriptions: Optional[DatabaseSchema] = None,
|
24
|
+
add_examples: bool = True):
|
25
|
+
self.db_manager = dbmanager
|
26
|
+
self.db_id = dbmanager.db_id if dbmanager else None
|
27
|
+
self.add_examples = add_examples
|
28
|
+
self.schema_structure = tentative_schema or DatabaseSchema()
|
29
|
+
self.schema_with_examples = schema_with_examples or DatabaseSchema()
|
30
|
+
self.schema_with_descriptions = schema_with_descriptions or DatabaseSchema()
|
31
|
+
|
32
|
+
if dbmanager and self.db_id not in MultiDbGenerator.CACHED_DB_SCHEMA:
|
33
|
+
self._load_schema_into_cache()
|
34
|
+
self._initialize_schema_structure()
|
35
|
+
|
36
|
+
def _load_schema_into_cache(self) -> None:
|
37
|
+
"""Load database schema into cache using manager methods"""
|
38
|
+
schema_dict = self.db_manager.get_schema_dict()
|
39
|
+
db_schema = DatabaseSchema.from_schema_dict(schema_dict)
|
40
|
+
MultiDbGenerator.CACHED_DB_SCHEMA[self.db_id] = db_schema
|
41
|
+
|
42
|
+
# Set primary keys
|
43
|
+
primary_keys = self.db_manager.get_primary_keys()
|
44
|
+
schema_with_primary_keys = {
|
45
|
+
table_name: {col: {"primary_key": True} for col in cols}
|
46
|
+
for table_name, cols in primary_keys.items()
|
47
|
+
}
|
48
|
+
db_schema.set_columns_info(schema_with_primary_keys)
|
49
|
+
|
50
|
+
# Set foreign keys
|
51
|
+
foreign_keys = self.db_manager.get_foreign_keys()
|
52
|
+
schema_with_references = {
|
53
|
+
table_name: {
|
54
|
+
column_name: {
|
55
|
+
"foreign_keys": info.get("foreign_keys", []),
|
56
|
+
"referenced_by": info.get("referenced_by", []),
|
57
|
+
}
|
58
|
+
for column_name, info in columns.items()
|
59
|
+
}
|
60
|
+
for table_name, columns in foreign_keys.items()
|
61
|
+
}
|
62
|
+
db_schema.set_columns_info(schema_with_references)
|
63
|
+
|
64
|
+
def _set_primary_keys(self, database_schema: DatabaseSchema) -> None:
|
65
|
+
"""Set primary keys using manager method"""
|
66
|
+
primary_keys = self.db_manager.get_primary_keys()
|
67
|
+
schema_with_primary_keys = {
|
68
|
+
table_name: {col: {"primary_key": True} for col in cols}
|
69
|
+
for table_name, cols in primary_keys.items()
|
70
|
+
}
|
71
|
+
database_schema.set_columns_info(schema_with_primary_keys)
|
72
|
+
|
73
|
+
def _set_foreign_keys(self, database_schema: DatabaseSchema) -> None:
|
74
|
+
"""Set foreign keys using manager method"""
|
75
|
+
foreign_keys = self.db_manager.get_foreign_keys()
|
76
|
+
schema_with_references = {
|
77
|
+
table_name: {
|
78
|
+
column_name: {
|
79
|
+
"foreign_keys": info.get("foreign_keys", []),
|
80
|
+
"referenced_by": info.get("referenced_by", []),
|
81
|
+
}
|
82
|
+
for column_name, info in columns.items()
|
83
|
+
}
|
84
|
+
for table_name, columns in foreign_keys.items()
|
85
|
+
}
|
86
|
+
database_schema.set_columns_info(schema_with_references)
|
87
|
+
|
88
|
+
def _initialize_schema_structure(self) -> None:
|
89
|
+
"""Initialize the schema structure with table and column info"""
|
90
|
+
self._load_table_and_column_info()
|
91
|
+
self._load_column_examples()
|
92
|
+
self._load_column_descriptions()
|
93
|
+
|
94
|
+
def _load_table_and_column_info(self) -> None:
|
95
|
+
"""Load table and column information from cached schema"""
|
96
|
+
if self.db_id in MultiDbGenerator.CACHED_DB_SCHEMA:
|
97
|
+
self.schema_structure = MultiDbGenerator.CACHED_DB_SCHEMA[
|
98
|
+
self.db_id
|
99
|
+
].subselect_schema(self.schema_structure)
|
100
|
+
self.schema_structure.add_info_from_schema(
|
101
|
+
schema=MultiDbGenerator.CACHED_DB_SCHEMA[self.db_id],
|
102
|
+
field_names=["type", "primary_key", "foreign_keys", "referenced_by"],
|
103
|
+
)
|
104
|
+
|
105
|
+
def _load_column_examples(self) -> None:
|
106
|
+
"""Load examples for columns in the schema"""
|
107
|
+
self.schema_structure.add_info_from_schema(
|
108
|
+
schema=self.schema_with_examples, field_names=["examples"]
|
109
|
+
)
|
110
|
+
|
111
|
+
def _load_column_descriptions(self) -> None:
|
112
|
+
"""Load descriptions for columns in the schema"""
|
113
|
+
self.schema_structure.add_info_from_schema(
|
114
|
+
schema=self.schema_with_descriptions,
|
115
|
+
field_names=[
|
116
|
+
"original_column_name",
|
117
|
+
"column_name",
|
118
|
+
"column_description",
|
119
|
+
"data_format",
|
120
|
+
"value_description",
|
121
|
+
],
|
122
|
+
)
|
123
|
+
|
124
|
+
def _extract_create_ddl_commands(self) -> Dict[str, str]:
|
125
|
+
"""Extract CREATE TABLE DDL commands for all tables
|
126
|
+
|
127
|
+
Returns:
|
128
|
+
Dict[str, str]: Dictionary mapping table names to their CREATE TABLE DDL statements
|
129
|
+
"""
|
130
|
+
if hasattr(self.db_manager, 'extract_create_ddl_commands'):
|
131
|
+
return self.db_manager.extract_create_ddl_commands()
|
132
|
+
else:
|
133
|
+
raise NotImplementedError("extract_create_ddl_commands method not implemented for this database manager")
|
134
|
+
|
135
|
+
def generate_schema_string(
|
136
|
+
self,
|
137
|
+
include_value_description: bool = True,
|
138
|
+
shuffle_cols: bool = True,
|
139
|
+
shuffle_tables: bool = True,
|
140
|
+
) -> str:
|
141
|
+
"""
|
142
|
+
Generates a schema string with descriptions and examples.
|
143
|
+
|
144
|
+
Args:
|
145
|
+
include_value_description (bool): Flag to include value descriptions.
|
146
|
+
shuffle_cols (bool): Flag to shuffle columns within tables.
|
147
|
+
shuffle_tables (bool): Flag to shuffle tables in the output.
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
str: The generated schema string.
|
151
|
+
"""
|
152
|
+
ddl_commands = self._extract_create_ddl_commands()
|
153
|
+
schema_strings = []
|
154
|
+
|
155
|
+
if shuffle_tables:
|
156
|
+
ddl_tables = list(ddl_commands.keys())
|
157
|
+
random.shuffle(ddl_tables)
|
158
|
+
ddl_commands = {
|
159
|
+
table_name: ddl_commands[table_name] for table_name in ddl_tables
|
160
|
+
}
|
161
|
+
|
162
|
+
for table_name, ddl_command in ddl_commands.items():
|
163
|
+
ddl_command = re.sub(r"\s+", " ", ddl_command.strip())
|
164
|
+
create_table_match = re.match(
|
165
|
+
r'CREATE TABLE "?`?([\w -]+)`?"?\s*\((.*)\)', ddl_command, re.DOTALL
|
166
|
+
)
|
167
|
+
|
168
|
+
if not create_table_match:
|
169
|
+
logging.warning(f"Could not parse DDL command for table {table_name}")
|
170
|
+
continue
|
171
|
+
|
172
|
+
table = create_table_match.group(1).strip()
|
173
|
+
if table != table_name:
|
174
|
+
logging.warning(f"Table name mismatch: {table} != {table_name}")
|
175
|
+
|
176
|
+
column_definitions = create_table_match.group(2).strip()
|
177
|
+
|
178
|
+
if table_name in self.schema_structure.tables:
|
179
|
+
table_schema = self.schema_structure.tables[table_name]
|
180
|
+
|
181
|
+
# Start building the new CREATE TABLE statement
|
182
|
+
schema_lines = [f"CREATE TABLE {table_name}", "("]
|
183
|
+
|
184
|
+
# Process column definitions
|
185
|
+
definitions = self._separate_column_definitions(column_definitions)
|
186
|
+
column_defs = []
|
187
|
+
constraint_defs = []
|
188
|
+
|
189
|
+
# Extract column definitions and constraints
|
190
|
+
for definition in definitions:
|
191
|
+
if definition.lower().startswith("foreign key") or definition.lower().startswith("constraint"):
|
192
|
+
constraint_defs.append(definition)
|
193
|
+
else:
|
194
|
+
column_match = re.match(r'"?`?([\w_]+)`?"?\s+(.*)', definition)
|
195
|
+
if column_match:
|
196
|
+
column_name = column_match.group(1)
|
197
|
+
column_type = column_match.group(2)
|
198
|
+
|
199
|
+
# Remove NULL/NOT NULL constraints as requested
|
200
|
+
column_type = re.sub(r'\s+(?:NOT\s+)?NULL', '', column_type, flags=re.IGNORECASE)
|
201
|
+
|
202
|
+
# Check if this is a primary key
|
203
|
+
is_primary_key = "primary key" in column_type.lower()
|
204
|
+
|
205
|
+
# Format the column definition
|
206
|
+
column_def = f"\t{column_name} {column_type}"
|
207
|
+
|
208
|
+
# Add comments with examples and descriptions
|
209
|
+
if column_name in table_schema.columns:
|
210
|
+
column_info = table_schema.columns[column_name]
|
211
|
+
comment_parts = []
|
212
|
+
|
213
|
+
# Add examples if available
|
214
|
+
if hasattr(column_info, 'examples') and column_info.examples:
|
215
|
+
examples = [f"`{ex}`" for ex in column_info.examples[:3]] # Limit to 3 examples
|
216
|
+
comment_parts.append(f"examples: {', '.join(examples)}")
|
217
|
+
|
218
|
+
# Add column name if available
|
219
|
+
if hasattr(column_info, 'column_name') and column_info.column_name:
|
220
|
+
comment_parts.append(f"| `{column_info.column_name}`")
|
221
|
+
|
222
|
+
# Add column description if available
|
223
|
+
if hasattr(column_info, 'column_description') and column_info.column_description:
|
224
|
+
comment_parts.append(f"description: {column_info.column_description}")
|
225
|
+
|
226
|
+
# Add value description if available and requested
|
227
|
+
if include_value_description and hasattr(column_info, 'value_description') and column_info.value_description:
|
228
|
+
comment_parts.append(f"values: {column_info.value_description}")
|
229
|
+
|
230
|
+
# Add the comment to the column definition
|
231
|
+
if comment_parts:
|
232
|
+
column_def += f" -- {' '.join(comment_parts)}"
|
233
|
+
|
234
|
+
column_defs.append(column_def)
|
235
|
+
|
236
|
+
# Process foreign key constraints with references
|
237
|
+
for column_name, column_info in table_schema.columns.items():
|
238
|
+
if hasattr(column_info, 'foreign_keys') and column_info.foreign_keys:
|
239
|
+
for ref_table, ref_column in column_info.foreign_keys:
|
240
|
+
# Fixed: Properly access tuple elements instead of using dictionary access
|
241
|
+
fk_constraint = f"\tforeign key ({column_name}) references {ref_table} ({ref_column}) on update cascade on delete cascade"
|
242
|
+
constraint_defs.append(fk_constraint)
|
243
|
+
|
244
|
+
# Combine column definitions and constraints
|
245
|
+
all_defs = column_defs + constraint_defs
|
246
|
+
schema_lines.extend(all_defs)
|
247
|
+
schema_lines.append(");")
|
248
|
+
|
249
|
+
# Join all lines to form the complete CREATE TABLE statement
|
250
|
+
schema_strings.append("\n".join(schema_lines))
|
251
|
+
|
252
|
+
return "\n\n".join(schema_strings)
|
253
|
+
|
254
|
+
@staticmethod
|
255
|
+
def _separate_column_definitions(column_definitions: str) -> List[str]:
|
256
|
+
"""Separate column definitions from a CREATE TABLE statement
|
257
|
+
|
258
|
+
Args:
|
259
|
+
column_definitions (str): The column definitions part of a CREATE TABLE statement
|
260
|
+
|
261
|
+
Returns:
|
262
|
+
List[str]: List of individual column definitions
|
263
|
+
"""
|
264
|
+
definitions = []
|
265
|
+
current_def = ""
|
266
|
+
paren_count = 0
|
267
|
+
|
268
|
+
for char in column_definitions:
|
269
|
+
if char == '(' and not current_def.strip().lower().startswith("constraint"):
|
270
|
+
paren_count += 1
|
271
|
+
elif char == ')' and not current_def.strip().lower().startswith("constraint"):
|
272
|
+
paren_count -= 1
|
273
|
+
|
274
|
+
current_def += char
|
275
|
+
|
276
|
+
if char == ',' and paren_count == 0:
|
277
|
+
definitions.append(current_def[:-1].strip())
|
278
|
+
current_def = ""
|
279
|
+
|
280
|
+
if current_def.strip():
|
281
|
+
definitions.append(current_def.strip())
|
282
|
+
|
283
|
+
return definitions
|
284
|
+
|
285
|
+
def _is_connection(self, table_name: str, column_name: str) -> bool:
|
286
|
+
"""
|
287
|
+
Checks if a column is a connection (primary key or foreign key).
|
288
|
+
|
289
|
+
Args:
|
290
|
+
table_name (str): The name of the table.
|
291
|
+
column_name (str): The name of the column.
|
292
|
+
|
293
|
+
Returns:
|
294
|
+
bool: True if the column is a connection, False otherwise.
|
295
|
+
"""
|
296
|
+
column_info = self.CACHED_DB_SCHEMA[self.db_id].get_column_info(
|
297
|
+
table_name, column_name
|
298
|
+
)
|
299
|
+
if column_info is None:
|
300
|
+
return False
|
301
|
+
if column_info.primary_key:
|
302
|
+
return True
|
303
|
+
for target_table, _ in column_info.foreign_keys:
|
304
|
+
if self.schema_structure.get_table_info(target_table):
|
305
|
+
return True
|
306
|
+
for target_table, _ in column_info.referenced_by:
|
307
|
+
if self.schema_structure.get_table_info(target_table):
|
308
|
+
return True
|
309
|
+
for target_table_name, table_schema in self.schema_structure.tables.items():
|
310
|
+
if table_name.lower() == target_table_name.lower():
|
311
|
+
continue
|
312
|
+
for target_column_name, target_column_info in table_schema.columns.items():
|
313
|
+
if (
|
314
|
+
target_column_name.lower() == column_name.lower()
|
315
|
+
and target_column_info.primary_key
|
316
|
+
):
|
317
|
+
return True
|
318
|
+
return False
|
319
|
+
|
320
|
+
def _get_connections(self) -> Dict[str, List[str]]:
|
321
|
+
"""
|
322
|
+
Retrieves connections between tables in the schema.
|
323
|
+
|
324
|
+
Returns:
|
325
|
+
Dict[str, List[str]]: A dictionary mapping table names to lists of connected columns.
|
326
|
+
"""
|
327
|
+
connections = {}
|
328
|
+
for table_name, table_schema in self.schema_structure.tables.items():
|
329
|
+
connections[table_name] = []
|
330
|
+
for column_name, column_info in (
|
331
|
+
self.CACHED_DB_SCHEMA[self.db_id].tables[table_name].columns.items()
|
332
|
+
):
|
333
|
+
if self._is_connection(table_name, column_name):
|
334
|
+
connections[table_name].append(column_name)
|
335
|
+
return connections
|
336
|
+
|
337
|
+
def get_schema_with_connections(self) -> Dict[str, List[str]]:
|
338
|
+
"""
|
339
|
+
Gets schema with connections included.
|
340
|
+
|
341
|
+
Returns:
|
342
|
+
Dict[str, List[str]]: The schema with connections included.
|
343
|
+
"""
|
344
|
+
schema_structure_dict = self.schema_structure.to_dict()
|
345
|
+
connections = self._get_connections()
|
346
|
+
for table_name, connected_columns in connections.items():
|
347
|
+
for column_name in connected_columns:
|
348
|
+
if column_name.lower() not in [
|
349
|
+
col.lower() for col in schema_structure_dict[table_name]
|
350
|
+
]:
|
351
|
+
schema_structure_dict[table_name].append(column_name)
|
352
|
+
return schema_structure_dict
|
353
|
+
|
354
|
+
def _get_example_column_name_description(
|
355
|
+
self, table_name: str, column_name: str, include_value_description: bool = True
|
356
|
+
) -> str:
|
357
|
+
"""
|
358
|
+
Retrieves example values and descriptions for a column.
|
359
|
+
|
360
|
+
Args:
|
361
|
+
table_name (str): The name of the table.
|
362
|
+
column_name (str): The name of the column.
|
363
|
+
include_value_description (bool): Flag to include value description.
|
364
|
+
|
365
|
+
Returns:
|
366
|
+
str: The example values and descriptions for the column.
|
367
|
+
"""
|
368
|
+
example_part = ""
|
369
|
+
name_string = ""
|
370
|
+
description_string = ""
|
371
|
+
value_statics_string = ""
|
372
|
+
value_description_string = ""
|
373
|
+
|
374
|
+
column_info = self.schema_structure.get_column_info(table_name, column_name)
|
375
|
+
if column_info:
|
376
|
+
if column_info.examples:
|
377
|
+
example_part = f" Example Values: {', '.join([f'`{str(x)}`' for x in column_info.examples])}"
|
378
|
+
if column_info.value_statics:
|
379
|
+
value_statics_string = f" Value Statics: {column_info.value_statics}"
|
380
|
+
if column_info.column_name:
|
381
|
+
if (column_info.column_name.lower() != column_name.lower()) and (
|
382
|
+
column_info.column_name.strip() != ""
|
383
|
+
):
|
384
|
+
name_string = f"| Column Name Meaning: {column_info.column_name}"
|
385
|
+
if column_info.column_description:
|
386
|
+
description_string = (
|
387
|
+
f"| Column Description: {column_info.column_description}"
|
388
|
+
)
|
389
|
+
if column_info.value_description and include_value_description:
|
390
|
+
value_description_string = (
|
391
|
+
f"| Value Description: {column_info.value_description}"
|
392
|
+
)
|
393
|
+
|
394
|
+
description_part = (
|
395
|
+
f"{name_string} {description_string} {value_description_string}"
|
396
|
+
)
|
397
|
+
joint_string = (
|
398
|
+
f" --{example_part} |{value_statics_string} {description_part}"
|
399
|
+
if example_part and description_part
|
400
|
+
else f" --{example_part or description_part or value_statics_string}"
|
401
|
+
)
|
402
|
+
if joint_string == " --":
|
403
|
+
joint_string = ""
|
404
|
+
return joint_string.replace("\n", " ") if joint_string else ""
|
405
|
+
|
406
|
+
def get_column_profiles(
|
407
|
+
self, with_keys: bool = False, with_references: bool = False
|
408
|
+
) -> Dict[str, Dict[str, str]]:
|
409
|
+
"""
|
410
|
+
Retrieves profiles for columns in the schema.
|
411
|
+
The output is a dictionary with table names as keys mapping to dictionaries with column names as keys and column profiles as values.
|
412
|
+
|
413
|
+
Args:
|
414
|
+
with_keys (bool): Flag to include primary keys and foreign keys.
|
415
|
+
with_references (bool): Flag to include referenced columns.
|
416
|
+
|
417
|
+
Returns:
|
418
|
+
Dict[str, Dict[str, str]]: The column profiles.
|
419
|
+
"""
|
420
|
+
column_profiles = {}
|
421
|
+
for table_name, table_schema in self.schema_structure.tables.items():
|
422
|
+
column_profiles[table_name] = {}
|
423
|
+
for column_name, column_info in table_schema.columns.items():
|
424
|
+
if with_keys or not (
|
425
|
+
column_info.primary_key
|
426
|
+
or column_info.foreign_keys
|
427
|
+
or column_info.referenced_by
|
428
|
+
):
|
429
|
+
column_profile = f"Table name: `{table_name}`\nOriginal column name: `{column_name}`\n"
|
430
|
+
if (
|
431
|
+
column_info.column_name.lower().strip()
|
432
|
+
!= column_name.lower().strip()
|
433
|
+
) and (column_info.column_name.strip() != ""):
|
434
|
+
column_profile += (
|
435
|
+
f"Expanded column name: `{column_info.column_name}`\n"
|
436
|
+
)
|
437
|
+
if column_info.type:
|
438
|
+
column_profile += f"Data type: {column_info.type}\n"
|
439
|
+
if column_info.column_description:
|
440
|
+
column_profile += (
|
441
|
+
f"Description: {column_info.column_description}\n"
|
442
|
+
)
|
443
|
+
if column_info.value_description:
|
444
|
+
column_profile += (
|
445
|
+
f"Value description: {column_info.value_description}\n"
|
446
|
+
)
|
447
|
+
if column_info.examples:
|
448
|
+
column_profile += f"Example of values in the column: {', '.join([f'`{str(x)}`' for x in column_info.examples])}\n"
|
449
|
+
if column_info.primary_key:
|
450
|
+
column_profile += "This column is a primary key.\n"
|
451
|
+
if with_references:
|
452
|
+
if column_info.foreign_keys:
|
453
|
+
column_profile += (
|
454
|
+
"This column references the following columns:\n"
|
455
|
+
)
|
456
|
+
for target_table, target_column in column_info.foreign_keys:
|
457
|
+
column_profile += f" Table: `{target_table}`, Column: `{target_column}`\n"
|
458
|
+
if column_info.referenced_by:
|
459
|
+
column_profile += (
|
460
|
+
"This column is referenced by the following columns:\n"
|
461
|
+
)
|
462
|
+
for (
|
463
|
+
source_table,
|
464
|
+
source_column,
|
465
|
+
) in column_info.referenced_by:
|
466
|
+
column_profile += f" Table: `{source_table}`, Column: `{source_column}`\n"
|
467
|
+
column_profiles[table_name][column_name] = column_profile
|
468
|
+
return column_profiles
|
469
|
+
|
470
|
+
def validate_schema_consistency(self) -> List[str]:
|
471
|
+
"""
|
472
|
+
Validates the consistency between the schema in the generator and the database manager.
|
473
|
+
|
474
|
+
Returns:
|
475
|
+
List[str]: A list of validation error messages, empty if no errors.
|
476
|
+
"""
|
477
|
+
errors = []
|
478
|
+
|
479
|
+
# Skip validation if no database manager is provided
|
480
|
+
if not self.db_manager:
|
481
|
+
return ["No database manager provided for validation"]
|
482
|
+
|
483
|
+
# Compare schema tables with database manager tables
|
484
|
+
db_schema_dict = self.db_manager.get_schema_dict()
|
485
|
+
for table_name in self.schema_structure.tables:
|
486
|
+
if table_name not in db_schema_dict:
|
487
|
+
errors.append(f"Table '{table_name}' exists in schema but not in database manager")
|
488
|
+
|
489
|
+
# Validate foreign key references
|
490
|
+
for table_name, table_schema in self.schema_structure.tables.items():
|
491
|
+
for column_name, column_info in table_schema.columns.items():
|
492
|
+
if column_info.foreign_keys:
|
493
|
+
for ref_table, ref_column in column_info.foreign_keys:
|
494
|
+
# Check if referenced table exists
|
495
|
+
if ref_table not in self.schema_structure.tables:
|
496
|
+
errors.append(f"Foreign key in {table_name}.{column_name} references non-existent table {ref_table}")
|
497
|
+
continue
|
498
|
+
|
499
|
+
# Check if referenced column exists
|
500
|
+
if ref_column not in self.schema_structure.tables[ref_table].columns:
|
501
|
+
errors.append(f"Foreign key in {table_name}.{column_name} references non-existent column {ref_table}.{ref_column}")
|
502
|
+
|
503
|
+
# Use the DatabaseSchema's validate_schema method if available
|
504
|
+
if hasattr(self.schema_structure, 'validate_schema'):
|
505
|
+
schema_errors = self.schema_structure.validate_schema()
|
506
|
+
errors.extend(schema_errors)
|
507
|
+
|
508
|
+
return errors
|
@@ -0,0 +1,159 @@
|
|
1
|
+
import logging
|
2
|
+
import pickle
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Dict, List, Tuple
|
5
|
+
|
6
|
+
from datasketch import MinHash, MinHashLSH
|
7
|
+
from tqdm import tqdm
|
8
|
+
|
9
|
+
|
10
|
+
def _create_minhash(signature_size: int, string: str, n_gram: int) -> MinHash:
|
11
|
+
"""
|
12
|
+
Creates a MinHash object for a given string.
|
13
|
+
|
14
|
+
Args:
|
15
|
+
signature_size (int): The size of the MinHash signature.
|
16
|
+
string (str): The input string to create the MinHash for.
|
17
|
+
n_gram (int): The n-gram size for the MinHash.
|
18
|
+
|
19
|
+
Returns:
|
20
|
+
MinHash: The MinHash object for the input string.
|
21
|
+
"""
|
22
|
+
m = MinHash(num_perm=signature_size)
|
23
|
+
for d in [string[i : i + n_gram] for i in range(len(string) - n_gram + 1)]:
|
24
|
+
m.update(d.encode("utf8"))
|
25
|
+
return m
|
26
|
+
|
27
|
+
|
28
|
+
def skip_column(column_name: str, column_values: List[str]) -> bool:
|
29
|
+
"""
|
30
|
+
Determines whether to skip processing a column based on its values.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
column_name (str): The name of the column.
|
34
|
+
column_values (List[str]): The list of values in the column.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
bool: True if the column should be skipped, False otherwise.
|
38
|
+
"""
|
39
|
+
if "name" in column_name.lower():
|
40
|
+
return False
|
41
|
+
sum_of_lengths = sum(len(value) for value in column_values)
|
42
|
+
average_length = sum_of_lengths / len(column_values)
|
43
|
+
return (sum_of_lengths > 50000) and (average_length > 20)
|
44
|
+
|
45
|
+
|
46
|
+
def make_lsh(
|
47
|
+
unique_values: Dict[str, Dict[str, List[str]]],
|
48
|
+
signature_size: int,
|
49
|
+
n_gram: int,
|
50
|
+
threshold: float,
|
51
|
+
verbose: bool = True,
|
52
|
+
) -> Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]:
|
53
|
+
"""
|
54
|
+
Creates a MinHash Locality-Sensitive Hashing (LSH) index from unique values in a database.
|
55
|
+
|
56
|
+
This function processes unique values from database tables and columns, creates MinHash
|
57
|
+
signatures for each value, and builds an LSH index for efficient similarity search.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
unique_values (Dict[str, Dict[str, List[str]]]): A nested dictionary containing unique values
|
61
|
+
from the database. The structure is {table_name: {column_name: [values]}}.
|
62
|
+
signature_size (int): The number of permutations to use in the MinHash signatures.
|
63
|
+
n_gram (int): The size of n-grams to use when creating MinHash signatures.
|
64
|
+
threshold (float): The similarity threshold for the LSH index. Values closer to 1 require
|
65
|
+
higher similarity for matches.
|
66
|
+
verbose (bool, optional): If True, displays a progress bar during processing. Defaults to True.
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]: A tuple containing:
|
70
|
+
- MinHashLSH: The constructed LSH index.
|
71
|
+
- Dict[str, Tuple[MinHash, str, str, str]]: A dictionary mapping unique keys to tuples
|
72
|
+
containing (MinHash object, table name, column name, original value).
|
73
|
+
|
74
|
+
Raises:
|
75
|
+
Exception: If an error occurs during LSH creation, it's logged but not raised.
|
76
|
+
|
77
|
+
Note:
|
78
|
+
This function uses the datasketch library for MinHash and LSH operations.
|
79
|
+
"""
|
80
|
+
lsh = MinHashLSH(threshold=threshold, num_perm=signature_size)
|
81
|
+
minhashes: Dict[str, Tuple[MinHash, str, str, str]] = {}
|
82
|
+
try:
|
83
|
+
total_unique_values = sum(
|
84
|
+
len(column_values)
|
85
|
+
for table_values in unique_values.values()
|
86
|
+
for column_values in table_values.values()
|
87
|
+
)
|
88
|
+
logging.info(f"Total unique values: {total_unique_values}")
|
89
|
+
|
90
|
+
progress_bar = (
|
91
|
+
tqdm(total=total_unique_values, desc="Creating LSH") if verbose else None
|
92
|
+
)
|
93
|
+
|
94
|
+
for table_name, table_values in unique_values.items():
|
95
|
+
for column_name, column_values in table_values.items():
|
96
|
+
if column_name.lower() == "doctype":
|
97
|
+
print("=" * 20)
|
98
|
+
print("Doctype found")
|
99
|
+
print("=" * 20)
|
100
|
+
logging.info(
|
101
|
+
f"Processing {table_name} - {column_name} - {len(column_values)}"
|
102
|
+
)
|
103
|
+
|
104
|
+
for id, value in enumerate(column_values):
|
105
|
+
minhash = _create_minhash(signature_size, value, n_gram)
|
106
|
+
minhash_key = f"{table_name}_{column_name}_{id}"
|
107
|
+
minhashes[minhash_key] = (minhash, table_name, column_name, value)
|
108
|
+
lsh.insert(minhash_key, minhash)
|
109
|
+
|
110
|
+
if verbose:
|
111
|
+
progress_bar.update(1)
|
112
|
+
|
113
|
+
if verbose:
|
114
|
+
progress_bar.close()
|
115
|
+
except Exception as e:
|
116
|
+
logging.error(f"Error creating LSH: {e}")
|
117
|
+
|
118
|
+
return lsh, minhashes
|
119
|
+
|
120
|
+
|
121
|
+
def make_db_lsh(db, db_directory_path, db_name, **kwargs) -> None:
|
122
|
+
"""
|
123
|
+
Creates a MinHash LSH for the database and saves the results.
|
124
|
+
|
125
|
+
This function maintains backward compatibility while using the new LSH architecture.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
db: Database manager instance
|
129
|
+
db_directory_path (str): The path to the database directory.
|
130
|
+
db_name (str): Name of the database
|
131
|
+
**kwargs (Any): Additional arguments for the LSH creation.
|
132
|
+
"""
|
133
|
+
# Use the new LSH factory for database-independent creation
|
134
|
+
from ..lsh.factory import LshFactory
|
135
|
+
|
136
|
+
try:
|
137
|
+
# Try using the new architecture
|
138
|
+
LshFactory.create_lsh_from_db(db, **kwargs)
|
139
|
+
except Exception as e:
|
140
|
+
logging.warning(f"New LSH creation failed, falling back to old method: {e}")
|
141
|
+
|
142
|
+
# Fallback to old method for backward compatibility
|
143
|
+
preprocessed_path = Path(db_directory_path) / "preprocessed"
|
144
|
+
logging.info(f"Preprocessed directory: {preprocessed_path}")
|
145
|
+
preprocessed_path.mkdir(parents=True, exist_ok=True)
|
146
|
+
|
147
|
+
unique_values = db.get_unique_values()
|
148
|
+
logging.info("Unique values obtained")
|
149
|
+
|
150
|
+
with open(preprocessed_path / f"{db_name}_unique_values.pkl", "wb") as file:
|
151
|
+
pickle.dump(unique_values, file)
|
152
|
+
logging.info("Saved unique values")
|
153
|
+
|
154
|
+
lsh, minhashes = make_lsh(unique_values, **kwargs)
|
155
|
+
|
156
|
+
with open(preprocessed_path / f"{db_name}_lsh.pkl", "wb") as file:
|
157
|
+
pickle.dump(lsh, file)
|
158
|
+
with open(preprocessed_path / f"{db_name}_minhashes.pkl", "wb") as file:
|
159
|
+
pickle.dump(minhashes, file)
|