thoth-dbmanager 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thoth_dbmanager/ThothDbManager.py +459 -0
- thoth_dbmanager/__init__.py +136 -0
- thoth_dbmanager/adapters/__init__.py +21 -0
- thoth_dbmanager/adapters/mariadb.py +165 -0
- thoth_dbmanager/adapters/mysql.py +165 -0
- thoth_dbmanager/adapters/oracle.py +554 -0
- thoth_dbmanager/adapters/postgresql.py +444 -0
- thoth_dbmanager/adapters/qdrant.py +189 -0
- thoth_dbmanager/adapters/sqlite.py +385 -0
- thoth_dbmanager/adapters/sqlserver.py +583 -0
- thoth_dbmanager/adapters/supabase.py +249 -0
- thoth_dbmanager/core/__init__.py +13 -0
- thoth_dbmanager/core/factory.py +272 -0
- thoth_dbmanager/core/interfaces.py +271 -0
- thoth_dbmanager/core/registry.py +220 -0
- thoth_dbmanager/documents.py +155 -0
- thoth_dbmanager/dynamic_imports.py +250 -0
- thoth_dbmanager/helpers/__init__.py +0 -0
- thoth_dbmanager/helpers/multi_db_generator.py +508 -0
- thoth_dbmanager/helpers/preprocess_values.py +159 -0
- thoth_dbmanager/helpers/schema.py +376 -0
- thoth_dbmanager/helpers/search.py +117 -0
- thoth_dbmanager/lsh/__init__.py +21 -0
- thoth_dbmanager/lsh/core.py +182 -0
- thoth_dbmanager/lsh/factory.py +76 -0
- thoth_dbmanager/lsh/manager.py +170 -0
- thoth_dbmanager/lsh/storage.py +96 -0
- thoth_dbmanager/plugins/__init__.py +23 -0
- thoth_dbmanager/plugins/mariadb.py +436 -0
- thoth_dbmanager/plugins/mysql.py +408 -0
- thoth_dbmanager/plugins/oracle.py +150 -0
- thoth_dbmanager/plugins/postgresql.py +145 -0
- thoth_dbmanager/plugins/qdrant.py +41 -0
- thoth_dbmanager/plugins/sqlite.py +170 -0
- thoth_dbmanager/plugins/sqlserver.py +149 -0
- thoth_dbmanager/plugins/supabase.py +224 -0
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.2.dist-info}/METADATA +9 -6
- thoth_dbmanager-0.4.2.dist-info/RECORD +41 -0
- thoth_dbmanager-0.4.2.dist-info/top_level.txt +1 -0
- thoth_dbmanager-0.4.0.dist-info/RECORD +0 -5
- thoth_dbmanager-0.4.0.dist-info/top_level.txt +0 -1
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.2.dist-info}/WHEEL +0 -0
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,376 @@
|
|
1
|
+
import logging
|
2
|
+
from dataclasses import dataclass, field
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
4
|
+
|
5
|
+
logger = logging.getLogger(__name__)
|
6
|
+
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class ColumnInfo:
|
10
|
+
"""
|
11
|
+
Represents metadata for a single column in a database table.
|
12
|
+
|
13
|
+
Attributes:
|
14
|
+
original_column_name (str): The original name of the column.
|
15
|
+
column_name (str): The standardized name of the column.
|
16
|
+
column_description (str): A description of the column.
|
17
|
+
data_format (str): The format of the data in the column.
|
18
|
+
value_description (str): A description of the values in the column.
|
19
|
+
type (str): The data type of the column.
|
20
|
+
examples (List[str]): Example values from the column.
|
21
|
+
primary_key (bool): Whether the column is a primary key.
|
22
|
+
foreign_keys (List[Tuple[str, str]]): Foreign keys referencing other tables and columns.
|
23
|
+
referenced_by (List[Tuple[str, str]]): Columns in other tables that reference this column.
|
24
|
+
"""
|
25
|
+
|
26
|
+
original_column_name: str = ""
|
27
|
+
column_name: str = ""
|
28
|
+
column_description: str = ""
|
29
|
+
generated_comment: str = ""
|
30
|
+
value_description: str = ""
|
31
|
+
data_format: str = ""
|
32
|
+
type: str = ""
|
33
|
+
examples: List[str] = field(default_factory=list)
|
34
|
+
primary_key: bool = False
|
35
|
+
foreign_keys: List[Tuple[str, str]] = field(default_factory=list)
|
36
|
+
referenced_by: List[Tuple[str, str]] = field(default_factory=list)
|
37
|
+
|
38
|
+
|
39
|
+
def set_field(column_info: ColumnInfo, field_name: str, value: Any) -> None:
|
40
|
+
"""
|
41
|
+
Sets a field in the ColumnInfo dataclass.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
column_info (ColumnInfo): The ColumnInfo instance to update.
|
45
|
+
field_name (str): The field name to set.
|
46
|
+
value (Any): The value to set for the field.
|
47
|
+
|
48
|
+
Raises:
|
49
|
+
ValueError: If the field_name is not a valid field of ColumnInfo.
|
50
|
+
"""
|
51
|
+
if field_name in column_info.__dataclass_fields__:
|
52
|
+
setattr(column_info, field_name, value)
|
53
|
+
else:
|
54
|
+
raise ValueError(f"{field_name} is not a valid field of ColumnInfo")
|
55
|
+
|
56
|
+
|
57
|
+
@dataclass
|
58
|
+
class TableSchema:
|
59
|
+
"""
|
60
|
+
Represents the schema of a single table in a database.
|
61
|
+
|
62
|
+
Attributes:
|
63
|
+
columns (Dict[str, ColumnInfo]): A dictionary mapping column names to their metadata.
|
64
|
+
"""
|
65
|
+
|
66
|
+
columns: Dict[str, ColumnInfo] = field(default_factory=dict)
|
67
|
+
|
68
|
+
|
69
|
+
def get_primary_keys(table_schema: TableSchema) -> List[str]:
|
70
|
+
"""
|
71
|
+
Retrieves the primary key columns from a table schema.
|
72
|
+
|
73
|
+
Args:
|
74
|
+
table_schema (TableSchema): The table schema to analyze.
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
List[str]: A list of primary key column names.
|
78
|
+
"""
|
79
|
+
return [name for name, info in table_schema.columns.items() if info.primary_key]
|
80
|
+
|
81
|
+
|
82
|
+
@dataclass
|
83
|
+
class DatabaseSchema:
|
84
|
+
"""
|
85
|
+
Represents the schema of an entire database, consisting of multiple tables.
|
86
|
+
|
87
|
+
Attributes:
|
88
|
+
tables (Dict[str, TableSchema]): A dictionary mapping table names to their schemas.
|
89
|
+
"""
|
90
|
+
|
91
|
+
tables: Dict[str, TableSchema] = field(default_factory=dict)
|
92
|
+
|
93
|
+
@classmethod
|
94
|
+
def from_table_names(cls, table_names: List[str]) -> "DatabaseSchema":
|
95
|
+
"""
|
96
|
+
Creates a DatabaseSchema from a list of table names.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
table_names (List[str]): The names of the tables to include in the schema.
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
DatabaseSchema: The constructed database schema.
|
103
|
+
"""
|
104
|
+
return cls(tables={name: TableSchema() for name in table_names})
|
105
|
+
|
106
|
+
@classmethod
|
107
|
+
def from_schema_dict(
|
108
|
+
cls, schema_dict: Dict[str, List[str]]
|
109
|
+
) -> "DatabaseSchema":
|
110
|
+
"""
|
111
|
+
Creates a DatabaseSchema from a dictionary mapping table names to lists of column names.
|
112
|
+
|
113
|
+
Args:
|
114
|
+
schema_dict (Dict[str, List[str]]): The schema dictionary to convert.
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
DatabaseSchema: The constructed database schema.
|
118
|
+
"""
|
119
|
+
return cls(
|
120
|
+
tables={
|
121
|
+
table_name: TableSchema(
|
122
|
+
columns={column_name: ColumnInfo() for column_name in column_names}
|
123
|
+
)
|
124
|
+
for table_name, column_names in schema_dict.items()
|
125
|
+
}
|
126
|
+
)
|
127
|
+
|
128
|
+
@classmethod
|
129
|
+
def from_schema_dict_with_examples(
|
130
|
+
cls, schema_dict_with_info: Dict[str, Dict[str, List[str]]]
|
131
|
+
) -> "DatabaseSchema":
|
132
|
+
"""
|
133
|
+
Creates a DatabaseSchema from a dictionary with example values for each column.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
schema_dict_with_info (Dict[str, Dict[str, List[str]]]): The schema dictionary with example values.
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
DatabaseSchema: The constructed database schema.
|
140
|
+
"""
|
141
|
+
return cls(
|
142
|
+
tables={
|
143
|
+
table_name: TableSchema(
|
144
|
+
columns={
|
145
|
+
column_name: ColumnInfo(examples=column_info)
|
146
|
+
for column_name, column_info in column_dict.items()
|
147
|
+
}
|
148
|
+
)
|
149
|
+
for table_name, column_dict in schema_dict_with_info.items()
|
150
|
+
}
|
151
|
+
)
|
152
|
+
|
153
|
+
@classmethod
|
154
|
+
def from_schema_dict_with_descriptions(
|
155
|
+
cls,
|
156
|
+
tentative_schema: Dict[str, Dict[str, Dict[str, Any]]],
|
157
|
+
schema_dict_with_info: Dict[str, Dict[str, Dict[str, Any]]]
|
158
|
+
) -> "DatabaseSchema":
|
159
|
+
"""
|
160
|
+
Creates a DatabaseSchema from a dictionary with detailed information for each column.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
tentative_schema (Dict[str, Dict[str, Dict[str, Any]]]): The base schema structure
|
164
|
+
schema_dict_with_info (Dict[str, Dict[str, Dict[str, Any]]]): The schema dictionary with detailed information
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
DatabaseSchema: The constructed database schema
|
168
|
+
"""
|
169
|
+
# Convert new schema format to simplified format for initial creation
|
170
|
+
simplified_schema = {
|
171
|
+
table_name: list(table_info["columns"].keys())
|
172
|
+
for table_name, table_info in tentative_schema.items()
|
173
|
+
}
|
174
|
+
|
175
|
+
database_schema = cls.from_schema_dict(simplified_schema)
|
176
|
+
|
177
|
+
for table_name, table_info in schema_dict_with_info.items():
|
178
|
+
if table_name in database_schema.tables:
|
179
|
+
for column_name, info in table_info.items():
|
180
|
+
if column_name in database_schema.tables[table_name].columns:
|
181
|
+
column_info = database_schema.tables[table_name].columns[column_name]
|
182
|
+
for field_name, value in info.items():
|
183
|
+
set_field(column_info, field_name, value)
|
184
|
+
|
185
|
+
return database_schema
|
186
|
+
|
187
|
+
def get_actual_table_name(
|
188
|
+
self, table_name: str
|
189
|
+
) -> Optional[str]:
|
190
|
+
"""
|
191
|
+
Retrieves the actual table name matching the provided name, case-insensitive.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
table_name (str): The name of the table to search for.
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
Optional[str]: The actual table name if found, otherwise None.
|
198
|
+
"""
|
199
|
+
table_name_lower = table_name.lower()
|
200
|
+
return next(
|
201
|
+
(name for name in self.tables if name.lower() == table_name_lower), None
|
202
|
+
)
|
203
|
+
|
204
|
+
def get_table_info(
|
205
|
+
self, table_name: str
|
206
|
+
) -> Optional[TableSchema]:
|
207
|
+
"""
|
208
|
+
Retrieves the TableSchema object for the specified table name.
|
209
|
+
|
210
|
+
Args:
|
211
|
+
table_name (str): The name of the table to retrieve.
|
212
|
+
|
213
|
+
Returns:
|
214
|
+
Optional[TableSchema]: The TableSchema if found, otherwise None.
|
215
|
+
"""
|
216
|
+
actual_name = self.get_actual_table_name(table_name)
|
217
|
+
return self.tables.get(actual_name)
|
218
|
+
|
219
|
+
def get_actual_column_name(
|
220
|
+
self, table_name: str, column_name: str
|
221
|
+
) -> Optional[str]:
|
222
|
+
"""
|
223
|
+
Retrieves the actual column name matching the provided name, case-insensitive.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
table_name (str): The name of the table containing the column.
|
227
|
+
column_name (str): The name of the column to search for.
|
228
|
+
|
229
|
+
Returns:
|
230
|
+
Optional[str]: The actual column name if found, otherwise None.
|
231
|
+
"""
|
232
|
+
table_info = self.get_table_info(table_name)
|
233
|
+
if table_info:
|
234
|
+
column_name_lower = column_name.lower()
|
235
|
+
return next(
|
236
|
+
(
|
237
|
+
name
|
238
|
+
for name in table_info.columns
|
239
|
+
if name.lower() == column_name_lower
|
240
|
+
),
|
241
|
+
None,
|
242
|
+
)
|
243
|
+
return None
|
244
|
+
|
245
|
+
def get_column_info(
|
246
|
+
self, table_name: str, column_name: str
|
247
|
+
) -> Optional[ColumnInfo]:
|
248
|
+
"""
|
249
|
+
Retrieves the ColumnInfo object for the specified column in a table.
|
250
|
+
|
251
|
+
Args:
|
252
|
+
table_name (str): The name of the table containing the column.
|
253
|
+
column_name (str): The name of the column to retrieve.
|
254
|
+
|
255
|
+
Returns:
|
256
|
+
Optional[ColumnInfo]: The ColumnInfo if found, otherwise None.
|
257
|
+
"""
|
258
|
+
actual_name = self.get_actual_column_name(table_name, column_name)
|
259
|
+
if actual_name:
|
260
|
+
return self.tables[table_name].columns[actual_name]
|
261
|
+
return None
|
262
|
+
|
263
|
+
def set_columns_info(
|
264
|
+
self, schema_with_info: Dict[str, Dict[str, Dict[str, Any]]]
|
265
|
+
) -> None:
|
266
|
+
"""
|
267
|
+
Sets detailed information for columns in the schema.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
schema_with_info (Dict[str, Dict[str, Dict[str, Any]]]): The schema information to set.
|
271
|
+
"""
|
272
|
+
for table_name, columns_info in schema_with_info.items():
|
273
|
+
table_info = self.get_table_info(table_name)
|
274
|
+
if table_info is None:
|
275
|
+
logger.warning(f"Table {table_name} not found in the schema")
|
276
|
+
continue
|
277
|
+
for column_name, info in columns_info.items():
|
278
|
+
actual_name = self.get_actual_column_name(table_name, column_name)
|
279
|
+
if actual_name is None:
|
280
|
+
logger.warning(f"Column {column_name} not found in table {table_name}")
|
281
|
+
continue
|
282
|
+
schema_column_info = table_info.columns[actual_name]
|
283
|
+
for field_name, value in info.items():
|
284
|
+
set_field(schema_column_info, field_name, value)
|
285
|
+
|
286
|
+
def subselect_schema(
|
287
|
+
self, selected_database_schema: "DatabaseSchema"
|
288
|
+
) -> "DatabaseSchema":
|
289
|
+
"""
|
290
|
+
Creates a new DatabaseSchema containing only the selected tables and columns.
|
291
|
+
|
292
|
+
Args:
|
293
|
+
selected_database_schema (DatabaseSchema): The schema to subselect from.
|
294
|
+
|
295
|
+
Returns:
|
296
|
+
DatabaseSchema: The new subselected database schema.
|
297
|
+
"""
|
298
|
+
new_schema = DatabaseSchema({})
|
299
|
+
for table_name, table_info in selected_database_schema.tables.items():
|
300
|
+
actual_table_name = self.get_actual_table_name(table_name)
|
301
|
+
if actual_table_name is None:
|
302
|
+
logger.warning(f"Table {table_name} not found in the schema")
|
303
|
+
continue
|
304
|
+
new_table_info = TableSchema()
|
305
|
+
for column_name, column_info in table_info.columns.items():
|
306
|
+
actual_column_name = self.get_actual_column_name(
|
307
|
+
table_name, column_name
|
308
|
+
)
|
309
|
+
if actual_column_name is None:
|
310
|
+
logger.warning(f"Column {column_name} not found in table {table_name}")
|
311
|
+
continue
|
312
|
+
new_table_info.columns[actual_column_name] = column_info
|
313
|
+
new_schema.tables[actual_table_name] = new_table_info
|
314
|
+
return new_schema
|
315
|
+
|
316
|
+
def add_info_from_schema(
|
317
|
+
self, schema: "DatabaseSchema", field_names: List[str]
|
318
|
+
) -> None:
|
319
|
+
"""
|
320
|
+
Adds additional field information from another schema to the current schema.
|
321
|
+
|
322
|
+
Args:
|
323
|
+
schema (DatabaseSchema): The schema to copy information from.
|
324
|
+
field_names (List[str]): The list of field names to copy.
|
325
|
+
"""
|
326
|
+
for table_name, table_info in self.tables.items():
|
327
|
+
actual_table_name = schema.get_actual_table_name(table_name)
|
328
|
+
if actual_table_name is None:
|
329
|
+
continue
|
330
|
+
for column_name, column_info in table_info.columns.items():
|
331
|
+
actual_column_name = schema.get_actual_column_name(
|
332
|
+
table_name, column_name
|
333
|
+
)
|
334
|
+
if actual_column_name is None:
|
335
|
+
continue
|
336
|
+
new_column_info = schema.tables[actual_table_name].columns[
|
337
|
+
actual_column_name
|
338
|
+
]
|
339
|
+
for field_name in field_names:
|
340
|
+
set_field(
|
341
|
+
column_info, field_name, getattr(new_column_info, field_name)
|
342
|
+
)
|
343
|
+
|
344
|
+
def to_dict(self) -> Dict[str, List[str]]:
|
345
|
+
"""
|
346
|
+
Converts the DatabaseSchema to a dictionary representation.
|
347
|
+
|
348
|
+
Returns:
|
349
|
+
Dict[str, List[str]]: The dictionary representation of the schema.
|
350
|
+
"""
|
351
|
+
return {
|
352
|
+
table_name: list(table_info.columns.keys())
|
353
|
+
for table_name, table_info in self.tables.items()
|
354
|
+
}
|
355
|
+
|
356
|
+
def validate_schema(self) -> List[str]:
|
357
|
+
"""
|
358
|
+
Validates the schema for integrity and returns a list of validation errors.
|
359
|
+
|
360
|
+
Returns:
|
361
|
+
List[str]: A list of validation error messages, empty if no errors.
|
362
|
+
"""
|
363
|
+
errors = []
|
364
|
+
|
365
|
+
# Check for foreign key references to non-existent tables/columns
|
366
|
+
for table_name, table_info in self.tables.items():
|
367
|
+
for column_name, column_info in table_info.columns.items():
|
368
|
+
for fk_table, fk_column in column_info.foreign_keys:
|
369
|
+
if not self.get_actual_table_name(fk_table):
|
370
|
+
errors.append(f"Foreign key in {table_name}.{column_name} references non-existent table {fk_table}")
|
371
|
+
continue
|
372
|
+
|
373
|
+
if not self.get_actual_column_name(fk_table, fk_column):
|
374
|
+
errors.append(f"Foreign key in {table_name}.{column_name} references non-existent column {fk_table}.{fk_column}")
|
375
|
+
|
376
|
+
return errors
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import logging
|
2
|
+
import pickle
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Dict, List, Tuple
|
5
|
+
|
6
|
+
from datasketch import MinHash, MinHashLSH
|
7
|
+
|
8
|
+
from .preprocess_values import _create_minhash
|
9
|
+
|
10
|
+
|
11
|
+
### Database value similarity ###
|
12
|
+
|
13
|
+
|
14
|
+
def _jaccard_similarity(m1: MinHash, m2: MinHash) -> float:
|
15
|
+
"""
|
16
|
+
Computes the Jaccard similarity between two MinHash objects.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
m1 (MinHash): The first MinHash object.
|
20
|
+
m2 (MinHash): The second MinHash object.
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
float: The Jaccard similarity between the two MinHash objects.
|
24
|
+
"""
|
25
|
+
return m1.jaccard(m2)
|
26
|
+
|
27
|
+
|
28
|
+
def load_db_lsh(
|
29
|
+
db_directory_path: str,
|
30
|
+
) -> Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]:
|
31
|
+
"""
|
32
|
+
Loads the LSH and MinHashes from the preprocessed files in the specified directory.
|
33
|
+
|
34
|
+
This function maintains backward compatibility while potentially using the new LSH manager.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
db_directory_path (str): The path to the database directory.
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]: The LSH object and the dictionary of MinHashes.
|
41
|
+
|
42
|
+
Raises:
|
43
|
+
Exception: If there is an error loading the LSH or MinHashes.
|
44
|
+
"""
|
45
|
+
db_id = Path(db_directory_path).name
|
46
|
+
|
47
|
+
try:
|
48
|
+
# Try using the new LSH manager first
|
49
|
+
from ..lsh.manager import LshManager
|
50
|
+
lsh_manager = LshManager(Path(db_directory_path))
|
51
|
+
if lsh_manager.load_lsh():
|
52
|
+
return lsh_manager.lsh, lsh_manager.minhashes
|
53
|
+
except Exception as e:
|
54
|
+
logging.warning(f"New LSH manager failed, falling back to old method: {e}")
|
55
|
+
|
56
|
+
# Fallback to old method
|
57
|
+
try:
|
58
|
+
with open(
|
59
|
+
Path(db_directory_path) / "preprocessed" / f"{db_id}_lsh.pkl", "rb"
|
60
|
+
) as file:
|
61
|
+
lsh = pickle.load(file)
|
62
|
+
with open(
|
63
|
+
Path(db_directory_path) / "preprocessed" / f"{db_id}_minhashes.pkl", "rb"
|
64
|
+
) as file:
|
65
|
+
minhashes = pickle.load(file)
|
66
|
+
return lsh, minhashes
|
67
|
+
except Exception as e:
|
68
|
+
logging.error(f"Error loading LSH for {db_id}: {e}")
|
69
|
+
raise e
|
70
|
+
|
71
|
+
|
72
|
+
def _query_lsh(
|
73
|
+
lsh: MinHashLSH,
|
74
|
+
minhashes: Dict[str, Tuple[MinHash, str, str, str]],
|
75
|
+
keyword: str,
|
76
|
+
signature_size: int = 30,
|
77
|
+
n_gram: int = 3,
|
78
|
+
top_n: int = 10,
|
79
|
+
) -> Dict[str, Dict[str, List[str]]]:
|
80
|
+
"""
|
81
|
+
Queries the LSH for similar values to the given keyword and returns the top results.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
lsh (MinHashLSH): The LSH object.
|
85
|
+
minhashes (Dict[str, Tuple[MinHash, str, str, str]]): The dictionary of MinHashes.
|
86
|
+
keyword (str): The keyword to search for.
|
87
|
+
signature_size (int, optional): The size of the MinHash signature.
|
88
|
+
n_gram (int, optional): The n-gram size for the MinHash.
|
89
|
+
top_n (int, optional): The number of top results to return.
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Dict[str, Dict[str, List[str]]]: A dictionary containing the top similar values.
|
93
|
+
Example:{
|
94
|
+
'table_name1': {
|
95
|
+
'column_name1': ['value1', 'value2', 'value3'],
|
96
|
+
'column_name2': ['value4', 'value5']
|
97
|
+
},
|
98
|
+
'table_name2': {
|
99
|
+
"""
|
100
|
+
query_minhash = _create_minhash(signature_size, keyword, n_gram)
|
101
|
+
results = lsh.query(query_minhash)
|
102
|
+
similarities = [
|
103
|
+
(result, _jaccard_similarity(query_minhash, minhashes[result][0]))
|
104
|
+
for result in results
|
105
|
+
]
|
106
|
+
similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
|
107
|
+
|
108
|
+
similar_values_trimmed: Dict[str, Dict[str, List[str]]] = {}
|
109
|
+
for result, similarity in similarities:
|
110
|
+
table_name, column_name, value = minhashes[result][1:] #type: ignore
|
111
|
+
if table_name not in similar_values_trimmed:
|
112
|
+
similar_values_trimmed[table_name] = {}
|
113
|
+
if column_name not in similar_values_trimmed[table_name]:
|
114
|
+
similar_values_trimmed[table_name][column_name] = []
|
115
|
+
similar_values_trimmed[table_name][column_name].append(value)
|
116
|
+
|
117
|
+
return similar_values_trimmed
|
@@ -0,0 +1,21 @@
|
|
1
|
+
"""
|
2
|
+
LSH (Locality Sensitive Hashing) module for database-independent LSH management.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from .storage import LshStorageStrategy, PickleStorage
|
6
|
+
from .manager import LshManager
|
7
|
+
from .factory import LshFactory, make_db_lsh
|
8
|
+
from .core import create_minhash, skip_column, jaccard_similarity, create_lsh_index, query_lsh_index
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"LshStorageStrategy",
|
12
|
+
"PickleStorage",
|
13
|
+
"LshManager",
|
14
|
+
"LshFactory",
|
15
|
+
"make_db_lsh",
|
16
|
+
"create_minhash",
|
17
|
+
"skip_column",
|
18
|
+
"jaccard_similarity",
|
19
|
+
"create_lsh_index",
|
20
|
+
"query_lsh_index"
|
21
|
+
]
|
@@ -0,0 +1,182 @@
|
|
1
|
+
"""
|
2
|
+
Core LSH functionality extracted from helpers.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from typing import Dict, List, Tuple
|
7
|
+
|
8
|
+
from datasketch import MinHash, MinHashLSH
|
9
|
+
from tqdm import tqdm
|
10
|
+
|
11
|
+
|
12
|
+
def create_minhash(signature_size: int, string: str, n_gram: int) -> MinHash:
|
13
|
+
"""
|
14
|
+
Creates a MinHash object for a given string.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
signature_size (int): The size of the MinHash signature.
|
18
|
+
string (str): The input string to create the MinHash for.
|
19
|
+
n_gram (int): The n-gram size for the MinHash.
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
MinHash: The MinHash object for the input string.
|
23
|
+
"""
|
24
|
+
m = MinHash(num_perm=signature_size)
|
25
|
+
for d in [string[i : i + n_gram] for i in range(len(string) - n_gram + 1)]:
|
26
|
+
m.update(d.encode("utf8"))
|
27
|
+
return m
|
28
|
+
|
29
|
+
|
30
|
+
def skip_column(column_name: str, column_values: List[str]) -> bool:
|
31
|
+
"""
|
32
|
+
Determines whether to skip processing a column based on its values.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
column_name (str): The name of the column.
|
36
|
+
column_values (List[str]): The list of values in the column.
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
bool: True if the column should be skipped, False otherwise.
|
40
|
+
"""
|
41
|
+
if "name" in column_name.lower():
|
42
|
+
return False
|
43
|
+
sum_of_lengths = sum(len(value) for value in column_values)
|
44
|
+
average_length = sum_of_lengths / len(column_values)
|
45
|
+
return (sum_of_lengths > 50000) and (average_length > 20)
|
46
|
+
|
47
|
+
|
48
|
+
def jaccard_similarity(m1: MinHash, m2: MinHash) -> float:
|
49
|
+
"""
|
50
|
+
Computes the Jaccard similarity between two MinHash objects.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
m1 (MinHash): The first MinHash object.
|
54
|
+
m2 (MinHash): The second MinHash object.
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
float: The Jaccard similarity between the two MinHash objects.
|
58
|
+
"""
|
59
|
+
return m1.jaccard(m2)
|
60
|
+
|
61
|
+
|
62
|
+
def create_lsh_index(
|
63
|
+
unique_values: Dict[str, Dict[str, List[str]]],
|
64
|
+
signature_size: int,
|
65
|
+
n_gram: int,
|
66
|
+
threshold: float,
|
67
|
+
verbose: bool = True,
|
68
|
+
) -> Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]:
|
69
|
+
"""
|
70
|
+
Creates a MinHash Locality-Sensitive Hashing (LSH) index from unique values in a database.
|
71
|
+
|
72
|
+
This function processes unique values from database tables and columns, creates MinHash
|
73
|
+
signatures for each value, and builds an LSH index for efficient similarity search.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
unique_values (Dict[str, Dict[str, List[str]]]): A nested dictionary containing unique values
|
77
|
+
from the database. The structure is {table_name: {column_name: [values]}}.
|
78
|
+
signature_size (int): The number of permutations to use in the MinHash signatures.
|
79
|
+
n_gram (int): The size of n-grams to use when creating MinHash signatures.
|
80
|
+
threshold (float): The similarity threshold for the LSH index. Values closer to 1 require
|
81
|
+
higher similarity for matches.
|
82
|
+
verbose (bool, optional): If True, displays a progress bar during processing. Defaults to True.
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]: A tuple containing:
|
86
|
+
- MinHashLSH: The constructed LSH index.
|
87
|
+
- Dict[str, Tuple[MinHash, str, str, str]]: A dictionary mapping unique keys to tuples
|
88
|
+
containing (MinHash object, table name, column name, original value).
|
89
|
+
|
90
|
+
Raises:
|
91
|
+
Exception: If an error occurs during LSH creation, it's logged but not raised.
|
92
|
+
|
93
|
+
Note:
|
94
|
+
This function uses the datasketch library for MinHash and LSH operations.
|
95
|
+
"""
|
96
|
+
lsh = MinHashLSH(threshold=threshold, num_perm=signature_size)
|
97
|
+
minhashes: Dict[str, Tuple[MinHash, str, str, str]] = {}
|
98
|
+
try:
|
99
|
+
total_unique_values = sum(
|
100
|
+
len(column_values)
|
101
|
+
for table_values in unique_values.values()
|
102
|
+
for column_values in table_values.values()
|
103
|
+
)
|
104
|
+
logging.info(f"Total unique values: {total_unique_values}")
|
105
|
+
|
106
|
+
progress_bar = (
|
107
|
+
tqdm(total=total_unique_values, desc="Creating LSH") if verbose else None
|
108
|
+
)
|
109
|
+
|
110
|
+
for table_name, table_values in unique_values.items():
|
111
|
+
for column_name, column_values in table_values.items():
|
112
|
+
if column_name.lower() == "doctype":
|
113
|
+
print("=" * 20)
|
114
|
+
print("Doctype found")
|
115
|
+
print("=" * 20)
|
116
|
+
logging.info(
|
117
|
+
f"Processing {table_name} - {column_name} - {len(column_values)}"
|
118
|
+
)
|
119
|
+
|
120
|
+
for id, value in enumerate(column_values):
|
121
|
+
minhash = create_minhash(signature_size, value, n_gram)
|
122
|
+
minhash_key = f"{table_name}_{column_name}_{id}"
|
123
|
+
minhashes[minhash_key] = (minhash, table_name, column_name, value)
|
124
|
+
lsh.insert(minhash_key, minhash)
|
125
|
+
|
126
|
+
if verbose:
|
127
|
+
progress_bar.update(1)
|
128
|
+
|
129
|
+
if verbose:
|
130
|
+
progress_bar.close()
|
131
|
+
except Exception as e:
|
132
|
+
logging.error(f"Error creating LSH: {e}")
|
133
|
+
|
134
|
+
return lsh, minhashes
|
135
|
+
|
136
|
+
|
137
|
+
def query_lsh_index(
|
138
|
+
lsh: MinHashLSH,
|
139
|
+
minhashes: Dict[str, Tuple[MinHash, str, str, str]],
|
140
|
+
keyword: str,
|
141
|
+
signature_size: int = 30,
|
142
|
+
n_gram: int = 3,
|
143
|
+
top_n: int = 10,
|
144
|
+
) -> Dict[str, Dict[str, List[str]]]:
|
145
|
+
"""
|
146
|
+
Queries the LSH for similar values to the given keyword and returns the top results.
|
147
|
+
|
148
|
+
Args:
|
149
|
+
lsh (MinHashLSH): The LSH object.
|
150
|
+
minhashes (Dict[str, Tuple[MinHash, str, str, str]]): The dictionary of MinHashes.
|
151
|
+
keyword (str): The keyword to search for.
|
152
|
+
signature_size (int, optional): The size of the MinHash signature.
|
153
|
+
n_gram (int, optional): The n-gram size for the MinHash.
|
154
|
+
top_n (int, optional): The number of top results to return.
|
155
|
+
|
156
|
+
Returns:
|
157
|
+
Dict[str, Dict[str, List[str]]]: A dictionary containing the top similar values.
|
158
|
+
Example:{
|
159
|
+
'table_name1': {
|
160
|
+
'column_name1': ['value1', 'value2', 'value3'],
|
161
|
+
'column_name2': ['value4', 'value5']
|
162
|
+
},
|
163
|
+
'table_name2': {
|
164
|
+
"""
|
165
|
+
query_minhash = create_minhash(signature_size, keyword, n_gram)
|
166
|
+
results = lsh.query(query_minhash)
|
167
|
+
similarities = [
|
168
|
+
(result, jaccard_similarity(query_minhash, minhashes[result][0]))
|
169
|
+
for result in results
|
170
|
+
]
|
171
|
+
similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
|
172
|
+
|
173
|
+
similar_values_trimmed: Dict[str, Dict[str, List[str]]] = {}
|
174
|
+
for result, similarity in similarities:
|
175
|
+
table_name, column_name, value = minhashes[result][1:] #type: ignore
|
176
|
+
if table_name not in similar_values_trimmed:
|
177
|
+
similar_values_trimmed[table_name] = {}
|
178
|
+
if column_name not in similar_values_trimmed[table_name]:
|
179
|
+
similar_values_trimmed[table_name][column_name] = []
|
180
|
+
similar_values_trimmed[table_name][column_name].append(value)
|
181
|
+
|
182
|
+
return similar_values_trimmed
|