thoth-dbmanager 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. thoth_dbmanager/ThothDbManager.py +459 -0
  2. thoth_dbmanager/__init__.py +136 -0
  3. thoth_dbmanager/adapters/__init__.py +21 -0
  4. thoth_dbmanager/adapters/mariadb.py +165 -0
  5. thoth_dbmanager/adapters/mysql.py +165 -0
  6. thoth_dbmanager/adapters/oracle.py +554 -0
  7. thoth_dbmanager/adapters/postgresql.py +444 -0
  8. thoth_dbmanager/adapters/sqlite.py +385 -0
  9. thoth_dbmanager/adapters/sqlserver.py +583 -0
  10. thoth_dbmanager/adapters/supabase.py +249 -0
  11. thoth_dbmanager/core/__init__.py +13 -0
  12. thoth_dbmanager/core/factory.py +272 -0
  13. thoth_dbmanager/core/interfaces.py +271 -0
  14. thoth_dbmanager/core/registry.py +220 -0
  15. thoth_dbmanager/documents.py +155 -0
  16. thoth_dbmanager/dynamic_imports.py +250 -0
  17. thoth_dbmanager/helpers/__init__.py +0 -0
  18. thoth_dbmanager/helpers/multi_db_generator.py +508 -0
  19. thoth_dbmanager/helpers/preprocess_values.py +159 -0
  20. thoth_dbmanager/helpers/schema.py +376 -0
  21. thoth_dbmanager/helpers/search.py +117 -0
  22. thoth_dbmanager/lsh/__init__.py +21 -0
  23. thoth_dbmanager/lsh/core.py +182 -0
  24. thoth_dbmanager/lsh/factory.py +76 -0
  25. thoth_dbmanager/lsh/manager.py +170 -0
  26. thoth_dbmanager/lsh/storage.py +96 -0
  27. thoth_dbmanager/plugins/__init__.py +23 -0
  28. thoth_dbmanager/plugins/mariadb.py +436 -0
  29. thoth_dbmanager/plugins/mysql.py +408 -0
  30. thoth_dbmanager/plugins/oracle.py +150 -0
  31. thoth_dbmanager/plugins/postgresql.py +145 -0
  32. thoth_dbmanager/plugins/sqlite.py +170 -0
  33. thoth_dbmanager/plugins/sqlserver.py +149 -0
  34. thoth_dbmanager/plugins/supabase.py +224 -0
  35. {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/METADATA +6 -6
  36. thoth_dbmanager-0.4.1.dist-info/RECORD +39 -0
  37. thoth_dbmanager-0.4.1.dist-info/top_level.txt +1 -0
  38. thoth_dbmanager-0.4.0.dist-info/RECORD +0 -5
  39. thoth_dbmanager-0.4.0.dist-info/top_level.txt +0 -1
  40. {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/WHEEL +0 -0
  41. {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,376 @@
1
+ import logging
2
+ from dataclasses import dataclass, field
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ @dataclass
9
+ class ColumnInfo:
10
+ """
11
+ Represents metadata for a single column in a database table.
12
+
13
+ Attributes:
14
+ original_column_name (str): The original name of the column.
15
+ column_name (str): The standardized name of the column.
16
+ column_description (str): A description of the column.
17
+ data_format (str): The format of the data in the column.
18
+ value_description (str): A description of the values in the column.
19
+ type (str): The data type of the column.
20
+ examples (List[str]): Example values from the column.
21
+ primary_key (bool): Whether the column is a primary key.
22
+ foreign_keys (List[Tuple[str, str]]): Foreign keys referencing other tables and columns.
23
+ referenced_by (List[Tuple[str, str]]): Columns in other tables that reference this column.
24
+ """
25
+
26
+ original_column_name: str = ""
27
+ column_name: str = ""
28
+ column_description: str = ""
29
+ generated_comment: str = ""
30
+ value_description: str = ""
31
+ data_format: str = ""
32
+ type: str = ""
33
+ examples: List[str] = field(default_factory=list)
34
+ primary_key: bool = False
35
+ foreign_keys: List[Tuple[str, str]] = field(default_factory=list)
36
+ referenced_by: List[Tuple[str, str]] = field(default_factory=list)
37
+
38
+
39
+ def set_field(column_info: ColumnInfo, field_name: str, value: Any) -> None:
40
+ """
41
+ Sets a field in the ColumnInfo dataclass.
42
+
43
+ Args:
44
+ column_info (ColumnInfo): The ColumnInfo instance to update.
45
+ field_name (str): The field name to set.
46
+ value (Any): The value to set for the field.
47
+
48
+ Raises:
49
+ ValueError: If the field_name is not a valid field of ColumnInfo.
50
+ """
51
+ if field_name in column_info.__dataclass_fields__:
52
+ setattr(column_info, field_name, value)
53
+ else:
54
+ raise ValueError(f"{field_name} is not a valid field of ColumnInfo")
55
+
56
+
57
+ @dataclass
58
+ class TableSchema:
59
+ """
60
+ Represents the schema of a single table in a database.
61
+
62
+ Attributes:
63
+ columns (Dict[str, ColumnInfo]): A dictionary mapping column names to their metadata.
64
+ """
65
+
66
+ columns: Dict[str, ColumnInfo] = field(default_factory=dict)
67
+
68
+
69
+ def get_primary_keys(table_schema: TableSchema) -> List[str]:
70
+ """
71
+ Retrieves the primary key columns from a table schema.
72
+
73
+ Args:
74
+ table_schema (TableSchema): The table schema to analyze.
75
+
76
+ Returns:
77
+ List[str]: A list of primary key column names.
78
+ """
79
+ return [name for name, info in table_schema.columns.items() if info.primary_key]
80
+
81
+
82
+ @dataclass
83
+ class DatabaseSchema:
84
+ """
85
+ Represents the schema of an entire database, consisting of multiple tables.
86
+
87
+ Attributes:
88
+ tables (Dict[str, TableSchema]): A dictionary mapping table names to their schemas.
89
+ """
90
+
91
+ tables: Dict[str, TableSchema] = field(default_factory=dict)
92
+
93
+ @classmethod
94
+ def from_table_names(cls, table_names: List[str]) -> "DatabaseSchema":
95
+ """
96
+ Creates a DatabaseSchema from a list of table names.
97
+
98
+ Args:
99
+ table_names (List[str]): The names of the tables to include in the schema.
100
+
101
+ Returns:
102
+ DatabaseSchema: The constructed database schema.
103
+ """
104
+ return cls(tables={name: TableSchema() for name in table_names})
105
+
106
+ @classmethod
107
+ def from_schema_dict(
108
+ cls, schema_dict: Dict[str, List[str]]
109
+ ) -> "DatabaseSchema":
110
+ """
111
+ Creates a DatabaseSchema from a dictionary mapping table names to lists of column names.
112
+
113
+ Args:
114
+ schema_dict (Dict[str, List[str]]): The schema dictionary to convert.
115
+
116
+ Returns:
117
+ DatabaseSchema: The constructed database schema.
118
+ """
119
+ return cls(
120
+ tables={
121
+ table_name: TableSchema(
122
+ columns={column_name: ColumnInfo() for column_name in column_names}
123
+ )
124
+ for table_name, column_names in schema_dict.items()
125
+ }
126
+ )
127
+
128
+ @classmethod
129
+ def from_schema_dict_with_examples(
130
+ cls, schema_dict_with_info: Dict[str, Dict[str, List[str]]]
131
+ ) -> "DatabaseSchema":
132
+ """
133
+ Creates a DatabaseSchema from a dictionary with example values for each column.
134
+
135
+ Args:
136
+ schema_dict_with_info (Dict[str, Dict[str, List[str]]]): The schema dictionary with example values.
137
+
138
+ Returns:
139
+ DatabaseSchema: The constructed database schema.
140
+ """
141
+ return cls(
142
+ tables={
143
+ table_name: TableSchema(
144
+ columns={
145
+ column_name: ColumnInfo(examples=column_info)
146
+ for column_name, column_info in column_dict.items()
147
+ }
148
+ )
149
+ for table_name, column_dict in schema_dict_with_info.items()
150
+ }
151
+ )
152
+
153
+ @classmethod
154
+ def from_schema_dict_with_descriptions(
155
+ cls,
156
+ tentative_schema: Dict[str, Dict[str, Dict[str, Any]]],
157
+ schema_dict_with_info: Dict[str, Dict[str, Dict[str, Any]]]
158
+ ) -> "DatabaseSchema":
159
+ """
160
+ Creates a DatabaseSchema from a dictionary with detailed information for each column.
161
+
162
+ Args:
163
+ tentative_schema (Dict[str, Dict[str, Dict[str, Any]]]): The base schema structure
164
+ schema_dict_with_info (Dict[str, Dict[str, Dict[str, Any]]]): The schema dictionary with detailed information
165
+
166
+ Returns:
167
+ DatabaseSchema: The constructed database schema
168
+ """
169
+ # Convert new schema format to simplified format for initial creation
170
+ simplified_schema = {
171
+ table_name: list(table_info["columns"].keys())
172
+ for table_name, table_info in tentative_schema.items()
173
+ }
174
+
175
+ database_schema = cls.from_schema_dict(simplified_schema)
176
+
177
+ for table_name, table_info in schema_dict_with_info.items():
178
+ if table_name in database_schema.tables:
179
+ for column_name, info in table_info.items():
180
+ if column_name in database_schema.tables[table_name].columns:
181
+ column_info = database_schema.tables[table_name].columns[column_name]
182
+ for field_name, value in info.items():
183
+ set_field(column_info, field_name, value)
184
+
185
+ return database_schema
186
+
187
+ def get_actual_table_name(
188
+ self, table_name: str
189
+ ) -> Optional[str]:
190
+ """
191
+ Retrieves the actual table name matching the provided name, case-insensitive.
192
+
193
+ Args:
194
+ table_name (str): The name of the table to search for.
195
+
196
+ Returns:
197
+ Optional[str]: The actual table name if found, otherwise None.
198
+ """
199
+ table_name_lower = table_name.lower()
200
+ return next(
201
+ (name for name in self.tables if name.lower() == table_name_lower), None
202
+ )
203
+
204
+ def get_table_info(
205
+ self, table_name: str
206
+ ) -> Optional[TableSchema]:
207
+ """
208
+ Retrieves the TableSchema object for the specified table name.
209
+
210
+ Args:
211
+ table_name (str): The name of the table to retrieve.
212
+
213
+ Returns:
214
+ Optional[TableSchema]: The TableSchema if found, otherwise None.
215
+ """
216
+ actual_name = self.get_actual_table_name(table_name)
217
+ return self.tables.get(actual_name)
218
+
219
+ def get_actual_column_name(
220
+ self, table_name: str, column_name: str
221
+ ) -> Optional[str]:
222
+ """
223
+ Retrieves the actual column name matching the provided name, case-insensitive.
224
+
225
+ Args:
226
+ table_name (str): The name of the table containing the column.
227
+ column_name (str): The name of the column to search for.
228
+
229
+ Returns:
230
+ Optional[str]: The actual column name if found, otherwise None.
231
+ """
232
+ table_info = self.get_table_info(table_name)
233
+ if table_info:
234
+ column_name_lower = column_name.lower()
235
+ return next(
236
+ (
237
+ name
238
+ for name in table_info.columns
239
+ if name.lower() == column_name_lower
240
+ ),
241
+ None,
242
+ )
243
+ return None
244
+
245
+ def get_column_info(
246
+ self, table_name: str, column_name: str
247
+ ) -> Optional[ColumnInfo]:
248
+ """
249
+ Retrieves the ColumnInfo object for the specified column in a table.
250
+
251
+ Args:
252
+ table_name (str): The name of the table containing the column.
253
+ column_name (str): The name of the column to retrieve.
254
+
255
+ Returns:
256
+ Optional[ColumnInfo]: The ColumnInfo if found, otherwise None.
257
+ """
258
+ actual_name = self.get_actual_column_name(table_name, column_name)
259
+ if actual_name:
260
+ return self.tables[table_name].columns[actual_name]
261
+ return None
262
+
263
+ def set_columns_info(
264
+ self, schema_with_info: Dict[str, Dict[str, Dict[str, Any]]]
265
+ ) -> None:
266
+ """
267
+ Sets detailed information for columns in the schema.
268
+
269
+ Args:
270
+ schema_with_info (Dict[str, Dict[str, Dict[str, Any]]]): The schema information to set.
271
+ """
272
+ for table_name, columns_info in schema_with_info.items():
273
+ table_info = self.get_table_info(table_name)
274
+ if table_info is None:
275
+ logger.warning(f"Table {table_name} not found in the schema")
276
+ continue
277
+ for column_name, info in columns_info.items():
278
+ actual_name = self.get_actual_column_name(table_name, column_name)
279
+ if actual_name is None:
280
+ logger.warning(f"Column {column_name} not found in table {table_name}")
281
+ continue
282
+ schema_column_info = table_info.columns[actual_name]
283
+ for field_name, value in info.items():
284
+ set_field(schema_column_info, field_name, value)
285
+
286
+ def subselect_schema(
287
+ self, selected_database_schema: "DatabaseSchema"
288
+ ) -> "DatabaseSchema":
289
+ """
290
+ Creates a new DatabaseSchema containing only the selected tables and columns.
291
+
292
+ Args:
293
+ selected_database_schema (DatabaseSchema): The schema to subselect from.
294
+
295
+ Returns:
296
+ DatabaseSchema: The new subselected database schema.
297
+ """
298
+ new_schema = DatabaseSchema({})
299
+ for table_name, table_info in selected_database_schema.tables.items():
300
+ actual_table_name = self.get_actual_table_name(table_name)
301
+ if actual_table_name is None:
302
+ logger.warning(f"Table {table_name} not found in the schema")
303
+ continue
304
+ new_table_info = TableSchema()
305
+ for column_name, column_info in table_info.columns.items():
306
+ actual_column_name = self.get_actual_column_name(
307
+ table_name, column_name
308
+ )
309
+ if actual_column_name is None:
310
+ logger.warning(f"Column {column_name} not found in table {table_name}")
311
+ continue
312
+ new_table_info.columns[actual_column_name] = column_info
313
+ new_schema.tables[actual_table_name] = new_table_info
314
+ return new_schema
315
+
316
+ def add_info_from_schema(
317
+ self, schema: "DatabaseSchema", field_names: List[str]
318
+ ) -> None:
319
+ """
320
+ Adds additional field information from another schema to the current schema.
321
+
322
+ Args:
323
+ schema (DatabaseSchema): The schema to copy information from.
324
+ field_names (List[str]): The list of field names to copy.
325
+ """
326
+ for table_name, table_info in self.tables.items():
327
+ actual_table_name = schema.get_actual_table_name(table_name)
328
+ if actual_table_name is None:
329
+ continue
330
+ for column_name, column_info in table_info.columns.items():
331
+ actual_column_name = schema.get_actual_column_name(
332
+ table_name, column_name
333
+ )
334
+ if actual_column_name is None:
335
+ continue
336
+ new_column_info = schema.tables[actual_table_name].columns[
337
+ actual_column_name
338
+ ]
339
+ for field_name in field_names:
340
+ set_field(
341
+ column_info, field_name, getattr(new_column_info, field_name)
342
+ )
343
+
344
+ def to_dict(self) -> Dict[str, List[str]]:
345
+ """
346
+ Converts the DatabaseSchema to a dictionary representation.
347
+
348
+ Returns:
349
+ Dict[str, List[str]]: The dictionary representation of the schema.
350
+ """
351
+ return {
352
+ table_name: list(table_info.columns.keys())
353
+ for table_name, table_info in self.tables.items()
354
+ }
355
+
356
+ def validate_schema(self) -> List[str]:
357
+ """
358
+ Validates the schema for integrity and returns a list of validation errors.
359
+
360
+ Returns:
361
+ List[str]: A list of validation error messages, empty if no errors.
362
+ """
363
+ errors = []
364
+
365
+ # Check for foreign key references to non-existent tables/columns
366
+ for table_name, table_info in self.tables.items():
367
+ for column_name, column_info in table_info.columns.items():
368
+ for fk_table, fk_column in column_info.foreign_keys:
369
+ if not self.get_actual_table_name(fk_table):
370
+ errors.append(f"Foreign key in {table_name}.{column_name} references non-existent table {fk_table}")
371
+ continue
372
+
373
+ if not self.get_actual_column_name(fk_table, fk_column):
374
+ errors.append(f"Foreign key in {table_name}.{column_name} references non-existent column {fk_table}.{fk_column}")
375
+
376
+ return errors
@@ -0,0 +1,117 @@
1
+ import logging
2
+ import pickle
3
+ from pathlib import Path
4
+ from typing import Dict, List, Tuple
5
+
6
+ from datasketch import MinHash, MinHashLSH
7
+
8
+ from .preprocess_values import _create_minhash
9
+
10
+
11
+ ### Database value similarity ###
12
+
13
+
14
+ def _jaccard_similarity(m1: MinHash, m2: MinHash) -> float:
15
+ """
16
+ Computes the Jaccard similarity between two MinHash objects.
17
+
18
+ Args:
19
+ m1 (MinHash): The first MinHash object.
20
+ m2 (MinHash): The second MinHash object.
21
+
22
+ Returns:
23
+ float: The Jaccard similarity between the two MinHash objects.
24
+ """
25
+ return m1.jaccard(m2)
26
+
27
+
28
+ def load_db_lsh(
29
+ db_directory_path: str,
30
+ ) -> Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]:
31
+ """
32
+ Loads the LSH and MinHashes from the preprocessed files in the specified directory.
33
+
34
+ This function maintains backward compatibility while potentially using the new LSH manager.
35
+
36
+ Args:
37
+ db_directory_path (str): The path to the database directory.
38
+
39
+ Returns:
40
+ Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]: The LSH object and the dictionary of MinHashes.
41
+
42
+ Raises:
43
+ Exception: If there is an error loading the LSH or MinHashes.
44
+ """
45
+ db_id = Path(db_directory_path).name
46
+
47
+ try:
48
+ # Try using the new LSH manager first
49
+ from ..lsh.manager import LshManager
50
+ lsh_manager = LshManager(Path(db_directory_path))
51
+ if lsh_manager.load_lsh():
52
+ return lsh_manager.lsh, lsh_manager.minhashes
53
+ except Exception as e:
54
+ logging.warning(f"New LSH manager failed, falling back to old method: {e}")
55
+
56
+ # Fallback to old method
57
+ try:
58
+ with open(
59
+ Path(db_directory_path) / "preprocessed" / f"{db_id}_lsh.pkl", "rb"
60
+ ) as file:
61
+ lsh = pickle.load(file)
62
+ with open(
63
+ Path(db_directory_path) / "preprocessed" / f"{db_id}_minhashes.pkl", "rb"
64
+ ) as file:
65
+ minhashes = pickle.load(file)
66
+ return lsh, minhashes
67
+ except Exception as e:
68
+ logging.error(f"Error loading LSH for {db_id}: {e}")
69
+ raise e
70
+
71
+
72
+ def _query_lsh(
73
+ lsh: MinHashLSH,
74
+ minhashes: Dict[str, Tuple[MinHash, str, str, str]],
75
+ keyword: str,
76
+ signature_size: int = 30,
77
+ n_gram: int = 3,
78
+ top_n: int = 10,
79
+ ) -> Dict[str, Dict[str, List[str]]]:
80
+ """
81
+ Queries the LSH for similar values to the given keyword and returns the top results.
82
+
83
+ Args:
84
+ lsh (MinHashLSH): The LSH object.
85
+ minhashes (Dict[str, Tuple[MinHash, str, str, str]]): The dictionary of MinHashes.
86
+ keyword (str): The keyword to search for.
87
+ signature_size (int, optional): The size of the MinHash signature.
88
+ n_gram (int, optional): The n-gram size for the MinHash.
89
+ top_n (int, optional): The number of top results to return.
90
+
91
+ Returns:
92
+ Dict[str, Dict[str, List[str]]]: A dictionary containing the top similar values.
93
+ Example:{
94
+ 'table_name1': {
95
+ 'column_name1': ['value1', 'value2', 'value3'],
96
+ 'column_name2': ['value4', 'value5']
97
+ },
98
+ 'table_name2': {
99
+ """
100
+ query_minhash = _create_minhash(signature_size, keyword, n_gram)
101
+ results = lsh.query(query_minhash)
102
+ similarities = [
103
+ (result, _jaccard_similarity(query_minhash, minhashes[result][0]))
104
+ for result in results
105
+ ]
106
+ similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
107
+
108
+ similar_values_trimmed: Dict[str, Dict[str, List[str]]] = {}
109
+ for result, similarity in similarities:
110
+ table_name, column_name, value = minhashes[result][1:] #type: ignore
111
+ if table_name not in similar_values_trimmed:
112
+ similar_values_trimmed[table_name] = {}
113
+ if column_name not in similar_values_trimmed[table_name]:
114
+ similar_values_trimmed[table_name][column_name] = []
115
+ similar_values_trimmed[table_name][column_name].append(value)
116
+
117
+ return similar_values_trimmed
@@ -0,0 +1,21 @@
1
+ """
2
+ LSH (Locality Sensitive Hashing) module for database-independent LSH management.
3
+ """
4
+
5
+ from .storage import LshStorageStrategy, PickleStorage
6
+ from .manager import LshManager
7
+ from .factory import LshFactory, make_db_lsh
8
+ from .core import create_minhash, skip_column, jaccard_similarity, create_lsh_index, query_lsh_index
9
+
10
+ __all__ = [
11
+ "LshStorageStrategy",
12
+ "PickleStorage",
13
+ "LshManager",
14
+ "LshFactory",
15
+ "make_db_lsh",
16
+ "create_minhash",
17
+ "skip_column",
18
+ "jaccard_similarity",
19
+ "create_lsh_index",
20
+ "query_lsh_index"
21
+ ]
@@ -0,0 +1,182 @@
1
+ """
2
+ Core LSH functionality extracted from helpers.
3
+ """
4
+
5
+ import logging
6
+ from typing import Dict, List, Tuple
7
+
8
+ from datasketch import MinHash, MinHashLSH
9
+ from tqdm import tqdm
10
+
11
+
12
+ def create_minhash(signature_size: int, string: str, n_gram: int) -> MinHash:
13
+ """
14
+ Creates a MinHash object for a given string.
15
+
16
+ Args:
17
+ signature_size (int): The size of the MinHash signature.
18
+ string (str): The input string to create the MinHash for.
19
+ n_gram (int): The n-gram size for the MinHash.
20
+
21
+ Returns:
22
+ MinHash: The MinHash object for the input string.
23
+ """
24
+ m = MinHash(num_perm=signature_size)
25
+ for d in [string[i : i + n_gram] for i in range(len(string) - n_gram + 1)]:
26
+ m.update(d.encode("utf8"))
27
+ return m
28
+
29
+
30
+ def skip_column(column_name: str, column_values: List[str]) -> bool:
31
+ """
32
+ Determines whether to skip processing a column based on its values.
33
+
34
+ Args:
35
+ column_name (str): The name of the column.
36
+ column_values (List[str]): The list of values in the column.
37
+
38
+ Returns:
39
+ bool: True if the column should be skipped, False otherwise.
40
+ """
41
+ if "name" in column_name.lower():
42
+ return False
43
+ sum_of_lengths = sum(len(value) for value in column_values)
44
+ average_length = sum_of_lengths / len(column_values)
45
+ return (sum_of_lengths > 50000) and (average_length > 20)
46
+
47
+
48
+ def jaccard_similarity(m1: MinHash, m2: MinHash) -> float:
49
+ """
50
+ Computes the Jaccard similarity between two MinHash objects.
51
+
52
+ Args:
53
+ m1 (MinHash): The first MinHash object.
54
+ m2 (MinHash): The second MinHash object.
55
+
56
+ Returns:
57
+ float: The Jaccard similarity between the two MinHash objects.
58
+ """
59
+ return m1.jaccard(m2)
60
+
61
+
62
+ def create_lsh_index(
63
+ unique_values: Dict[str, Dict[str, List[str]]],
64
+ signature_size: int,
65
+ n_gram: int,
66
+ threshold: float,
67
+ verbose: bool = True,
68
+ ) -> Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]:
69
+ """
70
+ Creates a MinHash Locality-Sensitive Hashing (LSH) index from unique values in a database.
71
+
72
+ This function processes unique values from database tables and columns, creates MinHash
73
+ signatures for each value, and builds an LSH index for efficient similarity search.
74
+
75
+ Args:
76
+ unique_values (Dict[str, Dict[str, List[str]]]): A nested dictionary containing unique values
77
+ from the database. The structure is {table_name: {column_name: [values]}}.
78
+ signature_size (int): The number of permutations to use in the MinHash signatures.
79
+ n_gram (int): The size of n-grams to use when creating MinHash signatures.
80
+ threshold (float): The similarity threshold for the LSH index. Values closer to 1 require
81
+ higher similarity for matches.
82
+ verbose (bool, optional): If True, displays a progress bar during processing. Defaults to True.
83
+
84
+ Returns:
85
+ Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]: A tuple containing:
86
+ - MinHashLSH: The constructed LSH index.
87
+ - Dict[str, Tuple[MinHash, str, str, str]]: A dictionary mapping unique keys to tuples
88
+ containing (MinHash object, table name, column name, original value).
89
+
90
+ Raises:
91
+ Exception: If an error occurs during LSH creation, it's logged but not raised.
92
+
93
+ Note:
94
+ This function uses the datasketch library for MinHash and LSH operations.
95
+ """
96
+ lsh = MinHashLSH(threshold=threshold, num_perm=signature_size)
97
+ minhashes: Dict[str, Tuple[MinHash, str, str, str]] = {}
98
+ try:
99
+ total_unique_values = sum(
100
+ len(column_values)
101
+ for table_values in unique_values.values()
102
+ for column_values in table_values.values()
103
+ )
104
+ logging.info(f"Total unique values: {total_unique_values}")
105
+
106
+ progress_bar = (
107
+ tqdm(total=total_unique_values, desc="Creating LSH") if verbose else None
108
+ )
109
+
110
+ for table_name, table_values in unique_values.items():
111
+ for column_name, column_values in table_values.items():
112
+ if column_name.lower() == "doctype":
113
+ print("=" * 20)
114
+ print("Doctype found")
115
+ print("=" * 20)
116
+ logging.info(
117
+ f"Processing {table_name} - {column_name} - {len(column_values)}"
118
+ )
119
+
120
+ for id, value in enumerate(column_values):
121
+ minhash = create_minhash(signature_size, value, n_gram)
122
+ minhash_key = f"{table_name}_{column_name}_{id}"
123
+ minhashes[minhash_key] = (minhash, table_name, column_name, value)
124
+ lsh.insert(minhash_key, minhash)
125
+
126
+ if verbose:
127
+ progress_bar.update(1)
128
+
129
+ if verbose:
130
+ progress_bar.close()
131
+ except Exception as e:
132
+ logging.error(f"Error creating LSH: {e}")
133
+
134
+ return lsh, minhashes
135
+
136
+
137
+ def query_lsh_index(
138
+ lsh: MinHashLSH,
139
+ minhashes: Dict[str, Tuple[MinHash, str, str, str]],
140
+ keyword: str,
141
+ signature_size: int = 30,
142
+ n_gram: int = 3,
143
+ top_n: int = 10,
144
+ ) -> Dict[str, Dict[str, List[str]]]:
145
+ """
146
+ Queries the LSH for similar values to the given keyword and returns the top results.
147
+
148
+ Args:
149
+ lsh (MinHashLSH): The LSH object.
150
+ minhashes (Dict[str, Tuple[MinHash, str, str, str]]): The dictionary of MinHashes.
151
+ keyword (str): The keyword to search for.
152
+ signature_size (int, optional): The size of the MinHash signature.
153
+ n_gram (int, optional): The n-gram size for the MinHash.
154
+ top_n (int, optional): The number of top results to return.
155
+
156
+ Returns:
157
+ Dict[str, Dict[str, List[str]]]: A dictionary containing the top similar values.
158
+ Example:{
159
+ 'table_name1': {
160
+ 'column_name1': ['value1', 'value2', 'value3'],
161
+ 'column_name2': ['value4', 'value5']
162
+ },
163
+ 'table_name2': {
164
+ """
165
+ query_minhash = create_minhash(signature_size, keyword, n_gram)
166
+ results = lsh.query(query_minhash)
167
+ similarities = [
168
+ (result, jaccard_similarity(query_minhash, minhashes[result][0]))
169
+ for result in results
170
+ ]
171
+ similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
172
+
173
+ similar_values_trimmed: Dict[str, Dict[str, List[str]]] = {}
174
+ for result, similarity in similarities:
175
+ table_name, column_name, value = minhashes[result][1:] #type: ignore
176
+ if table_name not in similar_values_trimmed:
177
+ similar_values_trimmed[table_name] = {}
178
+ if column_name not in similar_values_trimmed[table_name]:
179
+ similar_values_trimmed[table_name][column_name] = []
180
+ similar_values_trimmed[table_name][column_name].append(value)
181
+
182
+ return similar_values_trimmed