thoth-dbmanager 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. thoth_dbmanager/ThothDbManager.py +459 -0
  2. thoth_dbmanager/__init__.py +136 -0
  3. thoth_dbmanager/adapters/__init__.py +21 -0
  4. thoth_dbmanager/adapters/mariadb.py +165 -0
  5. thoth_dbmanager/adapters/mysql.py +165 -0
  6. thoth_dbmanager/adapters/oracle.py +554 -0
  7. thoth_dbmanager/adapters/postgresql.py +444 -0
  8. thoth_dbmanager/adapters/qdrant.py +189 -0
  9. thoth_dbmanager/adapters/sqlite.py +385 -0
  10. thoth_dbmanager/adapters/sqlserver.py +583 -0
  11. thoth_dbmanager/adapters/supabase.py +249 -0
  12. thoth_dbmanager/core/__init__.py +13 -0
  13. thoth_dbmanager/core/factory.py +272 -0
  14. thoth_dbmanager/core/interfaces.py +271 -0
  15. thoth_dbmanager/core/registry.py +220 -0
  16. thoth_dbmanager/documents.py +155 -0
  17. thoth_dbmanager/dynamic_imports.py +250 -0
  18. thoth_dbmanager/helpers/__init__.py +0 -0
  19. thoth_dbmanager/helpers/multi_db_generator.py +508 -0
  20. thoth_dbmanager/helpers/preprocess_values.py +159 -0
  21. thoth_dbmanager/helpers/schema.py +376 -0
  22. thoth_dbmanager/helpers/search.py +117 -0
  23. thoth_dbmanager/lsh/__init__.py +21 -0
  24. thoth_dbmanager/lsh/core.py +182 -0
  25. thoth_dbmanager/lsh/factory.py +76 -0
  26. thoth_dbmanager/lsh/manager.py +170 -0
  27. thoth_dbmanager/lsh/storage.py +96 -0
  28. thoth_dbmanager/plugins/__init__.py +23 -0
  29. thoth_dbmanager/plugins/mariadb.py +436 -0
  30. thoth_dbmanager/plugins/mysql.py +408 -0
  31. thoth_dbmanager/plugins/oracle.py +150 -0
  32. thoth_dbmanager/plugins/postgresql.py +145 -0
  33. thoth_dbmanager/plugins/qdrant.py +41 -0
  34. thoth_dbmanager/plugins/sqlite.py +170 -0
  35. thoth_dbmanager/plugins/sqlserver.py +149 -0
  36. thoth_dbmanager/plugins/supabase.py +224 -0
  37. {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.2.dist-info}/METADATA +9 -6
  38. thoth_dbmanager-0.4.2.dist-info/RECORD +41 -0
  39. thoth_dbmanager-0.4.2.dist-info/top_level.txt +1 -0
  40. thoth_dbmanager-0.4.0.dist-info/RECORD +0 -5
  41. thoth_dbmanager-0.4.0.dist-info/top_level.txt +0 -1
  42. {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.2.dist-info}/WHEEL +0 -0
  43. {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,508 @@
1
+ import logging
2
+ import random
3
+ import re
4
+ from typing import Dict, List, Optional
5
+ from pathlib import Path
6
+
7
+ from dbmanager.helpers.schema import DatabaseSchema
8
+
9
+
10
+ class MultiDbGenerator:
11
+ """
12
+ Class for multi-database schema generation.
13
+ Works with any ThothDbManager implementation.
14
+ """
15
+
16
+ CACHED_DB_SCHEMA = {}
17
+
18
+ def __init__(
19
+ self,
20
+ dbmanager,
21
+ tentative_schema: Optional[DatabaseSchema] = None,
22
+ schema_with_examples: Optional[DatabaseSchema] = None,
23
+ schema_with_descriptions: Optional[DatabaseSchema] = None,
24
+ add_examples: bool = True):
25
+ self.db_manager = dbmanager
26
+ self.db_id = dbmanager.db_id if dbmanager else None
27
+ self.add_examples = add_examples
28
+ self.schema_structure = tentative_schema or DatabaseSchema()
29
+ self.schema_with_examples = schema_with_examples or DatabaseSchema()
30
+ self.schema_with_descriptions = schema_with_descriptions or DatabaseSchema()
31
+
32
+ if dbmanager and self.db_id not in MultiDbGenerator.CACHED_DB_SCHEMA:
33
+ self._load_schema_into_cache()
34
+ self._initialize_schema_structure()
35
+
36
+ def _load_schema_into_cache(self) -> None:
37
+ """Load database schema into cache using manager methods"""
38
+ schema_dict = self.db_manager.get_schema_dict()
39
+ db_schema = DatabaseSchema.from_schema_dict(schema_dict)
40
+ MultiDbGenerator.CACHED_DB_SCHEMA[self.db_id] = db_schema
41
+
42
+ # Set primary keys
43
+ primary_keys = self.db_manager.get_primary_keys()
44
+ schema_with_primary_keys = {
45
+ table_name: {col: {"primary_key": True} for col in cols}
46
+ for table_name, cols in primary_keys.items()
47
+ }
48
+ db_schema.set_columns_info(schema_with_primary_keys)
49
+
50
+ # Set foreign keys
51
+ foreign_keys = self.db_manager.get_foreign_keys()
52
+ schema_with_references = {
53
+ table_name: {
54
+ column_name: {
55
+ "foreign_keys": info.get("foreign_keys", []),
56
+ "referenced_by": info.get("referenced_by", []),
57
+ }
58
+ for column_name, info in columns.items()
59
+ }
60
+ for table_name, columns in foreign_keys.items()
61
+ }
62
+ db_schema.set_columns_info(schema_with_references)
63
+
64
+ def _set_primary_keys(self, database_schema: DatabaseSchema) -> None:
65
+ """Set primary keys using manager method"""
66
+ primary_keys = self.db_manager.get_primary_keys()
67
+ schema_with_primary_keys = {
68
+ table_name: {col: {"primary_key": True} for col in cols}
69
+ for table_name, cols in primary_keys.items()
70
+ }
71
+ database_schema.set_columns_info(schema_with_primary_keys)
72
+
73
+ def _set_foreign_keys(self, database_schema: DatabaseSchema) -> None:
74
+ """Set foreign keys using manager method"""
75
+ foreign_keys = self.db_manager.get_foreign_keys()
76
+ schema_with_references = {
77
+ table_name: {
78
+ column_name: {
79
+ "foreign_keys": info.get("foreign_keys", []),
80
+ "referenced_by": info.get("referenced_by", []),
81
+ }
82
+ for column_name, info in columns.items()
83
+ }
84
+ for table_name, columns in foreign_keys.items()
85
+ }
86
+ database_schema.set_columns_info(schema_with_references)
87
+
88
+ def _initialize_schema_structure(self) -> None:
89
+ """Initialize the schema structure with table and column info"""
90
+ self._load_table_and_column_info()
91
+ self._load_column_examples()
92
+ self._load_column_descriptions()
93
+
94
+ def _load_table_and_column_info(self) -> None:
95
+ """Load table and column information from cached schema"""
96
+ if self.db_id in MultiDbGenerator.CACHED_DB_SCHEMA:
97
+ self.schema_structure = MultiDbGenerator.CACHED_DB_SCHEMA[
98
+ self.db_id
99
+ ].subselect_schema(self.schema_structure)
100
+ self.schema_structure.add_info_from_schema(
101
+ schema=MultiDbGenerator.CACHED_DB_SCHEMA[self.db_id],
102
+ field_names=["type", "primary_key", "foreign_keys", "referenced_by"],
103
+ )
104
+
105
+ def _load_column_examples(self) -> None:
106
+ """Load examples for columns in the schema"""
107
+ self.schema_structure.add_info_from_schema(
108
+ schema=self.schema_with_examples, field_names=["examples"]
109
+ )
110
+
111
+ def _load_column_descriptions(self) -> None:
112
+ """Load descriptions for columns in the schema"""
113
+ self.schema_structure.add_info_from_schema(
114
+ schema=self.schema_with_descriptions,
115
+ field_names=[
116
+ "original_column_name",
117
+ "column_name",
118
+ "column_description",
119
+ "data_format",
120
+ "value_description",
121
+ ],
122
+ )
123
+
124
+ def _extract_create_ddl_commands(self) -> Dict[str, str]:
125
+ """Extract CREATE TABLE DDL commands for all tables
126
+
127
+ Returns:
128
+ Dict[str, str]: Dictionary mapping table names to their CREATE TABLE DDL statements
129
+ """
130
+ if hasattr(self.db_manager, 'extract_create_ddl_commands'):
131
+ return self.db_manager.extract_create_ddl_commands()
132
+ else:
133
+ raise NotImplementedError("extract_create_ddl_commands method not implemented for this database manager")
134
+
135
+ def generate_schema_string(
136
+ self,
137
+ include_value_description: bool = True,
138
+ shuffle_cols: bool = True,
139
+ shuffle_tables: bool = True,
140
+ ) -> str:
141
+ """
142
+ Generates a schema string with descriptions and examples.
143
+
144
+ Args:
145
+ include_value_description (bool): Flag to include value descriptions.
146
+ shuffle_cols (bool): Flag to shuffle columns within tables.
147
+ shuffle_tables (bool): Flag to shuffle tables in the output.
148
+
149
+ Returns:
150
+ str: The generated schema string.
151
+ """
152
+ ddl_commands = self._extract_create_ddl_commands()
153
+ schema_strings = []
154
+
155
+ if shuffle_tables:
156
+ ddl_tables = list(ddl_commands.keys())
157
+ random.shuffle(ddl_tables)
158
+ ddl_commands = {
159
+ table_name: ddl_commands[table_name] for table_name in ddl_tables
160
+ }
161
+
162
+ for table_name, ddl_command in ddl_commands.items():
163
+ ddl_command = re.sub(r"\s+", " ", ddl_command.strip())
164
+ create_table_match = re.match(
165
+ r'CREATE TABLE "?`?([\w -]+)`?"?\s*\((.*)\)', ddl_command, re.DOTALL
166
+ )
167
+
168
+ if not create_table_match:
169
+ logging.warning(f"Could not parse DDL command for table {table_name}")
170
+ continue
171
+
172
+ table = create_table_match.group(1).strip()
173
+ if table != table_name:
174
+ logging.warning(f"Table name mismatch: {table} != {table_name}")
175
+
176
+ column_definitions = create_table_match.group(2).strip()
177
+
178
+ if table_name in self.schema_structure.tables:
179
+ table_schema = self.schema_structure.tables[table_name]
180
+
181
+ # Start building the new CREATE TABLE statement
182
+ schema_lines = [f"CREATE TABLE {table_name}", "("]
183
+
184
+ # Process column definitions
185
+ definitions = self._separate_column_definitions(column_definitions)
186
+ column_defs = []
187
+ constraint_defs = []
188
+
189
+ # Extract column definitions and constraints
190
+ for definition in definitions:
191
+ if definition.lower().startswith("foreign key") or definition.lower().startswith("constraint"):
192
+ constraint_defs.append(definition)
193
+ else:
194
+ column_match = re.match(r'"?`?([\w_]+)`?"?\s+(.*)', definition)
195
+ if column_match:
196
+ column_name = column_match.group(1)
197
+ column_type = column_match.group(2)
198
+
199
+ # Remove NULL/NOT NULL constraints as requested
200
+ column_type = re.sub(r'\s+(?:NOT\s+)?NULL', '', column_type, flags=re.IGNORECASE)
201
+
202
+ # Check if this is a primary key
203
+ is_primary_key = "primary key" in column_type.lower()
204
+
205
+ # Format the column definition
206
+ column_def = f"\t{column_name} {column_type}"
207
+
208
+ # Add comments with examples and descriptions
209
+ if column_name in table_schema.columns:
210
+ column_info = table_schema.columns[column_name]
211
+ comment_parts = []
212
+
213
+ # Add examples if available
214
+ if hasattr(column_info, 'examples') and column_info.examples:
215
+ examples = [f"`{ex}`" for ex in column_info.examples[:3]] # Limit to 3 examples
216
+ comment_parts.append(f"examples: {', '.join(examples)}")
217
+
218
+ # Add column name if available
219
+ if hasattr(column_info, 'column_name') and column_info.column_name:
220
+ comment_parts.append(f"| `{column_info.column_name}`")
221
+
222
+ # Add column description if available
223
+ if hasattr(column_info, 'column_description') and column_info.column_description:
224
+ comment_parts.append(f"description: {column_info.column_description}")
225
+
226
+ # Add value description if available and requested
227
+ if include_value_description and hasattr(column_info, 'value_description') and column_info.value_description:
228
+ comment_parts.append(f"values: {column_info.value_description}")
229
+
230
+ # Add the comment to the column definition
231
+ if comment_parts:
232
+ column_def += f" -- {' '.join(comment_parts)}"
233
+
234
+ column_defs.append(column_def)
235
+
236
+ # Process foreign key constraints with references
237
+ for column_name, column_info in table_schema.columns.items():
238
+ if hasattr(column_info, 'foreign_keys') and column_info.foreign_keys:
239
+ for ref_table, ref_column in column_info.foreign_keys:
240
+ # Fixed: Properly access tuple elements instead of using dictionary access
241
+ fk_constraint = f"\tforeign key ({column_name}) references {ref_table} ({ref_column}) on update cascade on delete cascade"
242
+ constraint_defs.append(fk_constraint)
243
+
244
+ # Combine column definitions and constraints
245
+ all_defs = column_defs + constraint_defs
246
+ schema_lines.extend(all_defs)
247
+ schema_lines.append(");")
248
+
249
+ # Join all lines to form the complete CREATE TABLE statement
250
+ schema_strings.append("\n".join(schema_lines))
251
+
252
+ return "\n\n".join(schema_strings)
253
+
254
+ @staticmethod
255
+ def _separate_column_definitions(column_definitions: str) -> List[str]:
256
+ """Separate column definitions from a CREATE TABLE statement
257
+
258
+ Args:
259
+ column_definitions (str): The column definitions part of a CREATE TABLE statement
260
+
261
+ Returns:
262
+ List[str]: List of individual column definitions
263
+ """
264
+ definitions = []
265
+ current_def = ""
266
+ paren_count = 0
267
+
268
+ for char in column_definitions:
269
+ if char == '(' and not current_def.strip().lower().startswith("constraint"):
270
+ paren_count += 1
271
+ elif char == ')' and not current_def.strip().lower().startswith("constraint"):
272
+ paren_count -= 1
273
+
274
+ current_def += char
275
+
276
+ if char == ',' and paren_count == 0:
277
+ definitions.append(current_def[:-1].strip())
278
+ current_def = ""
279
+
280
+ if current_def.strip():
281
+ definitions.append(current_def.strip())
282
+
283
+ return definitions
284
+
285
+ def _is_connection(self, table_name: str, column_name: str) -> bool:
286
+ """
287
+ Checks if a column is a connection (primary key or foreign key).
288
+
289
+ Args:
290
+ table_name (str): The name of the table.
291
+ column_name (str): The name of the column.
292
+
293
+ Returns:
294
+ bool: True if the column is a connection, False otherwise.
295
+ """
296
+ column_info = self.CACHED_DB_SCHEMA[self.db_id].get_column_info(
297
+ table_name, column_name
298
+ )
299
+ if column_info is None:
300
+ return False
301
+ if column_info.primary_key:
302
+ return True
303
+ for target_table, _ in column_info.foreign_keys:
304
+ if self.schema_structure.get_table_info(target_table):
305
+ return True
306
+ for target_table, _ in column_info.referenced_by:
307
+ if self.schema_structure.get_table_info(target_table):
308
+ return True
309
+ for target_table_name, table_schema in self.schema_structure.tables.items():
310
+ if table_name.lower() == target_table_name.lower():
311
+ continue
312
+ for target_column_name, target_column_info in table_schema.columns.items():
313
+ if (
314
+ target_column_name.lower() == column_name.lower()
315
+ and target_column_info.primary_key
316
+ ):
317
+ return True
318
+ return False
319
+
320
+ def _get_connections(self) -> Dict[str, List[str]]:
321
+ """
322
+ Retrieves connections between tables in the schema.
323
+
324
+ Returns:
325
+ Dict[str, List[str]]: A dictionary mapping table names to lists of connected columns.
326
+ """
327
+ connections = {}
328
+ for table_name, table_schema in self.schema_structure.tables.items():
329
+ connections[table_name] = []
330
+ for column_name, column_info in (
331
+ self.CACHED_DB_SCHEMA[self.db_id].tables[table_name].columns.items()
332
+ ):
333
+ if self._is_connection(table_name, column_name):
334
+ connections[table_name].append(column_name)
335
+ return connections
336
+
337
+ def get_schema_with_connections(self) -> Dict[str, List[str]]:
338
+ """
339
+ Gets schema with connections included.
340
+
341
+ Returns:
342
+ Dict[str, List[str]]: The schema with connections included.
343
+ """
344
+ schema_structure_dict = self.schema_structure.to_dict()
345
+ connections = self._get_connections()
346
+ for table_name, connected_columns in connections.items():
347
+ for column_name in connected_columns:
348
+ if column_name.lower() not in [
349
+ col.lower() for col in schema_structure_dict[table_name]
350
+ ]:
351
+ schema_structure_dict[table_name].append(column_name)
352
+ return schema_structure_dict
353
+
354
+ def _get_example_column_name_description(
355
+ self, table_name: str, column_name: str, include_value_description: bool = True
356
+ ) -> str:
357
+ """
358
+ Retrieves example values and descriptions for a column.
359
+
360
+ Args:
361
+ table_name (str): The name of the table.
362
+ column_name (str): The name of the column.
363
+ include_value_description (bool): Flag to include value description.
364
+
365
+ Returns:
366
+ str: The example values and descriptions for the column.
367
+ """
368
+ example_part = ""
369
+ name_string = ""
370
+ description_string = ""
371
+ value_statics_string = ""
372
+ value_description_string = ""
373
+
374
+ column_info = self.schema_structure.get_column_info(table_name, column_name)
375
+ if column_info:
376
+ if column_info.examples:
377
+ example_part = f" Example Values: {', '.join([f'`{str(x)}`' for x in column_info.examples])}"
378
+ if column_info.value_statics:
379
+ value_statics_string = f" Value Statics: {column_info.value_statics}"
380
+ if column_info.column_name:
381
+ if (column_info.column_name.lower() != column_name.lower()) and (
382
+ column_info.column_name.strip() != ""
383
+ ):
384
+ name_string = f"| Column Name Meaning: {column_info.column_name}"
385
+ if column_info.column_description:
386
+ description_string = (
387
+ f"| Column Description: {column_info.column_description}"
388
+ )
389
+ if column_info.value_description and include_value_description:
390
+ value_description_string = (
391
+ f"| Value Description: {column_info.value_description}"
392
+ )
393
+
394
+ description_part = (
395
+ f"{name_string} {description_string} {value_description_string}"
396
+ )
397
+ joint_string = (
398
+ f" --{example_part} |{value_statics_string} {description_part}"
399
+ if example_part and description_part
400
+ else f" --{example_part or description_part or value_statics_string}"
401
+ )
402
+ if joint_string == " --":
403
+ joint_string = ""
404
+ return joint_string.replace("\n", " ") if joint_string else ""
405
+
406
+ def get_column_profiles(
407
+ self, with_keys: bool = False, with_references: bool = False
408
+ ) -> Dict[str, Dict[str, str]]:
409
+ """
410
+ Retrieves profiles for columns in the schema.
411
+ The output is a dictionary with table names as keys mapping to dictionaries with column names as keys and column profiles as values.
412
+
413
+ Args:
414
+ with_keys (bool): Flag to include primary keys and foreign keys.
415
+ with_references (bool): Flag to include referenced columns.
416
+
417
+ Returns:
418
+ Dict[str, Dict[str, str]]: The column profiles.
419
+ """
420
+ column_profiles = {}
421
+ for table_name, table_schema in self.schema_structure.tables.items():
422
+ column_profiles[table_name] = {}
423
+ for column_name, column_info in table_schema.columns.items():
424
+ if with_keys or not (
425
+ column_info.primary_key
426
+ or column_info.foreign_keys
427
+ or column_info.referenced_by
428
+ ):
429
+ column_profile = f"Table name: `{table_name}`\nOriginal column name: `{column_name}`\n"
430
+ if (
431
+ column_info.column_name.lower().strip()
432
+ != column_name.lower().strip()
433
+ ) and (column_info.column_name.strip() != ""):
434
+ column_profile += (
435
+ f"Expanded column name: `{column_info.column_name}`\n"
436
+ )
437
+ if column_info.type:
438
+ column_profile += f"Data type: {column_info.type}\n"
439
+ if column_info.column_description:
440
+ column_profile += (
441
+ f"Description: {column_info.column_description}\n"
442
+ )
443
+ if column_info.value_description:
444
+ column_profile += (
445
+ f"Value description: {column_info.value_description}\n"
446
+ )
447
+ if column_info.examples:
448
+ column_profile += f"Example of values in the column: {', '.join([f'`{str(x)}`' for x in column_info.examples])}\n"
449
+ if column_info.primary_key:
450
+ column_profile += "This column is a primary key.\n"
451
+ if with_references:
452
+ if column_info.foreign_keys:
453
+ column_profile += (
454
+ "This column references the following columns:\n"
455
+ )
456
+ for target_table, target_column in column_info.foreign_keys:
457
+ column_profile += f" Table: `{target_table}`, Column: `{target_column}`\n"
458
+ if column_info.referenced_by:
459
+ column_profile += (
460
+ "This column is referenced by the following columns:\n"
461
+ )
462
+ for (
463
+ source_table,
464
+ source_column,
465
+ ) in column_info.referenced_by:
466
+ column_profile += f" Table: `{source_table}`, Column: `{source_column}`\n"
467
+ column_profiles[table_name][column_name] = column_profile
468
+ return column_profiles
469
+
470
+ def validate_schema_consistency(self) -> List[str]:
471
+ """
472
+ Validates the consistency between the schema in the generator and the database manager.
473
+
474
+ Returns:
475
+ List[str]: A list of validation error messages, empty if no errors.
476
+ """
477
+ errors = []
478
+
479
+ # Skip validation if no database manager is provided
480
+ if not self.db_manager:
481
+ return ["No database manager provided for validation"]
482
+
483
+ # Compare schema tables with database manager tables
484
+ db_schema_dict = self.db_manager.get_schema_dict()
485
+ for table_name in self.schema_structure.tables:
486
+ if table_name not in db_schema_dict:
487
+ errors.append(f"Table '{table_name}' exists in schema but not in database manager")
488
+
489
+ # Validate foreign key references
490
+ for table_name, table_schema in self.schema_structure.tables.items():
491
+ for column_name, column_info in table_schema.columns.items():
492
+ if column_info.foreign_keys:
493
+ for ref_table, ref_column in column_info.foreign_keys:
494
+ # Check if referenced table exists
495
+ if ref_table not in self.schema_structure.tables:
496
+ errors.append(f"Foreign key in {table_name}.{column_name} references non-existent table {ref_table}")
497
+ continue
498
+
499
+ # Check if referenced column exists
500
+ if ref_column not in self.schema_structure.tables[ref_table].columns:
501
+ errors.append(f"Foreign key in {table_name}.{column_name} references non-existent column {ref_table}.{ref_column}")
502
+
503
+ # Use the DatabaseSchema's validate_schema method if available
504
+ if hasattr(self.schema_structure, 'validate_schema'):
505
+ schema_errors = self.schema_structure.validate_schema()
506
+ errors.extend(schema_errors)
507
+
508
+ return errors
@@ -0,0 +1,159 @@
1
+ import logging
2
+ import pickle
3
+ from pathlib import Path
4
+ from typing import Dict, List, Tuple
5
+
6
+ from datasketch import MinHash, MinHashLSH
7
+ from tqdm import tqdm
8
+
9
+
10
+ def _create_minhash(signature_size: int, string: str, n_gram: int) -> MinHash:
11
+ """
12
+ Creates a MinHash object for a given string.
13
+
14
+ Args:
15
+ signature_size (int): The size of the MinHash signature.
16
+ string (str): The input string to create the MinHash for.
17
+ n_gram (int): The n-gram size for the MinHash.
18
+
19
+ Returns:
20
+ MinHash: The MinHash object for the input string.
21
+ """
22
+ m = MinHash(num_perm=signature_size)
23
+ for d in [string[i : i + n_gram] for i in range(len(string) - n_gram + 1)]:
24
+ m.update(d.encode("utf8"))
25
+ return m
26
+
27
+
28
+ def skip_column(column_name: str, column_values: List[str]) -> bool:
29
+ """
30
+ Determines whether to skip processing a column based on its values.
31
+
32
+ Args:
33
+ column_name (str): The name of the column.
34
+ column_values (List[str]): The list of values in the column.
35
+
36
+ Returns:
37
+ bool: True if the column should be skipped, False otherwise.
38
+ """
39
+ if "name" in column_name.lower():
40
+ return False
41
+ sum_of_lengths = sum(len(value) for value in column_values)
42
+ average_length = sum_of_lengths / len(column_values)
43
+ return (sum_of_lengths > 50000) and (average_length > 20)
44
+
45
+
46
+ def make_lsh(
47
+ unique_values: Dict[str, Dict[str, List[str]]],
48
+ signature_size: int,
49
+ n_gram: int,
50
+ threshold: float,
51
+ verbose: bool = True,
52
+ ) -> Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]:
53
+ """
54
+ Creates a MinHash Locality-Sensitive Hashing (LSH) index from unique values in a database.
55
+
56
+ This function processes unique values from database tables and columns, creates MinHash
57
+ signatures for each value, and builds an LSH index for efficient similarity search.
58
+
59
+ Args:
60
+ unique_values (Dict[str, Dict[str, List[str]]]): A nested dictionary containing unique values
61
+ from the database. The structure is {table_name: {column_name: [values]}}.
62
+ signature_size (int): The number of permutations to use in the MinHash signatures.
63
+ n_gram (int): The size of n-grams to use when creating MinHash signatures.
64
+ threshold (float): The similarity threshold for the LSH index. Values closer to 1 require
65
+ higher similarity for matches.
66
+ verbose (bool, optional): If True, displays a progress bar during processing. Defaults to True.
67
+
68
+ Returns:
69
+ Tuple[MinHashLSH, Dict[str, Tuple[MinHash, str, str, str]]]: A tuple containing:
70
+ - MinHashLSH: The constructed LSH index.
71
+ - Dict[str, Tuple[MinHash, str, str, str]]: A dictionary mapping unique keys to tuples
72
+ containing (MinHash object, table name, column name, original value).
73
+
74
+ Raises:
75
+ Exception: If an error occurs during LSH creation, it's logged but not raised.
76
+
77
+ Note:
78
+ This function uses the datasketch library for MinHash and LSH operations.
79
+ """
80
+ lsh = MinHashLSH(threshold=threshold, num_perm=signature_size)
81
+ minhashes: Dict[str, Tuple[MinHash, str, str, str]] = {}
82
+ try:
83
+ total_unique_values = sum(
84
+ len(column_values)
85
+ for table_values in unique_values.values()
86
+ for column_values in table_values.values()
87
+ )
88
+ logging.info(f"Total unique values: {total_unique_values}")
89
+
90
+ progress_bar = (
91
+ tqdm(total=total_unique_values, desc="Creating LSH") if verbose else None
92
+ )
93
+
94
+ for table_name, table_values in unique_values.items():
95
+ for column_name, column_values in table_values.items():
96
+ if column_name.lower() == "doctype":
97
+ print("=" * 20)
98
+ print("Doctype found")
99
+ print("=" * 20)
100
+ logging.info(
101
+ f"Processing {table_name} - {column_name} - {len(column_values)}"
102
+ )
103
+
104
+ for id, value in enumerate(column_values):
105
+ minhash = _create_minhash(signature_size, value, n_gram)
106
+ minhash_key = f"{table_name}_{column_name}_{id}"
107
+ minhashes[minhash_key] = (minhash, table_name, column_name, value)
108
+ lsh.insert(minhash_key, minhash)
109
+
110
+ if verbose:
111
+ progress_bar.update(1)
112
+
113
+ if verbose:
114
+ progress_bar.close()
115
+ except Exception as e:
116
+ logging.error(f"Error creating LSH: {e}")
117
+
118
+ return lsh, minhashes
119
+
120
+
121
+ def make_db_lsh(db, db_directory_path, db_name, **kwargs) -> None:
122
+ """
123
+ Creates a MinHash LSH for the database and saves the results.
124
+
125
+ This function maintains backward compatibility while using the new LSH architecture.
126
+
127
+ Args:
128
+ db: Database manager instance
129
+ db_directory_path (str): The path to the database directory.
130
+ db_name (str): Name of the database
131
+ **kwargs (Any): Additional arguments for the LSH creation.
132
+ """
133
+ # Use the new LSH factory for database-independent creation
134
+ from ..lsh.factory import LshFactory
135
+
136
+ try:
137
+ # Try using the new architecture
138
+ LshFactory.create_lsh_from_db(db, **kwargs)
139
+ except Exception as e:
140
+ logging.warning(f"New LSH creation failed, falling back to old method: {e}")
141
+
142
+ # Fallback to old method for backward compatibility
143
+ preprocessed_path = Path(db_directory_path) / "preprocessed"
144
+ logging.info(f"Preprocessed directory: {preprocessed_path}")
145
+ preprocessed_path.mkdir(parents=True, exist_ok=True)
146
+
147
+ unique_values = db.get_unique_values()
148
+ logging.info("Unique values obtained")
149
+
150
+ with open(preprocessed_path / f"{db_name}_unique_values.pkl", "wb") as file:
151
+ pickle.dump(unique_values, file)
152
+ logging.info("Saved unique values")
153
+
154
+ lsh, minhashes = make_lsh(unique_values, **kwargs)
155
+
156
+ with open(preprocessed_path / f"{db_name}_lsh.pkl", "wb") as file:
157
+ pickle.dump(lsh, file)
158
+ with open(preprocessed_path / f"{db_name}_minhashes.pkl", "wb") as file:
159
+ pickle.dump(minhashes, file)