thoth-dbmanager 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thoth_dbmanager/ThothDbManager.py +459 -0
- thoth_dbmanager/__init__.py +136 -0
- thoth_dbmanager/adapters/__init__.py +21 -0
- thoth_dbmanager/adapters/mariadb.py +165 -0
- thoth_dbmanager/adapters/mysql.py +165 -0
- thoth_dbmanager/adapters/oracle.py +554 -0
- thoth_dbmanager/adapters/postgresql.py +444 -0
- thoth_dbmanager/adapters/qdrant.py +189 -0
- thoth_dbmanager/adapters/sqlite.py +385 -0
- thoth_dbmanager/adapters/sqlserver.py +583 -0
- thoth_dbmanager/adapters/supabase.py +249 -0
- thoth_dbmanager/core/__init__.py +13 -0
- thoth_dbmanager/core/factory.py +272 -0
- thoth_dbmanager/core/interfaces.py +271 -0
- thoth_dbmanager/core/registry.py +220 -0
- thoth_dbmanager/documents.py +155 -0
- thoth_dbmanager/dynamic_imports.py +250 -0
- thoth_dbmanager/helpers/__init__.py +0 -0
- thoth_dbmanager/helpers/multi_db_generator.py +508 -0
- thoth_dbmanager/helpers/preprocess_values.py +159 -0
- thoth_dbmanager/helpers/schema.py +376 -0
- thoth_dbmanager/helpers/search.py +117 -0
- thoth_dbmanager/lsh/__init__.py +21 -0
- thoth_dbmanager/lsh/core.py +182 -0
- thoth_dbmanager/lsh/factory.py +76 -0
- thoth_dbmanager/lsh/manager.py +170 -0
- thoth_dbmanager/lsh/storage.py +96 -0
- thoth_dbmanager/plugins/__init__.py +23 -0
- thoth_dbmanager/plugins/mariadb.py +436 -0
- thoth_dbmanager/plugins/mysql.py +408 -0
- thoth_dbmanager/plugins/oracle.py +150 -0
- thoth_dbmanager/plugins/postgresql.py +145 -0
- thoth_dbmanager/plugins/qdrant.py +41 -0
- thoth_dbmanager/plugins/sqlite.py +170 -0
- thoth_dbmanager/plugins/sqlserver.py +149 -0
- thoth_dbmanager/plugins/supabase.py +224 -0
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.2.dist-info}/METADATA +9 -6
- thoth_dbmanager-0.4.2.dist-info/RECORD +41 -0
- thoth_dbmanager-0.4.2.dist-info/top_level.txt +1 -0
- thoth_dbmanager-0.4.0.dist-info/RECORD +0 -5
- thoth_dbmanager-0.4.0.dist-info/top_level.txt +0 -1
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.2.dist-info}/WHEEL +0 -0
- {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,76 @@
|
|
1
|
+
"""
|
2
|
+
Factory for creating LSH indices from database managers.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
from typing import TYPE_CHECKING, Dict, List
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from ..ThothDbManager import ThothDbManager
|
10
|
+
|
11
|
+
from .manager import LshManager
|
12
|
+
|
13
|
+
|
14
|
+
class LshFactory:
|
15
|
+
"""Factory for creating LSH indices from any database type."""
|
16
|
+
|
17
|
+
@staticmethod
|
18
|
+
def create_lsh_from_db(
|
19
|
+
db_manager: "ThothDbManager",
|
20
|
+
signature_size: int = 30,
|
21
|
+
n_gram: int = 3,
|
22
|
+
threshold: float = 0.5,
|
23
|
+
verbose: bool = True,
|
24
|
+
**kwargs
|
25
|
+
) -> None:
|
26
|
+
"""
|
27
|
+
Create LSH index from any database manager type.
|
28
|
+
|
29
|
+
This function extracts unique values from the database manager
|
30
|
+
and creates an LSH index using the LshManager.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
db_manager: Any ThothDbManager implementation
|
34
|
+
signature_size: Size of MinHash signature
|
35
|
+
n_gram: N-gram size for MinHash
|
36
|
+
threshold: LSH similarity threshold
|
37
|
+
verbose: Whether to show progress
|
38
|
+
**kwargs: Additional arguments
|
39
|
+
"""
|
40
|
+
if not db_manager.db_directory_path:
|
41
|
+
raise ValueError("Database manager must have a valid db_directory_path")
|
42
|
+
|
43
|
+
# Get unique values from the database
|
44
|
+
logging.info(f"Extracting unique values from {db_manager.db_id}")
|
45
|
+
unique_values = db_manager.get_unique_values()
|
46
|
+
|
47
|
+
# Get or create LSH manager
|
48
|
+
lsh_manager = db_manager.lsh_manager
|
49
|
+
if lsh_manager is None:
|
50
|
+
raise ValueError("Could not create LSH manager for database")
|
51
|
+
|
52
|
+
# Create the LSH index
|
53
|
+
lsh_manager.create_lsh(
|
54
|
+
unique_values=unique_values,
|
55
|
+
signature_size=signature_size,
|
56
|
+
n_gram=n_gram,
|
57
|
+
threshold=threshold,
|
58
|
+
verbose=verbose,
|
59
|
+
**kwargs
|
60
|
+
)
|
61
|
+
|
62
|
+
logging.info(f"LSH creation completed for {db_manager.db_id}")
|
63
|
+
|
64
|
+
|
65
|
+
def make_db_lsh(db_manager: "ThothDbManager", **kwargs) -> None:
|
66
|
+
"""
|
67
|
+
Create LSH for any database type (maintains backward compatibility).
|
68
|
+
|
69
|
+
This function provides backward compatibility with the existing
|
70
|
+
make_db_lsh function signature while using the new architecture.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
db_manager: Database manager instance
|
74
|
+
**kwargs: LSH creation parameters
|
75
|
+
"""
|
76
|
+
LshFactory.create_lsh_from_db(db_manager, **kwargs)
|
@@ -0,0 +1,170 @@
|
|
1
|
+
"""
|
2
|
+
LSH Manager for database-independent LSH operations.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
import pickle
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Dict, List, Tuple, Optional, Any
|
9
|
+
|
10
|
+
from datasketch import MinHash, MinHashLSH
|
11
|
+
|
12
|
+
from .core import create_lsh_index, query_lsh_index
|
13
|
+
from .storage import LshStorageStrategy, PickleStorage
|
14
|
+
|
15
|
+
|
16
|
+
class LshManager:
|
17
|
+
"""
|
18
|
+
Manages LSH operations independently of database implementation.
|
19
|
+
|
20
|
+
This class handles creation, storage, loading, and querying of LSH indices
|
21
|
+
using a pluggable storage strategy.
|
22
|
+
"""
|
23
|
+
|
24
|
+
def __init__(self, storage_path: Path, storage_strategy: Optional[LshStorageStrategy] = None):
|
25
|
+
"""
|
26
|
+
Initialize the LSH manager.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
storage_path: Base path for LSH storage (directory containing preprocessed folder)
|
30
|
+
storage_strategy: Storage strategy to use (defaults to PickleStorage)
|
31
|
+
"""
|
32
|
+
self.storage_path = Path(storage_path)
|
33
|
+
self.storage_strategy = storage_strategy or PickleStorage()
|
34
|
+
self.lsh: Optional[MinHashLSH] = None
|
35
|
+
self.minhashes: Optional[Dict[str, Tuple[MinHash, str, str, str]]] = None
|
36
|
+
|
37
|
+
# Determine the database ID from the path
|
38
|
+
self.db_id = self.storage_path.name
|
39
|
+
|
40
|
+
# Set up the preprocessed directory path
|
41
|
+
self.preprocessed_path = self.storage_path / "preprocessed"
|
42
|
+
self.lsh_base_path = self.preprocessed_path / f"{self.db_id}_lsh"
|
43
|
+
|
44
|
+
def create_lsh(
|
45
|
+
self,
|
46
|
+
unique_values: Dict[str, Dict[str, List[str]]],
|
47
|
+
signature_size: int = 30,
|
48
|
+
n_gram: int = 3,
|
49
|
+
threshold: float = 0.5,
|
50
|
+
verbose: bool = True,
|
51
|
+
**kwargs
|
52
|
+
) -> None:
|
53
|
+
"""
|
54
|
+
Create and persist LSH index from unique values.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
unique_values: Dictionary of unique values from database
|
58
|
+
signature_size: Size of MinHash signature
|
59
|
+
n_gram: N-gram size for MinHash
|
60
|
+
threshold: LSH similarity threshold
|
61
|
+
verbose: Whether to show progress
|
62
|
+
**kwargs: Additional arguments
|
63
|
+
"""
|
64
|
+
logging.info(f"Creating LSH for database: {self.db_id}")
|
65
|
+
|
66
|
+
# Ensure preprocessed directory exists
|
67
|
+
self.preprocessed_path.mkdir(parents=True, exist_ok=True)
|
68
|
+
|
69
|
+
# Save unique values for reference
|
70
|
+
unique_values_path = self.preprocessed_path / f"{self.db_id}_unique_values.pkl"
|
71
|
+
with open(unique_values_path, "wb") as file:
|
72
|
+
pickle.dump(unique_values, file)
|
73
|
+
logging.info("Saved unique values")
|
74
|
+
|
75
|
+
# Create LSH index
|
76
|
+
lsh, minhashes = create_lsh_index(
|
77
|
+
unique_values=unique_values,
|
78
|
+
signature_size=signature_size,
|
79
|
+
n_gram=n_gram,
|
80
|
+
threshold=threshold,
|
81
|
+
verbose=verbose
|
82
|
+
)
|
83
|
+
|
84
|
+
# Store LSH data using the storage strategy
|
85
|
+
self.storage_strategy.save(lsh, minhashes, self.lsh_base_path)
|
86
|
+
logging.info(f"LSH saved to {self.lsh_base_path}")
|
87
|
+
|
88
|
+
# Keep in memory for immediate use
|
89
|
+
self.lsh = lsh
|
90
|
+
self.minhashes = minhashes
|
91
|
+
|
92
|
+
def load_lsh(self) -> bool:
|
93
|
+
"""
|
94
|
+
Load LSH index from storage.
|
95
|
+
|
96
|
+
Returns:
|
97
|
+
True if successfully loaded, False otherwise
|
98
|
+
"""
|
99
|
+
try:
|
100
|
+
if not self.storage_strategy.exists(self.lsh_base_path):
|
101
|
+
logging.warning(f"LSH files not found at {self.lsh_base_path}")
|
102
|
+
return False
|
103
|
+
|
104
|
+
lsh_data, minhashes_data = self.storage_strategy.load(self.lsh_base_path)
|
105
|
+
|
106
|
+
if lsh_data is None or minhashes_data is None:
|
107
|
+
logging.error(f"Failed to load LSH data from {self.lsh_base_path}")
|
108
|
+
return False
|
109
|
+
|
110
|
+
self.lsh = lsh_data
|
111
|
+
self.minhashes = minhashes_data
|
112
|
+
logging.info(f"LSH loaded successfully for {self.db_id}")
|
113
|
+
return True
|
114
|
+
|
115
|
+
except Exception as e:
|
116
|
+
logging.error(f"Error loading LSH for {self.db_id}: {e}")
|
117
|
+
return False
|
118
|
+
|
119
|
+
def query(
|
120
|
+
self,
|
121
|
+
keyword: str,
|
122
|
+
signature_size: int = 30,
|
123
|
+
n_gram: int = 3,
|
124
|
+
top_n: int = 10,
|
125
|
+
**kwargs
|
126
|
+
) -> Dict[str, Dict[str, List[str]]]:
|
127
|
+
"""
|
128
|
+
Query the LSH index for similar values.
|
129
|
+
|
130
|
+
Args:
|
131
|
+
keyword: Search keyword
|
132
|
+
signature_size: Size of MinHash signature
|
133
|
+
n_gram: N-gram size for MinHash
|
134
|
+
top_n: Number of top results to return
|
135
|
+
**kwargs: Additional arguments
|
136
|
+
|
137
|
+
Returns:
|
138
|
+
Dictionary of similar values organized by table and column
|
139
|
+
|
140
|
+
Raises:
|
141
|
+
Exception: If LSH is not loaded or query fails
|
142
|
+
"""
|
143
|
+
if self.lsh is None or self.minhashes is None:
|
144
|
+
# Try to load LSH if not already loaded
|
145
|
+
if not self.load_lsh():
|
146
|
+
raise Exception(f"Error loading LSH for {self.db_id}")
|
147
|
+
|
148
|
+
return query_lsh_index(
|
149
|
+
lsh=self.lsh,
|
150
|
+
minhashes=self.minhashes,
|
151
|
+
keyword=keyword,
|
152
|
+
signature_size=signature_size,
|
153
|
+
n_gram=n_gram,
|
154
|
+
top_n=top_n
|
155
|
+
)
|
156
|
+
|
157
|
+
def is_available(self) -> bool:
|
158
|
+
"""
|
159
|
+
Check if LSH data is available (either loaded or stored).
|
160
|
+
|
161
|
+
Returns:
|
162
|
+
True if LSH is available, False otherwise
|
163
|
+
"""
|
164
|
+
return (self.lsh is not None and self.minhashes is not None) or \
|
165
|
+
self.storage_strategy.exists(self.lsh_base_path)
|
166
|
+
|
167
|
+
def clear(self) -> None:
|
168
|
+
"""Clear loaded LSH data from memory."""
|
169
|
+
self.lsh = None
|
170
|
+
self.minhashes = None
|
@@ -0,0 +1,96 @@
|
|
1
|
+
"""
|
2
|
+
Storage strategies for LSH data persistence.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import pickle
|
6
|
+
from abc import ABC, abstractmethod
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Any, Tuple, Optional
|
9
|
+
|
10
|
+
|
11
|
+
class LshStorageStrategy(ABC):
|
12
|
+
"""Abstract base class for LSH storage strategies."""
|
13
|
+
|
14
|
+
@abstractmethod
|
15
|
+
def save(self, lsh_data: Any, minhashes_data: Any, base_path: Path) -> None:
|
16
|
+
"""
|
17
|
+
Save LSH data and minhashes to storage.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
lsh_data: The LSH index data
|
21
|
+
minhashes_data: The minhashes data
|
22
|
+
base_path: Base path for storage (without file extension)
|
23
|
+
"""
|
24
|
+
pass
|
25
|
+
|
26
|
+
@abstractmethod
|
27
|
+
def load(self, base_path: Path) -> Tuple[Optional[Any], Optional[Any]]:
|
28
|
+
"""
|
29
|
+
Load LSH data and minhashes from storage.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
base_path: Base path for storage (without file extension)
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
Tuple of (lsh_data, minhashes_data) or (None, None) if not found
|
36
|
+
"""
|
37
|
+
pass
|
38
|
+
|
39
|
+
@abstractmethod
|
40
|
+
def exists(self, base_path: Path) -> bool:
|
41
|
+
"""
|
42
|
+
Check if LSH data exists in storage.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
base_path: Base path for storage (without file extension)
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
True if data exists, False otherwise
|
49
|
+
"""
|
50
|
+
pass
|
51
|
+
|
52
|
+
|
53
|
+
class PickleStorage(LshStorageStrategy):
|
54
|
+
"""Pickle-based storage strategy (current implementation)."""
|
55
|
+
|
56
|
+
def save(self, lsh_data: Any, minhashes_data: Any, base_path: Path) -> None:
|
57
|
+
"""Save LSH data using pickle format."""
|
58
|
+
lsh_path = base_path.with_suffix('.pkl')
|
59
|
+
minhashes_path = base_path.parent / f"{base_path.stem}_minhashes.pkl"
|
60
|
+
|
61
|
+
# Ensure directory exists
|
62
|
+
base_path.parent.mkdir(parents=True, exist_ok=True)
|
63
|
+
|
64
|
+
# Save LSH data
|
65
|
+
with open(lsh_path, 'wb') as f:
|
66
|
+
pickle.dump(lsh_data, f)
|
67
|
+
|
68
|
+
# Save minhashes data
|
69
|
+
with open(minhashes_path, 'wb') as f:
|
70
|
+
pickle.dump(minhashes_data, f)
|
71
|
+
|
72
|
+
def load(self, base_path: Path) -> Tuple[Optional[Any], Optional[Any]]:
|
73
|
+
"""Load LSH data from pickle files."""
|
74
|
+
lsh_path = base_path.with_suffix('.pkl')
|
75
|
+
minhashes_path = base_path.parent / f"{base_path.stem}_minhashes.pkl"
|
76
|
+
|
77
|
+
try:
|
78
|
+
# Load LSH data
|
79
|
+
with open(lsh_path, 'rb') as f:
|
80
|
+
lsh_data = pickle.load(f)
|
81
|
+
|
82
|
+
# Load minhashes data
|
83
|
+
with open(minhashes_path, 'rb') as f:
|
84
|
+
minhashes_data = pickle.load(f)
|
85
|
+
|
86
|
+
return lsh_data, minhashes_data
|
87
|
+
|
88
|
+
except (FileNotFoundError, pickle.PickleError):
|
89
|
+
return None, None
|
90
|
+
|
91
|
+
def exists(self, base_path: Path) -> bool:
|
92
|
+
"""Check if both LSH and minhashes pickle files exist."""
|
93
|
+
lsh_path = base_path.with_suffix('.pkl')
|
94
|
+
minhashes_path = base_path.parent / f"{base_path.stem}_minhashes.pkl"
|
95
|
+
|
96
|
+
return lsh_path.exists() and minhashes_path.exists()
|
@@ -0,0 +1,23 @@
|
|
1
|
+
"""
|
2
|
+
Database plugins for Thoth SQL Database Manager.
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Import all plugins to ensure they are registered
|
6
|
+
from .postgresql import PostgreSQLPlugin
|
7
|
+
from .sqlite import SQLitePlugin
|
8
|
+
from .supabase import SupabasePlugin
|
9
|
+
from .mysql import MySQLPlugin
|
10
|
+
from .mariadb import MariaDBPlugin
|
11
|
+
from .sqlserver import SQLServerPlugin
|
12
|
+
from .oracle import OraclePlugin
|
13
|
+
|
14
|
+
# This ensures all plugins are registered when the module is imported
|
15
|
+
__all__ = [
|
16
|
+
"PostgreSQLPlugin",
|
17
|
+
"SQLitePlugin",
|
18
|
+
"SupabasePlugin",
|
19
|
+
"MySQLPlugin",
|
20
|
+
"MariaDBPlugin",
|
21
|
+
"SQLServerPlugin",
|
22
|
+
"OraclePlugin",
|
23
|
+
]
|