thoth-dbmanager 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. thoth_dbmanager/ThothDbManager.py +459 -0
  2. thoth_dbmanager/__init__.py +136 -0
  3. thoth_dbmanager/adapters/__init__.py +21 -0
  4. thoth_dbmanager/adapters/mariadb.py +165 -0
  5. thoth_dbmanager/adapters/mysql.py +165 -0
  6. thoth_dbmanager/adapters/oracle.py +554 -0
  7. thoth_dbmanager/adapters/postgresql.py +444 -0
  8. thoth_dbmanager/adapters/sqlite.py +385 -0
  9. thoth_dbmanager/adapters/sqlserver.py +583 -0
  10. thoth_dbmanager/adapters/supabase.py +249 -0
  11. thoth_dbmanager/core/__init__.py +13 -0
  12. thoth_dbmanager/core/factory.py +272 -0
  13. thoth_dbmanager/core/interfaces.py +271 -0
  14. thoth_dbmanager/core/registry.py +220 -0
  15. thoth_dbmanager/documents.py +155 -0
  16. thoth_dbmanager/dynamic_imports.py +250 -0
  17. thoth_dbmanager/helpers/__init__.py +0 -0
  18. thoth_dbmanager/helpers/multi_db_generator.py +508 -0
  19. thoth_dbmanager/helpers/preprocess_values.py +159 -0
  20. thoth_dbmanager/helpers/schema.py +376 -0
  21. thoth_dbmanager/helpers/search.py +117 -0
  22. thoth_dbmanager/lsh/__init__.py +21 -0
  23. thoth_dbmanager/lsh/core.py +182 -0
  24. thoth_dbmanager/lsh/factory.py +76 -0
  25. thoth_dbmanager/lsh/manager.py +170 -0
  26. thoth_dbmanager/lsh/storage.py +96 -0
  27. thoth_dbmanager/plugins/__init__.py +23 -0
  28. thoth_dbmanager/plugins/mariadb.py +436 -0
  29. thoth_dbmanager/plugins/mysql.py +408 -0
  30. thoth_dbmanager/plugins/oracle.py +150 -0
  31. thoth_dbmanager/plugins/postgresql.py +145 -0
  32. thoth_dbmanager/plugins/sqlite.py +170 -0
  33. thoth_dbmanager/plugins/sqlserver.py +149 -0
  34. thoth_dbmanager/plugins/supabase.py +224 -0
  35. {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/METADATA +6 -6
  36. thoth_dbmanager-0.4.1.dist-info/RECORD +39 -0
  37. thoth_dbmanager-0.4.1.dist-info/top_level.txt +1 -0
  38. thoth_dbmanager-0.4.0.dist-info/RECORD +0 -5
  39. thoth_dbmanager-0.4.0.dist-info/top_level.txt +0 -1
  40. {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/WHEEL +0 -0
  41. {thoth_dbmanager-0.4.0.dist-info → thoth_dbmanager-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,76 @@
1
+ """
2
+ Factory for creating LSH indices from database managers.
3
+ """
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING, Dict, List
7
+
8
+ if TYPE_CHECKING:
9
+ from ..ThothDbManager import ThothDbManager
10
+
11
+ from .manager import LshManager
12
+
13
+
14
+ class LshFactory:
15
+ """Factory for creating LSH indices from any database type."""
16
+
17
+ @staticmethod
18
+ def create_lsh_from_db(
19
+ db_manager: "ThothDbManager",
20
+ signature_size: int = 30,
21
+ n_gram: int = 3,
22
+ threshold: float = 0.5,
23
+ verbose: bool = True,
24
+ **kwargs
25
+ ) -> None:
26
+ """
27
+ Create LSH index from any database manager type.
28
+
29
+ This function extracts unique values from the database manager
30
+ and creates an LSH index using the LshManager.
31
+
32
+ Args:
33
+ db_manager: Any ThothDbManager implementation
34
+ signature_size: Size of MinHash signature
35
+ n_gram: N-gram size for MinHash
36
+ threshold: LSH similarity threshold
37
+ verbose: Whether to show progress
38
+ **kwargs: Additional arguments
39
+ """
40
+ if not db_manager.db_directory_path:
41
+ raise ValueError("Database manager must have a valid db_directory_path")
42
+
43
+ # Get unique values from the database
44
+ logging.info(f"Extracting unique values from {db_manager.db_id}")
45
+ unique_values = db_manager.get_unique_values()
46
+
47
+ # Get or create LSH manager
48
+ lsh_manager = db_manager.lsh_manager
49
+ if lsh_manager is None:
50
+ raise ValueError("Could not create LSH manager for database")
51
+
52
+ # Create the LSH index
53
+ lsh_manager.create_lsh(
54
+ unique_values=unique_values,
55
+ signature_size=signature_size,
56
+ n_gram=n_gram,
57
+ threshold=threshold,
58
+ verbose=verbose,
59
+ **kwargs
60
+ )
61
+
62
+ logging.info(f"LSH creation completed for {db_manager.db_id}")
63
+
64
+
65
+ def make_db_lsh(db_manager: "ThothDbManager", **kwargs) -> None:
66
+ """
67
+ Create LSH for any database type (maintains backward compatibility).
68
+
69
+ This function provides backward compatibility with the existing
70
+ make_db_lsh function signature while using the new architecture.
71
+
72
+ Args:
73
+ db_manager: Database manager instance
74
+ **kwargs: LSH creation parameters
75
+ """
76
+ LshFactory.create_lsh_from_db(db_manager, **kwargs)
@@ -0,0 +1,170 @@
1
+ """
2
+ LSH Manager for database-independent LSH operations.
3
+ """
4
+
5
+ import logging
6
+ import pickle
7
+ from pathlib import Path
8
+ from typing import Dict, List, Tuple, Optional, Any
9
+
10
+ from datasketch import MinHash, MinHashLSH
11
+
12
+ from .core import create_lsh_index, query_lsh_index
13
+ from .storage import LshStorageStrategy, PickleStorage
14
+
15
+
16
+ class LshManager:
17
+ """
18
+ Manages LSH operations independently of database implementation.
19
+
20
+ This class handles creation, storage, loading, and querying of LSH indices
21
+ using a pluggable storage strategy.
22
+ """
23
+
24
+ def __init__(self, storage_path: Path, storage_strategy: Optional[LshStorageStrategy] = None):
25
+ """
26
+ Initialize the LSH manager.
27
+
28
+ Args:
29
+ storage_path: Base path for LSH storage (directory containing preprocessed folder)
30
+ storage_strategy: Storage strategy to use (defaults to PickleStorage)
31
+ """
32
+ self.storage_path = Path(storage_path)
33
+ self.storage_strategy = storage_strategy or PickleStorage()
34
+ self.lsh: Optional[MinHashLSH] = None
35
+ self.minhashes: Optional[Dict[str, Tuple[MinHash, str, str, str]]] = None
36
+
37
+ # Determine the database ID from the path
38
+ self.db_id = self.storage_path.name
39
+
40
+ # Set up the preprocessed directory path
41
+ self.preprocessed_path = self.storage_path / "preprocessed"
42
+ self.lsh_base_path = self.preprocessed_path / f"{self.db_id}_lsh"
43
+
44
+ def create_lsh(
45
+ self,
46
+ unique_values: Dict[str, Dict[str, List[str]]],
47
+ signature_size: int = 30,
48
+ n_gram: int = 3,
49
+ threshold: float = 0.5,
50
+ verbose: bool = True,
51
+ **kwargs
52
+ ) -> None:
53
+ """
54
+ Create and persist LSH index from unique values.
55
+
56
+ Args:
57
+ unique_values: Dictionary of unique values from database
58
+ signature_size: Size of MinHash signature
59
+ n_gram: N-gram size for MinHash
60
+ threshold: LSH similarity threshold
61
+ verbose: Whether to show progress
62
+ **kwargs: Additional arguments
63
+ """
64
+ logging.info(f"Creating LSH for database: {self.db_id}")
65
+
66
+ # Ensure preprocessed directory exists
67
+ self.preprocessed_path.mkdir(parents=True, exist_ok=True)
68
+
69
+ # Save unique values for reference
70
+ unique_values_path = self.preprocessed_path / f"{self.db_id}_unique_values.pkl"
71
+ with open(unique_values_path, "wb") as file:
72
+ pickle.dump(unique_values, file)
73
+ logging.info("Saved unique values")
74
+
75
+ # Create LSH index
76
+ lsh, minhashes = create_lsh_index(
77
+ unique_values=unique_values,
78
+ signature_size=signature_size,
79
+ n_gram=n_gram,
80
+ threshold=threshold,
81
+ verbose=verbose
82
+ )
83
+
84
+ # Store LSH data using the storage strategy
85
+ self.storage_strategy.save(lsh, minhashes, self.lsh_base_path)
86
+ logging.info(f"LSH saved to {self.lsh_base_path}")
87
+
88
+ # Keep in memory for immediate use
89
+ self.lsh = lsh
90
+ self.minhashes = minhashes
91
+
92
+ def load_lsh(self) -> bool:
93
+ """
94
+ Load LSH index from storage.
95
+
96
+ Returns:
97
+ True if successfully loaded, False otherwise
98
+ """
99
+ try:
100
+ if not self.storage_strategy.exists(self.lsh_base_path):
101
+ logging.warning(f"LSH files not found at {self.lsh_base_path}")
102
+ return False
103
+
104
+ lsh_data, minhashes_data = self.storage_strategy.load(self.lsh_base_path)
105
+
106
+ if lsh_data is None or minhashes_data is None:
107
+ logging.error(f"Failed to load LSH data from {self.lsh_base_path}")
108
+ return False
109
+
110
+ self.lsh = lsh_data
111
+ self.minhashes = minhashes_data
112
+ logging.info(f"LSH loaded successfully for {self.db_id}")
113
+ return True
114
+
115
+ except Exception as e:
116
+ logging.error(f"Error loading LSH for {self.db_id}: {e}")
117
+ return False
118
+
119
+ def query(
120
+ self,
121
+ keyword: str,
122
+ signature_size: int = 30,
123
+ n_gram: int = 3,
124
+ top_n: int = 10,
125
+ **kwargs
126
+ ) -> Dict[str, Dict[str, List[str]]]:
127
+ """
128
+ Query the LSH index for similar values.
129
+
130
+ Args:
131
+ keyword: Search keyword
132
+ signature_size: Size of MinHash signature
133
+ n_gram: N-gram size for MinHash
134
+ top_n: Number of top results to return
135
+ **kwargs: Additional arguments
136
+
137
+ Returns:
138
+ Dictionary of similar values organized by table and column
139
+
140
+ Raises:
141
+ Exception: If LSH is not loaded or query fails
142
+ """
143
+ if self.lsh is None or self.minhashes is None:
144
+ # Try to load LSH if not already loaded
145
+ if not self.load_lsh():
146
+ raise Exception(f"Error loading LSH for {self.db_id}")
147
+
148
+ return query_lsh_index(
149
+ lsh=self.lsh,
150
+ minhashes=self.minhashes,
151
+ keyword=keyword,
152
+ signature_size=signature_size,
153
+ n_gram=n_gram,
154
+ top_n=top_n
155
+ )
156
+
157
+ def is_available(self) -> bool:
158
+ """
159
+ Check if LSH data is available (either loaded or stored).
160
+
161
+ Returns:
162
+ True if LSH is available, False otherwise
163
+ """
164
+ return (self.lsh is not None and self.minhashes is not None) or \
165
+ self.storage_strategy.exists(self.lsh_base_path)
166
+
167
+ def clear(self) -> None:
168
+ """Clear loaded LSH data from memory."""
169
+ self.lsh = None
170
+ self.minhashes = None
@@ -0,0 +1,96 @@
1
+ """
2
+ Storage strategies for LSH data persistence.
3
+ """
4
+
5
+ import pickle
6
+ from abc import ABC, abstractmethod
7
+ from pathlib import Path
8
+ from typing import Any, Tuple, Optional
9
+
10
+
11
+ class LshStorageStrategy(ABC):
12
+ """Abstract base class for LSH storage strategies."""
13
+
14
+ @abstractmethod
15
+ def save(self, lsh_data: Any, minhashes_data: Any, base_path: Path) -> None:
16
+ """
17
+ Save LSH data and minhashes to storage.
18
+
19
+ Args:
20
+ lsh_data: The LSH index data
21
+ minhashes_data: The minhashes data
22
+ base_path: Base path for storage (without file extension)
23
+ """
24
+ pass
25
+
26
+ @abstractmethod
27
+ def load(self, base_path: Path) -> Tuple[Optional[Any], Optional[Any]]:
28
+ """
29
+ Load LSH data and minhashes from storage.
30
+
31
+ Args:
32
+ base_path: Base path for storage (without file extension)
33
+
34
+ Returns:
35
+ Tuple of (lsh_data, minhashes_data) or (None, None) if not found
36
+ """
37
+ pass
38
+
39
+ @abstractmethod
40
+ def exists(self, base_path: Path) -> bool:
41
+ """
42
+ Check if LSH data exists in storage.
43
+
44
+ Args:
45
+ base_path: Base path for storage (without file extension)
46
+
47
+ Returns:
48
+ True if data exists, False otherwise
49
+ """
50
+ pass
51
+
52
+
53
+ class PickleStorage(LshStorageStrategy):
54
+ """Pickle-based storage strategy (current implementation)."""
55
+
56
+ def save(self, lsh_data: Any, minhashes_data: Any, base_path: Path) -> None:
57
+ """Save LSH data using pickle format."""
58
+ lsh_path = base_path.with_suffix('.pkl')
59
+ minhashes_path = base_path.parent / f"{base_path.stem}_minhashes.pkl"
60
+
61
+ # Ensure directory exists
62
+ base_path.parent.mkdir(parents=True, exist_ok=True)
63
+
64
+ # Save LSH data
65
+ with open(lsh_path, 'wb') as f:
66
+ pickle.dump(lsh_data, f)
67
+
68
+ # Save minhashes data
69
+ with open(minhashes_path, 'wb') as f:
70
+ pickle.dump(minhashes_data, f)
71
+
72
+ def load(self, base_path: Path) -> Tuple[Optional[Any], Optional[Any]]:
73
+ """Load LSH data from pickle files."""
74
+ lsh_path = base_path.with_suffix('.pkl')
75
+ minhashes_path = base_path.parent / f"{base_path.stem}_minhashes.pkl"
76
+
77
+ try:
78
+ # Load LSH data
79
+ with open(lsh_path, 'rb') as f:
80
+ lsh_data = pickle.load(f)
81
+
82
+ # Load minhashes data
83
+ with open(minhashes_path, 'rb') as f:
84
+ minhashes_data = pickle.load(f)
85
+
86
+ return lsh_data, minhashes_data
87
+
88
+ except (FileNotFoundError, pickle.PickleError):
89
+ return None, None
90
+
91
+ def exists(self, base_path: Path) -> bool:
92
+ """Check if both LSH and minhashes pickle files exist."""
93
+ lsh_path = base_path.with_suffix('.pkl')
94
+ minhashes_path = base_path.parent / f"{base_path.stem}_minhashes.pkl"
95
+
96
+ return lsh_path.exists() and minhashes_path.exists()
@@ -0,0 +1,23 @@
1
+ """
2
+ Database plugins for Thoth SQL Database Manager.
3
+ """
4
+
5
+ # Import all plugins to ensure they are registered
6
+ from .postgresql import PostgreSQLPlugin
7
+ from .sqlite import SQLitePlugin
8
+ from .supabase import SupabasePlugin
9
+ from .mysql import MySQLPlugin
10
+ from .mariadb import MariaDBPlugin
11
+ from .sqlserver import SQLServerPlugin
12
+ from .oracle import OraclePlugin
13
+
14
+ # This ensures all plugins are registered when the module is imported
15
+ __all__ = [
16
+ "PostgreSQLPlugin",
17
+ "SQLitePlugin",
18
+ "SupabasePlugin",
19
+ "MySQLPlugin",
20
+ "MariaDBPlugin",
21
+ "SQLServerPlugin",
22
+ "OraclePlugin",
23
+ ]