spatial-memory-mcp 1.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. spatial_memory/__init__.py +97 -0
  2. spatial_memory/__main__.py +271 -0
  3. spatial_memory/adapters/__init__.py +7 -0
  4. spatial_memory/adapters/lancedb_repository.py +880 -0
  5. spatial_memory/config.py +769 -0
  6. spatial_memory/core/__init__.py +118 -0
  7. spatial_memory/core/cache.py +317 -0
  8. spatial_memory/core/circuit_breaker.py +297 -0
  9. spatial_memory/core/connection_pool.py +220 -0
  10. spatial_memory/core/consolidation_strategies.py +401 -0
  11. spatial_memory/core/database.py +3072 -0
  12. spatial_memory/core/db_idempotency.py +242 -0
  13. spatial_memory/core/db_indexes.py +576 -0
  14. spatial_memory/core/db_migrations.py +588 -0
  15. spatial_memory/core/db_search.py +512 -0
  16. spatial_memory/core/db_versioning.py +178 -0
  17. spatial_memory/core/embeddings.py +558 -0
  18. spatial_memory/core/errors.py +317 -0
  19. spatial_memory/core/file_security.py +701 -0
  20. spatial_memory/core/filesystem.py +178 -0
  21. spatial_memory/core/health.py +289 -0
  22. spatial_memory/core/helpers.py +79 -0
  23. spatial_memory/core/import_security.py +433 -0
  24. spatial_memory/core/lifecycle_ops.py +1067 -0
  25. spatial_memory/core/logging.py +194 -0
  26. spatial_memory/core/metrics.py +192 -0
  27. spatial_memory/core/models.py +660 -0
  28. spatial_memory/core/rate_limiter.py +326 -0
  29. spatial_memory/core/response_types.py +500 -0
  30. spatial_memory/core/security.py +588 -0
  31. spatial_memory/core/spatial_ops.py +430 -0
  32. spatial_memory/core/tracing.py +300 -0
  33. spatial_memory/core/utils.py +110 -0
  34. spatial_memory/core/validation.py +406 -0
  35. spatial_memory/factory.py +444 -0
  36. spatial_memory/migrations/__init__.py +40 -0
  37. spatial_memory/ports/__init__.py +11 -0
  38. spatial_memory/ports/repositories.py +630 -0
  39. spatial_memory/py.typed +0 -0
  40. spatial_memory/server.py +1214 -0
  41. spatial_memory/services/__init__.py +70 -0
  42. spatial_memory/services/decay_manager.py +411 -0
  43. spatial_memory/services/export_import.py +1031 -0
  44. spatial_memory/services/lifecycle.py +1139 -0
  45. spatial_memory/services/memory.py +412 -0
  46. spatial_memory/services/spatial.py +1152 -0
  47. spatial_memory/services/utility.py +429 -0
  48. spatial_memory/tools/__init__.py +5 -0
  49. spatial_memory/tools/definitions.py +695 -0
  50. spatial_memory/verify.py +140 -0
  51. spatial_memory_mcp-1.9.1.dist-info/METADATA +509 -0
  52. spatial_memory_mcp-1.9.1.dist-info/RECORD +55 -0
  53. spatial_memory_mcp-1.9.1.dist-info/WHEEL +4 -0
  54. spatial_memory_mcp-1.9.1.dist-info/entry_points.txt +2 -0
  55. spatial_memory_mcp-1.9.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,512 @@
1
+ """Search operations for LanceDB database.
2
+
3
+ Provides vector search, hybrid search, and batch search functionality.
4
+
5
+ This module is part of the database.py refactoring to separate concerns:
6
+ - SearchManager handles all search-related operations
7
+ - Database class delegates to SearchManager for these operations
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import logging
14
+ from typing import TYPE_CHECKING, Any, Protocol
15
+
16
+ import numpy as np
17
+
18
+ from spatial_memory.core.errors import StorageError, ValidationError
19
+ from spatial_memory.core.validation import (
20
+ sanitize_string as _sanitize_string,
21
+ )
22
+ from spatial_memory.core.validation import (
23
+ validate_namespace as _validate_namespace,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from lancedb.table import Table as LanceTable
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class SearchManagerProtocol(Protocol):
33
+ """Protocol defining what SearchManager needs from Database.
34
+
35
+ This protocol enables loose coupling between SearchManager and Database,
36
+ preventing circular imports while maintaining type safety.
37
+ """
38
+
39
+ @property
40
+ def table(self) -> LanceTable:
41
+ """Access to the LanceDB table."""
42
+ ...
43
+
44
+ @property
45
+ def index_nprobes(self) -> int:
46
+ """Base nprobes for search."""
47
+ ...
48
+
49
+ @property
50
+ def index_refine_factor(self) -> int:
51
+ """Base refine factor for search."""
52
+ ...
53
+
54
+ @property
55
+ def vector_index_threshold(self) -> int:
56
+ """Row count threshold for vector index."""
57
+ ...
58
+
59
+ def _get_cached_row_count(self) -> int:
60
+ """Get cached row count."""
61
+ ...
62
+
63
+ @property
64
+ def _has_vector_index(self) -> bool | None:
65
+ """Whether vector index exists."""
66
+ ...
67
+
68
+ @property
69
+ def _has_fts_index(self) -> bool | None:
70
+ """Whether FTS index exists."""
71
+ ...
72
+
73
+
74
+ class SearchManager:
75
+ """Manages search operations for vector and hybrid queries.
76
+
77
+ Handles vector similarity search, batch search, and hybrid
78
+ search combining vector and keyword matching.
79
+
80
+ Example:
81
+ search_mgr = SearchManager(database)
82
+ results = search_mgr.vector_search(query_vector, limit=10)
83
+ batch_results = search_mgr.batch_vector_search_native([vec1, vec2])
84
+ """
85
+
86
+ def __init__(self, db: SearchManagerProtocol) -> None:
87
+ """Initialize the search manager.
88
+
89
+ Args:
90
+ db: Database instance providing table and config access.
91
+ """
92
+ self._db = db
93
+
94
+ def calculate_search_params(
95
+ self,
96
+ count: int,
97
+ limit: int,
98
+ nprobes_override: int | None = None,
99
+ refine_factor_override: int | None = None,
100
+ ) -> tuple[int, int]:
101
+ """Calculate optimal search parameters based on dataset size and limit.
102
+
103
+ Dynamically tunes nprobes and refine_factor for optimal recall/speed tradeoff.
104
+
105
+ Args:
106
+ count: Number of rows in the dataset.
107
+ limit: Number of results requested.
108
+ nprobes_override: Optional override for nprobes (uses this if provided).
109
+ refine_factor_override: Optional override for refine_factor.
110
+
111
+ Returns:
112
+ Tuple of (nprobes, refine_factor).
113
+
114
+ Scaling rules:
115
+ - nprobes: Base from config, scaled up for larger datasets
116
+ - <100K: config value (default 20)
117
+ - 100K-1M: max(config, 30)
118
+ - 1M-10M: max(config, 50)
119
+ - >10M: max(config, 100)
120
+ - refine_factor: Base from config, scaled up for small limits
121
+ - limit <= 5: config value * 2
122
+ - limit <= 20: config value
123
+ - limit > 20: max(config // 2, 2)
124
+ """
125
+ # Calculate nprobes based on dataset size
126
+ if nprobes_override is not None:
127
+ nprobes = nprobes_override
128
+ else:
129
+ base_nprobes = self._db.index_nprobes
130
+ if count < 100_000:
131
+ nprobes = base_nprobes
132
+ elif count < 1_000_000:
133
+ nprobes = max(base_nprobes, 30)
134
+ elif count < 10_000_000:
135
+ nprobes = max(base_nprobes, 50)
136
+ else:
137
+ nprobes = max(base_nprobes, 100)
138
+
139
+ # Calculate refine_factor based on limit
140
+ if refine_factor_override is not None:
141
+ refine_factor = refine_factor_override
142
+ else:
143
+ base_refine = self._db.index_refine_factor
144
+ if limit <= 5:
145
+ # Small limits need more refinement for accuracy
146
+ refine_factor = base_refine * 2
147
+ elif limit <= 20:
148
+ refine_factor = base_refine
149
+ else:
150
+ # Large limits can use less refinement
151
+ refine_factor = max(base_refine // 2, 2)
152
+
153
+ return nprobes, refine_factor
154
+
155
+ def vector_search(
156
+ self,
157
+ query_vector: np.ndarray,
158
+ limit: int = 5,
159
+ namespace: str | None = None,
160
+ min_similarity: float = 0.0,
161
+ nprobes: int | None = None,
162
+ refine_factor: int | None = None,
163
+ include_vector: bool = False,
164
+ ) -> list[dict[str, Any]]:
165
+ """Search for similar memories by vector with performance tuning.
166
+
167
+ Note: This method should be called through the Database class which
168
+ applies stale connection recovery and retry decorators.
169
+
170
+ Args:
171
+ query_vector: Query embedding vector.
172
+ limit: Maximum number of results.
173
+ namespace: Filter to specific namespace.
174
+ min_similarity: Minimum similarity threshold (0-1).
175
+ nprobes: Number of partitions to search (higher = better recall).
176
+ Only effective when vector index exists. Defaults to dynamic calculation.
177
+ refine_factor: Re-rank top (refine_factor * limit) for accuracy.
178
+ Defaults to dynamic calculation based on limit.
179
+ include_vector: Whether to include vector embeddings in results.
180
+ Defaults to False to reduce response size.
181
+
182
+ Returns:
183
+ List of memory records with similarity scores.
184
+
185
+ Raises:
186
+ ValidationError: If input validation fails.
187
+ StorageError: If database operation fails.
188
+ """
189
+ try:
190
+ search = self._db.table.search(query_vector.tolist())
191
+
192
+ # Distance type for queries (cosine for semantic similarity)
193
+ # Note: When vector index exists, the index's metric is used
194
+ search = search.distance_type("cosine")
195
+
196
+ # Apply performance tuning when index exists (use cached count)
197
+ count = self._db._get_cached_row_count()
198
+ if count > self._db.vector_index_threshold and self._db._has_vector_index:
199
+ # Use dynamic calculation for search params
200
+ actual_nprobes, actual_refine = self.calculate_search_params(
201
+ count, limit, nprobes, refine_factor
202
+ )
203
+ search = search.nprobes(actual_nprobes)
204
+ search = search.refine_factor(actual_refine)
205
+
206
+ # Build filter with sanitized namespace
207
+ # prefilter=True applies namespace filter BEFORE vector search for better performance
208
+ if namespace:
209
+ namespace = _validate_namespace(namespace)
210
+ safe_ns = _sanitize_string(namespace)
211
+ search = search.where(f"namespace = '{safe_ns}'", prefilter=True)
212
+
213
+ # Vector projection: exclude vector column to reduce response size
214
+ if not include_vector:
215
+ search = search.select([
216
+ "id", "content", "namespace", "metadata",
217
+ "created_at", "updated_at", "last_accessed",
218
+ "importance", "tags", "source", "access_count",
219
+ "expires_at",
220
+ ])
221
+
222
+ # Fetch extra if filtering by similarity
223
+ fetch_limit = limit * 2 if min_similarity > 0.0 else limit
224
+ results: list[dict[str, Any]] = search.limit(fetch_limit).to_list()
225
+
226
+ # Process results
227
+ filtered_results: list[dict[str, Any]] = []
228
+ for record in results:
229
+ record["metadata"] = json.loads(record["metadata"]) if record["metadata"] else {}
230
+ # LanceDB returns _distance, convert to similarity
231
+ if "_distance" in record:
232
+ # Cosine distance to similarity: 1 - distance
233
+ # Clamp to [0, 1] (cosine distance can exceed 1 for unnormalized)
234
+ similarity = max(0.0, min(1.0, 1 - record["_distance"]))
235
+ record["similarity"] = similarity
236
+ del record["_distance"]
237
+
238
+ # Apply similarity threshold
239
+ if record.get("similarity", 0) >= min_similarity:
240
+ filtered_results.append(record)
241
+ if len(filtered_results) >= limit:
242
+ break
243
+
244
+ return filtered_results
245
+ except ValidationError:
246
+ raise
247
+ except Exception as e:
248
+ raise StorageError(f"Failed to search: {e}") from e
249
+
250
+ def batch_vector_search_native(
251
+ self,
252
+ query_vectors: list[np.ndarray],
253
+ limit_per_query: int = 3,
254
+ namespace: str | None = None,
255
+ min_similarity: float = 0.0,
256
+ include_vector: bool = False,
257
+ ) -> list[list[dict[str, Any]]]:
258
+ """Batch search for similar memories using native LanceDB batch search.
259
+
260
+ Searches for multiple query vectors in a single database operation,
261
+ much more efficient than individual searches. Uses LanceDB's native
262
+ batch search API which returns results with query_index for grouping.
263
+
264
+ Note: This method should be called through the Database class which
265
+ applies stale connection recovery and retry decorators.
266
+
267
+ Args:
268
+ query_vectors: List of query embedding vectors.
269
+ limit_per_query: Maximum number of results per query.
270
+ namespace: Filter to specific namespace (applied to all queries).
271
+ min_similarity: Minimum similarity threshold (0-1).
272
+ include_vector: Whether to include vector embeddings in results.
273
+
274
+ Returns:
275
+ List of result lists, one per query vector (same order as input).
276
+ Each result list contains memory records with similarity scores.
277
+
278
+ Raises:
279
+ ValidationError: If input validation fails.
280
+ StorageError: If database operation fails.
281
+ """
282
+ if not query_vectors:
283
+ return []
284
+
285
+ try:
286
+ # Convert all vectors to lists for LanceDB
287
+ vector_lists = [v.tolist() for v in query_vectors]
288
+
289
+ # LanceDB native batch search
290
+ search = self._db.table.search(vector_lists)
291
+ search = search.distance_type("cosine")
292
+
293
+ # Apply performance tuning when index exists
294
+ count = self._db._get_cached_row_count()
295
+ if count > self._db.vector_index_threshold and self._db._has_vector_index:
296
+ actual_nprobes, actual_refine = self.calculate_search_params(
297
+ count, limit_per_query, None, None
298
+ )
299
+ search = search.nprobes(actual_nprobes)
300
+ search = search.refine_factor(actual_refine)
301
+
302
+ # Apply namespace filter
303
+ if namespace:
304
+ namespace = _validate_namespace(namespace)
305
+ safe_ns = _sanitize_string(namespace)
306
+ search = search.where(f"namespace = '{safe_ns}'", prefilter=True)
307
+
308
+ # Vector projection
309
+ if not include_vector:
310
+ search = search.select([
311
+ "id", "content", "namespace", "metadata",
312
+ "created_at", "updated_at", "last_accessed",
313
+ "importance", "tags", "source", "access_count",
314
+ ])
315
+
316
+ # Execute search and get results
317
+ # LanceDB returns results with _query_index to identify which query
318
+ # each result belongs to
319
+ # Use Arrow operations (no pandas dependency)
320
+ search = search.limit(limit_per_query)
321
+ results = search.to_arrow().to_pylist()
322
+
323
+ # Initialize result lists (one per query)
324
+ num_queries = len(query_vectors)
325
+ batch_results: list[list[dict[str, Any]]] = [[] for _ in range(num_queries)]
326
+
327
+ if not results:
328
+ return batch_results
329
+
330
+ # Group results by query index
331
+ for record in results:
332
+ query_idx = int(record.get("_query_index", 0))
333
+ if query_idx >= num_queries:
334
+ continue
335
+
336
+ # Convert distance to similarity (cosine distance -> similarity)
337
+ distance = record.get("_distance", 0)
338
+ similarity = 1.0 - distance
339
+
340
+ if similarity < min_similarity:
341
+ continue
342
+
343
+ # Clean up internal fields
344
+ record.pop("_distance", None)
345
+ record.pop("_query_index", None)
346
+ record.pop("_relevance_score", None)
347
+
348
+ # Add similarity score
349
+ record["similarity"] = similarity
350
+
351
+ # Deserialize metadata
352
+ if record.get("metadata"):
353
+ try:
354
+ record["metadata"] = json.loads(record["metadata"])
355
+ except (json.JSONDecodeError, TypeError):
356
+ record["metadata"] = {}
357
+ else:
358
+ record["metadata"] = {}
359
+
360
+ batch_results[query_idx].append(record)
361
+
362
+ return batch_results
363
+ except ValidationError:
364
+ raise
365
+ except Exception as e:
366
+ raise StorageError(f"Failed to batch search: {e}") from e
367
+
368
+ def hybrid_search(
369
+ self,
370
+ query: str,
371
+ query_vector: np.ndarray,
372
+ limit: int = 5,
373
+ namespace: str | None = None,
374
+ alpha: float = 0.5,
375
+ min_similarity: float = 0.0,
376
+ ) -> list[dict[str, Any]]:
377
+ """Hybrid search combining vector similarity and keyword matching.
378
+
379
+ Uses LinearCombinationReranker to balance vector and keyword scores
380
+ based on the alpha parameter.
381
+
382
+ Note: This method should be called through the Database class which
383
+ applies stale connection recovery and retry decorators.
384
+
385
+ Args:
386
+ query: Text query for full-text search.
387
+ query_vector: Embedding vector for semantic search.
388
+ limit: Number of results.
389
+ namespace: Filter to namespace.
390
+ alpha: Balance between vector (1.0) and keyword (0.0).
391
+ 0.5 = balanced (recommended).
392
+ min_similarity: Minimum similarity threshold (0.0-1.0).
393
+ Results below this threshold are filtered out.
394
+
395
+ Returns:
396
+ List of memory records with combined scores.
397
+
398
+ Raises:
399
+ ValidationError: If input validation fails.
400
+ StorageError: If database operation fails.
401
+ """
402
+ try:
403
+ # Check if FTS is available
404
+ if not self._db._has_fts_index:
405
+ logger.debug("FTS index not available, falling back to vector search")
406
+ return self.vector_search(query_vector, limit=limit, namespace=namespace)
407
+
408
+ # Create hybrid search with explicit vector column specification
409
+ # Required when using external embeddings (not LanceDB built-in)
410
+ search = (
411
+ self._db.table.search(query, query_type="hybrid")
412
+ .vector(query_vector.tolist())
413
+ .vector_column_name("vector")
414
+ )
415
+
416
+ # Apply alpha parameter using LinearCombinationReranker
417
+ # alpha=1.0 means full vector, alpha=0.0 means full FTS
418
+ try:
419
+ from lancedb.rerankers import LinearCombinationReranker
420
+
421
+ reranker = LinearCombinationReranker(weight=alpha)
422
+ search = search.rerank(reranker)
423
+ except ImportError:
424
+ logger.debug("LinearCombinationReranker not available, using default reranking")
425
+ except Exception as e:
426
+ logger.debug(f"Could not apply reranker: {e}")
427
+
428
+ # Apply namespace filter
429
+ if namespace:
430
+ namespace = _validate_namespace(namespace)
431
+ safe_ns = _sanitize_string(namespace)
432
+ search = search.where(f"namespace = '{safe_ns}'")
433
+
434
+ results: list[dict[str, Any]] = search.limit(limit).to_list()
435
+
436
+ # Process results - normalize scores and clean up internal columns
437
+ processed_results: list[dict[str, Any]] = []
438
+ for record in results:
439
+ record["metadata"] = json.loads(record["metadata"]) if record["metadata"] else {}
440
+
441
+ # Compute similarity from various score columns
442
+ # Priority: _relevance_score > _distance > _score > default
443
+ similarity: float
444
+ if "_relevance_score" in record:
445
+ # Reranker output - use directly (already 0-1 range)
446
+ similarity = float(record["_relevance_score"])
447
+ del record["_relevance_score"]
448
+ elif "_distance" in record:
449
+ # Vector distance - convert to similarity
450
+ similarity = max(0.0, min(1.0, 1 - float(record["_distance"])))
451
+ del record["_distance"]
452
+ elif "_score" in record:
453
+ # BM25 score - normalize using score/(1+score)
454
+ score = float(record["_score"])
455
+ similarity = score / (1.0 + score)
456
+ del record["_score"]
457
+ else:
458
+ # No score column - use default
459
+ similarity = 0.5
460
+
461
+ record["similarity"] = similarity
462
+
463
+ # Mark as hybrid result with alpha value
464
+ record["search_type"] = "hybrid"
465
+ record["alpha"] = alpha
466
+
467
+ # Apply min_similarity filter
468
+ if similarity >= min_similarity:
469
+ processed_results.append(record)
470
+
471
+ return processed_results
472
+
473
+ except Exception as e:
474
+ logger.warning(f"Hybrid search failed, falling back to vector search: {e}")
475
+ return self.vector_search(query_vector, limit=limit, namespace=namespace)
476
+
477
+ def batch_vector_search(
478
+ self,
479
+ query_vectors: list[np.ndarray],
480
+ limit_per_query: int = 3,
481
+ namespace: str | None = None,
482
+ parallel: bool = False, # Deprecated: native batch is always efficient
483
+ max_workers: int = 4, # Deprecated: native batch handles parallelism
484
+ include_vector: bool = False,
485
+ ) -> list[list[dict[str, Any]]]:
486
+ """Search for similar memories using multiple query vectors.
487
+
488
+ Uses native LanceDB batch search for efficiency. A single database
489
+ operation searches all vectors simultaneously.
490
+
491
+ Args:
492
+ query_vectors: List of query embedding vectors.
493
+ limit_per_query: Maximum results per query vector.
494
+ namespace: Filter to specific namespace.
495
+ parallel: Deprecated, kept for backward compatibility.
496
+ max_workers: Deprecated, kept for backward compatibility.
497
+ include_vector: Whether to include vector embeddings in results.
498
+
499
+ Returns:
500
+ List of result lists (one per query vector).
501
+
502
+ Raises:
503
+ StorageError: If database operation fails.
504
+ """
505
+ # Delegate to native batch search implementation
506
+ return self.batch_vector_search_native(
507
+ query_vectors=query_vectors,
508
+ limit_per_query=limit_per_query,
509
+ namespace=namespace,
510
+ min_similarity=0.0,
511
+ include_vector=include_vector,
512
+ )
@@ -0,0 +1,178 @@
1
+ """Snapshot and version management for LanceDB database.
2
+
3
+ Provides snapshot creation, listing, and restoration capabilities
4
+ leveraging LanceDB's built-in versioning system.
5
+
6
+ This module is part of the database.py refactoring to separate concerns:
7
+ - VersionManager handles all snapshot/version operations
8
+ - Database class delegates to VersionManager for these operations
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ from typing import TYPE_CHECKING, Any, Protocol
15
+
16
+ from spatial_memory.core.errors import StorageError, ValidationError
17
+
18
+ if TYPE_CHECKING:
19
+ from lancedb.table import Table as LanceTable
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class VersionManagerProtocol(Protocol):
25
+ """Protocol defining what VersionManager needs from Database.
26
+
27
+ This protocol enables loose coupling between VersionManager and Database,
28
+ preventing circular imports while maintaining type safety.
29
+ """
30
+
31
+ @property
32
+ def table(self) -> LanceTable:
33
+ """Access to the LanceDB table."""
34
+ ...
35
+
36
+ def _invalidate_count_cache(self) -> None:
37
+ """Invalidate the row count cache."""
38
+ ...
39
+
40
+ def _track_modification(self, count: int = 1) -> None:
41
+ """Track a modification for auto-compaction."""
42
+ ...
43
+
44
+ def _invalidate_namespace_cache(self) -> None:
45
+ """Invalidate the namespace cache."""
46
+ ...
47
+
48
+
49
+ class VersionManager:
50
+ """Manages database snapshots and version control.
51
+
52
+ Leverages LanceDB's native versioning to provide:
53
+ - Snapshot creation with semantic tags
54
+ - Version listing
55
+ - Point-in-time restoration
56
+
57
+ LanceDB automatically versions data on every write. This manager
58
+ provides a clean interface for working with those versions.
59
+
60
+ Example:
61
+ version_mgr = VersionManager(database)
62
+ version = version_mgr.create_snapshot("backup-2024-01")
63
+ snapshots = version_mgr.list_snapshots()
64
+ version_mgr.restore_snapshot(version)
65
+ """
66
+
67
+ def __init__(self, db: VersionManagerProtocol) -> None:
68
+ """Initialize the version manager.
69
+
70
+ Args:
71
+ db: Database instance providing table and cache access.
72
+ """
73
+ self._db = db
74
+
75
+ def create_snapshot(self, tag: str) -> int:
76
+ """Create a named snapshot of the current table state.
77
+
78
+ LanceDB automatically versions data on every write. This method
79
+ returns the current version number which can be used with restore_snapshot().
80
+
81
+ Args:
82
+ tag: Semantic version tag (e.g., "v1.0.0", "backup-2024-01").
83
+ Note: Tag is logged for reference but LanceDB tracks versions
84
+ numerically. Consider storing tag->version mappings externally
85
+ if tag-based retrieval is needed.
86
+
87
+ Returns:
88
+ Version number of the snapshot.
89
+
90
+ Raises:
91
+ StorageError: If snapshot creation fails.
92
+ """
93
+ try:
94
+ version: int = self._db.table.version
95
+ logger.info(f"Created snapshot '{tag}' at version {version}")
96
+ return version
97
+ except Exception as e:
98
+ raise StorageError(f"Failed to create snapshot: {e}") from e
99
+
100
+ def list_snapshots(self) -> list[dict[str, Any]]:
101
+ """List available versions/snapshots.
102
+
103
+ Returns:
104
+ List of version information dictionaries. Each dict contains
105
+ at minimum 'version' key. Additional fields depend on LanceDB
106
+ version and available metadata.
107
+
108
+ Raises:
109
+ StorageError: If listing fails.
110
+ """
111
+ try:
112
+ versions_info: list[dict[str, Any]] = []
113
+
114
+ # Try to get version history if available
115
+ if hasattr(self._db.table, "list_versions"):
116
+ try:
117
+ versions = self._db.table.list_versions()
118
+ for v in versions:
119
+ if isinstance(v, dict):
120
+ versions_info.append(v)
121
+ elif hasattr(v, "version"):
122
+ versions_info.append({
123
+ "version": v.version,
124
+ "timestamp": getattr(v, "timestamp", None),
125
+ })
126
+ else:
127
+ versions_info.append({"version": v})
128
+ except Exception as e:
129
+ logger.debug(f"list_versions not fully supported: {e}")
130
+
131
+ # Always include current version
132
+ if not versions_info:
133
+ versions_info.append({"version": self._db.table.version})
134
+
135
+ return versions_info
136
+ except Exception as e:
137
+ logger.warning(f"Could not list snapshots: {e}")
138
+ return [{"version": 0, "error": str(e)}]
139
+
140
+ def restore_snapshot(self, version: int) -> None:
141
+ """Restore table to a specific version.
142
+
143
+ This creates a NEW version that reflects the old state
144
+ (doesn't delete history).
145
+
146
+ Args:
147
+ version: The version number to restore to.
148
+
149
+ Raises:
150
+ ValidationError: If version is invalid.
151
+ StorageError: If restore fails.
152
+ """
153
+ if version < 0:
154
+ raise ValidationError("Version must be non-negative")
155
+
156
+ try:
157
+ self._db.table.restore(version)
158
+ self._db._invalidate_count_cache()
159
+ self._db._track_modification()
160
+ self._db._invalidate_namespace_cache()
161
+ logger.info(f"Restored to version {version}")
162
+ except Exception as e:
163
+ raise StorageError(f"Failed to restore snapshot: {e}") from e
164
+
165
+ def get_current_version(self) -> int:
166
+ """Get the current table version number.
167
+
168
+ Returns:
169
+ Current version number.
170
+
171
+ Raises:
172
+ StorageError: If version cannot be retrieved.
173
+ """
174
+ try:
175
+ version: int = self._db.table.version
176
+ return version
177
+ except Exception as e:
178
+ raise StorageError(f"Failed to get current version: {e}") from e