spatial-memory-mcp 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spatial_memory/__init__.py +97 -0
- spatial_memory/__main__.py +271 -0
- spatial_memory/adapters/__init__.py +7 -0
- spatial_memory/adapters/lancedb_repository.py +880 -0
- spatial_memory/config.py +769 -0
- spatial_memory/core/__init__.py +118 -0
- spatial_memory/core/cache.py +317 -0
- spatial_memory/core/circuit_breaker.py +297 -0
- spatial_memory/core/connection_pool.py +220 -0
- spatial_memory/core/consolidation_strategies.py +401 -0
- spatial_memory/core/database.py +3072 -0
- spatial_memory/core/db_idempotency.py +242 -0
- spatial_memory/core/db_indexes.py +576 -0
- spatial_memory/core/db_migrations.py +588 -0
- spatial_memory/core/db_search.py +512 -0
- spatial_memory/core/db_versioning.py +178 -0
- spatial_memory/core/embeddings.py +558 -0
- spatial_memory/core/errors.py +317 -0
- spatial_memory/core/file_security.py +701 -0
- spatial_memory/core/filesystem.py +178 -0
- spatial_memory/core/health.py +289 -0
- spatial_memory/core/helpers.py +79 -0
- spatial_memory/core/import_security.py +433 -0
- spatial_memory/core/lifecycle_ops.py +1067 -0
- spatial_memory/core/logging.py +194 -0
- spatial_memory/core/metrics.py +192 -0
- spatial_memory/core/models.py +660 -0
- spatial_memory/core/rate_limiter.py +326 -0
- spatial_memory/core/response_types.py +500 -0
- spatial_memory/core/security.py +588 -0
- spatial_memory/core/spatial_ops.py +430 -0
- spatial_memory/core/tracing.py +300 -0
- spatial_memory/core/utils.py +110 -0
- spatial_memory/core/validation.py +406 -0
- spatial_memory/factory.py +444 -0
- spatial_memory/migrations/__init__.py +40 -0
- spatial_memory/ports/__init__.py +11 -0
- spatial_memory/ports/repositories.py +630 -0
- spatial_memory/py.typed +0 -0
- spatial_memory/server.py +1214 -0
- spatial_memory/services/__init__.py +70 -0
- spatial_memory/services/decay_manager.py +411 -0
- spatial_memory/services/export_import.py +1031 -0
- spatial_memory/services/lifecycle.py +1139 -0
- spatial_memory/services/memory.py +412 -0
- spatial_memory/services/spatial.py +1152 -0
- spatial_memory/services/utility.py +429 -0
- spatial_memory/tools/__init__.py +5 -0
- spatial_memory/tools/definitions.py +695 -0
- spatial_memory/verify.py +140 -0
- spatial_memory_mcp-1.9.1.dist-info/METADATA +509 -0
- spatial_memory_mcp-1.9.1.dist-info/RECORD +55 -0
- spatial_memory_mcp-1.9.1.dist-info/WHEEL +4 -0
- spatial_memory_mcp-1.9.1.dist-info/entry_points.txt +2 -0
- spatial_memory_mcp-1.9.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,576 @@
|
|
|
1
|
+
"""Index management for LanceDB database.
|
|
2
|
+
|
|
3
|
+
Provides vector, FTS, and scalar index creation and management.
|
|
4
|
+
|
|
5
|
+
This module is part of the database.py refactoring to separate concerns:
|
|
6
|
+
- IndexManager handles all index-related operations
|
|
7
|
+
- Database class delegates to IndexManager for these operations
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import math
|
|
14
|
+
import time
|
|
15
|
+
from typing import TYPE_CHECKING, Any, Protocol
|
|
16
|
+
|
|
17
|
+
from spatial_memory.core.errors import StorageError
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from lancedb.table import Table as LanceTable
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# All known vector index types for detection
|
|
25
|
+
VECTOR_INDEX_TYPES = frozenset({
|
|
26
|
+
"IVF_PQ", "IVF_FLAT", "HNSW",
|
|
27
|
+
"IVF_HNSW_PQ", "IVF_HNSW_SQ",
|
|
28
|
+
"HNSW_PQ", "HNSW_SQ",
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _get_index_attr(idx: Any, attr: str, default: Any = None) -> Any:
|
|
33
|
+
"""Get an attribute from an index object (handles both dict and IndexConfig).
|
|
34
|
+
|
|
35
|
+
LanceDB 0.27+ returns IndexConfig objects, while older versions use dicts.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
idx: Index object (dict or IndexConfig).
|
|
39
|
+
attr: Attribute name to retrieve.
|
|
40
|
+
default: Default value if attribute not found.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
The attribute value or default.
|
|
44
|
+
"""
|
|
45
|
+
if isinstance(idx, dict):
|
|
46
|
+
return idx.get(attr, default)
|
|
47
|
+
return getattr(idx, attr, default)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class IndexManagerProtocol(Protocol):
|
|
51
|
+
"""Protocol defining what IndexManager needs from Database.
|
|
52
|
+
|
|
53
|
+
This protocol enables loose coupling between IndexManager and Database,
|
|
54
|
+
preventing circular imports while maintaining type safety.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def table(self) -> LanceTable:
|
|
59
|
+
"""Access to the LanceDB table."""
|
|
60
|
+
...
|
|
61
|
+
|
|
62
|
+
# Configuration properties
|
|
63
|
+
@property
|
|
64
|
+
def enable_fts(self) -> bool:
|
|
65
|
+
"""Whether FTS is enabled."""
|
|
66
|
+
...
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def fts_language(self) -> str:
|
|
70
|
+
"""FTS language."""
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def fts_stem(self) -> bool:
|
|
75
|
+
"""FTS stemming enabled."""
|
|
76
|
+
...
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def fts_remove_stop_words(self) -> bool:
|
|
80
|
+
"""FTS stop words removal enabled."""
|
|
81
|
+
...
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def index_type(self) -> str:
|
|
85
|
+
"""Vector index type."""
|
|
86
|
+
...
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def vector_index_threshold(self) -> int:
|
|
90
|
+
"""Row count threshold for vector index."""
|
|
91
|
+
...
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def auto_create_indexes(self) -> bool:
|
|
95
|
+
"""Auto-create indexes when thresholds met."""
|
|
96
|
+
...
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def hnsw_m(self) -> int:
|
|
100
|
+
"""HNSW M parameter."""
|
|
101
|
+
...
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def hnsw_ef_construction(self) -> int:
|
|
105
|
+
"""HNSW ef_construction parameter."""
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def index_wait_timeout_seconds(self) -> float:
|
|
110
|
+
"""Timeout for waiting on index creation."""
|
|
111
|
+
...
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def embedding_dim(self) -> int:
|
|
115
|
+
"""Embedding dimension."""
|
|
116
|
+
...
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class IndexManager:
|
|
120
|
+
"""Manages vector, FTS, and scalar indexes.
|
|
121
|
+
|
|
122
|
+
Handles index creation, detection, and optimization for
|
|
123
|
+
LanceDB tables.
|
|
124
|
+
|
|
125
|
+
Example:
|
|
126
|
+
index_mgr = IndexManager(database)
|
|
127
|
+
index_mgr.ensure_indexes()
|
|
128
|
+
if not index_mgr.has_vector_index:
|
|
129
|
+
index_mgr.create_vector_index()
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def __init__(self, db: IndexManagerProtocol) -> None:
|
|
133
|
+
"""Initialize the index manager.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
db: Database instance providing table and config access.
|
|
137
|
+
"""
|
|
138
|
+
self._db = db
|
|
139
|
+
self._has_vector_index: bool | None = None
|
|
140
|
+
self._has_fts_index: bool | None = None
|
|
141
|
+
self._has_scalar_indexes: bool = False
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def has_vector_index(self) -> bool | None:
|
|
145
|
+
"""Whether vector index exists."""
|
|
146
|
+
return self._has_vector_index
|
|
147
|
+
|
|
148
|
+
@has_vector_index.setter
|
|
149
|
+
def has_vector_index(self, value: bool | None) -> None:
|
|
150
|
+
self._has_vector_index = value
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def has_fts_index(self) -> bool | None:
|
|
154
|
+
"""Whether FTS index exists."""
|
|
155
|
+
return self._has_fts_index
|
|
156
|
+
|
|
157
|
+
@has_fts_index.setter
|
|
158
|
+
def has_fts_index(self, value: bool | None) -> None:
|
|
159
|
+
self._has_fts_index = value
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def has_scalar_indexes(self) -> bool:
|
|
163
|
+
"""Whether scalar indexes exist."""
|
|
164
|
+
return self._has_scalar_indexes
|
|
165
|
+
|
|
166
|
+
@has_scalar_indexes.setter
|
|
167
|
+
def has_scalar_indexes(self, value: bool) -> None:
|
|
168
|
+
self._has_scalar_indexes = value
|
|
169
|
+
|
|
170
|
+
def reset_index_state(self) -> None:
|
|
171
|
+
"""Reset all index state flags."""
|
|
172
|
+
self._has_vector_index = None
|
|
173
|
+
self._has_fts_index = None
|
|
174
|
+
self._has_scalar_indexes = False
|
|
175
|
+
|
|
176
|
+
def check_existing_indexes(self) -> None:
|
|
177
|
+
"""Check which indexes already exist using robust detection."""
|
|
178
|
+
try:
|
|
179
|
+
indices = self._db.table.list_indices()
|
|
180
|
+
|
|
181
|
+
self._has_vector_index = False
|
|
182
|
+
self._has_fts_index = False
|
|
183
|
+
|
|
184
|
+
for idx in indices:
|
|
185
|
+
index_name = str(_get_index_attr(idx, "name", "")).lower()
|
|
186
|
+
index_type = str(_get_index_attr(idx, "index_type", "")).upper()
|
|
187
|
+
columns = _get_index_attr(idx, "columns", [])
|
|
188
|
+
|
|
189
|
+
# Vector index detection: check index_type or column name
|
|
190
|
+
if index_type in VECTOR_INDEX_TYPES:
|
|
191
|
+
self._has_vector_index = True
|
|
192
|
+
elif "vector" in columns or "vector" in index_name:
|
|
193
|
+
self._has_vector_index = True
|
|
194
|
+
|
|
195
|
+
# FTS index detection: check index_type or name patterns
|
|
196
|
+
if index_type == "FTS":
|
|
197
|
+
self._has_fts_index = True
|
|
198
|
+
elif "fts" in index_name or "content" in index_name:
|
|
199
|
+
self._has_fts_index = True
|
|
200
|
+
|
|
201
|
+
logger.debug(
|
|
202
|
+
f"Existing indexes: vector={self._has_vector_index}, "
|
|
203
|
+
f"fts={self._has_fts_index}"
|
|
204
|
+
)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.warning(f"Could not check existing indexes: {e}")
|
|
207
|
+
self._has_vector_index = None
|
|
208
|
+
self._has_fts_index = None
|
|
209
|
+
|
|
210
|
+
def create_fts_index(self) -> None:
|
|
211
|
+
"""Create full-text search index with optimized settings."""
|
|
212
|
+
try:
|
|
213
|
+
self._db.table.create_fts_index(
|
|
214
|
+
"content",
|
|
215
|
+
use_tantivy=False, # Use Lance native FTS
|
|
216
|
+
language=self._db.fts_language,
|
|
217
|
+
stem=self._db.fts_stem,
|
|
218
|
+
remove_stop_words=self._db.fts_remove_stop_words,
|
|
219
|
+
with_position=True, # Enable phrase queries
|
|
220
|
+
lower_case=True, # Case-insensitive search
|
|
221
|
+
)
|
|
222
|
+
self._has_fts_index = True
|
|
223
|
+
logger.info(
|
|
224
|
+
f"Created FTS index with stemming={self._db.fts_stem}, "
|
|
225
|
+
f"stop_words={self._db.fts_remove_stop_words}"
|
|
226
|
+
)
|
|
227
|
+
except Exception as e:
|
|
228
|
+
# Check if index already exists (not an error)
|
|
229
|
+
if "already exists" in str(e).lower():
|
|
230
|
+
self._has_fts_index = True
|
|
231
|
+
logger.debug("FTS index already exists")
|
|
232
|
+
else:
|
|
233
|
+
logger.warning(f"FTS index creation failed: {e}")
|
|
234
|
+
|
|
235
|
+
def create_vector_index(self, force: bool = False) -> bool:
|
|
236
|
+
"""Create vector index for similarity search.
|
|
237
|
+
|
|
238
|
+
Supports IVF_PQ, IVF_FLAT, and HNSW_SQ index types based on configuration.
|
|
239
|
+
Automatically determines optimal parameters based on dataset size.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
force: Force index creation regardless of dataset size.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
True if index was created, False if skipped.
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
StorageError: If index creation fails.
|
|
249
|
+
"""
|
|
250
|
+
count = self._db.table.count_rows()
|
|
251
|
+
|
|
252
|
+
# Check threshold
|
|
253
|
+
if count < self._db.vector_index_threshold and not force:
|
|
254
|
+
logger.info(
|
|
255
|
+
f"Dataset has {count} rows, below threshold {self._db.vector_index_threshold}. "
|
|
256
|
+
"Skipping vector index creation."
|
|
257
|
+
)
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
# Check if already exists
|
|
261
|
+
if self._has_vector_index and not force:
|
|
262
|
+
logger.info("Vector index already exists")
|
|
263
|
+
return False
|
|
264
|
+
|
|
265
|
+
# Handle HNSW_SQ index type
|
|
266
|
+
if self._db.index_type == "HNSW_SQ":
|
|
267
|
+
return self._create_hnsw_index(count)
|
|
268
|
+
|
|
269
|
+
# IVF-based index creation (IVF_PQ or IVF_FLAT)
|
|
270
|
+
return self._create_ivf_index(count)
|
|
271
|
+
|
|
272
|
+
def _create_hnsw_index(self, count: int) -> bool:
|
|
273
|
+
"""Create HNSW-SQ vector index.
|
|
274
|
+
|
|
275
|
+
HNSW (Hierarchical Navigable Small World) provides better recall than IVF
|
|
276
|
+
at the cost of higher memory usage. Good for datasets where recall is critical.
|
|
277
|
+
|
|
278
|
+
Args:
|
|
279
|
+
count: Number of rows in the table.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
True if index was created.
|
|
283
|
+
|
|
284
|
+
Raises:
|
|
285
|
+
StorageError: If index creation fails.
|
|
286
|
+
"""
|
|
287
|
+
logger.info(
|
|
288
|
+
f"Creating HNSW_SQ vector index: m={self._db.hnsw_m}, "
|
|
289
|
+
f"ef_construction={self._db.hnsw_ef_construction} for {count} rows"
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
try:
|
|
293
|
+
self._db.table.create_index(
|
|
294
|
+
metric="cosine",
|
|
295
|
+
vector_column_name="vector",
|
|
296
|
+
index_type="HNSW_SQ",
|
|
297
|
+
replace=True,
|
|
298
|
+
m=self._db.hnsw_m,
|
|
299
|
+
ef_construction=self._db.hnsw_ef_construction,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Wait for index to be ready with configurable timeout
|
|
303
|
+
self._wait_for_index_ready("vector", self._db.index_wait_timeout_seconds)
|
|
304
|
+
|
|
305
|
+
self._has_vector_index = True
|
|
306
|
+
logger.info("HNSW_SQ vector index created successfully")
|
|
307
|
+
|
|
308
|
+
# Optimize after index creation (may fail in some environments)
|
|
309
|
+
try:
|
|
310
|
+
self._db.table.optimize()
|
|
311
|
+
except Exception as optimize_error:
|
|
312
|
+
logger.debug(f"Optimization after index creation skipped: {optimize_error}")
|
|
313
|
+
|
|
314
|
+
return True
|
|
315
|
+
|
|
316
|
+
except Exception as e:
|
|
317
|
+
logger.error(f"Failed to create HNSW_SQ vector index: {e}")
|
|
318
|
+
raise StorageError(f"HNSW_SQ vector index creation failed: {e}") from e
|
|
319
|
+
|
|
320
|
+
def _create_ivf_index(self, count: int) -> bool:
|
|
321
|
+
"""Create IVF-PQ or IVF-FLAT vector index.
|
|
322
|
+
|
|
323
|
+
Uses sqrt rule for partitions: num_partitions = sqrt(count), clamped to [16, 4096].
|
|
324
|
+
Uses 48 sub-vectors for <500K rows (8 dims each for 384-dim vectors),
|
|
325
|
+
96 sub-vectors for >=500K rows (4 dims each).
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
count: Number of rows in the table.
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
True if index was created.
|
|
332
|
+
|
|
333
|
+
Raises:
|
|
334
|
+
StorageError: If index creation fails.
|
|
335
|
+
"""
|
|
336
|
+
# Use sqrt rule for partitions, clamped to [16, 4096]
|
|
337
|
+
num_partitions = int(math.sqrt(count))
|
|
338
|
+
num_partitions = max(16, min(num_partitions, 4096))
|
|
339
|
+
|
|
340
|
+
# Choose num_sub_vectors based on dataset size
|
|
341
|
+
# <500K: 48 sub-vectors (8 dims each for 384-dim, more precision)
|
|
342
|
+
# >=500K: 96 sub-vectors (4 dims each, more compression)
|
|
343
|
+
if count < 500_000:
|
|
344
|
+
num_sub_vectors = 48
|
|
345
|
+
else:
|
|
346
|
+
num_sub_vectors = 96
|
|
347
|
+
|
|
348
|
+
# Validate embedding_dim % num_sub_vectors == 0 (required for IVF-PQ)
|
|
349
|
+
if self._db.embedding_dim % num_sub_vectors != 0:
|
|
350
|
+
# Find a valid divisor from common sub-vector counts
|
|
351
|
+
valid_divisors = [96, 48, 32, 24, 16, 12, 8, 4]
|
|
352
|
+
found_divisor = False
|
|
353
|
+
for divisor in valid_divisors:
|
|
354
|
+
if self._db.embedding_dim % divisor == 0:
|
|
355
|
+
logger.info(
|
|
356
|
+
f"Adjusted num_sub_vectors from {num_sub_vectors} to {divisor} "
|
|
357
|
+
f"for embedding_dim={self._db.embedding_dim}"
|
|
358
|
+
)
|
|
359
|
+
num_sub_vectors = divisor
|
|
360
|
+
found_divisor = True
|
|
361
|
+
break
|
|
362
|
+
|
|
363
|
+
if not found_divisor:
|
|
364
|
+
raise StorageError(
|
|
365
|
+
f"Cannot create IVF-PQ index: embedding_dim={self._db.embedding_dim} "
|
|
366
|
+
"has no suitable divisor for sub-vectors. "
|
|
367
|
+
f"Tried divisors: {valid_divisors}"
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# IVF-PQ requires minimum rows for training (sample_rate * num_partitions / 256)
|
|
371
|
+
# Default sample_rate=256, so we need at least 256 rows
|
|
372
|
+
# Also, IVF requires num_partitions < num_vectors for KMeans training
|
|
373
|
+
sample_rate = 256 # default
|
|
374
|
+
if count < 256:
|
|
375
|
+
# Use IVF_FLAT for very small datasets (no PQ training required)
|
|
376
|
+
logger.info(
|
|
377
|
+
f"Dataset too small for IVF-PQ ({count} rows < 256). "
|
|
378
|
+
"Using IVF_FLAT index instead."
|
|
379
|
+
)
|
|
380
|
+
index_type = "IVF_FLAT"
|
|
381
|
+
sample_rate = max(16, count // 4) # Lower sample rate for small data
|
|
382
|
+
else:
|
|
383
|
+
valid_types = ("IVF_PQ", "IVF_FLAT")
|
|
384
|
+
index_type = self._db.index_type if self._db.index_type in valid_types else "IVF_PQ"
|
|
385
|
+
|
|
386
|
+
# Ensure num_partitions < num_vectors for KMeans clustering
|
|
387
|
+
if num_partitions >= count:
|
|
388
|
+
num_partitions = max(1, count // 4) # Use 1/4 of count, minimum 1
|
|
389
|
+
logger.info(f"Adjusted num_partitions to {num_partitions} for {count} rows")
|
|
390
|
+
|
|
391
|
+
logger.info(
|
|
392
|
+
f"Creating {index_type} vector index: {num_partitions} partitions, "
|
|
393
|
+
f"{num_sub_vectors} sub-vectors for {count} rows"
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
try:
|
|
397
|
+
# LanceDB 0.27+ API: parameters passed directly to create_index
|
|
398
|
+
index_kwargs: dict[str, Any] = {
|
|
399
|
+
"metric": "cosine",
|
|
400
|
+
"num_partitions": num_partitions,
|
|
401
|
+
"vector_column_name": "vector",
|
|
402
|
+
"index_type": index_type,
|
|
403
|
+
"replace": True,
|
|
404
|
+
"sample_rate": sample_rate,
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
# num_sub_vectors only applies to PQ-based indexes
|
|
408
|
+
if "PQ" in index_type:
|
|
409
|
+
index_kwargs["num_sub_vectors"] = num_sub_vectors
|
|
410
|
+
|
|
411
|
+
self._db.table.create_index(**index_kwargs)
|
|
412
|
+
|
|
413
|
+
# Wait for index to be ready with configurable timeout
|
|
414
|
+
self._wait_for_index_ready("vector", self._db.index_wait_timeout_seconds)
|
|
415
|
+
|
|
416
|
+
self._has_vector_index = True
|
|
417
|
+
logger.info(f"{index_type} vector index created successfully")
|
|
418
|
+
|
|
419
|
+
# Optimize after index creation (may fail in some environments)
|
|
420
|
+
try:
|
|
421
|
+
self._db.table.optimize()
|
|
422
|
+
except Exception as optimize_error:
|
|
423
|
+
logger.debug(f"Optimization after index creation skipped: {optimize_error}")
|
|
424
|
+
|
|
425
|
+
return True
|
|
426
|
+
|
|
427
|
+
except Exception as e:
|
|
428
|
+
logger.error(f"Failed to create {index_type} vector index: {e}")
|
|
429
|
+
raise StorageError(f"{index_type} vector index creation failed: {e}") from e
|
|
430
|
+
|
|
431
|
+
def _wait_for_index_ready(
|
|
432
|
+
self,
|
|
433
|
+
column_name: str,
|
|
434
|
+
timeout_seconds: float,
|
|
435
|
+
poll_interval: float = 0.5,
|
|
436
|
+
) -> None:
|
|
437
|
+
"""Wait for an index on the specified column to be ready.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
column_name: Name of the column the index is on (e.g., "vector").
|
|
441
|
+
LanceDB typically names indexes as "{column_name}_idx".
|
|
442
|
+
timeout_seconds: Maximum time to wait.
|
|
443
|
+
poll_interval: Time between status checks.
|
|
444
|
+
"""
|
|
445
|
+
if timeout_seconds <= 0:
|
|
446
|
+
return
|
|
447
|
+
|
|
448
|
+
start_time = time.time()
|
|
449
|
+
while time.time() - start_time < timeout_seconds:
|
|
450
|
+
try:
|
|
451
|
+
indices = self._db.table.list_indices()
|
|
452
|
+
for idx in indices:
|
|
453
|
+
idx_name = str(_get_index_attr(idx, "name", "")).lower()
|
|
454
|
+
idx_columns = _get_index_attr(idx, "columns", [])
|
|
455
|
+
|
|
456
|
+
# Match by column name in index metadata, or index name contains column
|
|
457
|
+
if column_name in idx_columns or column_name in idx_name:
|
|
458
|
+
# Index exists, check if it's ready
|
|
459
|
+
status = str(_get_index_attr(idx, "status", "ready"))
|
|
460
|
+
if status.lower() in ("ready", "complete", "built"):
|
|
461
|
+
logger.debug(f"Index on {column_name} is ready")
|
|
462
|
+
return
|
|
463
|
+
break
|
|
464
|
+
except Exception as e:
|
|
465
|
+
logger.debug(f"Error checking index status: {e}")
|
|
466
|
+
|
|
467
|
+
time.sleep(poll_interval)
|
|
468
|
+
|
|
469
|
+
logger.warning(
|
|
470
|
+
f"Timeout waiting for index on {column_name} after {timeout_seconds}s"
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
def create_scalar_indexes(self) -> None:
|
|
474
|
+
"""Create scalar indexes for frequently filtered columns.
|
|
475
|
+
|
|
476
|
+
Creates:
|
|
477
|
+
- BTREE on id (fast lookups, upserts)
|
|
478
|
+
- BTREE on timestamps and importance (range queries)
|
|
479
|
+
- BITMAP on namespace and source (low cardinality)
|
|
480
|
+
- LABEL_LIST on tags (array contains queries)
|
|
481
|
+
|
|
482
|
+
Raises:
|
|
483
|
+
StorageError: If index creation fails critically.
|
|
484
|
+
"""
|
|
485
|
+
# BTREE indexes for range queries and lookups
|
|
486
|
+
btree_columns = [
|
|
487
|
+
"id", # Fast lookups and merge_insert
|
|
488
|
+
"created_at",
|
|
489
|
+
"updated_at",
|
|
490
|
+
"last_accessed",
|
|
491
|
+
"importance",
|
|
492
|
+
"access_count",
|
|
493
|
+
"expires_at", # TTL expiration queries
|
|
494
|
+
]
|
|
495
|
+
|
|
496
|
+
for column in btree_columns:
|
|
497
|
+
try:
|
|
498
|
+
self._db.table.create_scalar_index(
|
|
499
|
+
column,
|
|
500
|
+
index_type="BTREE",
|
|
501
|
+
replace=True,
|
|
502
|
+
)
|
|
503
|
+
logger.debug(f"Created BTREE index on {column}")
|
|
504
|
+
except Exception as e:
|
|
505
|
+
if "already exists" not in str(e).lower():
|
|
506
|
+
logger.warning(f"Could not create BTREE index on {column}: {e}")
|
|
507
|
+
|
|
508
|
+
# BITMAP indexes for low-cardinality columns
|
|
509
|
+
bitmap_columns = ["namespace", "source"]
|
|
510
|
+
|
|
511
|
+
for column in bitmap_columns:
|
|
512
|
+
try:
|
|
513
|
+
self._db.table.create_scalar_index(
|
|
514
|
+
column,
|
|
515
|
+
index_type="BITMAP",
|
|
516
|
+
replace=True,
|
|
517
|
+
)
|
|
518
|
+
logger.debug(f"Created BITMAP index on {column}")
|
|
519
|
+
except Exception as e:
|
|
520
|
+
if "already exists" not in str(e).lower():
|
|
521
|
+
logger.warning(f"Could not create BITMAP index on {column}: {e}")
|
|
522
|
+
|
|
523
|
+
# LABEL_LIST index for tags array (supports array_has_any queries)
|
|
524
|
+
try:
|
|
525
|
+
self._db.table.create_scalar_index(
|
|
526
|
+
"tags",
|
|
527
|
+
index_type="LABEL_LIST",
|
|
528
|
+
replace=True,
|
|
529
|
+
)
|
|
530
|
+
logger.debug("Created LABEL_LIST index on tags")
|
|
531
|
+
except Exception as e:
|
|
532
|
+
if "already exists" not in str(e).lower():
|
|
533
|
+
logger.warning(f"Could not create LABEL_LIST index on tags: {e}")
|
|
534
|
+
|
|
535
|
+
self._has_scalar_indexes = True
|
|
536
|
+
logger.info("Scalar indexes created")
|
|
537
|
+
|
|
538
|
+
def ensure_indexes(self, force: bool = False) -> dict[str, bool]:
|
|
539
|
+
"""Ensure all appropriate indexes exist.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
force: Force index creation regardless of thresholds.
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
Dict indicating which indexes were created.
|
|
546
|
+
"""
|
|
547
|
+
results = {
|
|
548
|
+
"vector_index": False,
|
|
549
|
+
"scalar_indexes": False,
|
|
550
|
+
"fts_index": False,
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
count = self._db.table.count_rows()
|
|
554
|
+
|
|
555
|
+
# Vector index
|
|
556
|
+
if self._db.auto_create_indexes or force:
|
|
557
|
+
if count >= self._db.vector_index_threshold or force:
|
|
558
|
+
results["vector_index"] = self.create_vector_index(force=force)
|
|
559
|
+
|
|
560
|
+
# Scalar indexes (always create if > 1000 rows)
|
|
561
|
+
if count >= 1000 or force:
|
|
562
|
+
try:
|
|
563
|
+
self.create_scalar_indexes()
|
|
564
|
+
results["scalar_indexes"] = True
|
|
565
|
+
except Exception as e:
|
|
566
|
+
logger.warning(f"Scalar index creation partially failed: {e}")
|
|
567
|
+
|
|
568
|
+
# FTS index
|
|
569
|
+
if self._db.enable_fts and not self._has_fts_index:
|
|
570
|
+
try:
|
|
571
|
+
self.create_fts_index()
|
|
572
|
+
results["fts_index"] = True
|
|
573
|
+
except Exception as e:
|
|
574
|
+
logger.warning(f"FTS index creation failed in ensure_indexes: {e}")
|
|
575
|
+
|
|
576
|
+
return results
|