spatial-memory-mcp 1.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spatial-memory-mcp might be problematic. Click here for more details.
- spatial_memory/__init__.py +97 -0
- spatial_memory/__main__.py +270 -0
- spatial_memory/adapters/__init__.py +7 -0
- spatial_memory/adapters/lancedb_repository.py +878 -0
- spatial_memory/config.py +728 -0
- spatial_memory/core/__init__.py +118 -0
- spatial_memory/core/cache.py +317 -0
- spatial_memory/core/circuit_breaker.py +297 -0
- spatial_memory/core/connection_pool.py +220 -0
- spatial_memory/core/consolidation_strategies.py +402 -0
- spatial_memory/core/database.py +3069 -0
- spatial_memory/core/db_idempotency.py +242 -0
- spatial_memory/core/db_indexes.py +575 -0
- spatial_memory/core/db_migrations.py +584 -0
- spatial_memory/core/db_search.py +509 -0
- spatial_memory/core/db_versioning.py +177 -0
- spatial_memory/core/embeddings.py +557 -0
- spatial_memory/core/errors.py +317 -0
- spatial_memory/core/file_security.py +702 -0
- spatial_memory/core/filesystem.py +178 -0
- spatial_memory/core/health.py +289 -0
- spatial_memory/core/helpers.py +79 -0
- spatial_memory/core/import_security.py +432 -0
- spatial_memory/core/lifecycle_ops.py +1067 -0
- spatial_memory/core/logging.py +194 -0
- spatial_memory/core/metrics.py +192 -0
- spatial_memory/core/models.py +628 -0
- spatial_memory/core/rate_limiter.py +326 -0
- spatial_memory/core/response_types.py +497 -0
- spatial_memory/core/security.py +588 -0
- spatial_memory/core/spatial_ops.py +426 -0
- spatial_memory/core/tracing.py +300 -0
- spatial_memory/core/utils.py +110 -0
- spatial_memory/core/validation.py +403 -0
- spatial_memory/factory.py +407 -0
- spatial_memory/migrations/__init__.py +40 -0
- spatial_memory/ports/__init__.py +11 -0
- spatial_memory/ports/repositories.py +631 -0
- spatial_memory/py.typed +0 -0
- spatial_memory/server.py +1141 -0
- spatial_memory/services/__init__.py +70 -0
- spatial_memory/services/export_import.py +1023 -0
- spatial_memory/services/lifecycle.py +1120 -0
- spatial_memory/services/memory.py +412 -0
- spatial_memory/services/spatial.py +1147 -0
- spatial_memory/services/utility.py +409 -0
- spatial_memory/tools/__init__.py +5 -0
- spatial_memory/tools/definitions.py +695 -0
- spatial_memory/verify.py +140 -0
- spatial_memory_mcp-1.6.1.dist-info/METADATA +499 -0
- spatial_memory_mcp-1.6.1.dist-info/RECORD +54 -0
- spatial_memory_mcp-1.6.1.dist-info/WHEEL +4 -0
- spatial_memory_mcp-1.6.1.dist-info/entry_points.txt +2 -0
- spatial_memory_mcp-1.6.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1147 @@
|
|
|
1
|
+
"""Spatial service for exploration operations.
|
|
2
|
+
|
|
3
|
+
This service provides the spatial layer for memory exploration:
|
|
4
|
+
- journey: SLERP interpolation between two memories
|
|
5
|
+
- wander: Temperature-based random walk through memory space
|
|
6
|
+
- regions: HDBSCAN clustering to discover memory regions
|
|
7
|
+
- visualize: UMAP projection for 2D/3D visualization
|
|
8
|
+
|
|
9
|
+
The service uses dependency injection for repository and embedding services.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import random
|
|
17
|
+
import re
|
|
18
|
+
from collections import Counter
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
from spatial_memory.core.errors import (
|
|
25
|
+
ClusteringError,
|
|
26
|
+
InsufficientMemoriesError,
|
|
27
|
+
JourneyError,
|
|
28
|
+
MemoryNotFoundError,
|
|
29
|
+
ValidationError,
|
|
30
|
+
VisualizationError,
|
|
31
|
+
WanderError,
|
|
32
|
+
)
|
|
33
|
+
from spatial_memory.core.models import (
|
|
34
|
+
JourneyResult,
|
|
35
|
+
JourneyStep,
|
|
36
|
+
MemoryResult,
|
|
37
|
+
RegionCluster,
|
|
38
|
+
RegionsResult,
|
|
39
|
+
VisualizationEdge,
|
|
40
|
+
VisualizationNode,
|
|
41
|
+
VisualizationResult,
|
|
42
|
+
WanderResult,
|
|
43
|
+
WanderStep,
|
|
44
|
+
)
|
|
45
|
+
from spatial_memory.core.validation import validate_namespace, validate_uuid
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
# Check optional dependency availability at import time
|
|
50
|
+
try:
|
|
51
|
+
import hdbscan
|
|
52
|
+
|
|
53
|
+
HDBSCAN_AVAILABLE = True
|
|
54
|
+
except ImportError:
|
|
55
|
+
HDBSCAN_AVAILABLE = False
|
|
56
|
+
logger.debug("HDBSCAN not available - regions operation will be disabled")
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
import umap
|
|
60
|
+
|
|
61
|
+
UMAP_AVAILABLE = True
|
|
62
|
+
except ImportError:
|
|
63
|
+
UMAP_AVAILABLE = False
|
|
64
|
+
logger.debug("UMAP not available - visualize operation will be disabled")
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
from scipy.spatial.distance import cdist
|
|
68
|
+
|
|
69
|
+
SCIPY_AVAILABLE = True
|
|
70
|
+
except ImportError:
|
|
71
|
+
SCIPY_AVAILABLE = False
|
|
72
|
+
logger.debug("scipy not available - using fallback for similarity calculations")
|
|
73
|
+
|
|
74
|
+
# Common stop words for keyword extraction (module-level to avoid recreation)
|
|
75
|
+
_STOP_WORDS: frozenset[str] = frozenset({
|
|
76
|
+
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
|
77
|
+
"have", "has", "had", "do", "does", "did", "will", "would", "could",
|
|
78
|
+
"should", "may", "might", "must", "can", "to", "of", "in", "for",
|
|
79
|
+
"on", "with", "at", "by", "from", "as", "into", "through", "during",
|
|
80
|
+
"before", "after", "above", "below", "between", "under", "again",
|
|
81
|
+
"further", "then", "once", "here", "there", "when", "where", "why",
|
|
82
|
+
"how", "all", "each", "few", "more", "most", "other", "some", "such",
|
|
83
|
+
"no", "nor", "not", "only", "own", "same", "so", "than", "too",
|
|
84
|
+
"very", "just", "also", "now", "and", "but", "or", "if", "it", "its",
|
|
85
|
+
"this", "that", "these", "those", "i", "you", "he", "she", "we", "they",
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
if TYPE_CHECKING:
|
|
89
|
+
from spatial_memory.ports.repositories import (
|
|
90
|
+
EmbeddingServiceProtocol,
|
|
91
|
+
MemoryRepositoryProtocol,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass
|
|
96
|
+
class SpatialConfig:
|
|
97
|
+
"""Configuration for spatial operations.
|
|
98
|
+
|
|
99
|
+
Attributes:
|
|
100
|
+
journey_default_steps: Default number of interpolation steps for journey.
|
|
101
|
+
journey_max_steps: Maximum allowed steps for journey.
|
|
102
|
+
journey_neighbors_per_step: Number of neighbors to find per interpolation point.
|
|
103
|
+
wander_default_steps: Default number of steps for random walk.
|
|
104
|
+
wander_max_steps: Maximum allowed steps for wander.
|
|
105
|
+
wander_default_temperature: Default temperature (randomness) for wander.
|
|
106
|
+
wander_avoid_recent: Number of recent memories to avoid revisiting.
|
|
107
|
+
wander_candidates_per_step: Number of candidate neighbors per step.
|
|
108
|
+
regions_min_cluster_size: Minimum cluster size for HDBSCAN.
|
|
109
|
+
regions_max_memories: Maximum memories to consider for clustering.
|
|
110
|
+
visualize_n_neighbors: UMAP n_neighbors parameter.
|
|
111
|
+
visualize_min_dist: UMAP min_dist parameter.
|
|
112
|
+
visualize_max_memories: Maximum memories to include in visualization.
|
|
113
|
+
visualize_similarity_threshold: Minimum similarity for edge creation.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
# Journey parameters
|
|
117
|
+
journey_default_steps: int = 10
|
|
118
|
+
journey_max_steps: int = 20
|
|
119
|
+
journey_neighbors_per_step: int = 3
|
|
120
|
+
|
|
121
|
+
# Wander parameters
|
|
122
|
+
wander_default_steps: int = 10
|
|
123
|
+
wander_max_steps: int = 20
|
|
124
|
+
wander_default_temperature: float = 0.5
|
|
125
|
+
wander_avoid_recent: int = 5
|
|
126
|
+
wander_candidates_per_step: int = 10
|
|
127
|
+
|
|
128
|
+
# Regions parameters
|
|
129
|
+
regions_min_cluster_size: int = 3
|
|
130
|
+
regions_max_memories: int = 10_000
|
|
131
|
+
|
|
132
|
+
# Visualize parameters
|
|
133
|
+
visualize_n_neighbors: int = 15
|
|
134
|
+
visualize_min_dist: float = 0.1
|
|
135
|
+
visualize_max_memories: int = 500
|
|
136
|
+
visualize_similarity_threshold: float = 0.7
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
# Color palette for cluster visualization
|
|
140
|
+
CLUSTER_COLORS = [
|
|
141
|
+
"#4285F4", # Blue
|
|
142
|
+
"#EA4335", # Red
|
|
143
|
+
"#FBBC04", # Yellow
|
|
144
|
+
"#34A853", # Green
|
|
145
|
+
"#FF6D01", # Orange
|
|
146
|
+
"#46BDC6", # Cyan
|
|
147
|
+
"#7B1FA2", # Purple
|
|
148
|
+
"#E91E63", # Pink
|
|
149
|
+
"#009688", # Teal
|
|
150
|
+
"#795548", # Brown
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class SpatialService:
|
|
155
|
+
"""Service for spatial exploration of memory space.
|
|
156
|
+
|
|
157
|
+
Uses Clean Architecture - depends on protocol interfaces, not implementations.
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
def __init__(
|
|
161
|
+
self,
|
|
162
|
+
repository: MemoryRepositoryProtocol,
|
|
163
|
+
embeddings: EmbeddingServiceProtocol,
|
|
164
|
+
config: SpatialConfig | None = None,
|
|
165
|
+
) -> None:
|
|
166
|
+
"""Initialize the spatial service.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
repository: Repository for memory storage.
|
|
170
|
+
embeddings: Service for generating embeddings.
|
|
171
|
+
config: Optional configuration (uses defaults if not provided).
|
|
172
|
+
"""
|
|
173
|
+
self._repo = repository
|
|
174
|
+
self._embeddings = embeddings
|
|
175
|
+
self._config = config or SpatialConfig()
|
|
176
|
+
|
|
177
|
+
def journey(
|
|
178
|
+
self,
|
|
179
|
+
start_id: str,
|
|
180
|
+
end_id: str,
|
|
181
|
+
steps: int | None = None,
|
|
182
|
+
namespace: str | None = None,
|
|
183
|
+
) -> JourneyResult:
|
|
184
|
+
"""Find a path between two memories using SLERP interpolation.
|
|
185
|
+
|
|
186
|
+
Spherical Linear Interpolation (SLERP) creates smooth paths through
|
|
187
|
+
embedding space, finding actual memories closest to each interpolation
|
|
188
|
+
point.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
start_id: Starting memory UUID.
|
|
192
|
+
end_id: Ending memory UUID.
|
|
193
|
+
steps: Number of interpolation steps (default from config).
|
|
194
|
+
namespace: Optional namespace filter for intermediate memories.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
JourneyResult with path steps.
|
|
198
|
+
|
|
199
|
+
Raises:
|
|
200
|
+
ValidationError: If input validation fails.
|
|
201
|
+
MemoryNotFoundError: If start or end memory not found.
|
|
202
|
+
JourneyError: If path cannot be computed.
|
|
203
|
+
"""
|
|
204
|
+
# Validate inputs
|
|
205
|
+
start_id = validate_uuid(start_id)
|
|
206
|
+
end_id = validate_uuid(end_id)
|
|
207
|
+
if namespace is not None:
|
|
208
|
+
namespace = validate_namespace(namespace)
|
|
209
|
+
|
|
210
|
+
# Get step count
|
|
211
|
+
actual_steps = steps if steps is not None else self._config.journey_default_steps
|
|
212
|
+
if actual_steps < 2:
|
|
213
|
+
raise ValidationError("Journey requires at least 2 steps")
|
|
214
|
+
if actual_steps > self._config.journey_max_steps:
|
|
215
|
+
raise ValidationError(
|
|
216
|
+
f"Maximum journey steps is {self._config.journey_max_steps}"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Get start and end memories with vectors
|
|
220
|
+
start_result = self._repo.get_with_vector(start_id)
|
|
221
|
+
if start_result is None:
|
|
222
|
+
raise MemoryNotFoundError(start_id)
|
|
223
|
+
start_memory, start_vector = start_result
|
|
224
|
+
|
|
225
|
+
end_result = self._repo.get_with_vector(end_id)
|
|
226
|
+
if end_result is None:
|
|
227
|
+
raise MemoryNotFoundError(end_id)
|
|
228
|
+
end_memory, end_vector = end_result
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
# Generate interpolation points using SLERP
|
|
232
|
+
interpolated_vectors, t_values = self._slerp_interpolate(
|
|
233
|
+
start_vector, end_vector, actual_steps
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Find nearest memories for each interpolation point
|
|
237
|
+
# Use batch search for efficiency, include vectors to avoid N+1 queries
|
|
238
|
+
search_results = self._batch_vector_search(
|
|
239
|
+
interpolated_vectors,
|
|
240
|
+
limit_per_query=self._config.journey_neighbors_per_step,
|
|
241
|
+
namespace=namespace,
|
|
242
|
+
include_vector=True, # Include vectors to avoid follow-up queries
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Build journey steps
|
|
246
|
+
journey_steps: list[JourneyStep] = []
|
|
247
|
+
steps_with_memories = 0
|
|
248
|
+
|
|
249
|
+
for step_num, (interp_vec, t_val, neighbors) in enumerate(
|
|
250
|
+
zip(interpolated_vectors, t_values, search_results)
|
|
251
|
+
):
|
|
252
|
+
# Calculate distance from interpolation point to nearest memory
|
|
253
|
+
distance_to_path = float("inf")
|
|
254
|
+
if neighbors:
|
|
255
|
+
for neighbor in neighbors:
|
|
256
|
+
# Use vector from search result (included via include_vector=True)
|
|
257
|
+
if neighbor.vector is not None:
|
|
258
|
+
neighbor_vec = np.array(neighbor.vector, dtype=np.float32)
|
|
259
|
+
dist = self._cosine_distance(interp_vec, neighbor_vec)
|
|
260
|
+
else:
|
|
261
|
+
# Fallback if vector not included (shouldn't happen)
|
|
262
|
+
dist = self._cosine_distance(
|
|
263
|
+
interp_vec, self._get_vector_for_memory(neighbor.id)
|
|
264
|
+
)
|
|
265
|
+
if dist < distance_to_path:
|
|
266
|
+
distance_to_path = dist
|
|
267
|
+
steps_with_memories += 1
|
|
268
|
+
|
|
269
|
+
# Use 0.0 if no memories found (inf means no distance calculated)
|
|
270
|
+
# Clamp to 0.0 to handle floating point precision errors (e.g., -4.89e-08)
|
|
271
|
+
final_distance = 0.0 if distance_to_path == float("inf") else max(0.0, distance_to_path)
|
|
272
|
+
journey_steps.append(
|
|
273
|
+
JourneyStep(
|
|
274
|
+
step=step_num,
|
|
275
|
+
t=t_val,
|
|
276
|
+
position=interp_vec.tolist(),
|
|
277
|
+
nearby_memories=neighbors,
|
|
278
|
+
distance_to_path=final_distance,
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
# Calculate path coverage
|
|
283
|
+
path_coverage = steps_with_memories / len(journey_steps) if journey_steps else 0.0
|
|
284
|
+
|
|
285
|
+
return JourneyResult(
|
|
286
|
+
start_id=start_id,
|
|
287
|
+
end_id=end_id,
|
|
288
|
+
steps=journey_steps,
|
|
289
|
+
path_coverage=path_coverage,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
except Exception as e:
|
|
293
|
+
if isinstance(e, (ValidationError, MemoryNotFoundError)):
|
|
294
|
+
raise
|
|
295
|
+
raise JourneyError(f"Failed to compute journey: {e}") from e
|
|
296
|
+
|
|
297
|
+
def wander(
|
|
298
|
+
self,
|
|
299
|
+
start_id: str,
|
|
300
|
+
steps: int | None = None,
|
|
301
|
+
temperature: float | None = None,
|
|
302
|
+
namespace: str | None = None,
|
|
303
|
+
) -> WanderResult:
|
|
304
|
+
"""Perform a random walk through memory space.
|
|
305
|
+
|
|
306
|
+
Temperature controls randomness:
|
|
307
|
+
- 0.0 = Always pick the most similar (greedy)
|
|
308
|
+
- 0.5 = Balanced exploration
|
|
309
|
+
- 1.0 = Highly random selection
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
start_id: Starting memory UUID.
|
|
313
|
+
steps: Number of steps to wander (default from config).
|
|
314
|
+
temperature: Randomness factor 0.0-1.0 (default from config).
|
|
315
|
+
namespace: Optional namespace filter.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
WanderResult with path taken.
|
|
319
|
+
|
|
320
|
+
Raises:
|
|
321
|
+
ValidationError: If input validation fails.
|
|
322
|
+
MemoryNotFoundError: If start memory not found.
|
|
323
|
+
WanderError: If walk cannot continue.
|
|
324
|
+
"""
|
|
325
|
+
# Validate inputs
|
|
326
|
+
start_id = validate_uuid(start_id)
|
|
327
|
+
if namespace is not None:
|
|
328
|
+
namespace = validate_namespace(namespace)
|
|
329
|
+
|
|
330
|
+
# Get parameters
|
|
331
|
+
actual_steps = steps if steps is not None else self._config.wander_default_steps
|
|
332
|
+
if actual_steps < 1:
|
|
333
|
+
raise ValidationError("Wander requires at least 1 step")
|
|
334
|
+
if actual_steps > self._config.wander_max_steps:
|
|
335
|
+
raise ValidationError(
|
|
336
|
+
f"Maximum wander steps is {self._config.wander_max_steps}"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
actual_temp = (
|
|
340
|
+
temperature
|
|
341
|
+
if temperature is not None
|
|
342
|
+
else self._config.wander_default_temperature
|
|
343
|
+
)
|
|
344
|
+
if not 0.0 <= actual_temp <= 1.0:
|
|
345
|
+
raise ValidationError("Temperature must be between 0.0 and 1.0")
|
|
346
|
+
|
|
347
|
+
# Verify start memory exists
|
|
348
|
+
start_result = self._repo.get_with_vector(start_id)
|
|
349
|
+
if start_result is None:
|
|
350
|
+
raise MemoryNotFoundError(start_id)
|
|
351
|
+
current_memory, current_vector = start_result
|
|
352
|
+
|
|
353
|
+
try:
|
|
354
|
+
wander_steps: list[WanderStep] = []
|
|
355
|
+
visited_ids: set[str] = {start_id}
|
|
356
|
+
recent_ids: list[str] = [start_id]
|
|
357
|
+
total_distance = 0.0
|
|
358
|
+
prev_vector = current_vector
|
|
359
|
+
|
|
360
|
+
for step_num in range(actual_steps):
|
|
361
|
+
# Find candidates from current position
|
|
362
|
+
# Include vectors to avoid follow-up get_with_vector queries
|
|
363
|
+
neighbors = self._repo.search(
|
|
364
|
+
current_vector,
|
|
365
|
+
limit=self._config.wander_candidates_per_step + len(visited_ids),
|
|
366
|
+
namespace=namespace,
|
|
367
|
+
include_vector=True,
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Filter out recently visited
|
|
371
|
+
candidates = [
|
|
372
|
+
n
|
|
373
|
+
for n in neighbors
|
|
374
|
+
if n.id not in recent_ids[-self._config.wander_avoid_recent :]
|
|
375
|
+
]
|
|
376
|
+
|
|
377
|
+
if not candidates:
|
|
378
|
+
# No unvisited candidates - allow revisiting older memories
|
|
379
|
+
candidates = [n for n in neighbors if n.id not in visited_ids]
|
|
380
|
+
|
|
381
|
+
if not candidates:
|
|
382
|
+
logger.warning(
|
|
383
|
+
f"Wander ended early at step {step_num}: no candidates"
|
|
384
|
+
)
|
|
385
|
+
break
|
|
386
|
+
|
|
387
|
+
# Select next memory based on temperature
|
|
388
|
+
next_memory, selection_prob = self._temperature_select(
|
|
389
|
+
candidates, actual_temp
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Get vector from search result (included via include_vector=True)
|
|
393
|
+
if next_memory.vector is not None:
|
|
394
|
+
next_vector = np.array(next_memory.vector, dtype=np.float32)
|
|
395
|
+
else:
|
|
396
|
+
# Fallback if vector not included (shouldn't happen)
|
|
397
|
+
next_result = self._repo.get_with_vector(next_memory.id)
|
|
398
|
+
if next_result is None:
|
|
399
|
+
logger.warning(f"Memory {next_memory.id} disappeared during wander")
|
|
400
|
+
break
|
|
401
|
+
_, next_vector = next_result
|
|
402
|
+
|
|
403
|
+
step_distance = self._cosine_distance(prev_vector, next_vector)
|
|
404
|
+
total_distance += step_distance
|
|
405
|
+
|
|
406
|
+
wander_steps.append(
|
|
407
|
+
WanderStep(
|
|
408
|
+
step=step_num,
|
|
409
|
+
memory=next_memory,
|
|
410
|
+
similarity_to_previous=next_memory.similarity,
|
|
411
|
+
selection_probability=selection_prob,
|
|
412
|
+
)
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
visited_ids.add(next_memory.id)
|
|
416
|
+
recent_ids.append(next_memory.id)
|
|
417
|
+
current_vector = next_vector
|
|
418
|
+
prev_vector = next_vector
|
|
419
|
+
|
|
420
|
+
return WanderResult(
|
|
421
|
+
start_id=start_id,
|
|
422
|
+
steps=wander_steps,
|
|
423
|
+
total_distance=total_distance,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
except Exception as e:
|
|
427
|
+
if isinstance(e, (ValidationError, MemoryNotFoundError)):
|
|
428
|
+
raise
|
|
429
|
+
raise WanderError(f"Wander failed: {e}") from e
|
|
430
|
+
|
|
431
|
+
def regions(
|
|
432
|
+
self,
|
|
433
|
+
namespace: str | None = None,
|
|
434
|
+
min_cluster_size: int | None = None,
|
|
435
|
+
max_clusters: int | None = None,
|
|
436
|
+
) -> RegionsResult:
|
|
437
|
+
"""Discover memory regions using HDBSCAN clustering.
|
|
438
|
+
|
|
439
|
+
HDBSCAN automatically determines the number of clusters and
|
|
440
|
+
identifies outliers (noise points).
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
namespace: Optional namespace filter.
|
|
444
|
+
min_cluster_size: Minimum points per cluster (default from config).
|
|
445
|
+
max_clusters: Maximum clusters to return (None = all).
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
RegionsResult with discovered clusters.
|
|
449
|
+
|
|
450
|
+
Raises:
|
|
451
|
+
ValidationError: If input validation fails.
|
|
452
|
+
ClusteringError: If clustering fails or HDBSCAN unavailable.
|
|
453
|
+
InsufficientMemoriesError: If not enough memories for clustering.
|
|
454
|
+
"""
|
|
455
|
+
if not HDBSCAN_AVAILABLE:
|
|
456
|
+
raise ClusteringError(
|
|
457
|
+
"HDBSCAN is not available. Install with: pip install hdbscan"
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Validate inputs
|
|
461
|
+
if namespace is not None:
|
|
462
|
+
namespace = validate_namespace(namespace)
|
|
463
|
+
|
|
464
|
+
actual_min_size = (
|
|
465
|
+
min_cluster_size
|
|
466
|
+
if min_cluster_size is not None
|
|
467
|
+
else self._config.regions_min_cluster_size
|
|
468
|
+
)
|
|
469
|
+
if actual_min_size < 2:
|
|
470
|
+
raise ValidationError("Minimum cluster size must be at least 2")
|
|
471
|
+
|
|
472
|
+
try:
|
|
473
|
+
# Fetch all vectors for clustering
|
|
474
|
+
all_memories = self._repo.get_all(
|
|
475
|
+
namespace=namespace, limit=self._config.regions_max_memories
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
if len(all_memories) < actual_min_size:
|
|
479
|
+
raise InsufficientMemoriesError(
|
|
480
|
+
required=actual_min_size,
|
|
481
|
+
available=len(all_memories),
|
|
482
|
+
operation="regions",
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
# Extract IDs and vectors
|
|
486
|
+
memory_map = {m.id: (m, v) for m, v in all_memories}
|
|
487
|
+
memory_ids = list(memory_map.keys())
|
|
488
|
+
vectors = np.array([v for _, v in all_memories], dtype=np.float32)
|
|
489
|
+
|
|
490
|
+
# Run HDBSCAN clustering
|
|
491
|
+
clusterer = hdbscan.HDBSCAN(
|
|
492
|
+
min_cluster_size=actual_min_size,
|
|
493
|
+
metric="euclidean", # Works well with normalized vectors
|
|
494
|
+
cluster_selection_method="eom", # Excess of Mass
|
|
495
|
+
)
|
|
496
|
+
labels = clusterer.fit_predict(vectors)
|
|
497
|
+
|
|
498
|
+
# Process clusters
|
|
499
|
+
clusters: list[RegionCluster] = []
|
|
500
|
+
unique_labels = set(labels)
|
|
501
|
+
|
|
502
|
+
# Remove noise label (-1) for cluster processing
|
|
503
|
+
cluster_labels = [label for label in unique_labels if label >= 0]
|
|
504
|
+
|
|
505
|
+
for cluster_id in cluster_labels:
|
|
506
|
+
# Get indices of memories in this cluster
|
|
507
|
+
cluster_indices = [
|
|
508
|
+
i for i, lbl in enumerate(labels) if lbl == cluster_id
|
|
509
|
+
]
|
|
510
|
+
cluster_vectors = vectors[cluster_indices]
|
|
511
|
+
cluster_ids = [memory_ids[i] for i in cluster_indices]
|
|
512
|
+
|
|
513
|
+
# Find centroid and closest memory to centroid
|
|
514
|
+
centroid = cluster_vectors.mean(axis=0)
|
|
515
|
+
distances_to_centroid = np.linalg.norm(
|
|
516
|
+
cluster_vectors - centroid, axis=1
|
|
517
|
+
)
|
|
518
|
+
centroid_idx = int(np.argmin(distances_to_centroid))
|
|
519
|
+
centroid_memory_id = cluster_ids[centroid_idx]
|
|
520
|
+
|
|
521
|
+
# Calculate coherence (inverse of average intra-cluster distance)
|
|
522
|
+
avg_dist = float(distances_to_centroid.mean())
|
|
523
|
+
max_possible_dist = 2.0 # Max distance for normalized vectors
|
|
524
|
+
coherence = max(0.0, min(1.0, 1.0 - (avg_dist / max_possible_dist)))
|
|
525
|
+
|
|
526
|
+
# Get representative and sample memories
|
|
527
|
+
rep_memory, _ = memory_map[centroid_memory_id]
|
|
528
|
+
rep_result = self._memory_to_result(rep_memory, 1.0)
|
|
529
|
+
|
|
530
|
+
sample_results: list[MemoryResult] = []
|
|
531
|
+
for sid in cluster_ids[:5]:
|
|
532
|
+
mem, _ = memory_map[sid]
|
|
533
|
+
# Calculate similarity to centroid for the sample
|
|
534
|
+
mem_vec = memory_map[sid][1]
|
|
535
|
+
sim = 1.0 - self._cosine_distance(centroid, mem_vec)
|
|
536
|
+
sample_results.append(self._memory_to_result(mem, sim))
|
|
537
|
+
|
|
538
|
+
# Extract keywords from sample content
|
|
539
|
+
sample_contents = [m.content for m in sample_results]
|
|
540
|
+
keywords = self._extract_keywords(" ".join(sample_contents), n=5)
|
|
541
|
+
|
|
542
|
+
clusters.append(
|
|
543
|
+
RegionCluster(
|
|
544
|
+
cluster_id=cluster_id,
|
|
545
|
+
size=len(cluster_ids),
|
|
546
|
+
representative_memory=rep_result,
|
|
547
|
+
sample_memories=sample_results[:3],
|
|
548
|
+
coherence=coherence,
|
|
549
|
+
keywords=keywords,
|
|
550
|
+
)
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
# Sort by size (largest first)
|
|
554
|
+
clusters.sort(key=lambda c: c.size, reverse=True)
|
|
555
|
+
|
|
556
|
+
# Limit clusters if requested
|
|
557
|
+
if max_clusters is not None and len(clusters) > max_clusters:
|
|
558
|
+
clusters = clusters[:max_clusters]
|
|
559
|
+
|
|
560
|
+
# Count noise points
|
|
561
|
+
noise_count = sum(1 for lbl in labels if lbl == -1)
|
|
562
|
+
|
|
563
|
+
# Calculate silhouette score if possible
|
|
564
|
+
clustering_quality = 0.0
|
|
565
|
+
if len(cluster_labels) >= 2:
|
|
566
|
+
try:
|
|
567
|
+
from sklearn.metrics import silhouette_score
|
|
568
|
+
# Filter out noise points for silhouette calculation
|
|
569
|
+
mask = labels >= 0
|
|
570
|
+
if mask.sum() >= 2:
|
|
571
|
+
clustering_quality = float(
|
|
572
|
+
silhouette_score(vectors[mask], labels[mask])
|
|
573
|
+
)
|
|
574
|
+
except ImportError:
|
|
575
|
+
pass # sklearn not available, skip quality calculation
|
|
576
|
+
|
|
577
|
+
return RegionsResult(
|
|
578
|
+
clusters=clusters,
|
|
579
|
+
noise_count=noise_count,
|
|
580
|
+
total_memories=len(memory_ids),
|
|
581
|
+
clustering_quality=clustering_quality,
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
except (ValidationError, InsufficientMemoriesError, ClusteringError):
|
|
585
|
+
raise
|
|
586
|
+
except Exception as e:
|
|
587
|
+
raise ClusteringError(f"Clustering failed: {e}") from e
|
|
588
|
+
|
|
589
|
+
def visualize(
|
|
590
|
+
self,
|
|
591
|
+
memory_ids: list[str] | None = None,
|
|
592
|
+
namespace: str | None = None,
|
|
593
|
+
format: Literal["json", "mermaid", "svg"] = "json",
|
|
594
|
+
dimensions: Literal[2, 3] = 2,
|
|
595
|
+
include_edges: bool = True,
|
|
596
|
+
) -> VisualizationResult:
|
|
597
|
+
"""Generate a visualization of memory space using UMAP projection.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
memory_ids: Specific memories to visualize (None = auto-select).
|
|
601
|
+
namespace: Namespace filter when auto-selecting.
|
|
602
|
+
format: Output format (json, mermaid, or svg).
|
|
603
|
+
dimensions: Number of dimensions (2 or 3).
|
|
604
|
+
include_edges: Include similarity edges between nodes.
|
|
605
|
+
|
|
606
|
+
Returns:
|
|
607
|
+
VisualizationResult with visualization data and formatted output.
|
|
608
|
+
|
|
609
|
+
Raises:
|
|
610
|
+
ValidationError: If input validation fails.
|
|
611
|
+
VisualizationError: If visualization fails or UMAP unavailable.
|
|
612
|
+
InsufficientMemoriesError: If not enough memories.
|
|
613
|
+
"""
|
|
614
|
+
if not UMAP_AVAILABLE:
|
|
615
|
+
raise VisualizationError(
|
|
616
|
+
"UMAP is not available. Install with: pip install umap-learn"
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# Validate inputs
|
|
620
|
+
if namespace is not None:
|
|
621
|
+
namespace = validate_namespace(namespace)
|
|
622
|
+
|
|
623
|
+
if memory_ids is not None:
|
|
624
|
+
memory_ids = [validate_uuid(mid) for mid in memory_ids]
|
|
625
|
+
|
|
626
|
+
if dimensions not in (2, 3):
|
|
627
|
+
raise ValidationError("Dimensions must be 2 or 3")
|
|
628
|
+
|
|
629
|
+
try:
|
|
630
|
+
# Get memories to visualize
|
|
631
|
+
if memory_ids:
|
|
632
|
+
memories_with_vectors: list[tuple[Any, np.ndarray]] = []
|
|
633
|
+
for mid in memory_ids[: self._config.visualize_max_memories]:
|
|
634
|
+
result = self._repo.get_with_vector(mid)
|
|
635
|
+
if result:
|
|
636
|
+
memories_with_vectors.append(result)
|
|
637
|
+
else:
|
|
638
|
+
memories_with_vectors = self._repo.get_all(
|
|
639
|
+
namespace=namespace, limit=self._config.visualize_max_memories
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
if len(memories_with_vectors) < 5:
|
|
643
|
+
raise InsufficientMemoriesError(
|
|
644
|
+
required=5,
|
|
645
|
+
available=len(memories_with_vectors),
|
|
646
|
+
operation="visualize",
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
# Extract vectors
|
|
650
|
+
vectors = np.array(
|
|
651
|
+
[v for _, v in memories_with_vectors], dtype=np.float32
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
# Run UMAP projection
|
|
655
|
+
n_neighbors = min(
|
|
656
|
+
self._config.visualize_n_neighbors, len(vectors) - 1
|
|
657
|
+
)
|
|
658
|
+
reducer = umap.UMAP(
|
|
659
|
+
n_components=dimensions,
|
|
660
|
+
n_neighbors=n_neighbors,
|
|
661
|
+
min_dist=self._config.visualize_min_dist,
|
|
662
|
+
metric="cosine",
|
|
663
|
+
random_state=42, # Reproducibility
|
|
664
|
+
)
|
|
665
|
+
embedding = reducer.fit_transform(vectors)
|
|
666
|
+
|
|
667
|
+
# Optionally run clustering for coloring
|
|
668
|
+
cluster_labels = [-1] * len(memories_with_vectors)
|
|
669
|
+
|
|
670
|
+
if HDBSCAN_AVAILABLE and len(memories_with_vectors) >= 10:
|
|
671
|
+
try:
|
|
672
|
+
clusterer = hdbscan.HDBSCAN(
|
|
673
|
+
min_cluster_size=3,
|
|
674
|
+
metric="euclidean",
|
|
675
|
+
)
|
|
676
|
+
cluster_labels = clusterer.fit_predict(vectors).tolist()
|
|
677
|
+
except Exception as e:
|
|
678
|
+
logger.debug(f"Clustering for visualization failed: {e}")
|
|
679
|
+
|
|
680
|
+
# Build visualization nodes
|
|
681
|
+
nodes: list[VisualizationNode] = []
|
|
682
|
+
for i, (memory, _) in enumerate(memories_with_vectors):
|
|
683
|
+
# Create short label from content
|
|
684
|
+
content = memory.content
|
|
685
|
+
label = content[:50] + "..." if len(content) > 50 else content
|
|
686
|
+
label = label.replace("\n", " ")
|
|
687
|
+
|
|
688
|
+
nodes.append(
|
|
689
|
+
VisualizationNode(
|
|
690
|
+
id=memory.id,
|
|
691
|
+
x=float(embedding[i, 0]),
|
|
692
|
+
y=float(embedding[i, 1]) if dimensions >= 2 else 0.0,
|
|
693
|
+
label=label,
|
|
694
|
+
cluster=cluster_labels[i],
|
|
695
|
+
importance=memory.importance,
|
|
696
|
+
highlighted=False,
|
|
697
|
+
)
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
# Build edges if requested
|
|
701
|
+
edges: list[VisualizationEdge] = []
|
|
702
|
+
if include_edges:
|
|
703
|
+
# Calculate pairwise similarities using vectorized operations
|
|
704
|
+
similarity_matrix = self._compute_pairwise_similarities(vectors)
|
|
705
|
+
threshold = self._config.visualize_similarity_threshold
|
|
706
|
+
|
|
707
|
+
# Extract upper triangle indices where similarity >= threshold
|
|
708
|
+
# (upper triangle avoids duplicate edges)
|
|
709
|
+
upper_tri_indices = np.triu_indices(len(vectors), k=1)
|
|
710
|
+
similarities = similarity_matrix[upper_tri_indices]
|
|
711
|
+
|
|
712
|
+
# Filter by threshold and create edges
|
|
713
|
+
mask = similarities >= threshold
|
|
714
|
+
for idx in np.where(mask)[0]:
|
|
715
|
+
i, j = upper_tri_indices[0][idx], upper_tri_indices[1][idx]
|
|
716
|
+
edges.append(
|
|
717
|
+
VisualizationEdge(
|
|
718
|
+
from_id=nodes[i].id,
|
|
719
|
+
to_id=nodes[j].id,
|
|
720
|
+
weight=float(similarities[idx]),
|
|
721
|
+
)
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
# Calculate bounds
|
|
725
|
+
x_coords = [n.x for n in nodes]
|
|
726
|
+
y_coords = [n.y for n in nodes]
|
|
727
|
+
bounds = {
|
|
728
|
+
"x_min": min(x_coords),
|
|
729
|
+
"x_max": max(x_coords),
|
|
730
|
+
"y_min": min(y_coords),
|
|
731
|
+
"y_max": max(y_coords),
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
# Format output
|
|
735
|
+
output = self._format_output(nodes, edges, format)
|
|
736
|
+
|
|
737
|
+
return VisualizationResult(
|
|
738
|
+
nodes=nodes,
|
|
739
|
+
edges=edges,
|
|
740
|
+
bounds=bounds,
|
|
741
|
+
format=format,
|
|
742
|
+
output=output,
|
|
743
|
+
)
|
|
744
|
+
|
|
745
|
+
except (ValidationError, InsufficientMemoriesError, VisualizationError):
|
|
746
|
+
raise
|
|
747
|
+
except Exception as e:
|
|
748
|
+
raise VisualizationError(f"Visualization failed: {e}") from e
|
|
749
|
+
|
|
750
|
+
# =========================================================================
|
|
751
|
+
# Helper Methods
|
|
752
|
+
# =========================================================================
|
|
753
|
+
|
|
754
|
+
def _memory_to_result(self, memory: Any, similarity: float) -> MemoryResult:
|
|
755
|
+
"""Convert a Memory object to a MemoryResult.
|
|
756
|
+
|
|
757
|
+
Args:
|
|
758
|
+
memory: Memory object.
|
|
759
|
+
similarity: Similarity score.
|
|
760
|
+
|
|
761
|
+
Returns:
|
|
762
|
+
MemoryResult object.
|
|
763
|
+
"""
|
|
764
|
+
return MemoryResult(
|
|
765
|
+
id=memory.id,
|
|
766
|
+
content=memory.content,
|
|
767
|
+
similarity=max(0.0, min(1.0, similarity)),
|
|
768
|
+
namespace=memory.namespace,
|
|
769
|
+
tags=memory.tags,
|
|
770
|
+
importance=memory.importance,
|
|
771
|
+
created_at=memory.created_at,
|
|
772
|
+
metadata=memory.metadata,
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
def _slerp_interpolate(
|
|
776
|
+
self,
|
|
777
|
+
start_vec: np.ndarray,
|
|
778
|
+
end_vec: np.ndarray,
|
|
779
|
+
num_steps: int,
|
|
780
|
+
) -> tuple[list[np.ndarray], list[float]]:
|
|
781
|
+
"""Spherical Linear Interpolation between two vectors.
|
|
782
|
+
|
|
783
|
+
SLERP maintains constant angular velocity along the geodesic path
|
|
784
|
+
between two points on a hypersphere, making it ideal for semantic
|
|
785
|
+
interpolation in embedding space.
|
|
786
|
+
|
|
787
|
+
Args:
|
|
788
|
+
start_vec: Starting vector.
|
|
789
|
+
end_vec: Ending vector.
|
|
790
|
+
num_steps: Number of interpolation points.
|
|
791
|
+
|
|
792
|
+
Returns:
|
|
793
|
+
Tuple of (interpolated vectors, t values).
|
|
794
|
+
"""
|
|
795
|
+
# Normalize vectors
|
|
796
|
+
start_norm = start_vec / (np.linalg.norm(start_vec) + 1e-10)
|
|
797
|
+
end_norm = end_vec / (np.linalg.norm(end_vec) + 1e-10)
|
|
798
|
+
|
|
799
|
+
# Calculate angle between vectors
|
|
800
|
+
dot = np.clip(np.dot(start_norm, end_norm), -1.0, 1.0)
|
|
801
|
+
omega = np.arccos(dot)
|
|
802
|
+
|
|
803
|
+
t_values = list(np.linspace(0, 1, num_steps))
|
|
804
|
+
|
|
805
|
+
# Handle nearly parallel vectors (use linear interpolation)
|
|
806
|
+
if omega < 1e-6:
|
|
807
|
+
linear_interp = [
|
|
808
|
+
start_vec + t * (end_vec - start_vec)
|
|
809
|
+
for t in t_values
|
|
810
|
+
]
|
|
811
|
+
return linear_interp, t_values
|
|
812
|
+
|
|
813
|
+
sin_omega = np.sin(omega)
|
|
814
|
+
|
|
815
|
+
interpolated: list[np.ndarray] = []
|
|
816
|
+
for t in t_values:
|
|
817
|
+
coef_start = np.sin((1 - t) * omega) / sin_omega
|
|
818
|
+
coef_end = np.sin(t * omega) / sin_omega
|
|
819
|
+
vec = coef_start * start_norm + coef_end * end_norm
|
|
820
|
+
interpolated.append(vec)
|
|
821
|
+
|
|
822
|
+
return interpolated, t_values
|
|
823
|
+
|
|
824
|
+
def _batch_vector_search(
|
|
825
|
+
self,
|
|
826
|
+
vectors: list[np.ndarray],
|
|
827
|
+
limit_per_query: int,
|
|
828
|
+
namespace: str | None,
|
|
829
|
+
include_vector: bool = False,
|
|
830
|
+
) -> list[list[MemoryResult]]:
|
|
831
|
+
"""Perform batch vector search using repository's native batch capability.
|
|
832
|
+
|
|
833
|
+
Uses the repository's batch_vector_search for efficient multi-query
|
|
834
|
+
searches in a single database operation.
|
|
835
|
+
|
|
836
|
+
Args:
|
|
837
|
+
vectors: List of query vectors.
|
|
838
|
+
limit_per_query: Results per query.
|
|
839
|
+
namespace: Optional namespace filter.
|
|
840
|
+
include_vector: Whether to include embedding vectors in results.
|
|
841
|
+
Defaults to False to reduce response size.
|
|
842
|
+
|
|
843
|
+
Returns:
|
|
844
|
+
List of result lists. If include_vector=True, each MemoryResult
|
|
845
|
+
includes its embedding vector.
|
|
846
|
+
"""
|
|
847
|
+
# Use native batch search for efficiency
|
|
848
|
+
raw_results = self._repo.batch_vector_search(
|
|
849
|
+
query_vectors=vectors,
|
|
850
|
+
limit_per_query=limit_per_query,
|
|
851
|
+
namespace=namespace,
|
|
852
|
+
include_vector=include_vector,
|
|
853
|
+
)
|
|
854
|
+
|
|
855
|
+
# Convert raw dict results to MemoryResult objects
|
|
856
|
+
results: list[list[MemoryResult]] = []
|
|
857
|
+
for query_results in raw_results:
|
|
858
|
+
memory_results: list[MemoryResult] = []
|
|
859
|
+
for record in query_results:
|
|
860
|
+
memory_result = MemoryResult(
|
|
861
|
+
id=record["id"],
|
|
862
|
+
content=record["content"],
|
|
863
|
+
similarity=record.get("similarity", 0.0),
|
|
864
|
+
namespace=record.get("namespace", "default"),
|
|
865
|
+
tags=record.get("tags", []),
|
|
866
|
+
importance=record.get("importance", 0.5),
|
|
867
|
+
created_at=record.get("created_at"),
|
|
868
|
+
metadata=record.get("metadata", {}),
|
|
869
|
+
vector=record.get("vector") if include_vector else None,
|
|
870
|
+
)
|
|
871
|
+
memory_results.append(memory_result)
|
|
872
|
+
results.append(memory_results)
|
|
873
|
+
return results
|
|
874
|
+
|
|
875
|
+
def _get_vector_for_memory(self, memory_id: str) -> np.ndarray:
|
|
876
|
+
"""Get the vector for a memory.
|
|
877
|
+
|
|
878
|
+
Args:
|
|
879
|
+
memory_id: Memory UUID.
|
|
880
|
+
|
|
881
|
+
Returns:
|
|
882
|
+
The memory's vector.
|
|
883
|
+
"""
|
|
884
|
+
result = self._repo.get_with_vector(memory_id)
|
|
885
|
+
if result is None:
|
|
886
|
+
# Return zero vector if memory not found (shouldn't happen in practice)
|
|
887
|
+
return np.zeros(self._embeddings.dimensions, dtype=np.float32)
|
|
888
|
+
_, vector = result
|
|
889
|
+
return vector
|
|
890
|
+
|
|
891
|
+
def _cosine_distance(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
|
|
892
|
+
"""Calculate cosine distance between two vectors.
|
|
893
|
+
|
|
894
|
+
Args:
|
|
895
|
+
vec1: First vector.
|
|
896
|
+
vec2: Second vector.
|
|
897
|
+
|
|
898
|
+
Returns:
|
|
899
|
+
Cosine distance (0 = identical, 2 = opposite).
|
|
900
|
+
"""
|
|
901
|
+
norm1 = np.linalg.norm(vec1)
|
|
902
|
+
norm2 = np.linalg.norm(vec2)
|
|
903
|
+
if norm1 < 1e-10 or norm2 < 1e-10:
|
|
904
|
+
return 1.0 # Maximum distance for zero vectors
|
|
905
|
+
|
|
906
|
+
similarity = np.dot(vec1, vec2) / (norm1 * norm2)
|
|
907
|
+
return float(1.0 - similarity)
|
|
908
|
+
|
|
909
|
+
def _compute_pairwise_similarities(self, vectors: np.ndarray) -> np.ndarray:
|
|
910
|
+
"""Compute pairwise cosine similarities using vectorized operations.
|
|
911
|
+
|
|
912
|
+
Uses scipy.cdist if available for optimal performance, otherwise
|
|
913
|
+
falls back to numpy matrix operations.
|
|
914
|
+
|
|
915
|
+
Args:
|
|
916
|
+
vectors: 2D array of shape (n_vectors, embedding_dim).
|
|
917
|
+
|
|
918
|
+
Returns:
|
|
919
|
+
Symmetric similarity matrix of shape (n_vectors, n_vectors).
|
|
920
|
+
Values range from -1 (opposite) to 1 (identical).
|
|
921
|
+
"""
|
|
922
|
+
# Normalize vectors to unit length
|
|
923
|
+
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
924
|
+
# Avoid division by zero for zero vectors
|
|
925
|
+
norms = np.where(norms < 1e-10, 1.0, norms)
|
|
926
|
+
normalized = vectors / norms
|
|
927
|
+
|
|
928
|
+
if SCIPY_AVAILABLE:
|
|
929
|
+
# scipy.cdist with cosine metric returns distances (1 - similarity)
|
|
930
|
+
distances = cdist(normalized, normalized, metric="cosine")
|
|
931
|
+
similarities = 1.0 - distances
|
|
932
|
+
else:
|
|
933
|
+
# Fallback: use numpy dot product (A @ A.T for normalized vectors)
|
|
934
|
+
similarities = normalized @ normalized.T
|
|
935
|
+
|
|
936
|
+
return similarities
|
|
937
|
+
|
|
938
|
+
def _temperature_select(
|
|
939
|
+
self,
|
|
940
|
+
candidates: list[MemoryResult],
|
|
941
|
+
temperature: float,
|
|
942
|
+
) -> tuple[MemoryResult, float]:
|
|
943
|
+
"""Select a candidate using temperature-based sampling.
|
|
944
|
+
|
|
945
|
+
Args:
|
|
946
|
+
candidates: List of candidate memories with similarity scores.
|
|
947
|
+
temperature: Randomness factor (0 = greedy, 1 = uniform random).
|
|
948
|
+
|
|
949
|
+
Returns:
|
|
950
|
+
Tuple of (selected memory, selection probability).
|
|
951
|
+
"""
|
|
952
|
+
if not candidates:
|
|
953
|
+
raise WanderError("No candidates for temperature selection")
|
|
954
|
+
|
|
955
|
+
if temperature == 0.0:
|
|
956
|
+
# Greedy: pick highest similarity
|
|
957
|
+
return max(candidates, key=lambda c: c.similarity), 1.0
|
|
958
|
+
|
|
959
|
+
if temperature >= 1.0:
|
|
960
|
+
# Random: uniform selection
|
|
961
|
+
prob = 1.0 / len(candidates)
|
|
962
|
+
return random.choice(candidates), prob
|
|
963
|
+
|
|
964
|
+
# Temperature-based softmax selection
|
|
965
|
+
similarities = np.array([c.similarity for c in candidates])
|
|
966
|
+
|
|
967
|
+
# Scale by inverse temperature (lower temp = sharper distribution)
|
|
968
|
+
scaled = similarities / (temperature + 1e-10)
|
|
969
|
+
scaled = scaled - scaled.max() # Numerical stability
|
|
970
|
+
exp_scaled = np.exp(scaled)
|
|
971
|
+
probs = exp_scaled / exp_scaled.sum()
|
|
972
|
+
|
|
973
|
+
# Sample according to probabilities
|
|
974
|
+
idx = np.random.choice(len(candidates), p=probs)
|
|
975
|
+
return candidates[idx], float(probs[idx])
|
|
976
|
+
|
|
977
|
+
def _extract_keywords(self, text: str, n: int = 5) -> list[str]:
|
|
978
|
+
"""Extract top keywords from text using simple frequency analysis.
|
|
979
|
+
|
|
980
|
+
Args:
|
|
981
|
+
text: Text to analyze.
|
|
982
|
+
n: Number of keywords to extract.
|
|
983
|
+
|
|
984
|
+
Returns:
|
|
985
|
+
List of top keywords.
|
|
986
|
+
"""
|
|
987
|
+
# Simple keyword extraction using word frequency
|
|
988
|
+
# Tokenize and filter using module-level stop words
|
|
989
|
+
words = re.findall(r"\b[a-zA-Z]+\b", text.lower())
|
|
990
|
+
filtered = [w for w in words if w not in _STOP_WORDS and len(w) > 2]
|
|
991
|
+
|
|
992
|
+
# Count frequencies
|
|
993
|
+
counter = Counter(filtered)
|
|
994
|
+
return [word for word, _ in counter.most_common(n)]
|
|
995
|
+
|
|
996
|
+
def _format_output(
|
|
997
|
+
self,
|
|
998
|
+
nodes: list[VisualizationNode],
|
|
999
|
+
edges: list[VisualizationEdge],
|
|
1000
|
+
format: Literal["json", "mermaid", "svg"],
|
|
1001
|
+
) -> str:
|
|
1002
|
+
"""Format visualization data for output.
|
|
1003
|
+
|
|
1004
|
+
Args:
|
|
1005
|
+
nodes: Visualization nodes.
|
|
1006
|
+
edges: Visualization edges.
|
|
1007
|
+
format: Output format.
|
|
1008
|
+
|
|
1009
|
+
Returns:
|
|
1010
|
+
Formatted string output.
|
|
1011
|
+
"""
|
|
1012
|
+
if format == "json":
|
|
1013
|
+
return json.dumps(
|
|
1014
|
+
{
|
|
1015
|
+
"nodes": [
|
|
1016
|
+
{
|
|
1017
|
+
"id": n.id,
|
|
1018
|
+
"x": n.x,
|
|
1019
|
+
"y": n.y,
|
|
1020
|
+
"label": n.label,
|
|
1021
|
+
"cluster": n.cluster,
|
|
1022
|
+
"importance": n.importance,
|
|
1023
|
+
}
|
|
1024
|
+
for n in nodes
|
|
1025
|
+
],
|
|
1026
|
+
"edges": [
|
|
1027
|
+
{
|
|
1028
|
+
"from": e.from_id,
|
|
1029
|
+
"to": e.to_id,
|
|
1030
|
+
"weight": e.weight,
|
|
1031
|
+
}
|
|
1032
|
+
for e in edges
|
|
1033
|
+
],
|
|
1034
|
+
},
|
|
1035
|
+
indent=2,
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
elif format == "mermaid":
|
|
1039
|
+
lines = ["graph LR"]
|
|
1040
|
+
|
|
1041
|
+
# Add nodes with short IDs
|
|
1042
|
+
node_aliases = {n.id: f"N{i}" for i, n in enumerate(nodes)}
|
|
1043
|
+
for node in nodes:
|
|
1044
|
+
alias = node_aliases[node.id]
|
|
1045
|
+
# Escape special characters in label
|
|
1046
|
+
safe_label = node.label.replace('"', "'").replace("\n", " ")[:30]
|
|
1047
|
+
lines.append(f' {alias}["{safe_label}"]')
|
|
1048
|
+
|
|
1049
|
+
# Add edges
|
|
1050
|
+
for edge in edges:
|
|
1051
|
+
from_alias = node_aliases.get(edge.from_id)
|
|
1052
|
+
to_alias = node_aliases.get(edge.to_id)
|
|
1053
|
+
if from_alias and to_alias:
|
|
1054
|
+
lines.append(f" {from_alias} --> {to_alias}")
|
|
1055
|
+
|
|
1056
|
+
return "\n".join(lines)
|
|
1057
|
+
|
|
1058
|
+
elif format == "svg":
|
|
1059
|
+
return self._generate_svg(nodes, edges)
|
|
1060
|
+
|
|
1061
|
+
else:
|
|
1062
|
+
raise ValidationError(f"Unknown format: {format}")
|
|
1063
|
+
|
|
1064
|
+
def _generate_svg(
|
|
1065
|
+
self,
|
|
1066
|
+
nodes: list[VisualizationNode],
|
|
1067
|
+
edges: list[VisualizationEdge],
|
|
1068
|
+
) -> str:
|
|
1069
|
+
"""Generate SVG visualization.
|
|
1070
|
+
|
|
1071
|
+
Args:
|
|
1072
|
+
nodes: Visualization nodes.
|
|
1073
|
+
edges: Visualization edges.
|
|
1074
|
+
|
|
1075
|
+
Returns:
|
|
1076
|
+
SVG string.
|
|
1077
|
+
"""
|
|
1078
|
+
width, height = 800, 600
|
|
1079
|
+
padding = 50
|
|
1080
|
+
|
|
1081
|
+
# Calculate scale to fit nodes
|
|
1082
|
+
x_coords = [n.x for n in nodes]
|
|
1083
|
+
y_coords = [n.y for n in nodes]
|
|
1084
|
+
x_min, x_max = min(x_coords), max(x_coords)
|
|
1085
|
+
y_min, y_max = min(y_coords), max(y_coords)
|
|
1086
|
+
|
|
1087
|
+
x_range = x_max - x_min if x_max != x_min else 1
|
|
1088
|
+
y_range = y_max - y_min if y_max != y_min else 1
|
|
1089
|
+
|
|
1090
|
+
def scale_x(x: float) -> float:
|
|
1091
|
+
return padding + (x - x_min) / x_range * (width - 2 * padding)
|
|
1092
|
+
|
|
1093
|
+
def scale_y(y: float) -> float:
|
|
1094
|
+
return padding + (y - y_min) / y_range * (height - 2 * padding)
|
|
1095
|
+
|
|
1096
|
+
svg_lines = [
|
|
1097
|
+
f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 {width} {height}">',
|
|
1098
|
+
" <style>",
|
|
1099
|
+
" .node { cursor: pointer; }",
|
|
1100
|
+
" .node circle { stroke: #333; stroke-width: 1; }",
|
|
1101
|
+
" .node text { font-size: 10px; fill: #333; }",
|
|
1102
|
+
" .edge { stroke: #ccc; stroke-width: 1; opacity: 0.5; }",
|
|
1103
|
+
" </style>",
|
|
1104
|
+
]
|
|
1105
|
+
|
|
1106
|
+
# Draw edges
|
|
1107
|
+
for edge in edges:
|
|
1108
|
+
from_node = next((n for n in nodes if n.id == edge.from_id), None)
|
|
1109
|
+
to_node = next((n for n in nodes if n.id == edge.to_id), None)
|
|
1110
|
+
if from_node and to_node:
|
|
1111
|
+
x1, y1 = scale_x(from_node.x), scale_y(from_node.y)
|
|
1112
|
+
x2, y2 = scale_x(to_node.x), scale_y(to_node.y)
|
|
1113
|
+
svg_lines.append(
|
|
1114
|
+
f' <line class="edge" x1="{x1:.1f}" y1="{y1:.1f}" '
|
|
1115
|
+
f'x2="{x2:.1f}" y2="{y2:.1f}" />'
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
# Draw nodes
|
|
1119
|
+
for node in nodes:
|
|
1120
|
+
x, y = scale_x(node.x), scale_y(node.y)
|
|
1121
|
+
radius = 5 + node.importance * 5 # Scale by importance
|
|
1122
|
+
if node.cluster >= 0:
|
|
1123
|
+
color = CLUSTER_COLORS[node.cluster % len(CLUSTER_COLORS)]
|
|
1124
|
+
else:
|
|
1125
|
+
color = "#999"
|
|
1126
|
+
|
|
1127
|
+
svg_lines.append(' <g class="node">')
|
|
1128
|
+
svg_lines.append(
|
|
1129
|
+
f' <circle cx="{x:.1f}" cy="{y:.1f}" r="{radius:.1f}" '
|
|
1130
|
+
f'fill="{color}" />'
|
|
1131
|
+
)
|
|
1132
|
+
# Add truncated label
|
|
1133
|
+
short_label = node.label[:20] + "..." if len(node.label) > 20 else node.label
|
|
1134
|
+
# Escape XML special characters
|
|
1135
|
+
short_label = (
|
|
1136
|
+
short_label.replace("&", "&")
|
|
1137
|
+
.replace("<", "<")
|
|
1138
|
+
.replace(">", ">")
|
|
1139
|
+
)
|
|
1140
|
+
svg_lines.append(
|
|
1141
|
+
f' <text x="{x:.1f}" y="{y + radius + 12:.1f}" '
|
|
1142
|
+
f'text-anchor="middle">{short_label}</text>'
|
|
1143
|
+
)
|
|
1144
|
+
svg_lines.append(" </g>")
|
|
1145
|
+
|
|
1146
|
+
svg_lines.append("</svg>")
|
|
1147
|
+
return "\n".join(svg_lines)
|