spatial-memory-mcp 1.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. spatial_memory/__init__.py +97 -0
  2. spatial_memory/__main__.py +271 -0
  3. spatial_memory/adapters/__init__.py +7 -0
  4. spatial_memory/adapters/lancedb_repository.py +880 -0
  5. spatial_memory/config.py +769 -0
  6. spatial_memory/core/__init__.py +118 -0
  7. spatial_memory/core/cache.py +317 -0
  8. spatial_memory/core/circuit_breaker.py +297 -0
  9. spatial_memory/core/connection_pool.py +220 -0
  10. spatial_memory/core/consolidation_strategies.py +401 -0
  11. spatial_memory/core/database.py +3072 -0
  12. spatial_memory/core/db_idempotency.py +242 -0
  13. spatial_memory/core/db_indexes.py +576 -0
  14. spatial_memory/core/db_migrations.py +588 -0
  15. spatial_memory/core/db_search.py +512 -0
  16. spatial_memory/core/db_versioning.py +178 -0
  17. spatial_memory/core/embeddings.py +558 -0
  18. spatial_memory/core/errors.py +317 -0
  19. spatial_memory/core/file_security.py +701 -0
  20. spatial_memory/core/filesystem.py +178 -0
  21. spatial_memory/core/health.py +289 -0
  22. spatial_memory/core/helpers.py +79 -0
  23. spatial_memory/core/import_security.py +433 -0
  24. spatial_memory/core/lifecycle_ops.py +1067 -0
  25. spatial_memory/core/logging.py +194 -0
  26. spatial_memory/core/metrics.py +192 -0
  27. spatial_memory/core/models.py +660 -0
  28. spatial_memory/core/rate_limiter.py +326 -0
  29. spatial_memory/core/response_types.py +500 -0
  30. spatial_memory/core/security.py +588 -0
  31. spatial_memory/core/spatial_ops.py +430 -0
  32. spatial_memory/core/tracing.py +300 -0
  33. spatial_memory/core/utils.py +110 -0
  34. spatial_memory/core/validation.py +406 -0
  35. spatial_memory/factory.py +444 -0
  36. spatial_memory/migrations/__init__.py +40 -0
  37. spatial_memory/ports/__init__.py +11 -0
  38. spatial_memory/ports/repositories.py +630 -0
  39. spatial_memory/py.typed +0 -0
  40. spatial_memory/server.py +1214 -0
  41. spatial_memory/services/__init__.py +70 -0
  42. spatial_memory/services/decay_manager.py +411 -0
  43. spatial_memory/services/export_import.py +1031 -0
  44. spatial_memory/services/lifecycle.py +1139 -0
  45. spatial_memory/services/memory.py +412 -0
  46. spatial_memory/services/spatial.py +1152 -0
  47. spatial_memory/services/utility.py +429 -0
  48. spatial_memory/tools/__init__.py +5 -0
  49. spatial_memory/tools/definitions.py +695 -0
  50. spatial_memory/verify.py +140 -0
  51. spatial_memory_mcp-1.9.1.dist-info/METADATA +509 -0
  52. spatial_memory_mcp-1.9.1.dist-info/RECORD +55 -0
  53. spatial_memory_mcp-1.9.1.dist-info/WHEEL +4 -0
  54. spatial_memory_mcp-1.9.1.dist-info/entry_points.txt +2 -0
  55. spatial_memory_mcp-1.9.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1067 @@
1
+ """Core algorithms for memory lifecycle operations.
2
+
3
+ This module contains the pure algorithmic implementations for:
4
+ - Decay: Time/access-based importance reduction
5
+ - Reinforcement: Importance boosting
6
+ - Extraction: Pattern-based memory extraction from text
7
+ - Consolidation: Duplicate detection and merging
8
+
9
+ These functions are pure computations with no I/O dependencies,
10
+ enabling easy testing and reuse across different contexts.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ import math
17
+ import re
18
+ from dataclasses import dataclass
19
+ from typing import Any, Literal
20
+
21
+ import numpy as np
22
+ from numpy.typing import NDArray
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Type alias for vectors
27
+ Vector = NDArray[np.float32]
28
+
29
+
30
+ # =============================================================================
31
+ # Decay Algorithms
32
+ # =============================================================================
33
+
34
+
35
+ def calculate_decay_factor(
36
+ days_since_access: float,
37
+ access_count: int,
38
+ base_importance: float,
39
+ decay_function: Literal["exponential", "linear", "step"],
40
+ half_life_days: float,
41
+ access_weight: float,
42
+ ) -> float:
43
+ """
44
+ Calculate decay factor for a memory based on time and access patterns.
45
+
46
+ Implements a modified half-life regression algorithm inspired by Ebbinghaus
47
+ forgetting curve, Duolingo HLR, and FSRS. The decay factor represents how
48
+ much the importance should be multiplied by (1.0 = no decay, 0.0 = full decay).
49
+
50
+ The effective half-life is adaptive:
51
+ - More accesses = longer half-life (slower decay)
52
+ - Higher importance = longer half-life (slower decay)
53
+
54
+ Args:
55
+ days_since_access: Days since the memory was last accessed.
56
+ Negative values are treated as 0.
57
+ access_count: Number of times the memory has been accessed.
58
+ Capped at 20 for half-life calculation to prevent overflow.
59
+ base_importance: Current importance of the memory (0-1).
60
+ decay_function: Type of decay curve.
61
+ - "exponential": Smooth decay following 2^(-t/half_life)
62
+ - "linear": Constant rate decay reaching 0 at 2*half_life
63
+ - "step": Discrete thresholds at half_life intervals
64
+ half_life_days: Base half-life in days for exponential decay.
65
+ access_weight: Weight of access count in decay calculation (0-1).
66
+ Higher values make access count more influential vs time.
67
+
68
+ Returns:
69
+ Decay factor (0.0 - 1.0) to multiply with importance.
70
+
71
+ Example:
72
+ >>> factor = calculate_decay_factor(
73
+ ... days_since_access=30,
74
+ ... access_count=5,
75
+ ... base_importance=0.7,
76
+ ... decay_function="exponential",
77
+ ... half_life_days=30,
78
+ ... access_weight=0.3,
79
+ ... )
80
+ >>> 0.0 <= factor <= 1.0
81
+ True
82
+ """
83
+ # Clamp inputs to valid ranges
84
+ days_since_access = max(0.0, days_since_access)
85
+ access_count = max(0, access_count)
86
+ base_importance = max(0.0, min(1.0, base_importance))
87
+ access_weight = max(0.0, min(1.0, access_weight))
88
+ half_life_days = max(1.0, half_life_days)
89
+
90
+ # Adaptive half-life: more accesses = longer half-life (slower decay)
91
+ # Each access adds 50% to half-life, capped at 20 accesses
92
+ access_bonus = 0.5
93
+ access_factor = (1 + access_bonus) ** min(access_count, 20)
94
+
95
+ # Higher base importance also slows decay
96
+ importance_factor = 1 + base_importance
97
+
98
+ # Calculate effective half-life
99
+ effective_half_life = half_life_days * access_factor * importance_factor
100
+
101
+ # Calculate time-based decay
102
+ if decay_function == "exponential":
103
+ # Exponential decay: importance halves every half_life days
104
+ time_decay = 2.0 ** (-days_since_access / effective_half_life)
105
+ elif decay_function == "linear":
106
+ # Linear decay: reaches zero at 2x half_life
107
+ time_decay = max(0.0, 1.0 - days_since_access / (2 * effective_half_life))
108
+ elif decay_function == "step":
109
+ # Step function: discrete drops at half_life intervals
110
+ if days_since_access < effective_half_life:
111
+ time_decay = 1.0
112
+ elif days_since_access < 2 * effective_half_life:
113
+ time_decay = 0.5
114
+ else:
115
+ time_decay = 0.25
116
+ else:
117
+ # Default to exponential for unknown functions
118
+ logger.warning("Unknown decay function '%s', using exponential", decay_function)
119
+ time_decay = 2.0 ** (-days_since_access / effective_half_life)
120
+
121
+ # Calculate access-based stability (memories accessed more are more stable)
122
+ # log1p(x)/log(100) normalizes access count to ~1.0 at 99 accesses
123
+ if access_count > 0:
124
+ access_stability = min(1.0, math.log1p(access_count) / math.log(100))
125
+ else:
126
+ access_stability = 0.0
127
+
128
+ # Blend time decay with access-based stability
129
+ # access_weight controls the balance (0 = pure time decay, 1 = pure access stability)
130
+ decay_factor = (1 - access_weight) * time_decay + access_weight * access_stability
131
+
132
+ return float(max(0.0, min(1.0, decay_factor)))
133
+
134
+
135
+ def apply_decay(
136
+ current_importance: float,
137
+ decay_factor: float,
138
+ min_importance: float,
139
+ ) -> float:
140
+ """
141
+ Apply decay to importance with a minimum floor.
142
+
143
+ Applies the calculated decay factor to the current importance score,
144
+ ensuring the result never falls below the specified minimum. This
145
+ prevents memories from becoming completely unfindable due to decay.
146
+
147
+ Args:
148
+ current_importance: Current importance score (0-1).
149
+ decay_factor: Decay factor from calculate_decay_factor (0-1).
150
+ min_importance: Minimum importance floor (0-1).
151
+ Memories will not decay below this threshold.
152
+
153
+ Returns:
154
+ New importance score after decay, clamped to [min_importance, 1.0].
155
+
156
+ Example:
157
+ >>> apply_decay(current_importance=0.8, decay_factor=0.5, min_importance=0.1)
158
+ 0.4
159
+ >>> apply_decay(current_importance=0.2, decay_factor=0.3, min_importance=0.1)
160
+ 0.1
161
+ """
162
+ # Clamp inputs
163
+ current_importance = max(0.0, min(1.0, current_importance))
164
+ decay_factor = max(0.0, min(1.0, decay_factor))
165
+ min_importance = max(0.0, min(1.0, min_importance))
166
+
167
+ # Apply decay
168
+ decayed = current_importance * decay_factor
169
+
170
+ # Enforce floor and ceiling
171
+ return max(min_importance, min(1.0, decayed))
172
+
173
+
174
+ # =============================================================================
175
+ # Reinforcement Algorithms
176
+ # =============================================================================
177
+
178
+
179
+ def calculate_reinforcement(
180
+ current_importance: float,
181
+ boost_type: Literal["additive", "multiplicative", "set_value"],
182
+ boost_amount: float,
183
+ max_importance: float = 1.0,
184
+ ) -> tuple[float, float]:
185
+ """
186
+ Calculate new importance after reinforcement.
187
+
188
+ Computes the reinforced importance based on the specified boost type.
189
+ This implements the memory strengthening counterpart to decay, allowing
190
+ frequently accessed or explicitly important memories to maintain or
191
+ increase their importance.
192
+
193
+ Args:
194
+ current_importance: Current importance score (0-1).
195
+ boost_type: Type of boost to apply.
196
+ - "additive": new = current + boost_amount
197
+ - "multiplicative": new = current * (1 + boost_amount)
198
+ - "set_value": new = boost_amount (direct override)
199
+ boost_amount: Amount of boost to apply.
200
+ For additive/multiplicative: the increment/factor.
201
+ For set_value: the target importance.
202
+ max_importance: Maximum allowed importance (default 1.0).
203
+ Results are capped to this value.
204
+
205
+ Returns:
206
+ Tuple of (new_importance, actual_boost) where:
207
+ - new_importance: The resulting importance after reinforcement
208
+ - actual_boost: The actual change applied (may be less than requested
209
+ if capped by max_importance)
210
+
211
+ Example:
212
+ >>> calculate_reinforcement(0.5, "additive", 0.1)
213
+ (0.6, 0.1)
214
+ >>> calculate_reinforcement(0.5, "multiplicative", 0.2)
215
+ (0.6, 0.1)
216
+ >>> calculate_reinforcement(0.5, "set_value", 0.8)
217
+ (0.8, 0.3)
218
+ >>> calculate_reinforcement(0.9, "additive", 0.2) # Capped at 1.0
219
+ (1.0, 0.1)
220
+ """
221
+ # Clamp inputs
222
+ current_importance = max(0.0, min(1.0, current_importance))
223
+ boost_amount = max(0.0, boost_amount)
224
+ max_importance = max(0.0, min(1.0, max_importance))
225
+
226
+ # Calculate new importance based on boost type
227
+ if boost_type == "additive":
228
+ new_importance = current_importance + boost_amount
229
+ elif boost_type == "multiplicative":
230
+ new_importance = current_importance * (1.0 + boost_amount)
231
+ elif boost_type == "set_value":
232
+ new_importance = boost_amount
233
+ else:
234
+ # Unknown boost type, return unchanged
235
+ logger.warning("Unknown boost type '%s', returning unchanged", boost_type)
236
+ return current_importance, 0.0
237
+
238
+ # Cap at maximum
239
+ new_importance = min(max_importance, max(0.0, new_importance))
240
+ actual_boost = new_importance - current_importance
241
+
242
+ return new_importance, actual_boost
243
+
244
+
245
+ # =============================================================================
246
+ # Extraction Algorithms
247
+ # =============================================================================
248
+
249
+
250
+ @dataclass
251
+ class ExtractionCandidate:
252
+ """
253
+ A candidate memory extracted from text.
254
+
255
+ Attributes:
256
+ content: The extracted text content.
257
+ confidence: Confidence score (0-1) that this is a valid memory.
258
+ pattern_type: Type of pattern that matched (e.g., "decision", "solution").
259
+ start_pos: Start position in the original text.
260
+ end_pos: End position in the original text.
261
+ """
262
+
263
+ content: str
264
+ confidence: float
265
+ pattern_type: str
266
+ start_pos: int
267
+ end_pos: int
268
+
269
+
270
+ # Default extraction patterns: (regex_pattern, base_confidence, pattern_type)
271
+ # These patterns identify memory-worthy content in conversation text
272
+ EXTRACTION_PATTERNS: list[tuple[str, float, str]] = [
273
+ # Decisions
274
+ (
275
+ r"(?:decided|chose|going with|selected|will use)\s+(.+?)(?:\.|$)",
276
+ 0.8,
277
+ "decision",
278
+ ),
279
+ # Facts/Definitions
280
+ (
281
+ r"(.+?)\s+(?:is|are|means|refers to)\s+(.+?)(?:\.|$)",
282
+ 0.6,
283
+ "definition",
284
+ ),
285
+ # Important points
286
+ (
287
+ r"(?:important|note|remember|key point)[:\s]+(.+?)(?:\.|$)",
288
+ 0.9,
289
+ "important",
290
+ ),
291
+ # Solutions/Fixes
292
+ (
293
+ r"(?:the (?:fix|solution|approach) (?:is|was))\s+(.+?)(?:\.|$)",
294
+ 0.85,
295
+ "solution",
296
+ ),
297
+ # Error diagnoses
298
+ (
299
+ r"(?:the (?:issue|problem|bug) was)\s+(.+?)(?:\.|$)",
300
+ 0.8,
301
+ "error",
302
+ ),
303
+ # Explicit save requests
304
+ (
305
+ r"(?:save|remember|note|store)(?:\s+that)?\s+(.+?)(?:\.|$)",
306
+ 0.95,
307
+ "explicit",
308
+ ),
309
+ # Patterns/Learnings
310
+ (
311
+ r"(?:the trick is|the key is|pattern:)\s+(.+?)(?:\.|$)",
312
+ 0.85,
313
+ "pattern",
314
+ ),
315
+ ]
316
+
317
+
318
+ def score_extraction_confidence(content: str, base_confidence: float) -> float:
319
+ """
320
+ Adjust extraction confidence based on content quality signals.
321
+
322
+ Analyzes the extracted content for quality indicators that suggest
323
+ higher or lower confidence in the extraction being meaningful.
324
+
325
+ Args:
326
+ content: Extracted text content to analyze.
327
+ base_confidence: Base confidence from the pattern match.
328
+
329
+ Returns:
330
+ Adjusted confidence score clamped to [0.0, 1.0].
331
+
332
+ Quality signals that increase confidence:
333
+ - Longer content (10+ words): +0.1
334
+ - Technical terms present: +0.05
335
+ - Code snippets present: +0.1
336
+ - URL references: +0.05
337
+ - Proper sentence structure: +0.05
338
+
339
+ Quality signals that decrease confidence:
340
+ - Very short content (< 5 words): -0.1
341
+ - All caps (shouting/headers): -0.15
342
+ - Excessive punctuation: -0.1
343
+ """
344
+ score = base_confidence
345
+
346
+ # Word count analysis
347
+ words = content.split()
348
+ word_count = len(words)
349
+
350
+ if word_count >= 10:
351
+ # Longer content is typically more informative
352
+ score += 0.1
353
+ elif word_count < 5:
354
+ # Very short content may be incomplete
355
+ score -= 0.1
356
+
357
+ # Technical terms boost confidence (domain-specific knowledge)
358
+ tech_terms = {
359
+ "api",
360
+ "database",
361
+ "function",
362
+ "class",
363
+ "config",
364
+ "error",
365
+ "server",
366
+ "client",
367
+ "query",
368
+ "model",
369
+ "endpoint",
370
+ "module",
371
+ "package",
372
+ "library",
373
+ "framework",
374
+ "method",
375
+ "variable",
376
+ "parameter",
377
+ "exception",
378
+ "async",
379
+ "schema",
380
+ "interface",
381
+ "type",
382
+ }
383
+ content_lower = content.lower()
384
+ if any(term in content_lower for term in tech_terms):
385
+ score += 0.05
386
+
387
+ # Code presence boosts confidence (concrete implementation details)
388
+ has_code_block = "```" in content
389
+ has_indented_code = bool(re.search(r"^\s{4,}", content, re.MULTILINE))
390
+ has_inline_code = bool(re.search(r"`[^`]+`", content))
391
+
392
+ if has_code_block or has_indented_code:
393
+ score += 0.1
394
+ elif has_inline_code:
395
+ score += 0.05
396
+
397
+ # URL presence (references external knowledge)
398
+ if re.search(r"https?://", content):
399
+ score += 0.05
400
+
401
+ # All caps penalty (likely headers or shouting)
402
+ if content.isupper() and len(content) > 10:
403
+ score -= 0.15
404
+
405
+ # Excessive punctuation penalty
406
+ punct_count = sum(1 for c in content if c in "!?...")
407
+ if word_count > 0 and punct_count > word_count / 2:
408
+ score -= 0.1
409
+
410
+ # Proper sentence structure (starts with capital, ends with punctuation)
411
+ if content and content[0].isupper() and content[-1] in ".!?":
412
+ score += 0.05
413
+
414
+ # Clamp to valid range
415
+ return max(0.0, min(1.0, score))
416
+
417
+
418
+ def extract_candidates(
419
+ text: str,
420
+ patterns: list[tuple[str, float, str]] | None = None,
421
+ min_confidence: float = 0.5,
422
+ max_candidates: int = 20,
423
+ ) -> list[ExtractionCandidate]:
424
+ """
425
+ Extract memory candidates from text using pattern matching.
426
+
427
+ Scans the input text for patterns that indicate memory-worthy content,
428
+ such as decisions, definitions, solutions, and explicit save requests.
429
+ Returns candidates with confidence scores for further filtering.
430
+
431
+ Args:
432
+ text: Text to extract memories from.
433
+ Can be conversation transcript, documentation, notes, etc.
434
+ patterns: List of (regex, base_confidence, pattern_type) tuples.
435
+ If None, uses EXTRACTION_PATTERNS default.
436
+ min_confidence: Minimum confidence threshold (0-1).
437
+ Candidates below this are filtered out.
438
+ max_candidates: Maximum number of candidates to return.
439
+
440
+ Returns:
441
+ List of extraction candidates sorted by confidence (descending).
442
+ Limited to max_candidates entries.
443
+
444
+ Example:
445
+ >>> text = "The fix is to add a null check before accessing the property."
446
+ >>> candidates = extract_candidates(text, min_confidence=0.5)
447
+ >>> len(candidates) >= 1
448
+ True
449
+ """
450
+ if not text or not text.strip():
451
+ return []
452
+
453
+ if patterns is None:
454
+ patterns = EXTRACTION_PATTERNS
455
+
456
+ candidates: list[ExtractionCandidate] = []
457
+
458
+ for pattern, base_confidence, pattern_type in patterns:
459
+ try:
460
+ for match in re.finditer(pattern, text, re.IGNORECASE | re.MULTILINE):
461
+ # Extract content from first capture group or full match
462
+ if match.groups():
463
+ # Join multiple groups if present (for definition patterns)
464
+ groups = [g for g in match.groups() if g]
465
+ content = " ".join(groups).strip()
466
+ else:
467
+ content = match.group(0).strip()
468
+
469
+ # Skip too short or too long
470
+ if len(content) < 10 or len(content) > 5000:
471
+ continue
472
+
473
+ # Adjust confidence based on content quality
474
+ confidence = score_extraction_confidence(content, base_confidence)
475
+
476
+ if confidence >= min_confidence:
477
+ candidates.append(
478
+ ExtractionCandidate(
479
+ content=content,
480
+ confidence=confidence,
481
+ pattern_type=pattern_type,
482
+ start_pos=match.start(),
483
+ end_pos=match.end(),
484
+ )
485
+ )
486
+ except re.error as e:
487
+ logger.warning("Invalid regex pattern '%s': %s", pattern, e)
488
+ continue
489
+
490
+ # Deduplicate overlapping extractions (keep highest confidence)
491
+ candidates = dedupe_overlapping_extractions(candidates)
492
+
493
+ # Sort by confidence and limit
494
+ candidates.sort(key=lambda c: c.confidence, reverse=True)
495
+ return candidates[:max_candidates]
496
+
497
+
498
+ def dedupe_overlapping_extractions(
499
+ candidates: list[ExtractionCandidate],
500
+ ) -> list[ExtractionCandidate]:
501
+ """Remove overlapping extractions, keeping highest confidence.
502
+
503
+ Args:
504
+ candidates: List of extraction candidates.
505
+
506
+ Returns:
507
+ Deduplicated list.
508
+ """
509
+ if not candidates:
510
+ return []
511
+
512
+ # Sort by start position, then by confidence (highest first)
513
+ sorted_candidates = sorted(candidates, key=lambda c: (c.start_pos, -c.confidence))
514
+
515
+ result: list[ExtractionCandidate] = []
516
+ last_end = -1
517
+
518
+ for candidate in sorted_candidates:
519
+ # Skip if overlaps with previous kept candidate
520
+ if candidate.start_pos < last_end:
521
+ continue
522
+
523
+ result.append(candidate)
524
+ last_end = candidate.end_pos
525
+
526
+ return result
527
+
528
+
529
+ # =============================================================================
530
+ # Consolidation Algorithms
531
+ # =============================================================================
532
+
533
+
534
+ def jaccard_similarity(text1: str, text2: str) -> float:
535
+ """
536
+ Calculate Jaccard similarity between two texts.
537
+
538
+ Computes the ratio of shared words to total unique words. This provides
539
+ a simple lexical similarity measure that complements semantic (vector)
540
+ similarity.
541
+
542
+ Args:
543
+ text1: First text string.
544
+ text2: Second text string.
545
+
546
+ Returns:
547
+ Jaccard similarity coefficient in range [0, 1].
548
+ 0 = no shared words, 1 = identical word sets.
549
+
550
+ Example:
551
+ >>> jaccard_similarity("hello world", "hello there")
552
+ 0.333...
553
+ >>> jaccard_similarity("hello world", "hello world")
554
+ 1.0
555
+ >>> jaccard_similarity("", "hello")
556
+ 0.0
557
+ """
558
+ if not text1 or not text2:
559
+ return 0.0
560
+
561
+ # Tokenize to lowercase words
562
+ words1 = set(re.findall(r"\w+", text1.lower()))
563
+ words2 = set(re.findall(r"\w+", text2.lower()))
564
+
565
+ if not words1 or not words2:
566
+ return 0.0
567
+
568
+ intersection = len(words1 & words2)
569
+ union = len(words1 | words2)
570
+
571
+ return intersection / union if union > 0 else 0.0
572
+
573
+
574
+ def cosine_similarity_vectors(vec1: list[float], vec2: list[float]) -> float:
575
+ """Calculate cosine similarity between two vectors.
576
+
577
+ Args:
578
+ vec1: First vector.
579
+ vec2: Second vector.
580
+
581
+ Returns:
582
+ Cosine similarity (-1 to 1).
583
+ """
584
+ if len(vec1) != len(vec2):
585
+ return 0.0
586
+
587
+ dot_product = sum(a * b for a, b in zip(vec1, vec2))
588
+ norm1 = math.sqrt(sum(a * a for a in vec1))
589
+ norm2 = math.sqrt(sum(b * b for b in vec2))
590
+
591
+ if norm1 < 1e-10 or norm2 < 1e-10:
592
+ return 0.0
593
+
594
+ return dot_product / (norm1 * norm2)
595
+
596
+
597
+ def combined_similarity(
598
+ vector_similarity: float,
599
+ content_overlap: float,
600
+ content_weight: float = 0.3,
601
+ ) -> float:
602
+ """Calculate combined similarity score.
603
+
604
+ Args:
605
+ vector_similarity: Cosine similarity of embeddings.
606
+ content_overlap: Jaccard similarity of content.
607
+ content_weight: Weight for content similarity (0-1).
608
+
609
+ Returns:
610
+ Combined similarity score (0-1).
611
+ """
612
+ return (1 - content_weight) * vector_similarity + content_weight * content_overlap
613
+
614
+
615
+ @dataclass
616
+ class ConsolidationGroup:
617
+ """
618
+ A group of similar memories to consolidate.
619
+
620
+ Attributes:
621
+ member_indices: Indices of memories in this group.
622
+ representative_idx: Index of the representative memory.
623
+ avg_similarity: Average pairwise similarity within the group.
624
+ """
625
+
626
+ member_indices: list[int]
627
+ representative_idx: int
628
+ avg_similarity: float
629
+
630
+
631
+ def find_duplicate_groups(
632
+ memory_ids: list[str],
633
+ vectors: NDArray[np.float32],
634
+ contents: list[str],
635
+ threshold: float,
636
+ content_weight: float = 0.3,
637
+ ) -> list[list[int]]:
638
+ """
639
+ Find groups of duplicate memories using Union-Find algorithm.
640
+
641
+ Identifies clusters of similar memories based on a combination of
642
+ vector similarity (semantic) and content overlap (lexical). Uses
643
+ Union-Find for efficient grouping of transitively similar memories.
644
+
645
+ This is an efficient numpy-based implementation that computes all
646
+ pairwise similarities in batch using matrix operations.
647
+
648
+ Args:
649
+ memory_ids: List of memory IDs (for logging/debugging).
650
+ vectors: 2D array of shape (n_memories, embedding_dim).
651
+ Should be normalized vectors for cosine similarity.
652
+ contents: List of memory content strings.
653
+ threshold: Minimum combined similarity to consider duplicates (0-1).
654
+ Higher values = stricter matching.
655
+ content_weight: Weight of content (Jaccard) similarity vs vector.
656
+ 0.0 = pure vector similarity
657
+ 1.0 = pure content similarity
658
+ Default 0.3 balances both.
659
+
660
+ Returns:
661
+ List of duplicate groups, where each group is a list of indices
662
+ into the original arrays. Only groups with 2+ members are returned.
663
+
664
+ Example:
665
+ >>> ids = ["a", "b", "c"]
666
+ >>> vectors = np.array([[1, 0], [0.99, 0.1], [0, 1]], dtype=np.float32)
667
+ >>> contents = ["hello world", "hello world!", "goodbye"]
668
+ >>> groups = find_duplicate_groups(ids, vectors, contents, threshold=0.8)
669
+ >>> len(groups) >= 1 # First two should group together
670
+ True
671
+ """
672
+ n = len(memory_ids)
673
+
674
+ if n == 0:
675
+ return []
676
+
677
+ if n < 2:
678
+ return []
679
+
680
+ if n != vectors.shape[0] or n != len(contents):
681
+ raise ValueError(
682
+ f"Mismatched lengths: memory_ids={n}, vectors={vectors.shape[0]}, "
683
+ f"contents={len(contents)}"
684
+ )
685
+
686
+ # Union-Find data structure
687
+ parent = list(range(n))
688
+ rank = [0] * n
689
+
690
+ def find(x: int) -> int:
691
+ """Find with path compression."""
692
+ if parent[x] != x:
693
+ parent[x] = find(parent[x])
694
+ return parent[x]
695
+
696
+ def union(x: int, y: int) -> None:
697
+ """Union by rank."""
698
+ px, py = find(x), find(y)
699
+ if px != py:
700
+ if rank[px] < rank[py]:
701
+ px, py = py, px
702
+ parent[py] = px
703
+ if rank[px] == rank[py]:
704
+ rank[px] += 1
705
+
706
+ # Calculate pairwise vector similarities using cosine similarity
707
+ # For normalized vectors: cosine_sim = dot product
708
+ # Normalize vectors first
709
+ norms = np.linalg.norm(vectors, axis=1, keepdims=True)
710
+ norms = np.maximum(norms, 1e-10) # Avoid division by zero
711
+ normalized_vectors = vectors / norms
712
+
713
+ # Compute cosine similarity matrix efficiently
714
+ vector_sim = np.dot(normalized_vectors, normalized_vectors.T)
715
+
716
+ # Compare all pairs
717
+ for i in range(n):
718
+ for j in range(i + 1, n):
719
+ # Vector similarity (already computed)
720
+ v_sim = float(vector_sim[i, j])
721
+
722
+ # Content similarity (Jaccard)
723
+ c_sim = jaccard_similarity(contents[i], contents[j])
724
+
725
+ # Combined score
726
+ combined = (1 - content_weight) * v_sim + content_weight * c_sim
727
+
728
+ if combined >= threshold:
729
+ union(i, j)
730
+
731
+ # Group by root
732
+ groups: dict[int, list[int]] = {}
733
+ for i in range(n):
734
+ root = find(i)
735
+ if root not in groups:
736
+ groups[root] = []
737
+ groups[root].append(i)
738
+
739
+ # Return only groups with duplicates (2+ members)
740
+ return [g for g in groups.values() if len(g) > 1]
741
+
742
+
743
+ def find_duplicate_groups_with_callbacks(
744
+ memory_count: int,
745
+ get_vector_similarity: Any, # callable[[int, int], float]
746
+ get_content_similarity: Any, # callable[[int, int], float]
747
+ threshold: float,
748
+ content_weight: float = 0.3,
749
+ ) -> list[ConsolidationGroup]:
750
+ """
751
+ Find groups of duplicate memories using Union-Find with callback functions.
752
+
753
+ This is an alternative implementation that uses callback functions instead
754
+ of precomputed arrays. Useful when vectors/contents are lazily loaded.
755
+
756
+ Args:
757
+ memory_count: Number of memories.
758
+ get_vector_similarity: Function(i, j) -> float for vector similarity.
759
+ get_content_similarity: Function(i, j) -> float for content similarity.
760
+ threshold: Minimum combined similarity for grouping.
761
+ content_weight: Weight for content similarity.
762
+
763
+ Returns:
764
+ List of consolidation groups with indices.
765
+ """
766
+ if memory_count < 2:
767
+ return []
768
+
769
+ # Union-Find data structure
770
+ parent = list(range(memory_count))
771
+ rank = [0] * memory_count
772
+
773
+ def find(x: int) -> int:
774
+ if parent[x] != x:
775
+ parent[x] = find(parent[x]) # Path compression
776
+ return parent[x]
777
+
778
+ def union(x: int, y: int) -> None:
779
+ px, py = find(x), find(y)
780
+ if px != py:
781
+ # Union by rank
782
+ if rank[px] < rank[py]:
783
+ px, py = py, px
784
+ parent[py] = px
785
+ if rank[px] == rank[py]:
786
+ rank[px] += 1
787
+
788
+ # Track similarities for average calculation
789
+ similarities: dict[tuple[int, int], float] = {}
790
+
791
+ # Compare all pairs
792
+ for i in range(memory_count):
793
+ for j in range(i + 1, memory_count):
794
+ v_sim = get_vector_similarity(i, j)
795
+ c_sim = get_content_similarity(i, j)
796
+ combined = combined_similarity(v_sim, c_sim, content_weight)
797
+
798
+ if combined >= threshold:
799
+ union(i, j)
800
+ similarities[(i, j)] = combined
801
+
802
+ # Group by root
803
+ groups_dict: dict[int, list[int]] = {}
804
+ for i in range(memory_count):
805
+ root = find(i)
806
+ groups_dict.setdefault(root, []).append(i)
807
+
808
+ # Convert to ConsolidationGroup objects (only groups with 2+ members)
809
+ result: list[ConsolidationGroup] = []
810
+ for members in groups_dict.values():
811
+ if len(members) < 2:
812
+ continue
813
+
814
+ # Calculate average similarity
815
+ total_sim = 0.0
816
+ pair_count = 0
817
+ for idx_i in range(len(members)):
818
+ for idx_j in range(idx_i + 1, len(members)):
819
+ mi, mj = members[idx_i], members[idx_j]
820
+ key = (min(mi, mj), max(mi, mj))
821
+ if key in similarities:
822
+ total_sim += similarities[key]
823
+ pair_count += 1
824
+
825
+ avg_sim = total_sim / pair_count if pair_count > 0 else threshold
826
+
827
+ result.append(
828
+ ConsolidationGroup(
829
+ member_indices=members,
830
+ representative_idx=members[0], # Will be updated by caller
831
+ avg_similarity=avg_sim,
832
+ )
833
+ )
834
+
835
+ return result
836
+
837
+
838
+ def select_representative(
839
+ members: list[dict[str, Any]],
840
+ strategy: Literal[
841
+ "keep_newest", "keep_oldest", "keep_highest_importance", "merge_content"
842
+ ],
843
+ ) -> int:
844
+ """
845
+ Select the representative memory index based on strategy.
846
+
847
+ Determines which memory should be kept as the canonical version
848
+ when merging a group of similar memories.
849
+
850
+ Args:
851
+ members: List of memory dictionaries with 'created_at', 'importance' keys.
852
+ strategy: Selection strategy.
853
+ - "keep_newest": Most recently created memory
854
+ - "keep_oldest": Oldest memory (canonical/original)
855
+ - "keep_highest_importance": Most important memory
856
+ - "merge_content": Longest content (most comprehensive)
857
+
858
+ Returns:
859
+ Index of the representative memory within the list.
860
+ """
861
+ if not members:
862
+ return 0
863
+
864
+ if strategy == "keep_newest":
865
+ return max(range(len(members)), key=lambda i: members[i].get("created_at", 0))
866
+ elif strategy == "keep_oldest":
867
+ return min(
868
+ range(len(members)), key=lambda i: members[i].get("created_at", float("inf"))
869
+ )
870
+ elif strategy == "keep_highest_importance":
871
+ return max(range(len(members)), key=lambda i: members[i].get("importance", 0))
872
+ elif strategy == "merge_content":
873
+ # For merge, pick the longest content as base
874
+ return max(range(len(members)), key=lambda i: len(members[i].get("content", "")))
875
+ else:
876
+ return 0
877
+
878
+
879
+ def merge_memory_content(contents: list[str], separator: str = "\n\n---\n\n") -> str:
880
+ """
881
+ Merge multiple memory contents into one.
882
+
883
+ Combines content from multiple memories, removing duplicates while
884
+ preserving the order of first occurrence.
885
+
886
+ Args:
887
+ contents: List of content strings to merge.
888
+ separator: Separator between merged contents.
889
+
890
+ Returns:
891
+ Merged content string with duplicates removed.
892
+ """
893
+ # Remove duplicates while preserving order
894
+ seen: set[str] = set()
895
+ unique_contents: list[str] = []
896
+ for content in contents:
897
+ normalized = content.strip()
898
+ if normalized and normalized not in seen:
899
+ seen.add(normalized)
900
+ unique_contents.append(content.strip())
901
+
902
+ return separator.join(unique_contents)
903
+
904
+
905
+ def merge_memory_metadata(memories: list[dict[str, Any]]) -> dict[str, Any]:
906
+ """
907
+ Merge metadata from multiple memories.
908
+
909
+ Consolidates metadata intelligently:
910
+ - created_at: earliest (preserve provenance)
911
+ - last_accessed: latest (preserve recency)
912
+ - access_count: sum (preserve total usage)
913
+ - importance: max (preserve significance)
914
+ - tags: union (preserve all categorization)
915
+ - metadata: merged with special 'consolidated_from' field
916
+
917
+ Args:
918
+ memories: List of memory dictionaries.
919
+
920
+ Returns:
921
+ Merged metadata dictionary.
922
+ """
923
+ if not memories:
924
+ return {}
925
+
926
+ created_dates: list[Any] = [
927
+ m.get("created_at") for m in memories if m.get("created_at") is not None
928
+ ]
929
+ accessed_dates: list[Any] = [
930
+ m.get("last_accessed") for m in memories if m.get("last_accessed") is not None
931
+ ]
932
+
933
+ result: dict[str, Any] = {
934
+ "created_at": min(created_dates) if created_dates else None,
935
+ "last_accessed": max(accessed_dates) if accessed_dates else None,
936
+ "access_count": sum(m.get("access_count", 0) for m in memories),
937
+ "importance": max((m.get("importance", 0) for m in memories), default=0.5),
938
+ "tags": list(set(tag for m in memories for tag in m.get("tags", []))),
939
+ "source": "consolidated",
940
+ "metadata": {
941
+ "consolidated_from": [m.get("id") for m in memories if m.get("id")],
942
+ },
943
+ }
944
+
945
+ return result
946
+
947
+
948
+ def merge_memories(
949
+ memories: list[dict[str, Any]],
950
+ vectors: list[NDArray[np.float32]],
951
+ strategy: Literal["keep_newest", "keep_oldest", "keep_highest_importance", "merge_content"],
952
+ ) -> tuple[dict[str, Any], NDArray[np.float32]]:
953
+ """
954
+ Merge multiple memories into one according to strategy.
955
+
956
+ Combines similar memories using the specified strategy for content
957
+ selection, while intelligently merging metadata like timestamps,
958
+ access counts, and tags.
959
+
960
+ Args:
961
+ memories: List of memory dictionaries to merge.
962
+ Each must have: content, created_at, last_accessed,
963
+ access_count, importance, tags, id.
964
+ vectors: List of embedding vectors corresponding to memories.
965
+ strategy: How to select/merge content.
966
+ - "keep_newest": Use content from most recently created memory
967
+ - "keep_oldest": Use content from oldest memory (canonical)
968
+ - "keep_highest_importance": Use content from most important memory
969
+ - "merge_content": Combine all content with separators
970
+
971
+ Returns:
972
+ Tuple of (merged_memory_dict, merged_vector) where:
973
+ - merged_memory_dict contains the merged content and metadata
974
+ - merged_vector is the weighted average of input vectors (normalized)
975
+
976
+ Raises:
977
+ ValueError: If memories list is empty or lengths mismatch.
978
+
979
+ Example:
980
+ >>> memories = [
981
+ ... {"content": "A", "created_at": dt1, "importance": 0.5, ...},
982
+ ... {"content": "B", "created_at": dt2, "importance": 0.8, ...},
983
+ ... ]
984
+ >>> vectors = [v1, v2]
985
+ >>> merged, vec = merge_memories(memories, vectors, "keep_highest_importance")
986
+ >>> merged["content"] # "B" (higher importance)
987
+ """
988
+ if not memories:
989
+ raise ValueError("Cannot merge empty list of memories")
990
+
991
+ if len(memories) != len(vectors):
992
+ raise ValueError(
993
+ f"Mismatched lengths: memories={len(memories)}, vectors={len(vectors)}"
994
+ )
995
+
996
+ # Select primary memory based on strategy
997
+ if strategy == "keep_newest":
998
+ primary = max(memories, key=lambda m: m["created_at"])
999
+ content = primary["content"]
1000
+ elif strategy == "keep_oldest":
1001
+ primary = min(memories, key=lambda m: m["created_at"])
1002
+ content = primary["content"]
1003
+ elif strategy == "keep_highest_importance":
1004
+ primary = max(memories, key=lambda m: m.get("importance", 0.5))
1005
+ content = primary["content"]
1006
+ elif strategy == "merge_content":
1007
+ # Combine all content with separator
1008
+ contents = [m["content"] for m in memories]
1009
+ content = merge_memory_content(contents)
1010
+ else:
1011
+ # Default to keeping highest importance
1012
+ logger.warning("Unknown strategy '%s', using keep_highest_importance", strategy)
1013
+ primary = max(memories, key=lambda m: m.get("importance", 0.5))
1014
+ content = primary["content"]
1015
+
1016
+ # Merge metadata from all memories
1017
+ merged = merge_memory_metadata(memories)
1018
+ merged["content"] = content
1019
+
1020
+ # Calculate merged vector as weighted average by content length
1021
+ vectors_array = np.array(vectors, dtype=np.float32)
1022
+ weights = np.array([len(m["content"]) for m in memories], dtype=np.float32)
1023
+ total_weight = weights.sum()
1024
+
1025
+ if total_weight > 0:
1026
+ weights = weights / total_weight
1027
+ merged_vector = np.sum(vectors_array * weights[:, np.newaxis], axis=0)
1028
+ else:
1029
+ # Fallback to simple average
1030
+ merged_vector = np.mean(vectors_array, axis=0)
1031
+
1032
+ # Normalize the merged vector
1033
+ norm = np.linalg.norm(merged_vector)
1034
+ if norm > 1e-10:
1035
+ merged_vector = merged_vector / norm
1036
+
1037
+ return merged, merged_vector.astype(np.float32)
1038
+
1039
+
1040
+ def cosine_similarity_matrix(vectors: NDArray[np.float32]) -> NDArray[np.float32]:
1041
+ """
1042
+ Compute pairwise cosine similarity matrix for a set of vectors.
1043
+
1044
+ Efficient batch computation of cosine similarity between all pairs
1045
+ of vectors using matrix multiplication.
1046
+
1047
+ Args:
1048
+ vectors: 2D array of shape (n_vectors, embedding_dim).
1049
+
1050
+ Returns:
1051
+ 2D array of shape (n_vectors, n_vectors) containing pairwise
1052
+ cosine similarities. Values range from -1 to 1.
1053
+
1054
+ Example:
1055
+ >>> vectors = np.array([[1, 0], [0, 1], [1, 1]], dtype=np.float32)
1056
+ >>> sim = cosine_similarity_matrix(vectors)
1057
+ >>> abs(sim[0, 0] - 1.0) < 0.001 # Self-similarity = 1
1058
+ True
1059
+ """
1060
+ # Normalize vectors
1061
+ norms = np.linalg.norm(vectors, axis=1, keepdims=True)
1062
+ norms = np.maximum(norms, 1e-10)
1063
+ normalized = vectors / norms
1064
+
1065
+ # Cosine similarity = dot product of normalized vectors
1066
+ similarity: NDArray[np.float32] = np.dot(normalized, normalized.T).astype(np.float32)
1067
+ return similarity