spatial-memory-mcp 1.5.3__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spatial-memory-mcp might be problematic. Click here for more details.
- spatial_memory/__init__.py +1 -1
- spatial_memory/__main__.py +241 -2
- spatial_memory/adapters/lancedb_repository.py +74 -5
- spatial_memory/config.py +10 -2
- spatial_memory/core/__init__.py +9 -0
- spatial_memory/core/connection_pool.py +41 -3
- spatial_memory/core/consolidation_strategies.py +402 -0
- spatial_memory/core/database.py +774 -918
- spatial_memory/core/db_idempotency.py +242 -0
- spatial_memory/core/db_indexes.py +575 -0
- spatial_memory/core/db_migrations.py +584 -0
- spatial_memory/core/db_search.py +509 -0
- spatial_memory/core/db_versioning.py +177 -0
- spatial_memory/core/embeddings.py +65 -18
- spatial_memory/core/errors.py +75 -3
- spatial_memory/core/filesystem.py +178 -0
- spatial_memory/core/models.py +4 -0
- spatial_memory/core/rate_limiter.py +26 -9
- spatial_memory/core/response_types.py +497 -0
- spatial_memory/core/validation.py +86 -2
- spatial_memory/factory.py +407 -0
- spatial_memory/migrations/__init__.py +40 -0
- spatial_memory/ports/repositories.py +52 -2
- spatial_memory/server.py +131 -189
- spatial_memory/services/export_import.py +61 -43
- spatial_memory/services/lifecycle.py +397 -122
- spatial_memory/services/memory.py +2 -2
- spatial_memory/services/spatial.py +129 -46
- {spatial_memory_mcp-1.5.3.dist-info → spatial_memory_mcp-1.6.0.dist-info}/METADATA +83 -3
- spatial_memory_mcp-1.6.0.dist-info/RECORD +54 -0
- spatial_memory_mcp-1.5.3.dist-info/RECORD +0 -44
- {spatial_memory_mcp-1.5.3.dist-info → spatial_memory_mcp-1.6.0.dist-info}/WHEEL +0 -0
- {spatial_memory_mcp-1.5.3.dist-info → spatial_memory_mcp-1.6.0.dist-info}/entry_points.txt +0 -0
- {spatial_memory_mcp-1.5.3.dist-info → spatial_memory_mcp-1.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -16,9 +16,10 @@ from __future__ import annotations
|
|
|
16
16
|
|
|
17
17
|
import logging
|
|
18
18
|
from dataclasses import dataclass
|
|
19
|
-
from datetime import datetime, timezone
|
|
20
19
|
from typing import TYPE_CHECKING, Any, Literal
|
|
21
20
|
|
|
21
|
+
import numpy as np
|
|
22
|
+
|
|
22
23
|
from spatial_memory.core.errors import (
|
|
23
24
|
ConsolidationError,
|
|
24
25
|
DecayError,
|
|
@@ -26,6 +27,10 @@ from spatial_memory.core.errors import (
|
|
|
26
27
|
ReinforcementError,
|
|
27
28
|
ValidationError,
|
|
28
29
|
)
|
|
30
|
+
from spatial_memory.core.consolidation_strategies import (
|
|
31
|
+
ConsolidationAction,
|
|
32
|
+
get_strategy,
|
|
33
|
+
)
|
|
29
34
|
from spatial_memory.core.lifecycle_ops import (
|
|
30
35
|
apply_decay,
|
|
31
36
|
calculate_decay_factor,
|
|
@@ -34,9 +39,6 @@ from spatial_memory.core.lifecycle_ops import (
|
|
|
34
39
|
extract_candidates,
|
|
35
40
|
find_duplicate_groups,
|
|
36
41
|
jaccard_similarity,
|
|
37
|
-
merge_memory_content,
|
|
38
|
-
merge_memory_metadata,
|
|
39
|
-
select_representative,
|
|
40
42
|
)
|
|
41
43
|
from spatial_memory.core.models import (
|
|
42
44
|
ConsolidateResult,
|
|
@@ -53,7 +55,7 @@ from spatial_memory.core.models import (
|
|
|
53
55
|
|
|
54
56
|
# Alias for backward compatibility
|
|
55
57
|
ConsolidationGroupResult = ConsolidationGroup
|
|
56
|
-
from spatial_memory.core.utils import to_naive_utc, utc_now_naive
|
|
58
|
+
from spatial_memory.core.utils import to_naive_utc, utc_now, utc_now_naive
|
|
57
59
|
from spatial_memory.core.validation import validate_namespace
|
|
58
60
|
|
|
59
61
|
logger = logging.getLogger(__name__)
|
|
@@ -110,6 +112,7 @@ class LifecycleConfig:
|
|
|
110
112
|
consolidate_min_threshold: float = 0.7
|
|
111
113
|
consolidate_content_weight: float = 0.3
|
|
112
114
|
consolidate_max_batch: int = 1000
|
|
115
|
+
consolidate_chunk_size: int = 200 # Process in smaller chunks for memory efficiency
|
|
113
116
|
|
|
114
117
|
|
|
115
118
|
# =============================================================================
|
|
@@ -271,16 +274,31 @@ class LifecycleService:
|
|
|
271
274
|
total_decay_factor / len(all_memories) if all_memories else 1.0
|
|
272
275
|
)
|
|
273
276
|
|
|
274
|
-
# Apply updates if not dry run
|
|
277
|
+
# Apply updates if not dry run - use batch update for efficiency
|
|
275
278
|
failed_updates: list[str] = []
|
|
276
279
|
if not dry_run and memories_to_update:
|
|
277
280
|
logger.info(f"Applying decay to {len(memories_to_update)} memories")
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
281
|
+
# Convert to batch update format: list of (memory_id, updates_dict)
|
|
282
|
+
batch_updates = [
|
|
283
|
+
(memory_id, {"importance": new_importance})
|
|
284
|
+
for memory_id, new_importance in memories_to_update
|
|
285
|
+
]
|
|
286
|
+
try:
|
|
287
|
+
success_count, failed_ids = self._repo.update_batch(batch_updates)
|
|
288
|
+
failed_updates = failed_ids
|
|
289
|
+
logger.debug(
|
|
290
|
+
f"Batch decay update: {success_count} succeeded, "
|
|
291
|
+
f"{len(failed_ids)} failed"
|
|
292
|
+
)
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.warning(f"Batch decay update failed: {e}")
|
|
295
|
+
# Fall back to individual updates on batch failure
|
|
296
|
+
for memory_id, new_importance in memories_to_update:
|
|
297
|
+
try:
|
|
298
|
+
self._repo.update(memory_id, {"importance": new_importance})
|
|
299
|
+
except Exception as update_err:
|
|
300
|
+
logger.warning(f"Failed to update {memory_id}: {update_err}")
|
|
301
|
+
failed_updates.append(memory_id)
|
|
284
302
|
|
|
285
303
|
return DecayResult(
|
|
286
304
|
memories_analyzed=len(all_memories),
|
|
@@ -345,14 +363,30 @@ class LifecycleService:
|
|
|
345
363
|
failed_updates: list[str] = []
|
|
346
364
|
total_boost = 0.0
|
|
347
365
|
|
|
348
|
-
|
|
349
|
-
|
|
366
|
+
# Batch fetch all memories in a single query
|
|
367
|
+
memory_map = self._repo.get_batch(memory_ids)
|
|
350
368
|
|
|
351
|
-
|
|
369
|
+
# Track which memories were not found
|
|
370
|
+
for memory_id in memory_ids:
|
|
371
|
+
if memory_id not in memory_map:
|
|
352
372
|
not_found.append(memory_id)
|
|
353
373
|
logger.warning(f"Memory not found for reinforcement: {memory_id}")
|
|
354
|
-
continue
|
|
355
374
|
|
|
375
|
+
if not memory_map:
|
|
376
|
+
return ReinforceResult(
|
|
377
|
+
memories_reinforced=0,
|
|
378
|
+
avg_boost=0.0,
|
|
379
|
+
reinforced=[],
|
|
380
|
+
not_found=not_found,
|
|
381
|
+
failed_updates=[],
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Calculate reinforcement for all found memories
|
|
385
|
+
now = utc_now()
|
|
386
|
+
batch_updates: list[tuple[str, dict[str, Any]]] = []
|
|
387
|
+
reinforcement_info: list[tuple[str, Memory, float, float]] = [] # id, memory, new_imp, boost
|
|
388
|
+
|
|
389
|
+
for memory_id, memory in memory_map.items():
|
|
356
390
|
# Calculate new importance
|
|
357
391
|
new_importance, actual_boost = calculate_reinforcement(
|
|
358
392
|
current_importance=memory.importance,
|
|
@@ -364,12 +398,36 @@ class LifecycleService:
|
|
|
364
398
|
# Prepare update
|
|
365
399
|
updates: dict[str, Any] = {"importance": new_importance}
|
|
366
400
|
if update_access:
|
|
367
|
-
updates["last_accessed"] =
|
|
401
|
+
updates["last_accessed"] = now
|
|
368
402
|
updates["access_count"] = memory.access_count + 1
|
|
369
403
|
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
404
|
+
batch_updates.append((memory_id, updates))
|
|
405
|
+
reinforcement_info.append((memory_id, memory, new_importance, actual_boost))
|
|
406
|
+
|
|
407
|
+
# Apply all updates in a single batch operation
|
|
408
|
+
try:
|
|
409
|
+
success_count, batch_failed_ids = self._repo.update_batch(batch_updates)
|
|
410
|
+
failed_updates = batch_failed_ids
|
|
411
|
+
logger.debug(
|
|
412
|
+
f"Batch reinforce update: {success_count} succeeded, "
|
|
413
|
+
f"{len(batch_failed_ids)} failed"
|
|
414
|
+
)
|
|
415
|
+
except Exception as e:
|
|
416
|
+
logger.warning(f"Batch reinforce update failed: {e}, falling back to individual updates")
|
|
417
|
+
# Fall back to individual updates on batch failure
|
|
418
|
+
batch_failed_ids = []
|
|
419
|
+
for memory_id, updates in batch_updates:
|
|
420
|
+
try:
|
|
421
|
+
self._repo.update(memory_id, updates)
|
|
422
|
+
except Exception as update_err:
|
|
423
|
+
logger.warning(f"Failed to reinforce {memory_id}: {update_err}")
|
|
424
|
+
batch_failed_ids.append(memory_id)
|
|
425
|
+
failed_updates = batch_failed_ids
|
|
426
|
+
|
|
427
|
+
# Build reinforced results for successful updates
|
|
428
|
+
failed_set = set(failed_updates)
|
|
429
|
+
for memory_id, memory, new_importance, actual_boost in reinforcement_info:
|
|
430
|
+
if memory_id not in failed_set:
|
|
373
431
|
reinforced.append(
|
|
374
432
|
ReinforcedMemory(
|
|
375
433
|
id=memory_id,
|
|
@@ -382,9 +440,6 @@ class LifecycleService:
|
|
|
382
440
|
)
|
|
383
441
|
)
|
|
384
442
|
total_boost += actual_boost
|
|
385
|
-
except Exception as e:
|
|
386
|
-
logger.warning(f"Failed to reinforce {memory_id}: {e}")
|
|
387
|
-
failed_updates.append(memory_id)
|
|
388
443
|
|
|
389
444
|
avg_boost = total_boost / len(reinforced) if reinforced else 0.0
|
|
390
445
|
|
|
@@ -462,11 +517,17 @@ class LifecycleService:
|
|
|
462
517
|
extractions=[],
|
|
463
518
|
)
|
|
464
519
|
|
|
520
|
+
# Generate embeddings for all candidates in a single batch
|
|
521
|
+
# This is much more efficient than generating one at a time
|
|
522
|
+
candidate_texts = [c.content for c in candidates]
|
|
523
|
+
candidate_vectors = self._embeddings.embed_batch(candidate_texts)
|
|
524
|
+
logger.debug(f"Generated {len(candidate_vectors)} embeddings in batch")
|
|
525
|
+
|
|
465
526
|
extractions: list[ExtractedMemory] = []
|
|
466
527
|
memories_created = 0
|
|
467
528
|
deduplicated_count = 0
|
|
468
529
|
|
|
469
|
-
for candidate in candidates:
|
|
530
|
+
for candidate, vector in zip(candidates, candidate_vectors):
|
|
470
531
|
extraction = ExtractedMemory(
|
|
471
532
|
content=candidate.content,
|
|
472
533
|
confidence=candidate.confidence,
|
|
@@ -477,22 +538,24 @@ class LifecycleService:
|
|
|
477
538
|
memory_id=None,
|
|
478
539
|
)
|
|
479
540
|
|
|
480
|
-
# Check for duplicates if requested
|
|
541
|
+
# Check for duplicates if requested (use pre-computed vector)
|
|
481
542
|
if deduplicate:
|
|
482
|
-
is_duplicate = self.
|
|
483
|
-
candidate.content,
|
|
484
|
-
|
|
485
|
-
|
|
543
|
+
is_duplicate = self._check_duplicate_with_vector(
|
|
544
|
+
content=candidate.content,
|
|
545
|
+
vector=vector,
|
|
546
|
+
namespace=effective_namespace,
|
|
547
|
+
threshold=dedup_threshold,
|
|
486
548
|
)
|
|
487
549
|
if is_duplicate:
|
|
488
550
|
deduplicated_count += 1
|
|
489
551
|
extractions.append(extraction)
|
|
490
552
|
continue
|
|
491
553
|
|
|
492
|
-
# Store the extracted memory
|
|
554
|
+
# Store the extracted memory (use pre-computed vector)
|
|
493
555
|
try:
|
|
494
|
-
memory_id = self.
|
|
556
|
+
memory_id = self._store_extracted_memory_with_vector(
|
|
495
557
|
content=candidate.content,
|
|
558
|
+
vector=vector,
|
|
496
559
|
namespace=effective_namespace,
|
|
497
560
|
confidence=candidate.confidence,
|
|
498
561
|
pattern_type=candidate.pattern_type,
|
|
@@ -557,25 +620,20 @@ class LifecycleService:
|
|
|
557
620
|
"similarity_threshold must be between 0.7 and 0.99"
|
|
558
621
|
)
|
|
559
622
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
):
|
|
566
|
-
raise ValidationError(f"Invalid strategy: {strategy}")
|
|
623
|
+
# Validate strategy using the strategy registry
|
|
624
|
+
try:
|
|
625
|
+
strategy_impl = get_strategy(strategy)
|
|
626
|
+
except ValueError as e:
|
|
627
|
+
raise ValidationError(str(e)) from e
|
|
567
628
|
|
|
568
629
|
if max_groups < 1:
|
|
569
630
|
raise ValidationError("max_groups must be at least 1")
|
|
570
631
|
|
|
571
632
|
try:
|
|
572
|
-
#
|
|
573
|
-
|
|
574
|
-
namespace=namespace,
|
|
575
|
-
limit=self._config.consolidate_max_batch,
|
|
576
|
-
)
|
|
633
|
+
# Get total count to decide processing strategy
|
|
634
|
+
total_count = self._repo.count(namespace=namespace)
|
|
577
635
|
|
|
578
|
-
if
|
|
636
|
+
if total_count < 2:
|
|
579
637
|
logger.info("Not enough memories for consolidation")
|
|
580
638
|
return ConsolidateResult(
|
|
581
639
|
groups_found=0,
|
|
@@ -585,9 +643,34 @@ class LifecycleService:
|
|
|
585
643
|
dry_run=dry_run,
|
|
586
644
|
)
|
|
587
645
|
|
|
588
|
-
#
|
|
589
|
-
|
|
646
|
+
# Use chunked processing for large namespaces to reduce memory usage
|
|
647
|
+
chunk_size = min(
|
|
648
|
+
self._config.consolidate_chunk_size,
|
|
649
|
+
self._config.consolidate_max_batch,
|
|
650
|
+
)
|
|
651
|
+
use_chunked = total_count > chunk_size
|
|
590
652
|
|
|
653
|
+
if use_chunked:
|
|
654
|
+
logger.info(
|
|
655
|
+
f"Using chunked consolidation: {total_count} memories in "
|
|
656
|
+
f"chunks of {chunk_size}"
|
|
657
|
+
)
|
|
658
|
+
return self._consolidate_chunked(
|
|
659
|
+
namespace=namespace,
|
|
660
|
+
similarity_threshold=similarity_threshold,
|
|
661
|
+
strategy=strategy,
|
|
662
|
+
dry_run=dry_run,
|
|
663
|
+
max_groups=max_groups,
|
|
664
|
+
chunk_size=chunk_size,
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
# Standard single-pass processing for smaller namespaces
|
|
668
|
+
all_memories = self._repo.get_all(
|
|
669
|
+
namespace=namespace,
|
|
670
|
+
limit=self._config.consolidate_max_batch,
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
# Build lookup structures
|
|
591
674
|
memories = [m for m, _ in all_memories]
|
|
592
675
|
vectors_list = [v for _, v in all_memories]
|
|
593
676
|
vectors_array = np.array(vectors_list, dtype=np.float32)
|
|
@@ -636,12 +719,6 @@ class LifecycleService:
|
|
|
636
719
|
group_member_dicts = [memory_dicts[i] for i in member_indices]
|
|
637
720
|
group_member_ids = [str(d["id"]) for d in group_member_dicts]
|
|
638
721
|
|
|
639
|
-
# Select representative
|
|
640
|
-
rep_idx = select_representative(group_member_dicts, strategy)
|
|
641
|
-
rep_id = str(group_member_dicts[rep_idx]["id"])
|
|
642
|
-
|
|
643
|
-
action = "preview" if dry_run else "merged"
|
|
644
|
-
|
|
645
722
|
# Calculate average similarity for the group
|
|
646
723
|
total_sim = 0.0
|
|
647
724
|
pair_count = 0
|
|
@@ -665,81 +742,32 @@ class LifecycleService:
|
|
|
665
742
|
pair_count += 1
|
|
666
743
|
avg_similarity = total_sim / pair_count if pair_count > 0 else 0.0
|
|
667
744
|
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
metadata=merged_meta.get("metadata", {}),
|
|
688
|
-
)
|
|
689
|
-
|
|
690
|
-
# DELETE FIRST pattern: remove originals before adding merge
|
|
691
|
-
# This prevents duplicates if add fails after delete
|
|
692
|
-
deleted_ids: list[str] = []
|
|
693
|
-
try:
|
|
694
|
-
for mid in group_member_ids:
|
|
695
|
-
self._repo.delete(mid)
|
|
696
|
-
deleted_ids.append(mid)
|
|
697
|
-
memories_deleted += 1
|
|
698
|
-
except Exception as del_err:
|
|
699
|
-
# Partial delete - log for manual recovery
|
|
700
|
-
logger.critical(
|
|
701
|
-
f"Partial consolidation failure: deleted {deleted_ids}, "
|
|
702
|
-
f"failed on {mid}: {del_err}. "
|
|
703
|
-
f"Remaining members may need manual cleanup: "
|
|
704
|
-
f"{[m for m in group_member_ids if m not in deleted_ids]}"
|
|
705
|
-
)
|
|
706
|
-
raise
|
|
707
|
-
|
|
708
|
-
# Now add the merged memory
|
|
709
|
-
try:
|
|
710
|
-
new_id = self._repo.add(merged_memory, new_vector)
|
|
711
|
-
except Exception as add_err:
|
|
712
|
-
# CRITICAL: Originals deleted but merge failed
|
|
713
|
-
# Log for manual recovery - data is in merged_content
|
|
714
|
-
logger.critical(
|
|
715
|
-
f"Consolidation add failed after deleting originals. "
|
|
716
|
-
f"Deleted IDs: {deleted_ids}. "
|
|
717
|
-
f"Merged content (save for recovery): {merged_content[:500]}... "
|
|
718
|
-
f"Error: {add_err}"
|
|
719
|
-
)
|
|
720
|
-
raise
|
|
721
|
-
|
|
722
|
-
rep_id = new_id
|
|
723
|
-
memories_merged += 1
|
|
724
|
-
action = "merged"
|
|
725
|
-
else:
|
|
726
|
-
# Keep representative, delete others
|
|
727
|
-
for mid in group_member_ids:
|
|
728
|
-
if mid != rep_id:
|
|
729
|
-
self._repo.delete(mid)
|
|
730
|
-
memories_deleted += 1
|
|
731
|
-
memories_merged += 1
|
|
732
|
-
action = "kept_representative"
|
|
733
|
-
except Exception as e:
|
|
734
|
-
logger.warning(f"Failed to consolidate group: {e}")
|
|
735
|
-
action = "failed"
|
|
745
|
+
# Apply consolidation strategy
|
|
746
|
+
try:
|
|
747
|
+
action_result: ConsolidationAction = strategy_impl.apply(
|
|
748
|
+
members=group_member_dicts,
|
|
749
|
+
member_ids=group_member_ids,
|
|
750
|
+
namespace=namespace,
|
|
751
|
+
repository=self._repo,
|
|
752
|
+
embeddings=self._embeddings,
|
|
753
|
+
dry_run=dry_run,
|
|
754
|
+
)
|
|
755
|
+
memories_merged += action_result.memories_merged
|
|
756
|
+
memories_deleted += action_result.memories_deleted
|
|
757
|
+
except Exception as e:
|
|
758
|
+
logger.warning(f"Failed to consolidate group: {e}")
|
|
759
|
+
action_result = ConsolidationAction(
|
|
760
|
+
representative_id=group_member_ids[0],
|
|
761
|
+
deleted_ids=[],
|
|
762
|
+
action="failed",
|
|
763
|
+
)
|
|
736
764
|
|
|
737
765
|
result_groups.append(
|
|
738
766
|
ConsolidationGroupResult(
|
|
739
|
-
representative_id=
|
|
767
|
+
representative_id=action_result.representative_id,
|
|
740
768
|
member_ids=group_member_ids,
|
|
741
769
|
avg_similarity=avg_similarity,
|
|
742
|
-
action_taken=action,
|
|
770
|
+
action_taken=action_result.action,
|
|
743
771
|
)
|
|
744
772
|
)
|
|
745
773
|
|
|
@@ -760,6 +788,171 @@ class LifecycleService:
|
|
|
760
788
|
# Helper Methods
|
|
761
789
|
# =========================================================================
|
|
762
790
|
|
|
791
|
+
def _consolidate_chunked(
|
|
792
|
+
self,
|
|
793
|
+
namespace: str,
|
|
794
|
+
similarity_threshold: float,
|
|
795
|
+
strategy: Literal[
|
|
796
|
+
"keep_newest", "keep_oldest", "keep_highest_importance", "merge_content"
|
|
797
|
+
],
|
|
798
|
+
dry_run: bool,
|
|
799
|
+
max_groups: int,
|
|
800
|
+
chunk_size: int,
|
|
801
|
+
) -> ConsolidateResult:
|
|
802
|
+
"""Process consolidation in memory-efficient chunks.
|
|
803
|
+
|
|
804
|
+
Processes memories in smaller chunks to reduce peak memory usage.
|
|
805
|
+
Note: This may miss duplicates that span chunk boundaries.
|
|
806
|
+
|
|
807
|
+
Args:
|
|
808
|
+
namespace: Namespace to consolidate.
|
|
809
|
+
similarity_threshold: Minimum similarity for duplicates.
|
|
810
|
+
strategy: How to handle duplicates.
|
|
811
|
+
dry_run: Preview without changes.
|
|
812
|
+
max_groups: Maximum groups to process total.
|
|
813
|
+
chunk_size: Memories per chunk.
|
|
814
|
+
|
|
815
|
+
Returns:
|
|
816
|
+
Aggregated ConsolidateResult from all chunks.
|
|
817
|
+
"""
|
|
818
|
+
all_groups: list[ConsolidationGroupResult] = []
|
|
819
|
+
total_merged = 0
|
|
820
|
+
total_deleted = 0
|
|
821
|
+
offset = 0
|
|
822
|
+
groups_remaining = max_groups
|
|
823
|
+
|
|
824
|
+
while groups_remaining > 0:
|
|
825
|
+
# Fetch chunk of memories
|
|
826
|
+
chunk_memories = self._repo.get_all(
|
|
827
|
+
namespace=namespace,
|
|
828
|
+
limit=chunk_size,
|
|
829
|
+
)
|
|
830
|
+
|
|
831
|
+
# Skip already processed memories by filtering by offset
|
|
832
|
+
# Note: This is a simplified approach - in production, you'd want
|
|
833
|
+
# to track processed IDs or use cursor-based pagination
|
|
834
|
+
if offset > 0:
|
|
835
|
+
# Re-fetch with offset simulation (get more and skip)
|
|
836
|
+
all_chunk = self._repo.get_all(
|
|
837
|
+
namespace=namespace,
|
|
838
|
+
limit=offset + chunk_size,
|
|
839
|
+
)
|
|
840
|
+
if len(all_chunk) <= offset:
|
|
841
|
+
# No more memories to process
|
|
842
|
+
break
|
|
843
|
+
chunk_memories = all_chunk[offset:offset + chunk_size]
|
|
844
|
+
|
|
845
|
+
if len(chunk_memories) < 2:
|
|
846
|
+
break
|
|
847
|
+
|
|
848
|
+
# Build lookup structures for this chunk
|
|
849
|
+
memories = [m for m, _ in chunk_memories]
|
|
850
|
+
vectors_list = [v for _, v in chunk_memories]
|
|
851
|
+
vectors_array = np.array(vectors_list, dtype=np.float32)
|
|
852
|
+
memory_ids = [m.id for m in memories]
|
|
853
|
+
contents = [m.content for m in memories]
|
|
854
|
+
memory_dicts: list[dict[str, Any]] = [
|
|
855
|
+
{
|
|
856
|
+
"id": m.id,
|
|
857
|
+
"content": m.content,
|
|
858
|
+
"created_at": m.created_at,
|
|
859
|
+
"last_accessed": m.last_accessed,
|
|
860
|
+
"access_count": m.access_count,
|
|
861
|
+
"importance": m.importance,
|
|
862
|
+
"tags": list(m.tags),
|
|
863
|
+
}
|
|
864
|
+
for m in memories
|
|
865
|
+
]
|
|
866
|
+
|
|
867
|
+
# Find duplicate groups in this chunk
|
|
868
|
+
group_indices = find_duplicate_groups(
|
|
869
|
+
memory_ids=memory_ids,
|
|
870
|
+
vectors=vectors_array,
|
|
871
|
+
contents=contents,
|
|
872
|
+
threshold=similarity_threshold,
|
|
873
|
+
content_weight=self._config.consolidate_content_weight,
|
|
874
|
+
)
|
|
875
|
+
|
|
876
|
+
# Limit groups for this chunk
|
|
877
|
+
group_indices = group_indices[:groups_remaining]
|
|
878
|
+
|
|
879
|
+
if not group_indices:
|
|
880
|
+
offset += len(chunk_memories)
|
|
881
|
+
continue
|
|
882
|
+
|
|
883
|
+
# Get strategy implementation
|
|
884
|
+
strategy_impl = get_strategy(strategy)
|
|
885
|
+
|
|
886
|
+
# Process groups in this chunk
|
|
887
|
+
for member_indices in group_indices:
|
|
888
|
+
group_member_dicts = [memory_dicts[i] for i in member_indices]
|
|
889
|
+
group_member_ids = [str(d["id"]) for d in group_member_dicts]
|
|
890
|
+
|
|
891
|
+
# Calculate average similarity
|
|
892
|
+
total_sim = 0.0
|
|
893
|
+
pair_count = 0
|
|
894
|
+
for i_idx, i in enumerate(member_indices):
|
|
895
|
+
for j in member_indices[i_idx + 1:]:
|
|
896
|
+
v1, v2 = vectors_array[i], vectors_array[j]
|
|
897
|
+
dot = float(np.dot(v1, v2))
|
|
898
|
+
norm1 = float(np.linalg.norm(v1))
|
|
899
|
+
norm2 = float(np.linalg.norm(v2))
|
|
900
|
+
if norm1 > 1e-10 and norm2 > 1e-10:
|
|
901
|
+
v_sim = dot / (norm1 * norm2)
|
|
902
|
+
else:
|
|
903
|
+
v_sim = 0.0
|
|
904
|
+
c_sim = jaccard_similarity(contents[i], contents[j])
|
|
905
|
+
combined = combined_similarity(
|
|
906
|
+
v_sim, c_sim, self._config.consolidate_content_weight
|
|
907
|
+
)
|
|
908
|
+
total_sim += combined
|
|
909
|
+
pair_count += 1
|
|
910
|
+
avg_similarity = total_sim / pair_count if pair_count > 0 else 0.0
|
|
911
|
+
|
|
912
|
+
# Apply consolidation strategy
|
|
913
|
+
try:
|
|
914
|
+
action_result: ConsolidationAction = strategy_impl.apply(
|
|
915
|
+
members=group_member_dicts,
|
|
916
|
+
member_ids=group_member_ids,
|
|
917
|
+
namespace=namespace,
|
|
918
|
+
repository=self._repo,
|
|
919
|
+
embeddings=self._embeddings,
|
|
920
|
+
dry_run=dry_run,
|
|
921
|
+
)
|
|
922
|
+
total_merged += action_result.memories_merged
|
|
923
|
+
total_deleted += action_result.memories_deleted
|
|
924
|
+
except Exception as e:
|
|
925
|
+
logger.warning(f"Failed to consolidate group: {e}")
|
|
926
|
+
action_result = ConsolidationAction(
|
|
927
|
+
representative_id=group_member_ids[0],
|
|
928
|
+
deleted_ids=[],
|
|
929
|
+
action="failed",
|
|
930
|
+
)
|
|
931
|
+
|
|
932
|
+
all_groups.append(
|
|
933
|
+
ConsolidationGroupResult(
|
|
934
|
+
representative_id=action_result.representative_id,
|
|
935
|
+
member_ids=group_member_ids,
|
|
936
|
+
avg_similarity=avg_similarity,
|
|
937
|
+
action_taken=action_result.action,
|
|
938
|
+
)
|
|
939
|
+
)
|
|
940
|
+
groups_remaining -= 1
|
|
941
|
+
|
|
942
|
+
offset += len(chunk_memories)
|
|
943
|
+
logger.debug(
|
|
944
|
+
f"Processed chunk at offset {offset - len(chunk_memories)}, "
|
|
945
|
+
f"found {len(group_indices)} groups"
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
return ConsolidateResult(
|
|
949
|
+
groups_found=len(all_groups),
|
|
950
|
+
memories_merged=total_merged,
|
|
951
|
+
memories_deleted=total_deleted,
|
|
952
|
+
groups=all_groups,
|
|
953
|
+
dry_run=dry_run,
|
|
954
|
+
)
|
|
955
|
+
|
|
763
956
|
def _check_duplicate(
|
|
764
957
|
self,
|
|
765
958
|
content: str,
|
|
@@ -843,3 +1036,85 @@ class LifecycleService:
|
|
|
843
1036
|
)
|
|
844
1037
|
|
|
845
1038
|
return self._repo.add(memory, vector)
|
|
1039
|
+
|
|
1040
|
+
def _check_duplicate_with_vector(
|
|
1041
|
+
self,
|
|
1042
|
+
content: str,
|
|
1043
|
+
vector: np.ndarray,
|
|
1044
|
+
namespace: str,
|
|
1045
|
+
threshold: float,
|
|
1046
|
+
) -> bool:
|
|
1047
|
+
"""Check if similar content already exists using pre-computed vector.
|
|
1048
|
+
|
|
1049
|
+
Args:
|
|
1050
|
+
content: Content to check.
|
|
1051
|
+
vector: Pre-computed embedding vector.
|
|
1052
|
+
namespace: Namespace to search.
|
|
1053
|
+
threshold: Similarity threshold.
|
|
1054
|
+
|
|
1055
|
+
Returns:
|
|
1056
|
+
True if a similar memory exists.
|
|
1057
|
+
"""
|
|
1058
|
+
try:
|
|
1059
|
+
# Search for similar memories using pre-computed vector
|
|
1060
|
+
results = self._repo.search(vector, limit=5, namespace=namespace)
|
|
1061
|
+
|
|
1062
|
+
for result in results:
|
|
1063
|
+
# Check vector similarity
|
|
1064
|
+
if result.similarity >= threshold:
|
|
1065
|
+
return True
|
|
1066
|
+
|
|
1067
|
+
# Also check content overlap
|
|
1068
|
+
content_sim = jaccard_similarity(content, result.content)
|
|
1069
|
+
combined = combined_similarity(
|
|
1070
|
+
result.similarity,
|
|
1071
|
+
content_sim,
|
|
1072
|
+
self._config.consolidate_content_weight,
|
|
1073
|
+
)
|
|
1074
|
+
if combined >= threshold:
|
|
1075
|
+
return True
|
|
1076
|
+
|
|
1077
|
+
return False
|
|
1078
|
+
|
|
1079
|
+
except Exception as e:
|
|
1080
|
+
logger.warning(f"Duplicate check failed: {e}")
|
|
1081
|
+
return False
|
|
1082
|
+
|
|
1083
|
+
def _store_extracted_memory_with_vector(
|
|
1084
|
+
self,
|
|
1085
|
+
content: str,
|
|
1086
|
+
vector: np.ndarray,
|
|
1087
|
+
namespace: str,
|
|
1088
|
+
confidence: float,
|
|
1089
|
+
pattern_type: str,
|
|
1090
|
+
) -> str:
|
|
1091
|
+
"""Store an extracted memory using pre-computed vector.
|
|
1092
|
+
|
|
1093
|
+
Args:
|
|
1094
|
+
content: Memory content.
|
|
1095
|
+
vector: Pre-computed embedding vector.
|
|
1096
|
+
namespace: Target namespace.
|
|
1097
|
+
confidence: Extraction confidence.
|
|
1098
|
+
pattern_type: Type of pattern matched.
|
|
1099
|
+
|
|
1100
|
+
Returns:
|
|
1101
|
+
The new memory's ID.
|
|
1102
|
+
"""
|
|
1103
|
+
# Scale importance by confidence but keep lower than manual memories
|
|
1104
|
+
importance = self._config.extract_default_importance * confidence
|
|
1105
|
+
|
|
1106
|
+
# Create memory
|
|
1107
|
+
memory = Memory(
|
|
1108
|
+
id="", # Will be assigned
|
|
1109
|
+
content=content,
|
|
1110
|
+
namespace=namespace,
|
|
1111
|
+
tags=[f"extracted-{pattern_type}"],
|
|
1112
|
+
importance=importance,
|
|
1113
|
+
source=MemorySource.EXTRACTED,
|
|
1114
|
+
metadata={
|
|
1115
|
+
"extraction_confidence": confidence,
|
|
1116
|
+
"extraction_pattern": pattern_type,
|
|
1117
|
+
},
|
|
1118
|
+
)
|
|
1119
|
+
|
|
1120
|
+
return self._repo.add(memory, vector)
|
|
@@ -404,9 +404,9 @@ class MemoryService:
|
|
|
404
404
|
if not memory_ids:
|
|
405
405
|
raise ValidationError("Memory ID list cannot be empty")
|
|
406
406
|
|
|
407
|
-
deleted_count = self._repo.delete_batch(memory_ids)
|
|
407
|
+
deleted_count, deleted_ids = self._repo.delete_batch(memory_ids)
|
|
408
408
|
|
|
409
409
|
return ForgetResult(
|
|
410
410
|
deleted=deleted_count,
|
|
411
|
-
ids=
|
|
411
|
+
ids=deleted_ids,
|
|
412
412
|
)
|