tribalmemory 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tribalmemory/__init__.py +3 -0
- tribalmemory/a21/__init__.py +38 -0
- tribalmemory/a21/config/__init__.py +20 -0
- tribalmemory/a21/config/providers.py +104 -0
- tribalmemory/a21/config/system.py +184 -0
- tribalmemory/a21/container/__init__.py +8 -0
- tribalmemory/a21/container/container.py +212 -0
- tribalmemory/a21/providers/__init__.py +32 -0
- tribalmemory/a21/providers/base.py +241 -0
- tribalmemory/a21/providers/deduplication.py +99 -0
- tribalmemory/a21/providers/lancedb.py +232 -0
- tribalmemory/a21/providers/memory.py +128 -0
- tribalmemory/a21/providers/mock.py +54 -0
- tribalmemory/a21/providers/openai.py +151 -0
- tribalmemory/a21/providers/timestamp.py +88 -0
- tribalmemory/a21/system.py +293 -0
- tribalmemory/cli.py +298 -0
- tribalmemory/interfaces.py +306 -0
- tribalmemory/mcp/__init__.py +9 -0
- tribalmemory/mcp/__main__.py +6 -0
- tribalmemory/mcp/server.py +484 -0
- tribalmemory/performance/__init__.py +1 -0
- tribalmemory/performance/benchmarks.py +285 -0
- tribalmemory/performance/corpus_generator.py +171 -0
- tribalmemory/portability/__init__.py +1 -0
- tribalmemory/portability/embedding_metadata.py +320 -0
- tribalmemory/server/__init__.py +9 -0
- tribalmemory/server/__main__.py +6 -0
- tribalmemory/server/app.py +187 -0
- tribalmemory/server/config.py +115 -0
- tribalmemory/server/models.py +206 -0
- tribalmemory/server/routes.py +378 -0
- tribalmemory/services/__init__.py +15 -0
- tribalmemory/services/deduplication.py +115 -0
- tribalmemory/services/embeddings.py +273 -0
- tribalmemory/services/import_export.py +506 -0
- tribalmemory/services/memory.py +275 -0
- tribalmemory/services/vector_store.py +360 -0
- tribalmemory/testing/__init__.py +22 -0
- tribalmemory/testing/embedding_utils.py +110 -0
- tribalmemory/testing/fixtures.py +123 -0
- tribalmemory/testing/metrics.py +256 -0
- tribalmemory/testing/mocks.py +560 -0
- tribalmemory/testing/semantic_expansions.py +91 -0
- tribalmemory/utils.py +23 -0
- tribalmemory-0.1.0.dist-info/METADATA +275 -0
- tribalmemory-0.1.0.dist-info/RECORD +51 -0
- tribalmemory-0.1.0.dist-info/WHEEL +5 -0
- tribalmemory-0.1.0.dist-info/entry_points.txt +3 -0
- tribalmemory-0.1.0.dist-info/licenses/LICENSE +190 -0
- tribalmemory-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
"""Mock implementations for testing."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import hashlib
|
|
5
|
+
import random
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from ..interfaces import (
|
|
12
|
+
IEmbeddingService,
|
|
13
|
+
IVectorStore,
|
|
14
|
+
IMemoryService,
|
|
15
|
+
IDeduplicationService,
|
|
16
|
+
ITimestampService,
|
|
17
|
+
MemoryEntry,
|
|
18
|
+
MemorySource,
|
|
19
|
+
RecallResult,
|
|
20
|
+
StoreResult,
|
|
21
|
+
)
|
|
22
|
+
from .embedding_utils import hash_to_embedding_extended
|
|
23
|
+
from .semantic_expansions import (
|
|
24
|
+
SHORT_IMPORTANT_WORDS,
|
|
25
|
+
get_expanded_terms,
|
|
26
|
+
get_word_variants,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Scoring constants for text matching in recall
|
|
31
|
+
CANDIDATE_MULTIPLIER = 3 # Fetch N times more candidates than limit for re-ranking
|
|
32
|
+
MIN_CANDIDATE_THRESHOLD = 0.1 # Minimum similarity for candidate consideration
|
|
33
|
+
BASE_TEXT_MATCH_SCORE = 0.7 # Base score when meaningful words overlap
|
|
34
|
+
OVERLAP_BOOST_PER_WORD = 0.05 # Additional score per overlapping word
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class MockEmbeddingService(IEmbeddingService):
|
|
38
|
+
"""Mock embedding service for testing.
|
|
39
|
+
|
|
40
|
+
Uses deterministic hashing for reproducible tests.
|
|
41
|
+
Can be configured to simulate failures and latency.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
embedding_dim: int = 1536,
|
|
47
|
+
latency_ms: float = 0,
|
|
48
|
+
failure_rate: float = 0,
|
|
49
|
+
timeout_after_n: Optional[int] = None,
|
|
50
|
+
skip_latency: bool = False,
|
|
51
|
+
):
|
|
52
|
+
"""Initialize mock embedding service.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
embedding_dim: Dimension of generated embeddings.
|
|
56
|
+
latency_ms: Simulated latency per call.
|
|
57
|
+
failure_rate: Probability of failure (0.0-1.0).
|
|
58
|
+
timeout_after_n: Simulate timeout after N calls.
|
|
59
|
+
skip_latency: If True, skip all latency simulation (fast mode for dev).
|
|
60
|
+
"""
|
|
61
|
+
self.embedding_dim = embedding_dim
|
|
62
|
+
self.latency_ms = latency_ms
|
|
63
|
+
self.failure_rate = failure_rate
|
|
64
|
+
self.timeout_after_n = timeout_after_n
|
|
65
|
+
self.skip_latency = skip_latency
|
|
66
|
+
self._call_count = 0
|
|
67
|
+
|
|
68
|
+
async def embed(self, text: str) -> list[float]:
|
|
69
|
+
"""Generate deterministic embedding from text hash."""
|
|
70
|
+
self._call_count += 1
|
|
71
|
+
|
|
72
|
+
# Simulate timeout
|
|
73
|
+
if self.timeout_after_n and self._call_count > self.timeout_after_n:
|
|
74
|
+
if not self.skip_latency:
|
|
75
|
+
await asyncio.sleep(30) # Will trigger timeout
|
|
76
|
+
|
|
77
|
+
# Simulate latency
|
|
78
|
+
if self.latency_ms > 0 and not self.skip_latency:
|
|
79
|
+
await asyncio.sleep(self.latency_ms / 1000)
|
|
80
|
+
|
|
81
|
+
# Simulate failures
|
|
82
|
+
if self.failure_rate > 0 and random.random() < self.failure_rate:
|
|
83
|
+
raise RuntimeError("Mock embedding API failure")
|
|
84
|
+
|
|
85
|
+
return self._hash_to_embedding(text)
|
|
86
|
+
|
|
87
|
+
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
88
|
+
"""Generate embeddings for batch."""
|
|
89
|
+
return [await self.embed(t) for t in texts]
|
|
90
|
+
|
|
91
|
+
def similarity(self, a: list[float], b: list[float]) -> float:
|
|
92
|
+
"""Calculate cosine similarity.
|
|
93
|
+
|
|
94
|
+
Note: Zero vectors (all zeros) return 0.0 similarity with any other vector.
|
|
95
|
+
This is intentional - zero vectors indicate corrupted/missing embeddings
|
|
96
|
+
and should not match anything. Tests in test_negative_security.py verify
|
|
97
|
+
that corrupted embeddings are excluded from results.
|
|
98
|
+
"""
|
|
99
|
+
import math
|
|
100
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
101
|
+
norm_a = math.sqrt(sum(x * x for x in a))
|
|
102
|
+
norm_b = math.sqrt(sum(x * x for x in b))
|
|
103
|
+
if norm_a == 0 or norm_b == 0:
|
|
104
|
+
return 0.0
|
|
105
|
+
return dot / (norm_a * norm_b)
|
|
106
|
+
|
|
107
|
+
def _hash_to_embedding(self, text: str) -> list[float]:
|
|
108
|
+
"""Convert text to deterministic embedding that preserves semantic similarity.
|
|
109
|
+
|
|
110
|
+
Delegates to shared utility for consistent behavior across mock implementations.
|
|
111
|
+
Uses extended version with sliding window hashes for substring matching.
|
|
112
|
+
"""
|
|
113
|
+
return hash_to_embedding_extended(text, self.embedding_dim)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class MockVectorStore(IVectorStore):
|
|
117
|
+
"""In-memory vector store for testing."""
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self,
|
|
121
|
+
embedding_service: IEmbeddingService,
|
|
122
|
+
latency_ms: float = 0,
|
|
123
|
+
max_capacity: Optional[int] = None
|
|
124
|
+
):
|
|
125
|
+
self.embedding_service = embedding_service
|
|
126
|
+
self.latency_ms = latency_ms
|
|
127
|
+
self.max_capacity = max_capacity
|
|
128
|
+
self._store: dict[str, MemoryEntry] = {}
|
|
129
|
+
self._deleted: set[str] = set()
|
|
130
|
+
|
|
131
|
+
async def store(self, entry: MemoryEntry) -> StoreResult:
|
|
132
|
+
"""Store a memory entry."""
|
|
133
|
+
if self.latency_ms > 0:
|
|
134
|
+
await asyncio.sleep(self.latency_ms / 1000)
|
|
135
|
+
|
|
136
|
+
if self.max_capacity and len(self._store) >= self.max_capacity:
|
|
137
|
+
return StoreResult(
|
|
138
|
+
success=False,
|
|
139
|
+
error="Storage capacity reached"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
self._store[entry.id] = entry
|
|
143
|
+
return StoreResult(success=True, memory_id=entry.id)
|
|
144
|
+
|
|
145
|
+
async def recall(
|
|
146
|
+
self,
|
|
147
|
+
query_embedding: list[float],
|
|
148
|
+
limit: int = 10,
|
|
149
|
+
min_similarity: float = 0.7,
|
|
150
|
+
filters: Optional[dict] = None,
|
|
151
|
+
) -> list[RecallResult]:
|
|
152
|
+
"""Recall memories similar to query."""
|
|
153
|
+
start = time.perf_counter()
|
|
154
|
+
|
|
155
|
+
if self.latency_ms > 0:
|
|
156
|
+
await asyncio.sleep(self.latency_ms / 1000)
|
|
157
|
+
|
|
158
|
+
results = []
|
|
159
|
+
for memory in self._store.values():
|
|
160
|
+
if memory.id in self._deleted:
|
|
161
|
+
continue
|
|
162
|
+
if memory.embedding is None:
|
|
163
|
+
continue
|
|
164
|
+
|
|
165
|
+
# Apply filters
|
|
166
|
+
if filters and "tags" in filters and filters["tags"]:
|
|
167
|
+
if not any(t in memory.tags for t in filters["tags"]):
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
sim = self.embedding_service.similarity(query_embedding, memory.embedding)
|
|
171
|
+
if sim >= min_similarity:
|
|
172
|
+
results.append((memory, sim))
|
|
173
|
+
|
|
174
|
+
# Sort by similarity, take top limit
|
|
175
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
176
|
+
results = results[:limit]
|
|
177
|
+
|
|
178
|
+
elapsed_ms = (time.perf_counter() - start) * 1000
|
|
179
|
+
|
|
180
|
+
return [
|
|
181
|
+
RecallResult(
|
|
182
|
+
memory=mem,
|
|
183
|
+
similarity_score=sim,
|
|
184
|
+
retrieval_time_ms=elapsed_ms
|
|
185
|
+
)
|
|
186
|
+
for mem, sim in results
|
|
187
|
+
]
|
|
188
|
+
|
|
189
|
+
async def get(self, memory_id: str) -> Optional[MemoryEntry]:
|
|
190
|
+
"""Get a specific memory by ID."""
|
|
191
|
+
if memory_id in self._deleted:
|
|
192
|
+
return None
|
|
193
|
+
return self._store.get(memory_id)
|
|
194
|
+
|
|
195
|
+
async def delete(self, memory_id: str) -> bool:
|
|
196
|
+
"""Soft delete a memory."""
|
|
197
|
+
if memory_id in self._store:
|
|
198
|
+
self._deleted.add(memory_id)
|
|
199
|
+
return True
|
|
200
|
+
return False
|
|
201
|
+
|
|
202
|
+
async def list(
|
|
203
|
+
self,
|
|
204
|
+
limit: int = 1000,
|
|
205
|
+
offset: int = 0,
|
|
206
|
+
filters: Optional[dict] = None,
|
|
207
|
+
) -> list[MemoryEntry]:
|
|
208
|
+
"""List memories with optional filtering."""
|
|
209
|
+
entries = [
|
|
210
|
+
m for m in list(self._store.values())
|
|
211
|
+
if m.id not in self._deleted
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
if filters and "tags" in filters and filters["tags"]:
|
|
215
|
+
entries = [e for e in entries if any(t in e.tags for t in filters["tags"])]
|
|
216
|
+
|
|
217
|
+
return entries[offset:offset + limit]
|
|
218
|
+
|
|
219
|
+
async def count(self, filters: Optional[dict] = None) -> int:
|
|
220
|
+
"""Count memories matching filters."""
|
|
221
|
+
entries = await self.list(limit=100000, filters=filters)
|
|
222
|
+
return len(entries)
|
|
223
|
+
|
|
224
|
+
def clear(self):
|
|
225
|
+
"""Clear all data (for test cleanup)."""
|
|
226
|
+
self._store.clear()
|
|
227
|
+
self._deleted.clear()
|
|
228
|
+
|
|
229
|
+
async def __aenter__(self):
|
|
230
|
+
"""Async context manager entry."""
|
|
231
|
+
return self
|
|
232
|
+
|
|
233
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
234
|
+
"""Async context manager exit - clears store to prevent test pollution."""
|
|
235
|
+
self.clear()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class MockDeduplicationService(IDeduplicationService):
|
|
239
|
+
"""Mock deduplication service."""
|
|
240
|
+
|
|
241
|
+
def __init__(
|
|
242
|
+
self,
|
|
243
|
+
vector_store: MockVectorStore,
|
|
244
|
+
embedding_service: IEmbeddingService
|
|
245
|
+
):
|
|
246
|
+
self.vector_store = vector_store
|
|
247
|
+
self.embedding_service = embedding_service
|
|
248
|
+
|
|
249
|
+
async def is_duplicate(
|
|
250
|
+
self,
|
|
251
|
+
content: str,
|
|
252
|
+
embedding: list[float],
|
|
253
|
+
threshold: float = 0.90
|
|
254
|
+
) -> tuple[bool, Optional[str]]:
|
|
255
|
+
"""Check if content is duplicate.
|
|
256
|
+
|
|
257
|
+
Default threshold lowered to 0.90 to catch near-duplicates like:
|
|
258
|
+
- "Joe prefers concise responses" vs "Joe likes concise answers"
|
|
259
|
+
- Typo corrections and minor paraphrases
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Tuple of (is_duplicate, duplicate_of_id)
|
|
263
|
+
"""
|
|
264
|
+
similar = await self.find_similar(content, embedding, threshold)
|
|
265
|
+
if similar:
|
|
266
|
+
return True, similar[0][0]
|
|
267
|
+
return False, None
|
|
268
|
+
|
|
269
|
+
async def find_similar(
|
|
270
|
+
self,
|
|
271
|
+
content: str,
|
|
272
|
+
embedding: list[float],
|
|
273
|
+
threshold: float = 0.85,
|
|
274
|
+
limit: int = 10,
|
|
275
|
+
) -> list[tuple[str, float]]:
|
|
276
|
+
"""Find similar memories."""
|
|
277
|
+
results = await self.vector_store.recall(
|
|
278
|
+
embedding,
|
|
279
|
+
limit=limit,
|
|
280
|
+
min_similarity=threshold
|
|
281
|
+
)
|
|
282
|
+
return [(r.memory.id, r.similarity_score) for r in results]
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
class MockMemoryService(IMemoryService):
|
|
286
|
+
"""High-level mock memory service for testing."""
|
|
287
|
+
|
|
288
|
+
def __init__(
|
|
289
|
+
self,
|
|
290
|
+
instance_id: str = "test-instance",
|
|
291
|
+
embedding_service: Optional[IEmbeddingService] = None,
|
|
292
|
+
vector_store: Optional[IVectorStore] = None
|
|
293
|
+
):
|
|
294
|
+
self.instance_id = instance_id
|
|
295
|
+
self.embedding_service = embedding_service or MockEmbeddingService()
|
|
296
|
+
self.vector_store = vector_store or MockVectorStore(self.embedding_service)
|
|
297
|
+
self.dedup_service = MockDeduplicationService(
|
|
298
|
+
self.vector_store,
|
|
299
|
+
self.embedding_service
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
async def remember(
|
|
303
|
+
self,
|
|
304
|
+
content: str,
|
|
305
|
+
source_type: MemorySource = MemorySource.AUTO_CAPTURE,
|
|
306
|
+
context: Optional[str] = None,
|
|
307
|
+
tags: Optional[list[str]] = None,
|
|
308
|
+
skip_dedup: bool = False,
|
|
309
|
+
) -> StoreResult:
|
|
310
|
+
"""Store a new memory."""
|
|
311
|
+
# Validate
|
|
312
|
+
if not content or not content.strip():
|
|
313
|
+
return StoreResult(success=False, error="Empty content not allowed")
|
|
314
|
+
|
|
315
|
+
# Generate embedding
|
|
316
|
+
embedding = await self.embedding_service.embed(content)
|
|
317
|
+
|
|
318
|
+
# Check for duplicates
|
|
319
|
+
if not skip_dedup:
|
|
320
|
+
is_dup, dup_id = await self.dedup_service.is_duplicate(content, embedding)
|
|
321
|
+
if is_dup:
|
|
322
|
+
return StoreResult(success=False, duplicate_of=dup_id)
|
|
323
|
+
|
|
324
|
+
# Create entry
|
|
325
|
+
entry = MemoryEntry(
|
|
326
|
+
content=content,
|
|
327
|
+
embedding=embedding,
|
|
328
|
+
source_instance=self.instance_id,
|
|
329
|
+
source_type=source_type,
|
|
330
|
+
context=context,
|
|
331
|
+
tags=tags or []
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
return await self.vector_store.store(entry)
|
|
335
|
+
|
|
336
|
+
async def recall(
|
|
337
|
+
self,
|
|
338
|
+
query: str,
|
|
339
|
+
limit: int = 5,
|
|
340
|
+
min_relevance: float = 0.7,
|
|
341
|
+
tags: Optional[list[str]] = None,
|
|
342
|
+
) -> list[RecallResult]:
|
|
343
|
+
"""Recall relevant memories.
|
|
344
|
+
|
|
345
|
+
Uses both embedding similarity and text matching to better simulate
|
|
346
|
+
real semantic search behavior in the mock.
|
|
347
|
+
"""
|
|
348
|
+
query_embedding = await self.embedding_service.embed(query)
|
|
349
|
+
filters = {"tags": tags} if tags else None
|
|
350
|
+
|
|
351
|
+
# Get results from vector store with lowered threshold
|
|
352
|
+
# We'll re-filter based on combined score
|
|
353
|
+
results = await self.vector_store.recall(
|
|
354
|
+
query_embedding,
|
|
355
|
+
limit=limit * CANDIDATE_MULTIPLIER,
|
|
356
|
+
min_similarity=min(MIN_CANDIDATE_THRESHOLD, min_relevance / 2),
|
|
357
|
+
filters=filters
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
# Boost scores based on text matching (simulates semantic similarity better)
|
|
361
|
+
query_lower = query.lower()
|
|
362
|
+
# Filter out very short words for matching (stopwords-ish)
|
|
363
|
+
query_words = {w for w in re.findall(r'\b\w+\b', query_lower) if len(w) > 2}
|
|
364
|
+
|
|
365
|
+
# Add common short words that matter
|
|
366
|
+
query_words.update(
|
|
367
|
+
w for w in re.findall(r'\b\w+\b', query_lower)
|
|
368
|
+
if w in SHORT_IMPORTANT_WORDS
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Expand query words with variants (pseudo-stemming)
|
|
372
|
+
expanded_query = set()
|
|
373
|
+
for w in query_words:
|
|
374
|
+
expanded_query.update(get_word_variants(w))
|
|
375
|
+
query_words = expanded_query
|
|
376
|
+
|
|
377
|
+
# Apply semantic expansions for common concepts
|
|
378
|
+
query_words = get_expanded_terms(query_words, query_lower)
|
|
379
|
+
|
|
380
|
+
def is_corrupted_embedding(emb: list[float] | None) -> bool:
|
|
381
|
+
"""Check if embedding is corrupted (zero vector, NaN, etc.)."""
|
|
382
|
+
if emb is None:
|
|
383
|
+
return True
|
|
384
|
+
if all(x == 0.0 for x in emb):
|
|
385
|
+
return True
|
|
386
|
+
if any(x != x for x in emb): # NaN check
|
|
387
|
+
return True
|
|
388
|
+
return False
|
|
389
|
+
|
|
390
|
+
boosted_results = []
|
|
391
|
+
for r in results:
|
|
392
|
+
# Skip memories with corrupted embeddings (security consideration)
|
|
393
|
+
if is_corrupted_embedding(r.memory.embedding):
|
|
394
|
+
continue
|
|
395
|
+
|
|
396
|
+
content_lower = r.memory.content.lower()
|
|
397
|
+
content_words = {w for w in re.findall(r'\b\w+\b', content_lower) if len(w) > 2}
|
|
398
|
+
|
|
399
|
+
# Calculate text match boost
|
|
400
|
+
text_boost = 0.0
|
|
401
|
+
|
|
402
|
+
# Exact substring match is strong signal
|
|
403
|
+
if query_lower in content_lower:
|
|
404
|
+
text_boost = max(text_boost, 0.9)
|
|
405
|
+
|
|
406
|
+
# Word overlap scoring
|
|
407
|
+
if query_words and content_words:
|
|
408
|
+
overlap = query_words & content_words
|
|
409
|
+
# If any meaningful (>=3 chars) words overlap, it's relevant
|
|
410
|
+
meaningful_overlap = [w for w in overlap if len(w) >= 3]
|
|
411
|
+
if meaningful_overlap:
|
|
412
|
+
# More overlap = higher score, base score is 0.7 (meets default threshold)
|
|
413
|
+
score = BASE_TEXT_MATCH_SCORE + OVERLAP_BOOST_PER_WORD * len(meaningful_overlap)
|
|
414
|
+
text_boost = max(text_boost, score)
|
|
415
|
+
elif overlap:
|
|
416
|
+
text_boost = max(text_boost, 0.5)
|
|
417
|
+
|
|
418
|
+
# Combined score: max of embedding sim and text boost
|
|
419
|
+
combined_score = max(r.similarity_score, text_boost)
|
|
420
|
+
|
|
421
|
+
if combined_score >= min_relevance:
|
|
422
|
+
boosted_results.append(RecallResult(
|
|
423
|
+
memory=r.memory,
|
|
424
|
+
similarity_score=combined_score,
|
|
425
|
+
retrieval_time_ms=r.retrieval_time_ms
|
|
426
|
+
))
|
|
427
|
+
|
|
428
|
+
# Also check memories not returned by vector search (text-only matches)
|
|
429
|
+
returned_ids = {r.memory.id for r in results}
|
|
430
|
+
all_memories = await self.vector_store.list(limit=1000, filters=filters)
|
|
431
|
+
|
|
432
|
+
for memory in all_memories:
|
|
433
|
+
if memory.id in returned_ids or memory.id in self.vector_store._deleted:
|
|
434
|
+
continue
|
|
435
|
+
|
|
436
|
+
# Skip memories with corrupted embeddings (security consideration)
|
|
437
|
+
if is_corrupted_embedding(memory.embedding):
|
|
438
|
+
continue
|
|
439
|
+
|
|
440
|
+
content_lower = memory.content.lower()
|
|
441
|
+
content_words = {w for w in re.findall(r'\b\w+\b', content_lower) if len(w) > 2}
|
|
442
|
+
|
|
443
|
+
text_boost = 0.0
|
|
444
|
+
if query_lower in content_lower:
|
|
445
|
+
text_boost = 0.9
|
|
446
|
+
elif query_words and content_words:
|
|
447
|
+
overlap = query_words & content_words
|
|
448
|
+
meaningful_overlap = [w for w in overlap if len(w) >= 3]
|
|
449
|
+
if meaningful_overlap:
|
|
450
|
+
text_boost = (
|
|
451
|
+
BASE_TEXT_MATCH_SCORE + OVERLAP_BOOST_PER_WORD * len(meaningful_overlap)
|
|
452
|
+
)
|
|
453
|
+
elif overlap:
|
|
454
|
+
text_boost = 0.5
|
|
455
|
+
|
|
456
|
+
if text_boost >= min_relevance:
|
|
457
|
+
boosted_results.append(RecallResult(
|
|
458
|
+
memory=memory,
|
|
459
|
+
similarity_score=text_boost,
|
|
460
|
+
retrieval_time_ms=0.0
|
|
461
|
+
))
|
|
462
|
+
|
|
463
|
+
# Sort by score and limit
|
|
464
|
+
boosted_results.sort(key=lambda x: x.similarity_score, reverse=True)
|
|
465
|
+
return boosted_results[:limit]
|
|
466
|
+
|
|
467
|
+
async def correct(
|
|
468
|
+
self,
|
|
469
|
+
original_id: str,
|
|
470
|
+
corrected_content: str,
|
|
471
|
+
context: Optional[str] = None
|
|
472
|
+
) -> StoreResult:
|
|
473
|
+
"""Store a correction to an existing memory."""
|
|
474
|
+
embedding = await self.embedding_service.embed(corrected_content)
|
|
475
|
+
|
|
476
|
+
entry = MemoryEntry(
|
|
477
|
+
content=corrected_content,
|
|
478
|
+
embedding=embedding,
|
|
479
|
+
source_instance=self.instance_id,
|
|
480
|
+
source_type=MemorySource.CORRECTION,
|
|
481
|
+
context=context,
|
|
482
|
+
supersedes=original_id
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
return await self.vector_store.store(entry)
|
|
486
|
+
|
|
487
|
+
async def forget(self, memory_id: str) -> bool:
|
|
488
|
+
"""Forget a memory."""
|
|
489
|
+
return await self.vector_store.delete(memory_id)
|
|
490
|
+
|
|
491
|
+
async def get(self, memory_id: str) -> Optional[MemoryEntry]:
|
|
492
|
+
"""Get a memory by ID with full provenance."""
|
|
493
|
+
return await self.vector_store.get(memory_id)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
class MockTimestampService(ITimestampService):
|
|
497
|
+
"""Mock RFC 3161 timestamp service for testing.
|
|
498
|
+
|
|
499
|
+
Generates deterministic timestamps for reproducible tests.
|
|
500
|
+
Does NOT provide cryptographic guarantees - use real TSA in production.
|
|
501
|
+
"""
|
|
502
|
+
|
|
503
|
+
def __init__(self, fail_verify: bool = False):
|
|
504
|
+
"""Initialize mock timestamp service.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
fail_verify: If True, verify() always returns False (for testing failures)
|
|
508
|
+
"""
|
|
509
|
+
self.fail_verify = fail_verify
|
|
510
|
+
self._timestamps: dict[bytes, datetime] = {}
|
|
511
|
+
|
|
512
|
+
async def timestamp(self, data: bytes) -> bytes:
|
|
513
|
+
"""Generate a mock timestamp token.
|
|
514
|
+
|
|
515
|
+
Token format: "MOCK_TSA|{iso_timestamp}|{data_hash}"
|
|
516
|
+
This is NOT RFC 3161 compliant - use for testing only.
|
|
517
|
+
"""
|
|
518
|
+
import hashlib
|
|
519
|
+
|
|
520
|
+
now = datetime.utcnow()
|
|
521
|
+
data_hash = hashlib.sha256(data).hexdigest()[:16]
|
|
522
|
+
token = f"MOCK_TSA|{now.isoformat()}|{data_hash}".encode()
|
|
523
|
+
|
|
524
|
+
# Store for verification
|
|
525
|
+
self._timestamps[token] = now
|
|
526
|
+
|
|
527
|
+
return token
|
|
528
|
+
|
|
529
|
+
async def verify(self, data: bytes, token: bytes) -> tuple[bool, Optional[datetime]]:
|
|
530
|
+
"""Verify a mock timestamp token.
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
Tuple of (is_valid, timestamp_datetime)
|
|
534
|
+
"""
|
|
535
|
+
if self.fail_verify:
|
|
536
|
+
return False, None
|
|
537
|
+
|
|
538
|
+
try:
|
|
539
|
+
decoded = token.decode()
|
|
540
|
+
if not decoded.startswith("MOCK_TSA|"):
|
|
541
|
+
return False, None
|
|
542
|
+
|
|
543
|
+
parts = decoded.split("|")
|
|
544
|
+
if len(parts) != 3:
|
|
545
|
+
return False, None
|
|
546
|
+
|
|
547
|
+
timestamp_str = parts[1]
|
|
548
|
+
stored_hash = parts[2]
|
|
549
|
+
|
|
550
|
+
# Verify data hash matches
|
|
551
|
+
import hashlib
|
|
552
|
+
actual_hash = hashlib.sha256(data).hexdigest()[:16]
|
|
553
|
+
if actual_hash != stored_hash:
|
|
554
|
+
return False, None
|
|
555
|
+
|
|
556
|
+
timestamp = datetime.fromisoformat(timestamp_str)
|
|
557
|
+
return True, timestamp
|
|
558
|
+
|
|
559
|
+
except Exception:
|
|
560
|
+
return False, None
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Shared semantic expansion utilities for mock implementations.
|
|
2
|
+
|
|
3
|
+
Provides centralized semantic term dictionaries and helpers for tests.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# Short words that are meaningful in technical contexts
|
|
7
|
+
SHORT_IMPORTANT_WORDS = {'pr', 'rm', 'ui', 'ux', 'ai', 'ml', 'db', 'js', 'ts'}
|
|
8
|
+
|
|
9
|
+
# Semantic expansion dictionaries by domain
|
|
10
|
+
TECH_TERMS = {
|
|
11
|
+
'next', 'nextjs', 'react', 'tailwind', 'supabase', 'claude', 'api',
|
|
12
|
+
'typescript', 'javascript', 'python', 'database', 'postgresql', 'backend',
|
|
13
|
+
'frontend', 'framework', 'styling', 'css', 'app', 'router'
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
WORKFLOW_TERMS = {'pr', 'prs', 'review', 'commit', 'branch', 'merge', 'git'}
|
|
17
|
+
|
|
18
|
+
TESTING_TERMS = {'tdd', 'tests', 'testing', 'test', 'unit', 'coverage'}
|
|
19
|
+
|
|
20
|
+
FILE_TERMS = {'delete', 'deletion', 'trash', 'rm', 'remove', 'file', 'files'}
|
|
21
|
+
|
|
22
|
+
TIMESTAMP_TERMS = {'rfc', '3161', 'timestamp', 'provenance', 'blockchain'}
|
|
23
|
+
|
|
24
|
+
TIMEZONE_TERMS = {
|
|
25
|
+
'timezone', 'eastern', 'mountain', 'pacific',
|
|
26
|
+
'central', 'utc', 'summer', 'winter'
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_word_variants(word: str) -> set[str]:
|
|
31
|
+
"""Get common variants of a word (pseudo-stemming).
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
word: Base word to expand.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Set of word variants including original.
|
|
38
|
+
"""
|
|
39
|
+
variants = {word}
|
|
40
|
+
# Remove common suffixes
|
|
41
|
+
for suffix in ['ing', 'tion', 'ation', 'ed', 'er', 'ly', 's', 'es']:
|
|
42
|
+
if word.endswith(suffix) and len(word) > len(suffix) + 2:
|
|
43
|
+
root = word[:-len(suffix)]
|
|
44
|
+
variants.add(root)
|
|
45
|
+
# Also add other forms of the root
|
|
46
|
+
variants.add(root + 's')
|
|
47
|
+
variants.add(root + 'ing')
|
|
48
|
+
# Add common suffixes to the word
|
|
49
|
+
variants.add(word + 's')
|
|
50
|
+
variants.add(word + 'ing')
|
|
51
|
+
variants.add(word + 'tion')
|
|
52
|
+
return variants
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_expanded_terms(query_words: set[str], query_lower: str) -> set[str]:
|
|
56
|
+
"""Expand query words with semantic related terms.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
query_words: Initial set of query words.
|
|
60
|
+
query_lower: Lowercase query string for substring checks.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Expanded set of query words.
|
|
64
|
+
"""
|
|
65
|
+
expanded = set(query_words)
|
|
66
|
+
|
|
67
|
+
# Tech/stack related
|
|
68
|
+
if 'tech' in query_words or 'stack' in query_words or 'technology' in query_lower:
|
|
69
|
+
expanded.update(TECH_TERMS)
|
|
70
|
+
|
|
71
|
+
# Workflow/process related
|
|
72
|
+
if 'workflow' in query_words or 'process' in query_words:
|
|
73
|
+
expanded.update(WORKFLOW_TERMS)
|
|
74
|
+
|
|
75
|
+
# Testing related
|
|
76
|
+
if 'testing' in query_words or 'test' in query_words:
|
|
77
|
+
expanded.update(TESTING_TERMS)
|
|
78
|
+
|
|
79
|
+
# File operations
|
|
80
|
+
if 'delete' in query_words or 'files' in query_words or 'file' in query_words:
|
|
81
|
+
expanded.update(FILE_TERMS)
|
|
82
|
+
|
|
83
|
+
# Timestamp/provenance
|
|
84
|
+
if 'timestamp' in query_words or 'provenance' in query_words:
|
|
85
|
+
expanded.update(TIMESTAMP_TERMS)
|
|
86
|
+
|
|
87
|
+
# Timezone related
|
|
88
|
+
if 'timezone' in query_words or 'time' in query_words:
|
|
89
|
+
expanded.update(TIMEZONE_TERMS)
|
|
90
|
+
|
|
91
|
+
return expanded
|
tribalmemory/utils.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Shared utility functions for TribalMemory.
|
|
2
|
+
|
|
3
|
+
This module provides common functions used across multiple components
|
|
4
|
+
to prevent code duplication and ensure consistency.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def normalize_embedding(embedding: list[float]) -> list[float]:
|
|
11
|
+
"""Normalize embedding to unit length for consistent similarity math.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
embedding: Vector of floats representing an embedding.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Normalized embedding with unit length (L2 norm = 1).
|
|
18
|
+
Returns the original embedding if it has zero magnitude.
|
|
19
|
+
"""
|
|
20
|
+
norm = math.sqrt(sum(x * x for x in embedding))
|
|
21
|
+
if norm == 0:
|
|
22
|
+
return embedding
|
|
23
|
+
return [x / norm for x in embedding]
|