tribalmemory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. tribalmemory/__init__.py +3 -0
  2. tribalmemory/a21/__init__.py +38 -0
  3. tribalmemory/a21/config/__init__.py +20 -0
  4. tribalmemory/a21/config/providers.py +104 -0
  5. tribalmemory/a21/config/system.py +184 -0
  6. tribalmemory/a21/container/__init__.py +8 -0
  7. tribalmemory/a21/container/container.py +212 -0
  8. tribalmemory/a21/providers/__init__.py +32 -0
  9. tribalmemory/a21/providers/base.py +241 -0
  10. tribalmemory/a21/providers/deduplication.py +99 -0
  11. tribalmemory/a21/providers/lancedb.py +232 -0
  12. tribalmemory/a21/providers/memory.py +128 -0
  13. tribalmemory/a21/providers/mock.py +54 -0
  14. tribalmemory/a21/providers/openai.py +151 -0
  15. tribalmemory/a21/providers/timestamp.py +88 -0
  16. tribalmemory/a21/system.py +293 -0
  17. tribalmemory/cli.py +298 -0
  18. tribalmemory/interfaces.py +306 -0
  19. tribalmemory/mcp/__init__.py +9 -0
  20. tribalmemory/mcp/__main__.py +6 -0
  21. tribalmemory/mcp/server.py +484 -0
  22. tribalmemory/performance/__init__.py +1 -0
  23. tribalmemory/performance/benchmarks.py +285 -0
  24. tribalmemory/performance/corpus_generator.py +171 -0
  25. tribalmemory/portability/__init__.py +1 -0
  26. tribalmemory/portability/embedding_metadata.py +320 -0
  27. tribalmemory/server/__init__.py +9 -0
  28. tribalmemory/server/__main__.py +6 -0
  29. tribalmemory/server/app.py +187 -0
  30. tribalmemory/server/config.py +115 -0
  31. tribalmemory/server/models.py +206 -0
  32. tribalmemory/server/routes.py +378 -0
  33. tribalmemory/services/__init__.py +15 -0
  34. tribalmemory/services/deduplication.py +115 -0
  35. tribalmemory/services/embeddings.py +273 -0
  36. tribalmemory/services/import_export.py +506 -0
  37. tribalmemory/services/memory.py +275 -0
  38. tribalmemory/services/vector_store.py +360 -0
  39. tribalmemory/testing/__init__.py +22 -0
  40. tribalmemory/testing/embedding_utils.py +110 -0
  41. tribalmemory/testing/fixtures.py +123 -0
  42. tribalmemory/testing/metrics.py +256 -0
  43. tribalmemory/testing/mocks.py +560 -0
  44. tribalmemory/testing/semantic_expansions.py +91 -0
  45. tribalmemory/utils.py +23 -0
  46. tribalmemory-0.1.0.dist-info/METADATA +275 -0
  47. tribalmemory-0.1.0.dist-info/RECORD +51 -0
  48. tribalmemory-0.1.0.dist-info/WHEEL +5 -0
  49. tribalmemory-0.1.0.dist-info/entry_points.txt +3 -0
  50. tribalmemory-0.1.0.dist-info/licenses/LICENSE +190 -0
  51. tribalmemory-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,560 @@
1
+ """Mock implementations for testing."""
2
+
3
+ import asyncio
4
+ import hashlib
5
+ import random
6
+ import re
7
+ import time
8
+ from datetime import datetime
9
+ from typing import Optional
10
+
11
+ from ..interfaces import (
12
+ IEmbeddingService,
13
+ IVectorStore,
14
+ IMemoryService,
15
+ IDeduplicationService,
16
+ ITimestampService,
17
+ MemoryEntry,
18
+ MemorySource,
19
+ RecallResult,
20
+ StoreResult,
21
+ )
22
+ from .embedding_utils import hash_to_embedding_extended
23
+ from .semantic_expansions import (
24
+ SHORT_IMPORTANT_WORDS,
25
+ get_expanded_terms,
26
+ get_word_variants,
27
+ )
28
+
29
+
30
+ # Scoring constants for text matching in recall
31
+ CANDIDATE_MULTIPLIER = 3 # Fetch N times more candidates than limit for re-ranking
32
+ MIN_CANDIDATE_THRESHOLD = 0.1 # Minimum similarity for candidate consideration
33
+ BASE_TEXT_MATCH_SCORE = 0.7 # Base score when meaningful words overlap
34
+ OVERLAP_BOOST_PER_WORD = 0.05 # Additional score per overlapping word
35
+
36
+
37
+ class MockEmbeddingService(IEmbeddingService):
38
+ """Mock embedding service for testing.
39
+
40
+ Uses deterministic hashing for reproducible tests.
41
+ Can be configured to simulate failures and latency.
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ embedding_dim: int = 1536,
47
+ latency_ms: float = 0,
48
+ failure_rate: float = 0,
49
+ timeout_after_n: Optional[int] = None,
50
+ skip_latency: bool = False,
51
+ ):
52
+ """Initialize mock embedding service.
53
+
54
+ Args:
55
+ embedding_dim: Dimension of generated embeddings.
56
+ latency_ms: Simulated latency per call.
57
+ failure_rate: Probability of failure (0.0-1.0).
58
+ timeout_after_n: Simulate timeout after N calls.
59
+ skip_latency: If True, skip all latency simulation (fast mode for dev).
60
+ """
61
+ self.embedding_dim = embedding_dim
62
+ self.latency_ms = latency_ms
63
+ self.failure_rate = failure_rate
64
+ self.timeout_after_n = timeout_after_n
65
+ self.skip_latency = skip_latency
66
+ self._call_count = 0
67
+
68
+ async def embed(self, text: str) -> list[float]:
69
+ """Generate deterministic embedding from text hash."""
70
+ self._call_count += 1
71
+
72
+ # Simulate timeout
73
+ if self.timeout_after_n and self._call_count > self.timeout_after_n:
74
+ if not self.skip_latency:
75
+ await asyncio.sleep(30) # Will trigger timeout
76
+
77
+ # Simulate latency
78
+ if self.latency_ms > 0 and not self.skip_latency:
79
+ await asyncio.sleep(self.latency_ms / 1000)
80
+
81
+ # Simulate failures
82
+ if self.failure_rate > 0 and random.random() < self.failure_rate:
83
+ raise RuntimeError("Mock embedding API failure")
84
+
85
+ return self._hash_to_embedding(text)
86
+
87
+ async def embed_batch(self, texts: list[str]) -> list[list[float]]:
88
+ """Generate embeddings for batch."""
89
+ return [await self.embed(t) for t in texts]
90
+
91
+ def similarity(self, a: list[float], b: list[float]) -> float:
92
+ """Calculate cosine similarity.
93
+
94
+ Note: Zero vectors (all zeros) return 0.0 similarity with any other vector.
95
+ This is intentional - zero vectors indicate corrupted/missing embeddings
96
+ and should not match anything. Tests in test_negative_security.py verify
97
+ that corrupted embeddings are excluded from results.
98
+ """
99
+ import math
100
+ dot = sum(x * y for x, y in zip(a, b))
101
+ norm_a = math.sqrt(sum(x * x for x in a))
102
+ norm_b = math.sqrt(sum(x * x for x in b))
103
+ if norm_a == 0 or norm_b == 0:
104
+ return 0.0
105
+ return dot / (norm_a * norm_b)
106
+
107
+ def _hash_to_embedding(self, text: str) -> list[float]:
108
+ """Convert text to deterministic embedding that preserves semantic similarity.
109
+
110
+ Delegates to shared utility for consistent behavior across mock implementations.
111
+ Uses extended version with sliding window hashes for substring matching.
112
+ """
113
+ return hash_to_embedding_extended(text, self.embedding_dim)
114
+
115
+
116
+ class MockVectorStore(IVectorStore):
117
+ """In-memory vector store for testing."""
118
+
119
+ def __init__(
120
+ self,
121
+ embedding_service: IEmbeddingService,
122
+ latency_ms: float = 0,
123
+ max_capacity: Optional[int] = None
124
+ ):
125
+ self.embedding_service = embedding_service
126
+ self.latency_ms = latency_ms
127
+ self.max_capacity = max_capacity
128
+ self._store: dict[str, MemoryEntry] = {}
129
+ self._deleted: set[str] = set()
130
+
131
+ async def store(self, entry: MemoryEntry) -> StoreResult:
132
+ """Store a memory entry."""
133
+ if self.latency_ms > 0:
134
+ await asyncio.sleep(self.latency_ms / 1000)
135
+
136
+ if self.max_capacity and len(self._store) >= self.max_capacity:
137
+ return StoreResult(
138
+ success=False,
139
+ error="Storage capacity reached"
140
+ )
141
+
142
+ self._store[entry.id] = entry
143
+ return StoreResult(success=True, memory_id=entry.id)
144
+
145
+ async def recall(
146
+ self,
147
+ query_embedding: list[float],
148
+ limit: int = 10,
149
+ min_similarity: float = 0.7,
150
+ filters: Optional[dict] = None,
151
+ ) -> list[RecallResult]:
152
+ """Recall memories similar to query."""
153
+ start = time.perf_counter()
154
+
155
+ if self.latency_ms > 0:
156
+ await asyncio.sleep(self.latency_ms / 1000)
157
+
158
+ results = []
159
+ for memory in self._store.values():
160
+ if memory.id in self._deleted:
161
+ continue
162
+ if memory.embedding is None:
163
+ continue
164
+
165
+ # Apply filters
166
+ if filters and "tags" in filters and filters["tags"]:
167
+ if not any(t in memory.tags for t in filters["tags"]):
168
+ continue
169
+
170
+ sim = self.embedding_service.similarity(query_embedding, memory.embedding)
171
+ if sim >= min_similarity:
172
+ results.append((memory, sim))
173
+
174
+ # Sort by similarity, take top limit
175
+ results.sort(key=lambda x: x[1], reverse=True)
176
+ results = results[:limit]
177
+
178
+ elapsed_ms = (time.perf_counter() - start) * 1000
179
+
180
+ return [
181
+ RecallResult(
182
+ memory=mem,
183
+ similarity_score=sim,
184
+ retrieval_time_ms=elapsed_ms
185
+ )
186
+ for mem, sim in results
187
+ ]
188
+
189
+ async def get(self, memory_id: str) -> Optional[MemoryEntry]:
190
+ """Get a specific memory by ID."""
191
+ if memory_id in self._deleted:
192
+ return None
193
+ return self._store.get(memory_id)
194
+
195
+ async def delete(self, memory_id: str) -> bool:
196
+ """Soft delete a memory."""
197
+ if memory_id in self._store:
198
+ self._deleted.add(memory_id)
199
+ return True
200
+ return False
201
+
202
+ async def list(
203
+ self,
204
+ limit: int = 1000,
205
+ offset: int = 0,
206
+ filters: Optional[dict] = None,
207
+ ) -> list[MemoryEntry]:
208
+ """List memories with optional filtering."""
209
+ entries = [
210
+ m for m in list(self._store.values())
211
+ if m.id not in self._deleted
212
+ ]
213
+
214
+ if filters and "tags" in filters and filters["tags"]:
215
+ entries = [e for e in entries if any(t in e.tags for t in filters["tags"])]
216
+
217
+ return entries[offset:offset + limit]
218
+
219
+ async def count(self, filters: Optional[dict] = None) -> int:
220
+ """Count memories matching filters."""
221
+ entries = await self.list(limit=100000, filters=filters)
222
+ return len(entries)
223
+
224
+ def clear(self):
225
+ """Clear all data (for test cleanup)."""
226
+ self._store.clear()
227
+ self._deleted.clear()
228
+
229
+ async def __aenter__(self):
230
+ """Async context manager entry."""
231
+ return self
232
+
233
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
234
+ """Async context manager exit - clears store to prevent test pollution."""
235
+ self.clear()
236
+
237
+
238
+ class MockDeduplicationService(IDeduplicationService):
239
+ """Mock deduplication service."""
240
+
241
+ def __init__(
242
+ self,
243
+ vector_store: MockVectorStore,
244
+ embedding_service: IEmbeddingService
245
+ ):
246
+ self.vector_store = vector_store
247
+ self.embedding_service = embedding_service
248
+
249
+ async def is_duplicate(
250
+ self,
251
+ content: str,
252
+ embedding: list[float],
253
+ threshold: float = 0.90
254
+ ) -> tuple[bool, Optional[str]]:
255
+ """Check if content is duplicate.
256
+
257
+ Default threshold lowered to 0.90 to catch near-duplicates like:
258
+ - "Joe prefers concise responses" vs "Joe likes concise answers"
259
+ - Typo corrections and minor paraphrases
260
+
261
+ Returns:
262
+ Tuple of (is_duplicate, duplicate_of_id)
263
+ """
264
+ similar = await self.find_similar(content, embedding, threshold)
265
+ if similar:
266
+ return True, similar[0][0]
267
+ return False, None
268
+
269
+ async def find_similar(
270
+ self,
271
+ content: str,
272
+ embedding: list[float],
273
+ threshold: float = 0.85,
274
+ limit: int = 10,
275
+ ) -> list[tuple[str, float]]:
276
+ """Find similar memories."""
277
+ results = await self.vector_store.recall(
278
+ embedding,
279
+ limit=limit,
280
+ min_similarity=threshold
281
+ )
282
+ return [(r.memory.id, r.similarity_score) for r in results]
283
+
284
+
285
+ class MockMemoryService(IMemoryService):
286
+ """High-level mock memory service for testing."""
287
+
288
+ def __init__(
289
+ self,
290
+ instance_id: str = "test-instance",
291
+ embedding_service: Optional[IEmbeddingService] = None,
292
+ vector_store: Optional[IVectorStore] = None
293
+ ):
294
+ self.instance_id = instance_id
295
+ self.embedding_service = embedding_service or MockEmbeddingService()
296
+ self.vector_store = vector_store or MockVectorStore(self.embedding_service)
297
+ self.dedup_service = MockDeduplicationService(
298
+ self.vector_store,
299
+ self.embedding_service
300
+ )
301
+
302
+ async def remember(
303
+ self,
304
+ content: str,
305
+ source_type: MemorySource = MemorySource.AUTO_CAPTURE,
306
+ context: Optional[str] = None,
307
+ tags: Optional[list[str]] = None,
308
+ skip_dedup: bool = False,
309
+ ) -> StoreResult:
310
+ """Store a new memory."""
311
+ # Validate
312
+ if not content or not content.strip():
313
+ return StoreResult(success=False, error="Empty content not allowed")
314
+
315
+ # Generate embedding
316
+ embedding = await self.embedding_service.embed(content)
317
+
318
+ # Check for duplicates
319
+ if not skip_dedup:
320
+ is_dup, dup_id = await self.dedup_service.is_duplicate(content, embedding)
321
+ if is_dup:
322
+ return StoreResult(success=False, duplicate_of=dup_id)
323
+
324
+ # Create entry
325
+ entry = MemoryEntry(
326
+ content=content,
327
+ embedding=embedding,
328
+ source_instance=self.instance_id,
329
+ source_type=source_type,
330
+ context=context,
331
+ tags=tags or []
332
+ )
333
+
334
+ return await self.vector_store.store(entry)
335
+
336
+ async def recall(
337
+ self,
338
+ query: str,
339
+ limit: int = 5,
340
+ min_relevance: float = 0.7,
341
+ tags: Optional[list[str]] = None,
342
+ ) -> list[RecallResult]:
343
+ """Recall relevant memories.
344
+
345
+ Uses both embedding similarity and text matching to better simulate
346
+ real semantic search behavior in the mock.
347
+ """
348
+ query_embedding = await self.embedding_service.embed(query)
349
+ filters = {"tags": tags} if tags else None
350
+
351
+ # Get results from vector store with lowered threshold
352
+ # We'll re-filter based on combined score
353
+ results = await self.vector_store.recall(
354
+ query_embedding,
355
+ limit=limit * CANDIDATE_MULTIPLIER,
356
+ min_similarity=min(MIN_CANDIDATE_THRESHOLD, min_relevance / 2),
357
+ filters=filters
358
+ )
359
+
360
+ # Boost scores based on text matching (simulates semantic similarity better)
361
+ query_lower = query.lower()
362
+ # Filter out very short words for matching (stopwords-ish)
363
+ query_words = {w for w in re.findall(r'\b\w+\b', query_lower) if len(w) > 2}
364
+
365
+ # Add common short words that matter
366
+ query_words.update(
367
+ w for w in re.findall(r'\b\w+\b', query_lower)
368
+ if w in SHORT_IMPORTANT_WORDS
369
+ )
370
+
371
+ # Expand query words with variants (pseudo-stemming)
372
+ expanded_query = set()
373
+ for w in query_words:
374
+ expanded_query.update(get_word_variants(w))
375
+ query_words = expanded_query
376
+
377
+ # Apply semantic expansions for common concepts
378
+ query_words = get_expanded_terms(query_words, query_lower)
379
+
380
+ def is_corrupted_embedding(emb: list[float] | None) -> bool:
381
+ """Check if embedding is corrupted (zero vector, NaN, etc.)."""
382
+ if emb is None:
383
+ return True
384
+ if all(x == 0.0 for x in emb):
385
+ return True
386
+ if any(x != x for x in emb): # NaN check
387
+ return True
388
+ return False
389
+
390
+ boosted_results = []
391
+ for r in results:
392
+ # Skip memories with corrupted embeddings (security consideration)
393
+ if is_corrupted_embedding(r.memory.embedding):
394
+ continue
395
+
396
+ content_lower = r.memory.content.lower()
397
+ content_words = {w for w in re.findall(r'\b\w+\b', content_lower) if len(w) > 2}
398
+
399
+ # Calculate text match boost
400
+ text_boost = 0.0
401
+
402
+ # Exact substring match is strong signal
403
+ if query_lower in content_lower:
404
+ text_boost = max(text_boost, 0.9)
405
+
406
+ # Word overlap scoring
407
+ if query_words and content_words:
408
+ overlap = query_words & content_words
409
+ # If any meaningful (>=3 chars) words overlap, it's relevant
410
+ meaningful_overlap = [w for w in overlap if len(w) >= 3]
411
+ if meaningful_overlap:
412
+ # More overlap = higher score, base score is 0.7 (meets default threshold)
413
+ score = BASE_TEXT_MATCH_SCORE + OVERLAP_BOOST_PER_WORD * len(meaningful_overlap)
414
+ text_boost = max(text_boost, score)
415
+ elif overlap:
416
+ text_boost = max(text_boost, 0.5)
417
+
418
+ # Combined score: max of embedding sim and text boost
419
+ combined_score = max(r.similarity_score, text_boost)
420
+
421
+ if combined_score >= min_relevance:
422
+ boosted_results.append(RecallResult(
423
+ memory=r.memory,
424
+ similarity_score=combined_score,
425
+ retrieval_time_ms=r.retrieval_time_ms
426
+ ))
427
+
428
+ # Also check memories not returned by vector search (text-only matches)
429
+ returned_ids = {r.memory.id for r in results}
430
+ all_memories = await self.vector_store.list(limit=1000, filters=filters)
431
+
432
+ for memory in all_memories:
433
+ if memory.id in returned_ids or memory.id in self.vector_store._deleted:
434
+ continue
435
+
436
+ # Skip memories with corrupted embeddings (security consideration)
437
+ if is_corrupted_embedding(memory.embedding):
438
+ continue
439
+
440
+ content_lower = memory.content.lower()
441
+ content_words = {w for w in re.findall(r'\b\w+\b', content_lower) if len(w) > 2}
442
+
443
+ text_boost = 0.0
444
+ if query_lower in content_lower:
445
+ text_boost = 0.9
446
+ elif query_words and content_words:
447
+ overlap = query_words & content_words
448
+ meaningful_overlap = [w for w in overlap if len(w) >= 3]
449
+ if meaningful_overlap:
450
+ text_boost = (
451
+ BASE_TEXT_MATCH_SCORE + OVERLAP_BOOST_PER_WORD * len(meaningful_overlap)
452
+ )
453
+ elif overlap:
454
+ text_boost = 0.5
455
+
456
+ if text_boost >= min_relevance:
457
+ boosted_results.append(RecallResult(
458
+ memory=memory,
459
+ similarity_score=text_boost,
460
+ retrieval_time_ms=0.0
461
+ ))
462
+
463
+ # Sort by score and limit
464
+ boosted_results.sort(key=lambda x: x.similarity_score, reverse=True)
465
+ return boosted_results[:limit]
466
+
467
+ async def correct(
468
+ self,
469
+ original_id: str,
470
+ corrected_content: str,
471
+ context: Optional[str] = None
472
+ ) -> StoreResult:
473
+ """Store a correction to an existing memory."""
474
+ embedding = await self.embedding_service.embed(corrected_content)
475
+
476
+ entry = MemoryEntry(
477
+ content=corrected_content,
478
+ embedding=embedding,
479
+ source_instance=self.instance_id,
480
+ source_type=MemorySource.CORRECTION,
481
+ context=context,
482
+ supersedes=original_id
483
+ )
484
+
485
+ return await self.vector_store.store(entry)
486
+
487
+ async def forget(self, memory_id: str) -> bool:
488
+ """Forget a memory."""
489
+ return await self.vector_store.delete(memory_id)
490
+
491
+ async def get(self, memory_id: str) -> Optional[MemoryEntry]:
492
+ """Get a memory by ID with full provenance."""
493
+ return await self.vector_store.get(memory_id)
494
+
495
+
496
+ class MockTimestampService(ITimestampService):
497
+ """Mock RFC 3161 timestamp service for testing.
498
+
499
+ Generates deterministic timestamps for reproducible tests.
500
+ Does NOT provide cryptographic guarantees - use real TSA in production.
501
+ """
502
+
503
+ def __init__(self, fail_verify: bool = False):
504
+ """Initialize mock timestamp service.
505
+
506
+ Args:
507
+ fail_verify: If True, verify() always returns False (for testing failures)
508
+ """
509
+ self.fail_verify = fail_verify
510
+ self._timestamps: dict[bytes, datetime] = {}
511
+
512
+ async def timestamp(self, data: bytes) -> bytes:
513
+ """Generate a mock timestamp token.
514
+
515
+ Token format: "MOCK_TSA|{iso_timestamp}|{data_hash}"
516
+ This is NOT RFC 3161 compliant - use for testing only.
517
+ """
518
+ import hashlib
519
+
520
+ now = datetime.utcnow()
521
+ data_hash = hashlib.sha256(data).hexdigest()[:16]
522
+ token = f"MOCK_TSA|{now.isoformat()}|{data_hash}".encode()
523
+
524
+ # Store for verification
525
+ self._timestamps[token] = now
526
+
527
+ return token
528
+
529
+ async def verify(self, data: bytes, token: bytes) -> tuple[bool, Optional[datetime]]:
530
+ """Verify a mock timestamp token.
531
+
532
+ Returns:
533
+ Tuple of (is_valid, timestamp_datetime)
534
+ """
535
+ if self.fail_verify:
536
+ return False, None
537
+
538
+ try:
539
+ decoded = token.decode()
540
+ if not decoded.startswith("MOCK_TSA|"):
541
+ return False, None
542
+
543
+ parts = decoded.split("|")
544
+ if len(parts) != 3:
545
+ return False, None
546
+
547
+ timestamp_str = parts[1]
548
+ stored_hash = parts[2]
549
+
550
+ # Verify data hash matches
551
+ import hashlib
552
+ actual_hash = hashlib.sha256(data).hexdigest()[:16]
553
+ if actual_hash != stored_hash:
554
+ return False, None
555
+
556
+ timestamp = datetime.fromisoformat(timestamp_str)
557
+ return True, timestamp
558
+
559
+ except Exception:
560
+ return False, None
@@ -0,0 +1,91 @@
1
+ """Shared semantic expansion utilities for mock implementations.
2
+
3
+ Provides centralized semantic term dictionaries and helpers for tests.
4
+ """
5
+
6
+ # Short words that are meaningful in technical contexts
7
+ SHORT_IMPORTANT_WORDS = {'pr', 'rm', 'ui', 'ux', 'ai', 'ml', 'db', 'js', 'ts'}
8
+
9
+ # Semantic expansion dictionaries by domain
10
+ TECH_TERMS = {
11
+ 'next', 'nextjs', 'react', 'tailwind', 'supabase', 'claude', 'api',
12
+ 'typescript', 'javascript', 'python', 'database', 'postgresql', 'backend',
13
+ 'frontend', 'framework', 'styling', 'css', 'app', 'router'
14
+ }
15
+
16
+ WORKFLOW_TERMS = {'pr', 'prs', 'review', 'commit', 'branch', 'merge', 'git'}
17
+
18
+ TESTING_TERMS = {'tdd', 'tests', 'testing', 'test', 'unit', 'coverage'}
19
+
20
+ FILE_TERMS = {'delete', 'deletion', 'trash', 'rm', 'remove', 'file', 'files'}
21
+
22
+ TIMESTAMP_TERMS = {'rfc', '3161', 'timestamp', 'provenance', 'blockchain'}
23
+
24
+ TIMEZONE_TERMS = {
25
+ 'timezone', 'eastern', 'mountain', 'pacific',
26
+ 'central', 'utc', 'summer', 'winter'
27
+ }
28
+
29
+
30
+ def get_word_variants(word: str) -> set[str]:
31
+ """Get common variants of a word (pseudo-stemming).
32
+
33
+ Args:
34
+ word: Base word to expand.
35
+
36
+ Returns:
37
+ Set of word variants including original.
38
+ """
39
+ variants = {word}
40
+ # Remove common suffixes
41
+ for suffix in ['ing', 'tion', 'ation', 'ed', 'er', 'ly', 's', 'es']:
42
+ if word.endswith(suffix) and len(word) > len(suffix) + 2:
43
+ root = word[:-len(suffix)]
44
+ variants.add(root)
45
+ # Also add other forms of the root
46
+ variants.add(root + 's')
47
+ variants.add(root + 'ing')
48
+ # Add common suffixes to the word
49
+ variants.add(word + 's')
50
+ variants.add(word + 'ing')
51
+ variants.add(word + 'tion')
52
+ return variants
53
+
54
+
55
+ def get_expanded_terms(query_words: set[str], query_lower: str) -> set[str]:
56
+ """Expand query words with semantic related terms.
57
+
58
+ Args:
59
+ query_words: Initial set of query words.
60
+ query_lower: Lowercase query string for substring checks.
61
+
62
+ Returns:
63
+ Expanded set of query words.
64
+ """
65
+ expanded = set(query_words)
66
+
67
+ # Tech/stack related
68
+ if 'tech' in query_words or 'stack' in query_words or 'technology' in query_lower:
69
+ expanded.update(TECH_TERMS)
70
+
71
+ # Workflow/process related
72
+ if 'workflow' in query_words or 'process' in query_words:
73
+ expanded.update(WORKFLOW_TERMS)
74
+
75
+ # Testing related
76
+ if 'testing' in query_words or 'test' in query_words:
77
+ expanded.update(TESTING_TERMS)
78
+
79
+ # File operations
80
+ if 'delete' in query_words or 'files' in query_words or 'file' in query_words:
81
+ expanded.update(FILE_TERMS)
82
+
83
+ # Timestamp/provenance
84
+ if 'timestamp' in query_words or 'provenance' in query_words:
85
+ expanded.update(TIMESTAMP_TERMS)
86
+
87
+ # Timezone related
88
+ if 'timezone' in query_words or 'time' in query_words:
89
+ expanded.update(TIMEZONE_TERMS)
90
+
91
+ return expanded
tribalmemory/utils.py ADDED
@@ -0,0 +1,23 @@
1
+ """Shared utility functions for TribalMemory.
2
+
3
+ This module provides common functions used across multiple components
4
+ to prevent code duplication and ensure consistency.
5
+ """
6
+
7
+ import math
8
+
9
+
10
+ def normalize_embedding(embedding: list[float]) -> list[float]:
11
+ """Normalize embedding to unit length for consistent similarity math.
12
+
13
+ Args:
14
+ embedding: Vector of floats representing an embedding.
15
+
16
+ Returns:
17
+ Normalized embedding with unit length (L2 norm = 1).
18
+ Returns the original embedding if it has zero magnitude.
19
+ """
20
+ norm = math.sqrt(sum(x * x for x in embedding))
21
+ if norm == 0:
22
+ return embedding
23
+ return [x / norm for x in embedding]