memorytrace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. engram/__init__.py +8 -0
  2. engram/__main__.py +6 -0
  3. engram/cli/__init__.py +1 -0
  4. engram/cli/app.py +291 -0
  5. engram/cli/formatters.py +90 -0
  6. engram/cli/simple.py +267 -0
  7. engram/config.py +72 -0
  8. engram/engine.py +612 -0
  9. engram/exceptions.py +41 -0
  10. engram/extraction/__init__.py +6 -0
  11. engram/extraction/base.py +20 -0
  12. engram/extraction/llm_extractor.py +197 -0
  13. engram/extraction/ner/__init__.py +7 -0
  14. engram/extraction/ner/cjk.py +63 -0
  15. engram/extraction/ner/english.py +109 -0
  16. engram/extraction/ner/korean.py +106 -0
  17. engram/extraction/regex_extractor.py +188 -0
  18. engram/integrations/__init__.py +1 -0
  19. engram/integrations/mcp_server.py +213 -0
  20. engram/integrations/sdk.py +194 -0
  21. engram/models/__init__.py +19 -0
  22. engram/models/entity.py +72 -0
  23. engram/models/fact.py +58 -0
  24. engram/models/quality.py +61 -0
  25. engram/models/relation.py +26 -0
  26. engram/models/search.py +96 -0
  27. engram/models/session.py +53 -0
  28. engram/models/source.py +73 -0
  29. engram/quality/__init__.py +8 -0
  30. engram/quality/confidence.py +38 -0
  31. engram/quality/conflict.py +79 -0
  32. engram/quality/decay.py +28 -0
  33. engram/quality/gate.py +120 -0
  34. engram/quality/pii.py +80 -0
  35. engram/search/__init__.py +13 -0
  36. engram/search/base.py +20 -0
  37. engram/search/fts5_search.py +210 -0
  38. engram/search/hybrid.py +99 -0
  39. engram/search/semantic.py +186 -0
  40. engram/search/tokenizer.py +85 -0
  41. engram/session/__init__.py +6 -0
  42. engram/session/context.py +87 -0
  43. engram/session/manager.py +152 -0
  44. engram/session/working_memory.py +57 -0
  45. engram/storage/__init__.py +6 -0
  46. engram/storage/base.py +63 -0
  47. engram/storage/markdown_export.py +144 -0
  48. engram/storage/migrations.py +30 -0
  49. engram/storage/sqlite_store.py +615 -0
  50. memorytrace-0.1.0.dist-info/METADATA +138 -0
  51. memorytrace-0.1.0.dist-info/RECORD +54 -0
  52. memorytrace-0.1.0.dist-info/WHEEL +4 -0
  53. memorytrace-0.1.0.dist-info/entry_points.txt +3 -0
  54. memorytrace-0.1.0.dist-info/licenses/LICENSE +21 -0
engram/quality/pii.py ADDED
@@ -0,0 +1,80 @@
1
+ """PII (Personally Identifiable Information) detection and masking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Optional
7
+
8
+ from engram.models.quality import PIIMatch
9
+
10
+ # PII detection patterns
11
+ _PII_PATTERNS: dict[str, re.Pattern] = {
12
+ "credit_card": re.compile(
13
+ r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
14
+ ),
15
+ "ssn": re.compile(
16
+ r'\b\d{3}-\d{2}-\d{4}\b'
17
+ ),
18
+ "api_key": re.compile(
19
+ r'\b(?:sk-|ak-|AKIA|ghp_|gho_|github_pat_)[A-Za-z0-9_\-]{20,}\b'
20
+ ),
21
+ "email": re.compile(
22
+ r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Z|a-z]{2,}\b'
23
+ ),
24
+ "phone_kr": re.compile(
25
+ r'\b0\d{1,2}-\d{3,4}-\d{4}\b'
26
+ ),
27
+ "phone_intl": re.compile(
28
+ r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{3,4}[-\s]?\d{4}\b'
29
+ ),
30
+ "rrn": re.compile(
31
+ # Korean Resident Registration Number (주민등록번호)
32
+ r'\b\d{6}[-\s]?[1-4]\d{6}\b'
33
+ ),
34
+ }
35
+
36
+ _REDACTED = "[REDACTED]"
37
+
38
+
39
+ class PIIDetector:
40
+ """Detects and masks PII in text."""
41
+
42
+ def scan(self, text: str) -> list[PIIMatch]:
43
+ """Scan text for PII patterns. Returns list of matches."""
44
+ matches: list[PIIMatch] = []
45
+ for pii_type, pattern in _PII_PATTERNS.items():
46
+ for m in pattern.finditer(text):
47
+ # Store only type and length, not the original PII value
48
+ original_display = f"[{pii_type}:{len(m.group())}chars]"
49
+ matches.append(PIIMatch(
50
+ pii_type=pii_type,
51
+ start=m.start(),
52
+ end=m.end(),
53
+ original=original_display,
54
+ ))
55
+ # Sort by position (reverse for safe replacement)
56
+ matches.sort(key=lambda x: x.start)
57
+ return matches
58
+
59
+ def mask(self, text: str, matches: Optional[list[PIIMatch]] = None) -> str:
60
+ """Replace PII occurrences with [REDACTED].
61
+
62
+ If matches not provided, scans first.
63
+ """
64
+ if matches is None:
65
+ matches = self.scan(text)
66
+ if not matches:
67
+ return text
68
+
69
+ # Replace from end to start to preserve positions
70
+ result = text
71
+ for m in sorted(matches, key=lambda x: x.start, reverse=True):
72
+ result = result[:m.start] + _REDACTED + result[m.end:]
73
+ return result
74
+
75
+ def has_pii(self, text: str) -> bool:
76
+ """Quick check: does text contain any PII?"""
77
+ for pattern in _PII_PATTERNS.values():
78
+ if pattern.search(text):
79
+ return True
80
+ return False
@@ -0,0 +1,13 @@
1
+ """Search backends for Engram."""
2
+
3
+ from engram.search.base import SearchBackend
4
+ from engram.search.fts5_search import FTS5Search
5
+ from engram.search.tokenizer import tokenize, build_fts5_query, escape_fts5_query
6
+
7
+ __all__ = [
8
+ "SearchBackend",
9
+ "FTS5Search",
10
+ "tokenize",
11
+ "build_fts5_query",
12
+ "escape_fts5_query",
13
+ ]
engram/search/base.py ADDED
@@ -0,0 +1,20 @@
1
+ """Search backend protocol."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol, runtime_checkable
6
+
7
+ from engram.models.search import SearchOptions, SearchResult
8
+
9
+
10
+ @runtime_checkable
11
+ class SearchBackend(Protocol):
12
+ """Abstract search interface — all search backends implement this."""
13
+
14
+ def search(self, options: SearchOptions) -> SearchResult: ...
15
+
16
+ def index_entity(self, entity_id: str) -> None: ...
17
+
18
+ def remove_from_index(self, entity_id: str) -> None: ...
19
+
20
+ def reindex_all(self) -> int: ...
@@ -0,0 +1,210 @@
1
+ """FTS5-based BM25 search — genuine Okapi BM25 via SQLite's built-in ranking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sqlite3
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from engram.models.entity import Entity, EntityState, Tier
12
+ from engram.models.fact import Fact, FactStatus
13
+ from engram.models.search import SearchHit, SearchOptions, SearchResult
14
+ from engram.models.source import Source, SourceType
15
+ from engram.search.tokenizer import build_fts5_query
16
+
17
+
18
+ class FTS5Search:
19
+ """Full-text search using SQLite FTS5's built-in BM25 ranking.
20
+
21
+ FTS5 bm25() implements genuine Okapi BM25 with:
22
+ - TF saturation (k1 parameter)
23
+ - Document length normalization (b parameter)
24
+ - IDF via document frequency counting
25
+
26
+ Column weights (higher = more important):
27
+ - entity_name: 10.0
28
+ - summary: 5.0
29
+ - fact_text: 3.0
30
+ - state_text: 2.0
31
+ - entity_type: 1.0
32
+ - entity_id: 0.0 (not searchable)
33
+ """
34
+
35
+ # Column order in FTS table: entity_id, entity_name, entity_type, fact_text, state_text, summary
36
+ # bm25() weights correspond to column order
37
+ _BM25_WEIGHTS = (0.0, 10.0, 1.0, 3.0, 2.0, 5.0)
38
+ _BM25_WEIGHTS_SQL = "0.0, 10.0, 1.0, 3.0, 2.0, 5.0" # Pre-built constant for SQL
39
+
40
+ def __init__(self, db_path: Path):
41
+ self.db_path = Path(db_path).resolve()
42
+ self._conn: Optional[sqlite3.Connection] = None
43
+
44
+ def _get_conn(self) -> sqlite3.Connection:
45
+ if self._conn is None:
46
+ self._conn = sqlite3.connect(str(self.db_path))
47
+ self._conn.execute("PRAGMA busy_timeout=5000")
48
+ self._conn.row_factory = sqlite3.Row
49
+ return self._conn
50
+
51
+ def close(self) -> None:
52
+ if self._conn is not None:
53
+ self._conn.close()
54
+ self._conn = None
55
+
56
+ def search(self, options: SearchOptions) -> SearchResult:
57
+ """Execute a BM25-ranked full-text search."""
58
+ start = time.monotonic()
59
+
60
+ fts_query = build_fts5_query(options.query)
61
+ if not fts_query or fts_query == '""':
62
+ return SearchResult(
63
+ query=options.query,
64
+ hits=[],
65
+ total_count=0,
66
+ search_time_ms=0.0,
67
+ )
68
+
69
+ conn = self._get_conn()
70
+
71
+ # Search with BM25 ranking and join with entities table for full data
72
+ # Weights are a class-level constant — never derived from user input
73
+ sql = f"""
74
+ SELECT
75
+ e.*,
76
+ bm25(memory_fts, {self._BM25_WEIGHTS_SQL}) AS rank,
77
+ snippet(memory_fts, 3, '>>>', '<<<', '...', 40) AS snippet
78
+ FROM memory_fts fts
79
+ JOIN entities e ON e.id = fts.entity_id
80
+ WHERE memory_fts MATCH ?
81
+ """
82
+ params: list = [fts_query]
83
+
84
+ # Apply filters
85
+ if options.entity_types:
86
+ placeholders = ",".join("?" for _ in options.entity_types)
87
+ sql += f" AND e.entity_type IN ({placeholders})"
88
+ params.extend(options.entity_types)
89
+
90
+ if options.tiers:
91
+ placeholders = ",".join("?" for _ in options.tiers)
92
+ sql += f" AND e.tier IN ({placeholders})"
93
+ params.extend(options.tiers)
94
+
95
+ if not options.include_archived:
96
+ sql += " AND e.tier != 'archival'"
97
+
98
+ if options.date_from:
99
+ sql += " AND e.updated_at >= ?"
100
+ params.append(options.date_from.isoformat())
101
+
102
+ if options.date_to:
103
+ sql += " AND e.updated_at <= ?"
104
+ params.append(options.date_to.isoformat())
105
+
106
+ sql += " ORDER BY rank LIMIT ?"
107
+ params.append(options.max_results)
108
+
109
+ try:
110
+ rows = conn.execute(sql, params).fetchall()
111
+ except sqlite3.OperationalError as e:
112
+ err_msg = str(e).lower()
113
+ # Only swallow FTS5 query syntax errors; re-raise others
114
+ if "fts5" in err_msg or "syntax" in err_msg or "parse" in err_msg:
115
+ return SearchResult(
116
+ query=options.query,
117
+ hits=[],
118
+ total_count=0,
119
+ search_time_ms=(time.monotonic() - start) * 1000,
120
+ )
121
+ raise
122
+
123
+ # Build hits
124
+ hits: list[SearchHit] = []
125
+ total_tokens = 0
126
+
127
+ for row in rows:
128
+ entity = self._row_to_entity(row)
129
+ snippet = row["snippet"] if row["snippet"] else ""
130
+ # Clean snippet markers
131
+ snippet = snippet.replace(">>>", "").replace("<<<", "")
132
+
133
+ # Load facts for this entity if within confidence threshold
134
+ facts = self._get_entity_facts(
135
+ conn, entity.id, options.min_confidence
136
+ )
137
+
138
+ # Estimate token count (1 token ≈ 4 chars)
139
+ entry_text = f"{entity.name} {snippet} {' '.join(f.raw_text for f in facts[:3])}"
140
+ token_count = len(entry_text) // 4
141
+
142
+ # Check token budget (max_tokens <= 0 means unlimited)
143
+ if options.max_tokens > 0 and total_tokens + token_count > options.max_tokens:
144
+ break
145
+
146
+ hits.append(SearchHit(
147
+ entity=entity,
148
+ facts=facts[:5], # Limit facts per hit
149
+ relevance_score=abs(row["rank"]), # bm25() returns negative scores
150
+ snippet=snippet,
151
+ token_count=token_count,
152
+ ))
153
+ total_tokens += token_count
154
+
155
+ elapsed = (time.monotonic() - start) * 1000
156
+
157
+ return SearchResult(
158
+ query=options.query,
159
+ hits=hits,
160
+ total_count=len(rows), # Total DB matches (may exceed len(hits) due to token budget)
161
+ search_time_ms=round(elapsed, 2),
162
+ total_tokens=total_tokens,
163
+ )
164
+
165
+ def _get_entity_facts(
166
+ self, conn: sqlite3.Connection, entity_id: str, min_confidence: float
167
+ ) -> list[Fact]:
168
+ """Load current facts for an entity, filtered by confidence."""
169
+ rows = conn.execute(
170
+ """SELECT * FROM facts
171
+ WHERE entity_id = ? AND valid_to IS NULL AND superseded_by IS NULL
172
+ AND status NOT IN ('expired', 'retracted', 'conflicted')
173
+ AND confidence >= ?
174
+ ORDER BY confidence DESC, created_at DESC
175
+ LIMIT 10""",
176
+ (entity_id, min_confidence),
177
+ ).fetchall()
178
+
179
+ facts = []
180
+ for r in rows:
181
+ facts.append(Fact(
182
+ id=r["id"],
183
+ entity_id=r["entity_id"],
184
+ subject=r["subject"],
185
+ predicate=r["predicate"],
186
+ object=r["object"],
187
+ raw_text=r["raw_text"],
188
+ source=Source(
189
+ type=SourceType(r["source_type"]),
190
+ author=r["source_author"] or "",
191
+ channel=r["source_channel"] or "",
192
+ ),
193
+ confidence=r["confidence"],
194
+ status=FactStatus(r["status"]),
195
+ ))
196
+ return facts
197
+
198
+ def _row_to_entity(self, row: sqlite3.Row) -> Entity:
199
+ """Convert a joined row to Entity model."""
200
+ state_data = json.loads(row["state"]) if row["state"] else {}
201
+ return Entity(
202
+ id=row["id"],
203
+ name=row["name"],
204
+ entity_type=row["entity_type"],
205
+ tier=Tier(row["tier"]),
206
+ summary=row["summary"] or "",
207
+ aliases=json.loads(row["aliases"]) if row["aliases"] else [],
208
+ state=EntityState.from_dict(state_data),
209
+ access_count=row["access_count"] or 0,
210
+ )
@@ -0,0 +1,99 @@
1
+ """Hybrid search — Reciprocal Rank Fusion of FTS5 + semantic search."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+
7
+ from engram.models.search import SearchHit, SearchOptions, SearchResult
8
+ from engram.search.fts5_search import FTS5Search
9
+ from engram.search.semantic import SemanticSearch
10
+
11
+
12
+ class HybridSearch:
13
+ """Reciprocal Rank Fusion (RRF) of keyword and semantic search.
14
+
15
+ RRF score = sum over sources of 1 / (k + rank)
16
+ where k=60 (standard constant).
17
+
18
+ This is simple, effective, and well-studied — no learned weights needed.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ fts: FTS5Search,
24
+ semantic: SemanticSearch,
25
+ k: int = 60,
26
+ fts_weight: float = 1.0,
27
+ semantic_weight: float = 1.0,
28
+ ):
29
+ self.fts = fts
30
+ self.semantic = semantic
31
+ self.k = k
32
+ self.fts_weight = fts_weight
33
+ self.semantic_weight = semantic_weight
34
+
35
+ def close(self) -> None:
36
+ self.fts.close()
37
+ self.semantic.close()
38
+
39
+ def search(self, options: SearchOptions) -> SearchResult:
40
+ """Execute hybrid search with RRF fusion."""
41
+ start = time.monotonic()
42
+
43
+ # Increase individual search limits to get better fusion
44
+ expanded = SearchOptions(
45
+ query=options.query,
46
+ max_results=options.max_results * 3,
47
+ max_tokens=0, # No token limit for individual searches
48
+ min_confidence=options.min_confidence,
49
+ entity_types=options.entity_types,
50
+ tiers=options.tiers,
51
+ include_archived=options.include_archived,
52
+ )
53
+
54
+ # Run both searches
55
+ fts_result = self.fts.search(expanded)
56
+ sem_result = self.semantic.search(expanded)
57
+
58
+ # RRF scoring
59
+ scores: dict[str, float] = {}
60
+ hit_map: dict[str, SearchHit] = {}
61
+
62
+ for rank, hit in enumerate(fts_result.hits):
63
+ eid = hit.entity.id
64
+ scores[eid] = scores.get(eid, 0) + self.fts_weight / (self.k + rank + 1)
65
+ if eid not in hit_map:
66
+ hit_map[eid] = hit
67
+
68
+ for rank, hit in enumerate(sem_result.hits):
69
+ eid = hit.entity.id
70
+ scores[eid] = scores.get(eid, 0) + self.semantic_weight / (self.k + rank + 1)
71
+ if eid not in hit_map:
72
+ hit_map[eid] = hit
73
+
74
+ # Sort by RRF score
75
+ sorted_ids = sorted(scores.keys(), key=lambda eid: scores[eid], reverse=True)
76
+ sorted_ids = sorted_ids[:options.max_results]
77
+
78
+ # Build final hits with token budget
79
+ hits: list[SearchHit] = []
80
+ total_tokens = 0
81
+ for eid in sorted_ids:
82
+ hit = hit_map[eid]
83
+ hit.relevance_score = scores[eid]
84
+
85
+ token_count = len(f"{hit.entity.name} {hit.snippet}") // 4
86
+ if options.max_tokens > 0 and total_tokens + token_count > options.max_tokens:
87
+ break
88
+ hit.token_count = token_count
89
+ hits.append(hit)
90
+ total_tokens += token_count
91
+
92
+ elapsed = (time.monotonic() - start) * 1000
93
+ return SearchResult(
94
+ query=options.query,
95
+ hits=hits,
96
+ total_count=len(scores),
97
+ search_time_ms=round(elapsed, 2),
98
+ total_tokens=total_tokens,
99
+ )
@@ -0,0 +1,186 @@
1
+ """Optional embedding-based semantic search.
2
+
3
+ Requires: pip install engram[semantic]
4
+ For embeddings API: pip install engram[llm]
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import sqlite3
11
+ import struct
12
+ import time
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ from engram.models.entity import Entity, EntityState, Tier
18
+ from engram.models.search import SearchHit, SearchOptions, SearchResult
19
+
20
+
21
+ class SemanticSearch:
22
+ """Embedding-based semantic search using numpy cosine similarity.
23
+
24
+ Stores embeddings in a dedicated SQLite table alongside the main DB.
25
+ For <10k entities, in-memory numpy comparison is fast enough.
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ db_path: Path,
31
+ embedding_provider: str = "openai",
32
+ embedding_model: str = "text-embedding-3-small",
33
+ embedding_dim: int = 1536,
34
+ ):
35
+ self.db_path = Path(db_path).resolve()
36
+ self.embedding_provider = embedding_provider
37
+ self.embedding_model = embedding_model
38
+ self.embedding_dim = embedding_dim
39
+ self._conn: Optional[sqlite3.Connection] = None
40
+ self._client = None
41
+ self._np = None
42
+
43
+ try:
44
+ import numpy as np
45
+ self._np = np
46
+ except ImportError:
47
+ raise ImportError("numpy not installed. Run: pip install engram[semantic]")
48
+
49
+ self._ensure_table()
50
+
51
+ def _get_conn(self) -> sqlite3.Connection:
52
+ if self._conn is None:
53
+ self._conn = sqlite3.connect(str(self.db_path))
54
+ self._conn.execute("PRAGMA busy_timeout=5000")
55
+ self._conn.row_factory = sqlite3.Row
56
+ return self._conn
57
+
58
+ def _ensure_table(self) -> None:
59
+ conn = self._get_conn()
60
+ conn.execute("""
61
+ CREATE TABLE IF NOT EXISTS embeddings (
62
+ entity_id TEXT PRIMARY KEY,
63
+ embedding BLOB NOT NULL,
64
+ model TEXT NOT NULL,
65
+ updated_at TEXT NOT NULL
66
+ )
67
+ """)
68
+ conn.commit()
69
+
70
+ def close(self) -> None:
71
+ if self._conn:
72
+ self._conn.close()
73
+ self._conn = None
74
+
75
+ def _get_embedding_client(self):
76
+ """Lazy-load embedding API client."""
77
+ if self._client is not None:
78
+ return self._client
79
+ if self.embedding_provider == "openai":
80
+ try:
81
+ from openai import OpenAI
82
+ self._client = OpenAI()
83
+ except ImportError:
84
+ raise ImportError("OpenAI SDK not installed. Run: pip install engram[llm]")
85
+ elif self.embedding_provider == "anthropic":
86
+ raise NotImplementedError("Anthropic does not provide an embedding API. Use 'openai' provider.")
87
+ return self._client
88
+
89
+ def embed(self, text: str) -> bytes:
90
+ """Compute embedding for text. Returns bytes (packed floats)."""
91
+ client = self._get_embedding_client()
92
+ response = client.embeddings.create(
93
+ model=self.embedding_model,
94
+ input=text[:8000],
95
+ )
96
+ vector = response.data[0].embedding
97
+ return struct.pack(f"{len(vector)}f", *vector)
98
+
99
+ def _bytes_to_vector(self, data: bytes):
100
+ """Convert packed bytes back to numpy array."""
101
+ n = len(data) // 4
102
+ floats = struct.unpack(f"{n}f", data)
103
+ return self._np.array(floats, dtype=self._np.float32)
104
+
105
+ def index_entity(self, entity_id: str, text: str) -> None:
106
+ """Compute and store embedding for an entity."""
107
+ embedding_bytes = self.embed(text)
108
+ conn = self._get_conn()
109
+ conn.execute(
110
+ """INSERT OR REPLACE INTO embeddings (entity_id, embedding, model, updated_at)
111
+ VALUES (?, ?, ?, ?)""",
112
+ (entity_id, embedding_bytes, self.embedding_model, datetime.now().isoformat()),
113
+ )
114
+ conn.commit()
115
+
116
+ def remove_from_index(self, entity_id: str) -> None:
117
+ conn = self._get_conn()
118
+ conn.execute("DELETE FROM embeddings WHERE entity_id = ?", (entity_id,))
119
+ conn.commit()
120
+
121
+ def search(self, options: SearchOptions) -> SearchResult:
122
+ """Semantic search using cosine similarity."""
123
+ start = time.monotonic()
124
+ np = self._np
125
+
126
+ # Embed query
127
+ query_bytes = self.embed(options.query)
128
+ query_vec = self._bytes_to_vector(query_bytes)
129
+ query_norm = np.linalg.norm(query_vec)
130
+ if query_norm == 0:
131
+ return SearchResult(query=options.query)
132
+
133
+ query_vec = query_vec / query_norm
134
+
135
+ # Load all embeddings
136
+ conn = self._get_conn()
137
+ rows = conn.execute("SELECT entity_id, embedding FROM embeddings").fetchall()
138
+
139
+ if not rows:
140
+ return SearchResult(query=options.query)
141
+
142
+ # Compute cosine similarities
143
+ scores: list[tuple[str, float]] = []
144
+ for row in rows:
145
+ vec = self._bytes_to_vector(row["embedding"])
146
+ vec_norm = np.linalg.norm(vec)
147
+ if vec_norm == 0:
148
+ continue
149
+ similarity = float(np.dot(query_vec, vec / vec_norm))
150
+ scores.append((row["entity_id"], similarity))
151
+
152
+ scores.sort(key=lambda x: x[1], reverse=True)
153
+ scores = scores[:options.max_results]
154
+
155
+ # Build hits
156
+ hits: list[SearchHit] = []
157
+ for entity_id, score in scores:
158
+ entity_row = conn.execute(
159
+ "SELECT * FROM entities WHERE id = ?", (entity_id,)
160
+ ).fetchone()
161
+ if not entity_row:
162
+ continue
163
+
164
+ state_data = json.loads(entity_row["state"]) if entity_row["state"] else {}
165
+ entity = Entity(
166
+ id=entity_row["id"],
167
+ name=entity_row["name"],
168
+ entity_type=entity_row["entity_type"],
169
+ tier=Tier(entity_row["tier"]),
170
+ summary=entity_row["summary"] or "",
171
+ state=EntityState.from_dict(state_data),
172
+ )
173
+
174
+ hits.append(SearchHit(
175
+ entity=entity,
176
+ relevance_score=score,
177
+ snippet=entity.summary[:200] if entity.summary else "",
178
+ ))
179
+
180
+ elapsed = (time.monotonic() - start) * 1000
181
+ return SearchResult(
182
+ query=options.query,
183
+ hits=hits,
184
+ total_count=len(hits),
185
+ search_time_ms=round(elapsed, 2),
186
+ )
@@ -0,0 +1,85 @@
1
+ """Unicode-aware tokenizer with CJK support."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ # Common English stopwords (minimal set for search quality)
8
+ _ENGLISH_STOPWORDS = frozenset({
9
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
10
+ "of", "with", "by", "from", "is", "was", "are", "were", "be", "been",
11
+ "being", "have", "has", "had", "do", "does", "did", "will", "would",
12
+ "could", "should", "may", "might", "shall", "can", "need", "must",
13
+ "not", "no", "nor", "so", "if", "then", "than", "that", "this",
14
+ "these", "those", "it", "its", "he", "she", "they", "we", "you",
15
+ "i", "me", "my", "your", "his", "her", "our", "their", "who",
16
+ "what", "which", "when", "where", "how", "why", "all", "each",
17
+ "every", "both", "few", "more", "most", "some", "any", "such",
18
+ "only", "own", "same", "very", "just", "about", "above", "after",
19
+ "again", "also", "as", "because", "before", "between", "during",
20
+ "into", "over", "through", "under", "until", "up", "while",
21
+ })
22
+
23
+ # Korean stopwords (particles, endings)
24
+ _KOREAN_STOPWORDS = frozenset({
25
+ "은", "는", "이", "가", "을", "를", "에", "의", "로", "으로",
26
+ "와", "과", "도", "만", "부터", "까지", "에서", "한", "할", "하는",
27
+ "했다", "합니다", "있는", "있다", "없다", "되는", "된", "되다",
28
+ "그", "저", "이것", "그것", "저것", "여기", "거기", "저기",
29
+ })
30
+
31
+ # Token pattern: word chars + CJK unified ideographs + Hangul
32
+ _TOKEN_PATTERN = re.compile(
33
+ r'[a-zA-Z0-9_]+' # ASCII words
34
+ r'|[\uAC00-\uD7AF]+' # Hangul syllables
35
+ r'|[\u4E00-\u9FFF]+' # CJK unified ideographs
36
+ r'|[\u3040-\u309F\u30A0-\u30FF]+' # Hiragana + Katakana
37
+ , re.UNICODE)
38
+
39
+
40
+ def tokenize(text: str, remove_stopwords: bool = True) -> list[str]:
41
+ """Tokenize text into lowercase tokens with optional stopword removal.
42
+
43
+ Handles English, Korean, Chinese, and Japanese text.
44
+ """
45
+ tokens = _TOKEN_PATTERN.findall(text.lower())
46
+ if not remove_stopwords:
47
+ return tokens
48
+ return [
49
+ t for t in tokens
50
+ if t not in _ENGLISH_STOPWORDS and t not in _KOREAN_STOPWORDS and len(t) > 1
51
+ ]
52
+
53
+
54
+ def escape_fts5_query(query: str) -> str:
55
+ """Escape special FTS5 characters to prevent syntax errors.
56
+
57
+ FTS5 special chars: * " ( ) : ^
58
+ We wrap each token in double quotes to treat them as literals.
59
+ """
60
+ tokens = tokenize(query, remove_stopwords=False)
61
+ if not tokens:
62
+ return '""'
63
+ # Quote each token to escape special chars, join with implicit AND
64
+ escaped = []
65
+ for token in tokens:
66
+ # Remove any embedded double quotes
67
+ clean = token.replace('"', '')
68
+ if clean:
69
+ escaped.append(f'"{clean}"')
70
+ return " ".join(escaped) if escaped else '""'
71
+
72
+
73
+ def build_fts5_query(query: str) -> str:
74
+ """Build an FTS5 query from natural language input.
75
+
76
+ - Removes stopwords
77
+ - Escapes special characters
78
+ - Returns FTS5-compatible query string
79
+ """
80
+ tokens = tokenize(query, remove_stopwords=True)
81
+ if not tokens:
82
+ # Fall back to raw escaped query if all words were stopwords
83
+ return escape_fts5_query(query)
84
+ # Quote each token for safety
85
+ return " ".join(f'"{t}"' for t in tokens)