memorytrace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- engram/__init__.py +8 -0
- engram/__main__.py +6 -0
- engram/cli/__init__.py +1 -0
- engram/cli/app.py +291 -0
- engram/cli/formatters.py +90 -0
- engram/cli/simple.py +267 -0
- engram/config.py +72 -0
- engram/engine.py +612 -0
- engram/exceptions.py +41 -0
- engram/extraction/__init__.py +6 -0
- engram/extraction/base.py +20 -0
- engram/extraction/llm_extractor.py +197 -0
- engram/extraction/ner/__init__.py +7 -0
- engram/extraction/ner/cjk.py +63 -0
- engram/extraction/ner/english.py +109 -0
- engram/extraction/ner/korean.py +106 -0
- engram/extraction/regex_extractor.py +188 -0
- engram/integrations/__init__.py +1 -0
- engram/integrations/mcp_server.py +213 -0
- engram/integrations/sdk.py +194 -0
- engram/models/__init__.py +19 -0
- engram/models/entity.py +72 -0
- engram/models/fact.py +58 -0
- engram/models/quality.py +61 -0
- engram/models/relation.py +26 -0
- engram/models/search.py +96 -0
- engram/models/session.py +53 -0
- engram/models/source.py +73 -0
- engram/quality/__init__.py +8 -0
- engram/quality/confidence.py +38 -0
- engram/quality/conflict.py +79 -0
- engram/quality/decay.py +28 -0
- engram/quality/gate.py +120 -0
- engram/quality/pii.py +80 -0
- engram/search/__init__.py +13 -0
- engram/search/base.py +20 -0
- engram/search/fts5_search.py +210 -0
- engram/search/hybrid.py +99 -0
- engram/search/semantic.py +186 -0
- engram/search/tokenizer.py +85 -0
- engram/session/__init__.py +6 -0
- engram/session/context.py +87 -0
- engram/session/manager.py +152 -0
- engram/session/working_memory.py +57 -0
- engram/storage/__init__.py +6 -0
- engram/storage/base.py +63 -0
- engram/storage/markdown_export.py +144 -0
- engram/storage/migrations.py +30 -0
- engram/storage/sqlite_store.py +615 -0
- memorytrace-0.1.0.dist-info/METADATA +138 -0
- memorytrace-0.1.0.dist-info/RECORD +54 -0
- memorytrace-0.1.0.dist-info/WHEEL +4 -0
- memorytrace-0.1.0.dist-info/entry_points.txt +3 -0
- memorytrace-0.1.0.dist-info/licenses/LICENSE +21 -0
engram/quality/pii.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""PII (Personally Identifiable Information) detection and masking."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from engram.models.quality import PIIMatch
|
|
9
|
+
|
|
10
|
+
# PII detection patterns
|
|
11
|
+
_PII_PATTERNS: dict[str, re.Pattern] = {
|
|
12
|
+
"credit_card": re.compile(
|
|
13
|
+
r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'
|
|
14
|
+
),
|
|
15
|
+
"ssn": re.compile(
|
|
16
|
+
r'\b\d{3}-\d{2}-\d{4}\b'
|
|
17
|
+
),
|
|
18
|
+
"api_key": re.compile(
|
|
19
|
+
r'\b(?:sk-|ak-|AKIA|ghp_|gho_|github_pat_)[A-Za-z0-9_\-]{20,}\b'
|
|
20
|
+
),
|
|
21
|
+
"email": re.compile(
|
|
22
|
+
r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Z|a-z]{2,}\b'
|
|
23
|
+
),
|
|
24
|
+
"phone_kr": re.compile(
|
|
25
|
+
r'\b0\d{1,2}-\d{3,4}-\d{4}\b'
|
|
26
|
+
),
|
|
27
|
+
"phone_intl": re.compile(
|
|
28
|
+
r'\+\d{1,3}[-\s]?\d{1,4}[-\s]?\d{3,4}[-\s]?\d{4}\b'
|
|
29
|
+
),
|
|
30
|
+
"rrn": re.compile(
|
|
31
|
+
# Korean Resident Registration Number (주민등록번호)
|
|
32
|
+
r'\b\d{6}[-\s]?[1-4]\d{6}\b'
|
|
33
|
+
),
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
_REDACTED = "[REDACTED]"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class PIIDetector:
|
|
40
|
+
"""Detects and masks PII in text."""
|
|
41
|
+
|
|
42
|
+
def scan(self, text: str) -> list[PIIMatch]:
|
|
43
|
+
"""Scan text for PII patterns. Returns list of matches."""
|
|
44
|
+
matches: list[PIIMatch] = []
|
|
45
|
+
for pii_type, pattern in _PII_PATTERNS.items():
|
|
46
|
+
for m in pattern.finditer(text):
|
|
47
|
+
# Store only type and length, not the original PII value
|
|
48
|
+
original_display = f"[{pii_type}:{len(m.group())}chars]"
|
|
49
|
+
matches.append(PIIMatch(
|
|
50
|
+
pii_type=pii_type,
|
|
51
|
+
start=m.start(),
|
|
52
|
+
end=m.end(),
|
|
53
|
+
original=original_display,
|
|
54
|
+
))
|
|
55
|
+
# Sort by position (reverse for safe replacement)
|
|
56
|
+
matches.sort(key=lambda x: x.start)
|
|
57
|
+
return matches
|
|
58
|
+
|
|
59
|
+
def mask(self, text: str, matches: Optional[list[PIIMatch]] = None) -> str:
|
|
60
|
+
"""Replace PII occurrences with [REDACTED].
|
|
61
|
+
|
|
62
|
+
If matches not provided, scans first.
|
|
63
|
+
"""
|
|
64
|
+
if matches is None:
|
|
65
|
+
matches = self.scan(text)
|
|
66
|
+
if not matches:
|
|
67
|
+
return text
|
|
68
|
+
|
|
69
|
+
# Replace from end to start to preserve positions
|
|
70
|
+
result = text
|
|
71
|
+
for m in sorted(matches, key=lambda x: x.start, reverse=True):
|
|
72
|
+
result = result[:m.start] + _REDACTED + result[m.end:]
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
def has_pii(self, text: str) -> bool:
|
|
76
|
+
"""Quick check: does text contain any PII?"""
|
|
77
|
+
for pattern in _PII_PATTERNS.values():
|
|
78
|
+
if pattern.search(text):
|
|
79
|
+
return True
|
|
80
|
+
return False
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Search backends for Engram."""
|
|
2
|
+
|
|
3
|
+
from engram.search.base import SearchBackend
|
|
4
|
+
from engram.search.fts5_search import FTS5Search
|
|
5
|
+
from engram.search.tokenizer import tokenize, build_fts5_query, escape_fts5_query
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"SearchBackend",
|
|
9
|
+
"FTS5Search",
|
|
10
|
+
"tokenize",
|
|
11
|
+
"build_fts5_query",
|
|
12
|
+
"escape_fts5_query",
|
|
13
|
+
]
|
engram/search/base.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Search backend protocol."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
from engram.models.search import SearchOptions, SearchResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@runtime_checkable
|
|
11
|
+
class SearchBackend(Protocol):
|
|
12
|
+
"""Abstract search interface — all search backends implement this."""
|
|
13
|
+
|
|
14
|
+
def search(self, options: SearchOptions) -> SearchResult: ...
|
|
15
|
+
|
|
16
|
+
def index_entity(self, entity_id: str) -> None: ...
|
|
17
|
+
|
|
18
|
+
def remove_from_index(self, entity_id: str) -> None: ...
|
|
19
|
+
|
|
20
|
+
def reindex_all(self) -> int: ...
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""FTS5-based BM25 search — genuine Okapi BM25 via SQLite's built-in ranking."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sqlite3
|
|
7
|
+
import time
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional
|
|
10
|
+
|
|
11
|
+
from engram.models.entity import Entity, EntityState, Tier
|
|
12
|
+
from engram.models.fact import Fact, FactStatus
|
|
13
|
+
from engram.models.search import SearchHit, SearchOptions, SearchResult
|
|
14
|
+
from engram.models.source import Source, SourceType
|
|
15
|
+
from engram.search.tokenizer import build_fts5_query
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FTS5Search:
|
|
19
|
+
"""Full-text search using SQLite FTS5's built-in BM25 ranking.
|
|
20
|
+
|
|
21
|
+
FTS5 bm25() implements genuine Okapi BM25 with:
|
|
22
|
+
- TF saturation (k1 parameter)
|
|
23
|
+
- Document length normalization (b parameter)
|
|
24
|
+
- IDF via document frequency counting
|
|
25
|
+
|
|
26
|
+
Column weights (higher = more important):
|
|
27
|
+
- entity_name: 10.0
|
|
28
|
+
- summary: 5.0
|
|
29
|
+
- fact_text: 3.0
|
|
30
|
+
- state_text: 2.0
|
|
31
|
+
- entity_type: 1.0
|
|
32
|
+
- entity_id: 0.0 (not searchable)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Column order in FTS table: entity_id, entity_name, entity_type, fact_text, state_text, summary
|
|
36
|
+
# bm25() weights correspond to column order
|
|
37
|
+
_BM25_WEIGHTS = (0.0, 10.0, 1.0, 3.0, 2.0, 5.0)
|
|
38
|
+
_BM25_WEIGHTS_SQL = "0.0, 10.0, 1.0, 3.0, 2.0, 5.0" # Pre-built constant for SQL
|
|
39
|
+
|
|
40
|
+
def __init__(self, db_path: Path):
|
|
41
|
+
self.db_path = Path(db_path).resolve()
|
|
42
|
+
self._conn: Optional[sqlite3.Connection] = None
|
|
43
|
+
|
|
44
|
+
def _get_conn(self) -> sqlite3.Connection:
|
|
45
|
+
if self._conn is None:
|
|
46
|
+
self._conn = sqlite3.connect(str(self.db_path))
|
|
47
|
+
self._conn.execute("PRAGMA busy_timeout=5000")
|
|
48
|
+
self._conn.row_factory = sqlite3.Row
|
|
49
|
+
return self._conn
|
|
50
|
+
|
|
51
|
+
def close(self) -> None:
|
|
52
|
+
if self._conn is not None:
|
|
53
|
+
self._conn.close()
|
|
54
|
+
self._conn = None
|
|
55
|
+
|
|
56
|
+
def search(self, options: SearchOptions) -> SearchResult:
|
|
57
|
+
"""Execute a BM25-ranked full-text search."""
|
|
58
|
+
start = time.monotonic()
|
|
59
|
+
|
|
60
|
+
fts_query = build_fts5_query(options.query)
|
|
61
|
+
if not fts_query or fts_query == '""':
|
|
62
|
+
return SearchResult(
|
|
63
|
+
query=options.query,
|
|
64
|
+
hits=[],
|
|
65
|
+
total_count=0,
|
|
66
|
+
search_time_ms=0.0,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
conn = self._get_conn()
|
|
70
|
+
|
|
71
|
+
# Search with BM25 ranking and join with entities table for full data
|
|
72
|
+
# Weights are a class-level constant — never derived from user input
|
|
73
|
+
sql = f"""
|
|
74
|
+
SELECT
|
|
75
|
+
e.*,
|
|
76
|
+
bm25(memory_fts, {self._BM25_WEIGHTS_SQL}) AS rank,
|
|
77
|
+
snippet(memory_fts, 3, '>>>', '<<<', '...', 40) AS snippet
|
|
78
|
+
FROM memory_fts fts
|
|
79
|
+
JOIN entities e ON e.id = fts.entity_id
|
|
80
|
+
WHERE memory_fts MATCH ?
|
|
81
|
+
"""
|
|
82
|
+
params: list = [fts_query]
|
|
83
|
+
|
|
84
|
+
# Apply filters
|
|
85
|
+
if options.entity_types:
|
|
86
|
+
placeholders = ",".join("?" for _ in options.entity_types)
|
|
87
|
+
sql += f" AND e.entity_type IN ({placeholders})"
|
|
88
|
+
params.extend(options.entity_types)
|
|
89
|
+
|
|
90
|
+
if options.tiers:
|
|
91
|
+
placeholders = ",".join("?" for _ in options.tiers)
|
|
92
|
+
sql += f" AND e.tier IN ({placeholders})"
|
|
93
|
+
params.extend(options.tiers)
|
|
94
|
+
|
|
95
|
+
if not options.include_archived:
|
|
96
|
+
sql += " AND e.tier != 'archival'"
|
|
97
|
+
|
|
98
|
+
if options.date_from:
|
|
99
|
+
sql += " AND e.updated_at >= ?"
|
|
100
|
+
params.append(options.date_from.isoformat())
|
|
101
|
+
|
|
102
|
+
if options.date_to:
|
|
103
|
+
sql += " AND e.updated_at <= ?"
|
|
104
|
+
params.append(options.date_to.isoformat())
|
|
105
|
+
|
|
106
|
+
sql += " ORDER BY rank LIMIT ?"
|
|
107
|
+
params.append(options.max_results)
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
rows = conn.execute(sql, params).fetchall()
|
|
111
|
+
except sqlite3.OperationalError as e:
|
|
112
|
+
err_msg = str(e).lower()
|
|
113
|
+
# Only swallow FTS5 query syntax errors; re-raise others
|
|
114
|
+
if "fts5" in err_msg or "syntax" in err_msg or "parse" in err_msg:
|
|
115
|
+
return SearchResult(
|
|
116
|
+
query=options.query,
|
|
117
|
+
hits=[],
|
|
118
|
+
total_count=0,
|
|
119
|
+
search_time_ms=(time.monotonic() - start) * 1000,
|
|
120
|
+
)
|
|
121
|
+
raise
|
|
122
|
+
|
|
123
|
+
# Build hits
|
|
124
|
+
hits: list[SearchHit] = []
|
|
125
|
+
total_tokens = 0
|
|
126
|
+
|
|
127
|
+
for row in rows:
|
|
128
|
+
entity = self._row_to_entity(row)
|
|
129
|
+
snippet = row["snippet"] if row["snippet"] else ""
|
|
130
|
+
# Clean snippet markers
|
|
131
|
+
snippet = snippet.replace(">>>", "").replace("<<<", "")
|
|
132
|
+
|
|
133
|
+
# Load facts for this entity if within confidence threshold
|
|
134
|
+
facts = self._get_entity_facts(
|
|
135
|
+
conn, entity.id, options.min_confidence
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Estimate token count (1 token ≈ 4 chars)
|
|
139
|
+
entry_text = f"{entity.name} {snippet} {' '.join(f.raw_text for f in facts[:3])}"
|
|
140
|
+
token_count = len(entry_text) // 4
|
|
141
|
+
|
|
142
|
+
# Check token budget (max_tokens <= 0 means unlimited)
|
|
143
|
+
if options.max_tokens > 0 and total_tokens + token_count > options.max_tokens:
|
|
144
|
+
break
|
|
145
|
+
|
|
146
|
+
hits.append(SearchHit(
|
|
147
|
+
entity=entity,
|
|
148
|
+
facts=facts[:5], # Limit facts per hit
|
|
149
|
+
relevance_score=abs(row["rank"]), # bm25() returns negative scores
|
|
150
|
+
snippet=snippet,
|
|
151
|
+
token_count=token_count,
|
|
152
|
+
))
|
|
153
|
+
total_tokens += token_count
|
|
154
|
+
|
|
155
|
+
elapsed = (time.monotonic() - start) * 1000
|
|
156
|
+
|
|
157
|
+
return SearchResult(
|
|
158
|
+
query=options.query,
|
|
159
|
+
hits=hits,
|
|
160
|
+
total_count=len(rows), # Total DB matches (may exceed len(hits) due to token budget)
|
|
161
|
+
search_time_ms=round(elapsed, 2),
|
|
162
|
+
total_tokens=total_tokens,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def _get_entity_facts(
|
|
166
|
+
self, conn: sqlite3.Connection, entity_id: str, min_confidence: float
|
|
167
|
+
) -> list[Fact]:
|
|
168
|
+
"""Load current facts for an entity, filtered by confidence."""
|
|
169
|
+
rows = conn.execute(
|
|
170
|
+
"""SELECT * FROM facts
|
|
171
|
+
WHERE entity_id = ? AND valid_to IS NULL AND superseded_by IS NULL
|
|
172
|
+
AND status NOT IN ('expired', 'retracted', 'conflicted')
|
|
173
|
+
AND confidence >= ?
|
|
174
|
+
ORDER BY confidence DESC, created_at DESC
|
|
175
|
+
LIMIT 10""",
|
|
176
|
+
(entity_id, min_confidence),
|
|
177
|
+
).fetchall()
|
|
178
|
+
|
|
179
|
+
facts = []
|
|
180
|
+
for r in rows:
|
|
181
|
+
facts.append(Fact(
|
|
182
|
+
id=r["id"],
|
|
183
|
+
entity_id=r["entity_id"],
|
|
184
|
+
subject=r["subject"],
|
|
185
|
+
predicate=r["predicate"],
|
|
186
|
+
object=r["object"],
|
|
187
|
+
raw_text=r["raw_text"],
|
|
188
|
+
source=Source(
|
|
189
|
+
type=SourceType(r["source_type"]),
|
|
190
|
+
author=r["source_author"] or "",
|
|
191
|
+
channel=r["source_channel"] or "",
|
|
192
|
+
),
|
|
193
|
+
confidence=r["confidence"],
|
|
194
|
+
status=FactStatus(r["status"]),
|
|
195
|
+
))
|
|
196
|
+
return facts
|
|
197
|
+
|
|
198
|
+
def _row_to_entity(self, row: sqlite3.Row) -> Entity:
|
|
199
|
+
"""Convert a joined row to Entity model."""
|
|
200
|
+
state_data = json.loads(row["state"]) if row["state"] else {}
|
|
201
|
+
return Entity(
|
|
202
|
+
id=row["id"],
|
|
203
|
+
name=row["name"],
|
|
204
|
+
entity_type=row["entity_type"],
|
|
205
|
+
tier=Tier(row["tier"]),
|
|
206
|
+
summary=row["summary"] or "",
|
|
207
|
+
aliases=json.loads(row["aliases"]) if row["aliases"] else [],
|
|
208
|
+
state=EntityState.from_dict(state_data),
|
|
209
|
+
access_count=row["access_count"] or 0,
|
|
210
|
+
)
|
engram/search/hybrid.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Hybrid search — Reciprocal Rank Fusion of FTS5 + semantic search."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from engram.models.search import SearchHit, SearchOptions, SearchResult
|
|
8
|
+
from engram.search.fts5_search import FTS5Search
|
|
9
|
+
from engram.search.semantic import SemanticSearch
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HybridSearch:
|
|
13
|
+
"""Reciprocal Rank Fusion (RRF) of keyword and semantic search.
|
|
14
|
+
|
|
15
|
+
RRF score = sum over sources of 1 / (k + rank)
|
|
16
|
+
where k=60 (standard constant).
|
|
17
|
+
|
|
18
|
+
This is simple, effective, and well-studied — no learned weights needed.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
fts: FTS5Search,
|
|
24
|
+
semantic: SemanticSearch,
|
|
25
|
+
k: int = 60,
|
|
26
|
+
fts_weight: float = 1.0,
|
|
27
|
+
semantic_weight: float = 1.0,
|
|
28
|
+
):
|
|
29
|
+
self.fts = fts
|
|
30
|
+
self.semantic = semantic
|
|
31
|
+
self.k = k
|
|
32
|
+
self.fts_weight = fts_weight
|
|
33
|
+
self.semantic_weight = semantic_weight
|
|
34
|
+
|
|
35
|
+
def close(self) -> None:
|
|
36
|
+
self.fts.close()
|
|
37
|
+
self.semantic.close()
|
|
38
|
+
|
|
39
|
+
def search(self, options: SearchOptions) -> SearchResult:
|
|
40
|
+
"""Execute hybrid search with RRF fusion."""
|
|
41
|
+
start = time.monotonic()
|
|
42
|
+
|
|
43
|
+
# Increase individual search limits to get better fusion
|
|
44
|
+
expanded = SearchOptions(
|
|
45
|
+
query=options.query,
|
|
46
|
+
max_results=options.max_results * 3,
|
|
47
|
+
max_tokens=0, # No token limit for individual searches
|
|
48
|
+
min_confidence=options.min_confidence,
|
|
49
|
+
entity_types=options.entity_types,
|
|
50
|
+
tiers=options.tiers,
|
|
51
|
+
include_archived=options.include_archived,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Run both searches
|
|
55
|
+
fts_result = self.fts.search(expanded)
|
|
56
|
+
sem_result = self.semantic.search(expanded)
|
|
57
|
+
|
|
58
|
+
# RRF scoring
|
|
59
|
+
scores: dict[str, float] = {}
|
|
60
|
+
hit_map: dict[str, SearchHit] = {}
|
|
61
|
+
|
|
62
|
+
for rank, hit in enumerate(fts_result.hits):
|
|
63
|
+
eid = hit.entity.id
|
|
64
|
+
scores[eid] = scores.get(eid, 0) + self.fts_weight / (self.k + rank + 1)
|
|
65
|
+
if eid not in hit_map:
|
|
66
|
+
hit_map[eid] = hit
|
|
67
|
+
|
|
68
|
+
for rank, hit in enumerate(sem_result.hits):
|
|
69
|
+
eid = hit.entity.id
|
|
70
|
+
scores[eid] = scores.get(eid, 0) + self.semantic_weight / (self.k + rank + 1)
|
|
71
|
+
if eid not in hit_map:
|
|
72
|
+
hit_map[eid] = hit
|
|
73
|
+
|
|
74
|
+
# Sort by RRF score
|
|
75
|
+
sorted_ids = sorted(scores.keys(), key=lambda eid: scores[eid], reverse=True)
|
|
76
|
+
sorted_ids = sorted_ids[:options.max_results]
|
|
77
|
+
|
|
78
|
+
# Build final hits with token budget
|
|
79
|
+
hits: list[SearchHit] = []
|
|
80
|
+
total_tokens = 0
|
|
81
|
+
for eid in sorted_ids:
|
|
82
|
+
hit = hit_map[eid]
|
|
83
|
+
hit.relevance_score = scores[eid]
|
|
84
|
+
|
|
85
|
+
token_count = len(f"{hit.entity.name} {hit.snippet}") // 4
|
|
86
|
+
if options.max_tokens > 0 and total_tokens + token_count > options.max_tokens:
|
|
87
|
+
break
|
|
88
|
+
hit.token_count = token_count
|
|
89
|
+
hits.append(hit)
|
|
90
|
+
total_tokens += token_count
|
|
91
|
+
|
|
92
|
+
elapsed = (time.monotonic() - start) * 1000
|
|
93
|
+
return SearchResult(
|
|
94
|
+
query=options.query,
|
|
95
|
+
hits=hits,
|
|
96
|
+
total_count=len(scores),
|
|
97
|
+
search_time_ms=round(elapsed, 2),
|
|
98
|
+
total_tokens=total_tokens,
|
|
99
|
+
)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""Optional embedding-based semantic search.
|
|
2
|
+
|
|
3
|
+
Requires: pip install engram[semantic]
|
|
4
|
+
For embeddings API: pip install engram[llm]
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import sqlite3
|
|
11
|
+
import struct
|
|
12
|
+
import time
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from engram.models.entity import Entity, EntityState, Tier
|
|
18
|
+
from engram.models.search import SearchHit, SearchOptions, SearchResult
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SemanticSearch:
|
|
22
|
+
"""Embedding-based semantic search using numpy cosine similarity.
|
|
23
|
+
|
|
24
|
+
Stores embeddings in a dedicated SQLite table alongside the main DB.
|
|
25
|
+
For <10k entities, in-memory numpy comparison is fast enough.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
db_path: Path,
|
|
31
|
+
embedding_provider: str = "openai",
|
|
32
|
+
embedding_model: str = "text-embedding-3-small",
|
|
33
|
+
embedding_dim: int = 1536,
|
|
34
|
+
):
|
|
35
|
+
self.db_path = Path(db_path).resolve()
|
|
36
|
+
self.embedding_provider = embedding_provider
|
|
37
|
+
self.embedding_model = embedding_model
|
|
38
|
+
self.embedding_dim = embedding_dim
|
|
39
|
+
self._conn: Optional[sqlite3.Connection] = None
|
|
40
|
+
self._client = None
|
|
41
|
+
self._np = None
|
|
42
|
+
|
|
43
|
+
try:
|
|
44
|
+
import numpy as np
|
|
45
|
+
self._np = np
|
|
46
|
+
except ImportError:
|
|
47
|
+
raise ImportError("numpy not installed. Run: pip install engram[semantic]")
|
|
48
|
+
|
|
49
|
+
self._ensure_table()
|
|
50
|
+
|
|
51
|
+
def _get_conn(self) -> sqlite3.Connection:
|
|
52
|
+
if self._conn is None:
|
|
53
|
+
self._conn = sqlite3.connect(str(self.db_path))
|
|
54
|
+
self._conn.execute("PRAGMA busy_timeout=5000")
|
|
55
|
+
self._conn.row_factory = sqlite3.Row
|
|
56
|
+
return self._conn
|
|
57
|
+
|
|
58
|
+
def _ensure_table(self) -> None:
|
|
59
|
+
conn = self._get_conn()
|
|
60
|
+
conn.execute("""
|
|
61
|
+
CREATE TABLE IF NOT EXISTS embeddings (
|
|
62
|
+
entity_id TEXT PRIMARY KEY,
|
|
63
|
+
embedding BLOB NOT NULL,
|
|
64
|
+
model TEXT NOT NULL,
|
|
65
|
+
updated_at TEXT NOT NULL
|
|
66
|
+
)
|
|
67
|
+
""")
|
|
68
|
+
conn.commit()
|
|
69
|
+
|
|
70
|
+
def close(self) -> None:
|
|
71
|
+
if self._conn:
|
|
72
|
+
self._conn.close()
|
|
73
|
+
self._conn = None
|
|
74
|
+
|
|
75
|
+
def _get_embedding_client(self):
|
|
76
|
+
"""Lazy-load embedding API client."""
|
|
77
|
+
if self._client is not None:
|
|
78
|
+
return self._client
|
|
79
|
+
if self.embedding_provider == "openai":
|
|
80
|
+
try:
|
|
81
|
+
from openai import OpenAI
|
|
82
|
+
self._client = OpenAI()
|
|
83
|
+
except ImportError:
|
|
84
|
+
raise ImportError("OpenAI SDK not installed. Run: pip install engram[llm]")
|
|
85
|
+
elif self.embedding_provider == "anthropic":
|
|
86
|
+
raise NotImplementedError("Anthropic does not provide an embedding API. Use 'openai' provider.")
|
|
87
|
+
return self._client
|
|
88
|
+
|
|
89
|
+
def embed(self, text: str) -> bytes:
|
|
90
|
+
"""Compute embedding for text. Returns bytes (packed floats)."""
|
|
91
|
+
client = self._get_embedding_client()
|
|
92
|
+
response = client.embeddings.create(
|
|
93
|
+
model=self.embedding_model,
|
|
94
|
+
input=text[:8000],
|
|
95
|
+
)
|
|
96
|
+
vector = response.data[0].embedding
|
|
97
|
+
return struct.pack(f"{len(vector)}f", *vector)
|
|
98
|
+
|
|
99
|
+
def _bytes_to_vector(self, data: bytes):
|
|
100
|
+
"""Convert packed bytes back to numpy array."""
|
|
101
|
+
n = len(data) // 4
|
|
102
|
+
floats = struct.unpack(f"{n}f", data)
|
|
103
|
+
return self._np.array(floats, dtype=self._np.float32)
|
|
104
|
+
|
|
105
|
+
def index_entity(self, entity_id: str, text: str) -> None:
|
|
106
|
+
"""Compute and store embedding for an entity."""
|
|
107
|
+
embedding_bytes = self.embed(text)
|
|
108
|
+
conn = self._get_conn()
|
|
109
|
+
conn.execute(
|
|
110
|
+
"""INSERT OR REPLACE INTO embeddings (entity_id, embedding, model, updated_at)
|
|
111
|
+
VALUES (?, ?, ?, ?)""",
|
|
112
|
+
(entity_id, embedding_bytes, self.embedding_model, datetime.now().isoformat()),
|
|
113
|
+
)
|
|
114
|
+
conn.commit()
|
|
115
|
+
|
|
116
|
+
def remove_from_index(self, entity_id: str) -> None:
|
|
117
|
+
conn = self._get_conn()
|
|
118
|
+
conn.execute("DELETE FROM embeddings WHERE entity_id = ?", (entity_id,))
|
|
119
|
+
conn.commit()
|
|
120
|
+
|
|
121
|
+
def search(self, options: SearchOptions) -> SearchResult:
|
|
122
|
+
"""Semantic search using cosine similarity."""
|
|
123
|
+
start = time.monotonic()
|
|
124
|
+
np = self._np
|
|
125
|
+
|
|
126
|
+
# Embed query
|
|
127
|
+
query_bytes = self.embed(options.query)
|
|
128
|
+
query_vec = self._bytes_to_vector(query_bytes)
|
|
129
|
+
query_norm = np.linalg.norm(query_vec)
|
|
130
|
+
if query_norm == 0:
|
|
131
|
+
return SearchResult(query=options.query)
|
|
132
|
+
|
|
133
|
+
query_vec = query_vec / query_norm
|
|
134
|
+
|
|
135
|
+
# Load all embeddings
|
|
136
|
+
conn = self._get_conn()
|
|
137
|
+
rows = conn.execute("SELECT entity_id, embedding FROM embeddings").fetchall()
|
|
138
|
+
|
|
139
|
+
if not rows:
|
|
140
|
+
return SearchResult(query=options.query)
|
|
141
|
+
|
|
142
|
+
# Compute cosine similarities
|
|
143
|
+
scores: list[tuple[str, float]] = []
|
|
144
|
+
for row in rows:
|
|
145
|
+
vec = self._bytes_to_vector(row["embedding"])
|
|
146
|
+
vec_norm = np.linalg.norm(vec)
|
|
147
|
+
if vec_norm == 0:
|
|
148
|
+
continue
|
|
149
|
+
similarity = float(np.dot(query_vec, vec / vec_norm))
|
|
150
|
+
scores.append((row["entity_id"], similarity))
|
|
151
|
+
|
|
152
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
|
153
|
+
scores = scores[:options.max_results]
|
|
154
|
+
|
|
155
|
+
# Build hits
|
|
156
|
+
hits: list[SearchHit] = []
|
|
157
|
+
for entity_id, score in scores:
|
|
158
|
+
entity_row = conn.execute(
|
|
159
|
+
"SELECT * FROM entities WHERE id = ?", (entity_id,)
|
|
160
|
+
).fetchone()
|
|
161
|
+
if not entity_row:
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
state_data = json.loads(entity_row["state"]) if entity_row["state"] else {}
|
|
165
|
+
entity = Entity(
|
|
166
|
+
id=entity_row["id"],
|
|
167
|
+
name=entity_row["name"],
|
|
168
|
+
entity_type=entity_row["entity_type"],
|
|
169
|
+
tier=Tier(entity_row["tier"]),
|
|
170
|
+
summary=entity_row["summary"] or "",
|
|
171
|
+
state=EntityState.from_dict(state_data),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
hits.append(SearchHit(
|
|
175
|
+
entity=entity,
|
|
176
|
+
relevance_score=score,
|
|
177
|
+
snippet=entity.summary[:200] if entity.summary else "",
|
|
178
|
+
))
|
|
179
|
+
|
|
180
|
+
elapsed = (time.monotonic() - start) * 1000
|
|
181
|
+
return SearchResult(
|
|
182
|
+
query=options.query,
|
|
183
|
+
hits=hits,
|
|
184
|
+
total_count=len(hits),
|
|
185
|
+
search_time_ms=round(elapsed, 2),
|
|
186
|
+
)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Unicode-aware tokenizer with CJK support."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
# Common English stopwords (minimal set for search quality)
|
|
8
|
+
_ENGLISH_STOPWORDS = frozenset({
|
|
9
|
+
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
10
|
+
"of", "with", "by", "from", "is", "was", "are", "were", "be", "been",
|
|
11
|
+
"being", "have", "has", "had", "do", "does", "did", "will", "would",
|
|
12
|
+
"could", "should", "may", "might", "shall", "can", "need", "must",
|
|
13
|
+
"not", "no", "nor", "so", "if", "then", "than", "that", "this",
|
|
14
|
+
"these", "those", "it", "its", "he", "she", "they", "we", "you",
|
|
15
|
+
"i", "me", "my", "your", "his", "her", "our", "their", "who",
|
|
16
|
+
"what", "which", "when", "where", "how", "why", "all", "each",
|
|
17
|
+
"every", "both", "few", "more", "most", "some", "any", "such",
|
|
18
|
+
"only", "own", "same", "very", "just", "about", "above", "after",
|
|
19
|
+
"again", "also", "as", "because", "before", "between", "during",
|
|
20
|
+
"into", "over", "through", "under", "until", "up", "while",
|
|
21
|
+
})
|
|
22
|
+
|
|
23
|
+
# Korean stopwords (particles, endings)
|
|
24
|
+
_KOREAN_STOPWORDS = frozenset({
|
|
25
|
+
"은", "는", "이", "가", "을", "를", "에", "의", "로", "으로",
|
|
26
|
+
"와", "과", "도", "만", "부터", "까지", "에서", "한", "할", "하는",
|
|
27
|
+
"했다", "합니다", "있는", "있다", "없다", "되는", "된", "되다",
|
|
28
|
+
"그", "저", "이것", "그것", "저것", "여기", "거기", "저기",
|
|
29
|
+
})
|
|
30
|
+
|
|
31
|
+
# Token pattern: word chars + CJK unified ideographs + Hangul
|
|
32
|
+
_TOKEN_PATTERN = re.compile(
|
|
33
|
+
r'[a-zA-Z0-9_]+' # ASCII words
|
|
34
|
+
r'|[\uAC00-\uD7AF]+' # Hangul syllables
|
|
35
|
+
r'|[\u4E00-\u9FFF]+' # CJK unified ideographs
|
|
36
|
+
r'|[\u3040-\u309F\u30A0-\u30FF]+' # Hiragana + Katakana
|
|
37
|
+
, re.UNICODE)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def tokenize(text: str, remove_stopwords: bool = True) -> list[str]:
|
|
41
|
+
"""Tokenize text into lowercase tokens with optional stopword removal.
|
|
42
|
+
|
|
43
|
+
Handles English, Korean, Chinese, and Japanese text.
|
|
44
|
+
"""
|
|
45
|
+
tokens = _TOKEN_PATTERN.findall(text.lower())
|
|
46
|
+
if not remove_stopwords:
|
|
47
|
+
return tokens
|
|
48
|
+
return [
|
|
49
|
+
t for t in tokens
|
|
50
|
+
if t not in _ENGLISH_STOPWORDS and t not in _KOREAN_STOPWORDS and len(t) > 1
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def escape_fts5_query(query: str) -> str:
|
|
55
|
+
"""Escape special FTS5 characters to prevent syntax errors.
|
|
56
|
+
|
|
57
|
+
FTS5 special chars: * " ( ) : ^
|
|
58
|
+
We wrap each token in double quotes to treat them as literals.
|
|
59
|
+
"""
|
|
60
|
+
tokens = tokenize(query, remove_stopwords=False)
|
|
61
|
+
if not tokens:
|
|
62
|
+
return '""'
|
|
63
|
+
# Quote each token to escape special chars, join with implicit AND
|
|
64
|
+
escaped = []
|
|
65
|
+
for token in tokens:
|
|
66
|
+
# Remove any embedded double quotes
|
|
67
|
+
clean = token.replace('"', '')
|
|
68
|
+
if clean:
|
|
69
|
+
escaped.append(f'"{clean}"')
|
|
70
|
+
return " ".join(escaped) if escaped else '""'
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def build_fts5_query(query: str) -> str:
|
|
74
|
+
"""Build an FTS5 query from natural language input.
|
|
75
|
+
|
|
76
|
+
- Removes stopwords
|
|
77
|
+
- Escapes special characters
|
|
78
|
+
- Returns FTS5-compatible query string
|
|
79
|
+
"""
|
|
80
|
+
tokens = tokenize(query, remove_stopwords=True)
|
|
81
|
+
if not tokens:
|
|
82
|
+
# Fall back to raw escaped query if all words were stopwords
|
|
83
|
+
return escape_fts5_query(query)
|
|
84
|
+
# Quote each token for safety
|
|
85
|
+
return " ".join(f'"{t}"' for t in tokens)
|