brainlayer 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brainlayer/__init__.py +3 -0
- brainlayer/cli/__init__.py +1545 -0
- brainlayer/cli/wizard.py +132 -0
- brainlayer/cli_new.py +151 -0
- brainlayer/client.py +164 -0
- brainlayer/clustering.py +736 -0
- brainlayer/daemon.py +1105 -0
- brainlayer/dashboard/README.md +129 -0
- brainlayer/dashboard/__init__.py +5 -0
- brainlayer/dashboard/app.py +151 -0
- brainlayer/dashboard/search.py +229 -0
- brainlayer/dashboard/views.py +230 -0
- brainlayer/embeddings.py +131 -0
- brainlayer/engine.py +550 -0
- brainlayer/index_new.py +87 -0
- brainlayer/mcp/__init__.py +1558 -0
- brainlayer/migrate.py +205 -0
- brainlayer/paths.py +43 -0
- brainlayer/pipeline/__init__.py +47 -0
- brainlayer/pipeline/analyze_communication.py +508 -0
- brainlayer/pipeline/brain_graph.py +567 -0
- brainlayer/pipeline/chat_tags.py +63 -0
- brainlayer/pipeline/chunk.py +422 -0
- brainlayer/pipeline/classify.py +472 -0
- brainlayer/pipeline/cluster_sampling.py +73 -0
- brainlayer/pipeline/enrichment.py +810 -0
- brainlayer/pipeline/extract.py +66 -0
- brainlayer/pipeline/extract_claude_desktop.py +149 -0
- brainlayer/pipeline/extract_corrections.py +231 -0
- brainlayer/pipeline/extract_markdown.py +195 -0
- brainlayer/pipeline/extract_whatsapp.py +227 -0
- brainlayer/pipeline/git_overlay.py +301 -0
- brainlayer/pipeline/longitudinal_analyzer.py +568 -0
- brainlayer/pipeline/obsidian_export.py +455 -0
- brainlayer/pipeline/operation_grouping.py +486 -0
- brainlayer/pipeline/plan_linking.py +313 -0
- brainlayer/pipeline/sanitize.py +549 -0
- brainlayer/pipeline/semantic_style.py +574 -0
- brainlayer/pipeline/session_enrichment.py +472 -0
- brainlayer/pipeline/style_embed.py +67 -0
- brainlayer/pipeline/style_index.py +139 -0
- brainlayer/pipeline/temporal_chains.py +203 -0
- brainlayer/pipeline/time_batcher.py +248 -0
- brainlayer/pipeline/unified_timeline.py +569 -0
- brainlayer/storage.py +66 -0
- brainlayer/store.py +155 -0
- brainlayer/taxonomy.json +80 -0
- brainlayer/vector_store.py +1891 -0
- brainlayer-1.0.0.dist-info/METADATA +313 -0
- brainlayer-1.0.0.dist-info/RECORD +53 -0
- brainlayer-1.0.0.dist-info/WHEEL +4 -0
- brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
- brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,1891 @@
|
|
|
1
|
+
"""SQLite-vec based vector store for fast search."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import struct
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
import apsw
|
|
10
|
+
import apsw.bestpractice
|
|
11
|
+
import sqlite_vec
|
|
12
|
+
|
|
13
|
+
# Apply APSW best practices
|
|
14
|
+
apsw.bestpractice.apply(apsw.bestpractice.recommended)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_SOURCE_MIN_CHARS = {
|
|
18
|
+
"whatsapp": 15,
|
|
19
|
+
"telegram": 15,
|
|
20
|
+
}
|
|
21
|
+
_DEFAULT_MIN_CHARS = 50
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def source_aware_min_chars(source: Optional[str]) -> int:
|
|
25
|
+
"""Return minimum character count for enrichment based on message source.
|
|
26
|
+
|
|
27
|
+
Short-form messaging sources (WhatsApp, Telegram) use a lower threshold
|
|
28
|
+
since meaningful messages are often 15-50 chars.
|
|
29
|
+
"""
|
|
30
|
+
if source is None:
|
|
31
|
+
return _DEFAULT_MIN_CHARS
|
|
32
|
+
return _SOURCE_MIN_CHARS.get(source, _DEFAULT_MIN_CHARS)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _safe_json_loads(value: Any) -> list:
|
|
36
|
+
"""Safely parse a JSON string, returning [] on None or invalid JSON."""
|
|
37
|
+
if not value:
|
|
38
|
+
return []
|
|
39
|
+
try:
|
|
40
|
+
return json.loads(value)
|
|
41
|
+
except (json.JSONDecodeError, TypeError):
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _escape_fts5_query(query: str) -> str:
|
|
46
|
+
"""Escape a query string for FTS5 MATCH.
|
|
47
|
+
|
|
48
|
+
FTS5 treats certain characters as syntax: ., *, ^, ", (, ), +, -, NOT, AND, OR, NEAR.
|
|
49
|
+
We wrap each word in double quotes so they're treated as literal terms,
|
|
50
|
+
joined with OR for lenient matching (any term matches).
|
|
51
|
+
Empty/whitespace-only queries return a wildcard match-all.
|
|
52
|
+
"""
|
|
53
|
+
if not query or not query.strip():
|
|
54
|
+
return "*"
|
|
55
|
+
# Split into words, wrap each in double quotes (escaping any internal quotes)
|
|
56
|
+
terms = []
|
|
57
|
+
for word in query.split():
|
|
58
|
+
# Remove internal double quotes to prevent FTS5 injection
|
|
59
|
+
clean = word.replace('"', "")
|
|
60
|
+
if clean:
|
|
61
|
+
terms.append(f'"{clean}"')
|
|
62
|
+
# Use OR between terms so matching is lenient (any term matches)
|
|
63
|
+
# Without OR, FTS5 defaults to AND (all terms must be present)
|
|
64
|
+
return " OR ".join(terms) if terms else "*"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def serialize_f32(vector: List[float]) -> bytes:
|
|
68
|
+
"""Serialize a float32 vector to bytes for sqlite-vec."""
|
|
69
|
+
return struct.pack(f"{len(vector)}f", *vector)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class VectorStore:
|
|
73
|
+
"""SQLite-vec based vector store."""
|
|
74
|
+
|
|
75
|
+
def __init__(self, db_path: Path):
|
|
76
|
+
self.db_path = db_path
|
|
77
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
78
|
+
self._init_db()
|
|
79
|
+
|
|
80
|
+
def _init_db(self) -> None:
|
|
81
|
+
"""Initialize database with vector extension."""
|
|
82
|
+
self.conn = apsw.Connection(str(self.db_path))
|
|
83
|
+
self.conn.enableloadextension(True)
|
|
84
|
+
self.conn.loadextension(sqlite_vec.loadable_path())
|
|
85
|
+
self.conn.enableloadextension(False)
|
|
86
|
+
|
|
87
|
+
cursor = self.conn.cursor()
|
|
88
|
+
|
|
89
|
+
# AIDEV-NOTE: busy_timeout is critical for multi-process access (daemon + MCP + enrichment).
|
|
90
|
+
# Without this, concurrent writes get SQLITE_BUSY immediately and crash silently.
|
|
91
|
+
cursor.execute("PRAGMA busy_timeout = 5000")
|
|
92
|
+
|
|
93
|
+
# Create tables
|
|
94
|
+
cursor.execute("""
|
|
95
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
96
|
+
id TEXT PRIMARY KEY,
|
|
97
|
+
content TEXT NOT NULL,
|
|
98
|
+
metadata TEXT NOT NULL,
|
|
99
|
+
source_file TEXT NOT NULL,
|
|
100
|
+
project TEXT,
|
|
101
|
+
content_type TEXT,
|
|
102
|
+
value_type TEXT,
|
|
103
|
+
char_count INTEGER,
|
|
104
|
+
source TEXT,
|
|
105
|
+
sender TEXT,
|
|
106
|
+
language TEXT,
|
|
107
|
+
conversation_id TEXT,
|
|
108
|
+
position INTEGER,
|
|
109
|
+
context_summary TEXT
|
|
110
|
+
)
|
|
111
|
+
""")
|
|
112
|
+
|
|
113
|
+
# Add columns if upgrading existing DB (check existing columns first)
|
|
114
|
+
existing_cols = {row[1] for row in cursor.execute("PRAGMA table_info(chunks)")}
|
|
115
|
+
for col, typ in [
|
|
116
|
+
("source", "TEXT"),
|
|
117
|
+
("sender", "TEXT"),
|
|
118
|
+
("language", "TEXT"),
|
|
119
|
+
("conversation_id", "TEXT"),
|
|
120
|
+
("position", "INTEGER"),
|
|
121
|
+
("context_summary", "TEXT"),
|
|
122
|
+
("tags", "TEXT"),
|
|
123
|
+
("tag_confidence", "REAL"),
|
|
124
|
+
# Enrichment columns (Phase 5)
|
|
125
|
+
("summary", "TEXT"),
|
|
126
|
+
("importance", "REAL"),
|
|
127
|
+
("intent", "TEXT"),
|
|
128
|
+
("enriched_at", "TEXT"),
|
|
129
|
+
# Extended enrichment columns (Phase 3 — Gemini backfill)
|
|
130
|
+
("primary_symbols", "TEXT"), # JSON array of classes/functions/files
|
|
131
|
+
("resolved_query", "TEXT"), # HyDE-style hypothetical question
|
|
132
|
+
("epistemic_level", "TEXT"), # hypothesis/substantiated/validated
|
|
133
|
+
("version_scope", "TEXT"), # version or system state discussed
|
|
134
|
+
("debt_impact", "TEXT"), # introduction/resolution/none
|
|
135
|
+
("external_deps", "TEXT"), # JSON array of libraries/APIs
|
|
136
|
+
# Phase 3: created_at for date filtering
|
|
137
|
+
("created_at", "TEXT"), # ISO 8601 timestamp of when chunk was created/ingested
|
|
138
|
+
]:
|
|
139
|
+
if col not in existing_cols:
|
|
140
|
+
cursor.execute(f"ALTER TABLE chunks ADD COLUMN {col} {typ}")
|
|
141
|
+
|
|
142
|
+
# Indexes for filtering
|
|
143
|
+
for idx, col in [
|
|
144
|
+
("idx_chunks_source", "source"),
|
|
145
|
+
("idx_chunks_sender", "sender"),
|
|
146
|
+
("idx_chunks_conversation", "conversation_id"),
|
|
147
|
+
("idx_chunks_intent", "intent"),
|
|
148
|
+
("idx_chunks_importance", "importance"),
|
|
149
|
+
("idx_chunks_enriched", "enriched_at"),
|
|
150
|
+
("idx_chunks_created", "created_at"),
|
|
151
|
+
]:
|
|
152
|
+
cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx} ON chunks({col})")
|
|
153
|
+
|
|
154
|
+
# Create vector table with 1024 dimensions for bge-large-en-v1.5
|
|
155
|
+
cursor.execute("""
|
|
156
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunk_vectors USING vec0(
|
|
157
|
+
chunk_id TEXT PRIMARY KEY,
|
|
158
|
+
embedding FLOAT[1024]
|
|
159
|
+
)
|
|
160
|
+
""")
|
|
161
|
+
|
|
162
|
+
# FTS5 full-text search table for hybrid search
|
|
163
|
+
cursor.execute("""
|
|
164
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
|
|
165
|
+
content, chunk_id UNINDEXED
|
|
166
|
+
)
|
|
167
|
+
""")
|
|
168
|
+
|
|
169
|
+
# Triggers to keep FTS5 in sync with chunks table
|
|
170
|
+
cursor.execute("""
|
|
171
|
+
CREATE TRIGGER IF NOT EXISTS chunks_fts_insert AFTER INSERT ON chunks BEGIN
|
|
172
|
+
INSERT INTO chunks_fts(content, chunk_id) VALUES (new.content, new.id);
|
|
173
|
+
END
|
|
174
|
+
""")
|
|
175
|
+
cursor.execute("""
|
|
176
|
+
CREATE TRIGGER IF NOT EXISTS chunks_fts_delete AFTER DELETE ON chunks BEGIN
|
|
177
|
+
DELETE FROM chunks_fts WHERE chunk_id = old.id;
|
|
178
|
+
END
|
|
179
|
+
""")
|
|
180
|
+
cursor.execute("""
|
|
181
|
+
CREATE TRIGGER IF NOT EXISTS chunks_fts_update AFTER UPDATE OF content ON chunks BEGIN
|
|
182
|
+
DELETE FROM chunks_fts WHERE chunk_id = old.id;
|
|
183
|
+
INSERT INTO chunks_fts(content, chunk_id) VALUES (new.content, new.id);
|
|
184
|
+
END
|
|
185
|
+
""")
|
|
186
|
+
|
|
187
|
+
# Phase 8b: Git overlay tables
|
|
188
|
+
cursor.execute("""
|
|
189
|
+
CREATE TABLE IF NOT EXISTS session_context (
|
|
190
|
+
session_id TEXT PRIMARY KEY,
|
|
191
|
+
project TEXT,
|
|
192
|
+
branch TEXT,
|
|
193
|
+
pr_number INTEGER,
|
|
194
|
+
commit_shas TEXT,
|
|
195
|
+
files_changed TEXT,
|
|
196
|
+
started_at TEXT,
|
|
197
|
+
ended_at TEXT,
|
|
198
|
+
created_at TEXT
|
|
199
|
+
)
|
|
200
|
+
""")
|
|
201
|
+
# Phase 8c: Plan linking columns on session_context
|
|
202
|
+
existing_sc_cols = {row[1] for row in cursor.execute("PRAGMA table_info(session_context)")}
|
|
203
|
+
for col in ("plan_name", "plan_phase", "story_id"):
|
|
204
|
+
if col not in existing_sc_cols:
|
|
205
|
+
cursor.execute(f"ALTER TABLE session_context ADD COLUMN {col} TEXT")
|
|
206
|
+
cursor.execute("""
|
|
207
|
+
CREATE TABLE IF NOT EXISTS file_interactions (
|
|
208
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
209
|
+
file_path TEXT NOT NULL,
|
|
210
|
+
timestamp TEXT,
|
|
211
|
+
session_id TEXT,
|
|
212
|
+
action TEXT,
|
|
213
|
+
chunk_id TEXT,
|
|
214
|
+
project TEXT
|
|
215
|
+
)
|
|
216
|
+
""")
|
|
217
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_interactions_path ON file_interactions(file_path)")
|
|
218
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_interactions_session ON file_interactions(session_id)")
|
|
219
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_session_context_project ON session_context(project)")
|
|
220
|
+
|
|
221
|
+
# Phase 8a: Operations table
|
|
222
|
+
cursor.execute("""
|
|
223
|
+
CREATE TABLE IF NOT EXISTS operations (
|
|
224
|
+
id TEXT PRIMARY KEY,
|
|
225
|
+
session_id TEXT NOT NULL,
|
|
226
|
+
operation_type TEXT,
|
|
227
|
+
chunk_ids TEXT,
|
|
228
|
+
summary TEXT,
|
|
229
|
+
outcome TEXT,
|
|
230
|
+
started_at TEXT,
|
|
231
|
+
ended_at TEXT,
|
|
232
|
+
step_count INTEGER DEFAULT 0,
|
|
233
|
+
created_at TEXT
|
|
234
|
+
)
|
|
235
|
+
""")
|
|
236
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_operations_session ON operations(session_id)")
|
|
237
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_operations_type ON operations(operation_type)")
|
|
238
|
+
|
|
239
|
+
# Phase 8d: Topic chains table
|
|
240
|
+
cursor.execute("""
|
|
241
|
+
CREATE TABLE IF NOT EXISTS topic_chains (
|
|
242
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
243
|
+
file_path TEXT NOT NULL,
|
|
244
|
+
session_a TEXT NOT NULL,
|
|
245
|
+
session_b TEXT NOT NULL,
|
|
246
|
+
shared_actions INTEGER DEFAULT 0,
|
|
247
|
+
time_delta_hours REAL,
|
|
248
|
+
project TEXT,
|
|
249
|
+
created_at TEXT
|
|
250
|
+
)
|
|
251
|
+
""")
|
|
252
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_topic_chains_file ON topic_chains(file_path)")
|
|
253
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_topic_chains_session ON topic_chains(session_a)")
|
|
254
|
+
|
|
255
|
+
# Phase 7: Session-level enrichment table
|
|
256
|
+
cursor.execute("""
|
|
257
|
+
CREATE TABLE IF NOT EXISTS session_enrichments (
|
|
258
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
259
|
+
session_id TEXT NOT NULL UNIQUE,
|
|
260
|
+
file_path TEXT,
|
|
261
|
+
enrichment_version TEXT NOT NULL DEFAULT '1.0',
|
|
262
|
+
enrichment_model TEXT,
|
|
263
|
+
enrichment_timestamp TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ','now')),
|
|
264
|
+
|
|
265
|
+
-- Timing (flat — for temporal queries)
|
|
266
|
+
session_start_time TEXT,
|
|
267
|
+
session_end_time TEXT,
|
|
268
|
+
duration_seconds INTEGER,
|
|
269
|
+
|
|
270
|
+
-- Message dynamics (flat — for aggregation dashboards)
|
|
271
|
+
message_count INTEGER NOT NULL DEFAULT 0,
|
|
272
|
+
user_message_count INTEGER NOT NULL DEFAULT 0,
|
|
273
|
+
assistant_message_count INTEGER NOT NULL DEFAULT 0,
|
|
274
|
+
tool_call_count INTEGER NOT NULL DEFAULT 0,
|
|
275
|
+
|
|
276
|
+
-- Content analysis (flat — for filtering)
|
|
277
|
+
session_summary TEXT,
|
|
278
|
+
primary_intent TEXT,
|
|
279
|
+
outcome TEXT CHECK(outcome IN ('success','partial_success','failure','abandoned','ongoing')),
|
|
280
|
+
complexity_score INTEGER CHECK(complexity_score BETWEEN 1 AND 10),
|
|
281
|
+
|
|
282
|
+
-- Quality scores (flat — for dashboards)
|
|
283
|
+
session_quality_score INTEGER CHECK(session_quality_score BETWEEN 1 AND 10),
|
|
284
|
+
|
|
285
|
+
-- Decisions, corrections, learnings (JSON — variable-length arrays)
|
|
286
|
+
decisions_made TEXT DEFAULT '[]',
|
|
287
|
+
corrections TEXT DEFAULT '[]',
|
|
288
|
+
learnings TEXT DEFAULT '[]',
|
|
289
|
+
mistakes TEXT DEFAULT '[]',
|
|
290
|
+
patterns TEXT DEFAULT '[]',
|
|
291
|
+
|
|
292
|
+
-- Topic tags (JSON array)
|
|
293
|
+
topic_tags TEXT DEFAULT '[]',
|
|
294
|
+
|
|
295
|
+
-- Tool usage (JSON — per-tool stats)
|
|
296
|
+
tool_usage_stats TEXT DEFAULT '[]',
|
|
297
|
+
|
|
298
|
+
-- Narrative (text — for human reading)
|
|
299
|
+
what_worked TEXT,
|
|
300
|
+
what_failed TEXT,
|
|
301
|
+
|
|
302
|
+
-- Embedding for session-level semantic search
|
|
303
|
+
summary_embedding BLOB
|
|
304
|
+
)
|
|
305
|
+
""")
|
|
306
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_session_enrichments_session ON session_enrichments(session_id)")
|
|
307
|
+
cursor.execute(
|
|
308
|
+
"CREATE INDEX IF NOT EXISTS idx_session_enrichments_project ON session_enrichments(primary_intent)"
|
|
309
|
+
)
|
|
310
|
+
cursor.execute("CREATE INDEX IF NOT EXISTS idx_session_enrichments_outcome ON session_enrichments(outcome)")
|
|
311
|
+
cursor.execute(
|
|
312
|
+
"CREATE INDEX IF NOT EXISTS idx_session_enrichments_quality ON session_enrichments(session_quality_score)"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Phase 7: FTS5 for session narrative search
|
|
316
|
+
cursor.execute("""
|
|
317
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS session_enrichments_fts USING fts5(
|
|
318
|
+
session_summary, what_worked, what_failed, session_id UNINDEXED
|
|
319
|
+
)
|
|
320
|
+
""")
|
|
321
|
+
|
|
322
|
+
# Check if FTS5 needs backfill (existing DB without FTS5 data)
|
|
323
|
+
fts_count = list(cursor.execute("SELECT COUNT(*) FROM chunks_fts"))[0][0]
|
|
324
|
+
chunk_count = list(cursor.execute("SELECT COUNT(*) FROM chunks"))[0][0]
|
|
325
|
+
if chunk_count > 0 and fts_count == 0:
|
|
326
|
+
cursor.execute("""
|
|
327
|
+
INSERT INTO chunks_fts(content, chunk_id)
|
|
328
|
+
SELECT content, id FROM chunks
|
|
329
|
+
""")
|
|
330
|
+
|
|
331
|
+
def upsert_chunks(self, chunks: List[Dict[str, Any]], embeddings: List[List[float]]) -> int:
|
|
332
|
+
"""Upsert chunks with embeddings."""
|
|
333
|
+
if len(chunks) != len(embeddings):
|
|
334
|
+
raise ValueError("Chunks and embeddings must have same length")
|
|
335
|
+
|
|
336
|
+
cursor = self.conn.cursor()
|
|
337
|
+
|
|
338
|
+
for chunk, embedding in zip(chunks, embeddings):
|
|
339
|
+
chunk_id = chunk["id"]
|
|
340
|
+
|
|
341
|
+
# Upsert chunk — preserve enrichment columns on re-index
|
|
342
|
+
cursor.execute(
|
|
343
|
+
"""
|
|
344
|
+
INSERT INTO chunks
|
|
345
|
+
(id, content, metadata, source_file, project,
|
|
346
|
+
content_type, value_type, char_count, source, created_at)
|
|
347
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
348
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
349
|
+
content = excluded.content,
|
|
350
|
+
metadata = excluded.metadata,
|
|
351
|
+
source_file = excluded.source_file,
|
|
352
|
+
project = excluded.project,
|
|
353
|
+
content_type = excluded.content_type,
|
|
354
|
+
value_type = excluded.value_type,
|
|
355
|
+
char_count = excluded.char_count,
|
|
356
|
+
source = excluded.source,
|
|
357
|
+
created_at = COALESCE(chunks.created_at, excluded.created_at)
|
|
358
|
+
""",
|
|
359
|
+
(
|
|
360
|
+
chunk_id,
|
|
361
|
+
chunk["content"],
|
|
362
|
+
json.dumps(chunk["metadata"]),
|
|
363
|
+
chunk["source_file"],
|
|
364
|
+
chunk.get("project"),
|
|
365
|
+
chunk.get("content_type"),
|
|
366
|
+
chunk.get("value_type"),
|
|
367
|
+
chunk.get("char_count", 0),
|
|
368
|
+
chunk.get("source", "claude_code"),
|
|
369
|
+
chunk.get("created_at"),
|
|
370
|
+
),
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
# Upsert vector - vec0 doesn't support INSERT OR REPLACE, so delete first
|
|
374
|
+
cursor.execute("DELETE FROM chunk_vectors WHERE chunk_id = ?", (chunk_id,))
|
|
375
|
+
cursor.execute(
|
|
376
|
+
"""
|
|
377
|
+
INSERT INTO chunk_vectors (chunk_id, embedding)
|
|
378
|
+
VALUES (?, ?)
|
|
379
|
+
""",
|
|
380
|
+
(chunk_id, serialize_f32(embedding)),
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
return len(chunks)
|
|
384
|
+
|
|
385
|
+
def search(
|
|
386
|
+
self,
|
|
387
|
+
query_embedding: Optional[List[float]] = None,
|
|
388
|
+
query_text: Optional[str] = None,
|
|
389
|
+
n_results: int = 10,
|
|
390
|
+
project_filter: Optional[str] = None,
|
|
391
|
+
content_type_filter: Optional[str] = None,
|
|
392
|
+
source_filter: Optional[str] = None,
|
|
393
|
+
sender_filter: Optional[str] = None,
|
|
394
|
+
language_filter: Optional[str] = None,
|
|
395
|
+
tag_filter: Optional[str] = None,
|
|
396
|
+
intent_filter: Optional[str] = None,
|
|
397
|
+
importance_min: Optional[float] = None,
|
|
398
|
+
date_from: Optional[str] = None,
|
|
399
|
+
date_to: Optional[str] = None,
|
|
400
|
+
) -> Dict[str, List]:
|
|
401
|
+
"""Search chunks by embedding or text."""
|
|
402
|
+
|
|
403
|
+
cursor = self.conn.cursor()
|
|
404
|
+
|
|
405
|
+
if query_embedding is not None:
|
|
406
|
+
# Vector similarity search
|
|
407
|
+
query_bytes = serialize_f32(query_embedding)
|
|
408
|
+
|
|
409
|
+
where_clauses = []
|
|
410
|
+
filter_params: list = []
|
|
411
|
+
|
|
412
|
+
if project_filter:
|
|
413
|
+
where_clauses.append("c.project = ?")
|
|
414
|
+
filter_params.append(project_filter)
|
|
415
|
+
if content_type_filter:
|
|
416
|
+
where_clauses.append("c.content_type = ?")
|
|
417
|
+
filter_params.append(content_type_filter)
|
|
418
|
+
if source_filter:
|
|
419
|
+
where_clauses.append("c.source = ?")
|
|
420
|
+
filter_params.append(source_filter)
|
|
421
|
+
if sender_filter:
|
|
422
|
+
where_clauses.append("c.sender = ?")
|
|
423
|
+
filter_params.append(sender_filter)
|
|
424
|
+
if language_filter:
|
|
425
|
+
where_clauses.append("c.language = ?")
|
|
426
|
+
filter_params.append(language_filter)
|
|
427
|
+
if tag_filter:
|
|
428
|
+
where_clauses.append(
|
|
429
|
+
"c.tags IS NOT NULL AND json_valid(c.tags) = 1 AND EXISTS (SELECT 1 FROM json_each(c.tags) WHERE value = ?)"
|
|
430
|
+
)
|
|
431
|
+
filter_params.append(tag_filter)
|
|
432
|
+
if intent_filter:
|
|
433
|
+
where_clauses.append("c.intent = ?")
|
|
434
|
+
filter_params.append(intent_filter)
|
|
435
|
+
if importance_min is not None:
|
|
436
|
+
where_clauses.append("c.importance >= ?")
|
|
437
|
+
filter_params.append(importance_min)
|
|
438
|
+
if date_from:
|
|
439
|
+
where_clauses.append("c.created_at >= ?")
|
|
440
|
+
filter_params.append(date_from)
|
|
441
|
+
if date_to:
|
|
442
|
+
where_clauses.append("c.created_at <= ?")
|
|
443
|
+
filter_params.append(date_to)
|
|
444
|
+
|
|
445
|
+
where_sql = ""
|
|
446
|
+
if where_clauses:
|
|
447
|
+
where_sql = "AND " + " AND ".join(where_clauses)
|
|
448
|
+
|
|
449
|
+
# sqlite-vec KNN: MATCH and k must bind before filter params
|
|
450
|
+
params = [query_bytes, n_results] + filter_params
|
|
451
|
+
query = f"""
|
|
452
|
+
SELECT c.id, c.content, c.metadata, c.source_file, c.project,
|
|
453
|
+
c.content_type, c.value_type, c.char_count,
|
|
454
|
+
v.distance,
|
|
455
|
+
c.summary, c.tags, c.importance, c.intent,
|
|
456
|
+
c.created_at, c.source
|
|
457
|
+
FROM chunk_vectors v
|
|
458
|
+
JOIN chunks c ON v.chunk_id = c.id
|
|
459
|
+
WHERE v.embedding MATCH ? AND k = ? {where_sql}
|
|
460
|
+
ORDER BY v.distance
|
|
461
|
+
"""
|
|
462
|
+
|
|
463
|
+
results = list(cursor.execute(query, params))
|
|
464
|
+
|
|
465
|
+
elif query_text is not None:
|
|
466
|
+
# Text search using LIKE
|
|
467
|
+
where_clauses = ["content LIKE ?"]
|
|
468
|
+
params = [f"%{query_text}%"]
|
|
469
|
+
|
|
470
|
+
if project_filter:
|
|
471
|
+
where_clauses.append("project = ?")
|
|
472
|
+
params.append(project_filter)
|
|
473
|
+
if content_type_filter:
|
|
474
|
+
where_clauses.append("content_type = ?")
|
|
475
|
+
params.append(content_type_filter)
|
|
476
|
+
if source_filter:
|
|
477
|
+
where_clauses.append("source = ?")
|
|
478
|
+
params.append(source_filter)
|
|
479
|
+
if sender_filter:
|
|
480
|
+
where_clauses.append("sender = ?")
|
|
481
|
+
params.append(sender_filter)
|
|
482
|
+
if language_filter:
|
|
483
|
+
where_clauses.append("language = ?")
|
|
484
|
+
params.append(language_filter)
|
|
485
|
+
if tag_filter:
|
|
486
|
+
where_clauses.append(
|
|
487
|
+
"tags IS NOT NULL AND json_valid(tags) = 1 AND EXISTS (SELECT 1 FROM json_each(tags) WHERE value = ?)"
|
|
488
|
+
)
|
|
489
|
+
params.append(tag_filter)
|
|
490
|
+
if intent_filter:
|
|
491
|
+
where_clauses.append("intent = ?")
|
|
492
|
+
params.append(intent_filter)
|
|
493
|
+
if importance_min is not None:
|
|
494
|
+
where_clauses.append("importance >= ?")
|
|
495
|
+
params.append(importance_min)
|
|
496
|
+
if date_from:
|
|
497
|
+
where_clauses.append("created_at >= ?")
|
|
498
|
+
params.append(date_from)
|
|
499
|
+
if date_to:
|
|
500
|
+
where_clauses.append("created_at <= ?")
|
|
501
|
+
params.append(date_to)
|
|
502
|
+
|
|
503
|
+
params.append(n_results)
|
|
504
|
+
|
|
505
|
+
query = f"""
|
|
506
|
+
SELECT id, content, metadata, source_file, project,
|
|
507
|
+
content_type, value_type, char_count,
|
|
508
|
+
NULL as distance,
|
|
509
|
+
summary, tags, importance, intent,
|
|
510
|
+
created_at, source
|
|
511
|
+
FROM chunks
|
|
512
|
+
WHERE {" AND ".join(where_clauses)}
|
|
513
|
+
ORDER BY char_count DESC
|
|
514
|
+
LIMIT ?
|
|
515
|
+
"""
|
|
516
|
+
|
|
517
|
+
results = list(cursor.execute(query, params))
|
|
518
|
+
else:
|
|
519
|
+
raise ValueError("Either query_embedding or query_text must be provided")
|
|
520
|
+
|
|
521
|
+
# Format results
|
|
522
|
+
ids = []
|
|
523
|
+
documents = []
|
|
524
|
+
metadatas = []
|
|
525
|
+
distances = []
|
|
526
|
+
|
|
527
|
+
for row in results:
|
|
528
|
+
ids.append(row[0]) # chunk id
|
|
529
|
+
documents.append(row[1]) # content
|
|
530
|
+
metadata = json.loads(row[2]) # metadata
|
|
531
|
+
metadata.update(
|
|
532
|
+
{
|
|
533
|
+
"source_file": row[3],
|
|
534
|
+
"project": row[4],
|
|
535
|
+
"content_type": row[5],
|
|
536
|
+
"value_type": row[6],
|
|
537
|
+
"char_count": row[7],
|
|
538
|
+
}
|
|
539
|
+
)
|
|
540
|
+
# Enrichment fields (may be None if not yet enriched)
|
|
541
|
+
if row[9]:
|
|
542
|
+
metadata["summary"] = row[9]
|
|
543
|
+
if row[10]:
|
|
544
|
+
try:
|
|
545
|
+
metadata["tags"] = json.loads(row[10])
|
|
546
|
+
except (json.JSONDecodeError, TypeError):
|
|
547
|
+
pass
|
|
548
|
+
if row[11] is not None:
|
|
549
|
+
metadata["importance"] = row[11]
|
|
550
|
+
if row[12]:
|
|
551
|
+
metadata["intent"] = row[12]
|
|
552
|
+
# Temporal and source metadata
|
|
553
|
+
if row[13]:
|
|
554
|
+
metadata["created_at"] = row[13]
|
|
555
|
+
if row[14]:
|
|
556
|
+
metadata["source"] = row[14]
|
|
557
|
+
metadatas.append(metadata)
|
|
558
|
+
distances.append(row[8]) # distance (None for text search)
|
|
559
|
+
|
|
560
|
+
return {
|
|
561
|
+
"ids": [ids],
|
|
562
|
+
"documents": [documents],
|
|
563
|
+
"metadatas": [metadatas],
|
|
564
|
+
"distances": [distances],
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
def enrich_results_with_session_context(self, results: Dict[str, List]) -> Dict[str, List]:
|
|
568
|
+
"""Add session enrichment metadata to search results.
|
|
569
|
+
|
|
570
|
+
For each result, if its session has been enriched, add session_summary,
|
|
571
|
+
session_outcome, and session_quality_score to the metadata.
|
|
572
|
+
"""
|
|
573
|
+
if not results.get("metadatas") or not results["metadatas"][0]:
|
|
574
|
+
return results
|
|
575
|
+
|
|
576
|
+
cursor = self.conn.cursor()
|
|
577
|
+
# Cache session lookups to avoid repeated queries
|
|
578
|
+
session_cache: Dict[str, Optional[Dict]] = {}
|
|
579
|
+
|
|
580
|
+
for meta in results["metadatas"][0]:
|
|
581
|
+
source_file = meta.get("source_file", "")
|
|
582
|
+
if not source_file:
|
|
583
|
+
continue
|
|
584
|
+
|
|
585
|
+
# Extract session ID from source_file
|
|
586
|
+
import os
|
|
587
|
+
|
|
588
|
+
session_id = os.path.splitext(os.path.basename(source_file))[0]
|
|
589
|
+
if not session_id:
|
|
590
|
+
continue
|
|
591
|
+
|
|
592
|
+
if session_id not in session_cache:
|
|
593
|
+
rows = list(
|
|
594
|
+
cursor.execute(
|
|
595
|
+
"""SELECT session_summary, primary_intent, outcome,
|
|
596
|
+
session_quality_score
|
|
597
|
+
FROM session_enrichments WHERE session_id = ?""",
|
|
598
|
+
(session_id,),
|
|
599
|
+
)
|
|
600
|
+
)
|
|
601
|
+
if rows:
|
|
602
|
+
session_cache[session_id] = {
|
|
603
|
+
"session_summary": rows[0][0],
|
|
604
|
+
"session_intent": rows[0][1],
|
|
605
|
+
"session_outcome": rows[0][2],
|
|
606
|
+
"session_quality": rows[0][3],
|
|
607
|
+
}
|
|
608
|
+
else:
|
|
609
|
+
session_cache[session_id] = None
|
|
610
|
+
|
|
611
|
+
enrichment = session_cache[session_id]
|
|
612
|
+
if enrichment:
|
|
613
|
+
for k, v in enrichment.items():
|
|
614
|
+
if v is not None:
|
|
615
|
+
meta[k] = v
|
|
616
|
+
|
|
617
|
+
return results
|
|
618
|
+
|
|
619
|
+
def count(self) -> int:
|
|
620
|
+
"""Get total number of chunks."""
|
|
621
|
+
cursor = self.conn.cursor()
|
|
622
|
+
result = list(cursor.execute("SELECT COUNT(*) FROM chunks"))
|
|
623
|
+
return result[0][0] if result else 0
|
|
624
|
+
|
|
625
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
626
|
+
"""Get collection statistics."""
|
|
627
|
+
count = self.count()
|
|
628
|
+
|
|
629
|
+
if count == 0:
|
|
630
|
+
return {"total_chunks": 0, "projects": [], "content_types": []}
|
|
631
|
+
|
|
632
|
+
cursor = self.conn.cursor()
|
|
633
|
+
|
|
634
|
+
# Get unique projects and content types
|
|
635
|
+
results = list(
|
|
636
|
+
cursor.execute("""
|
|
637
|
+
SELECT DISTINCT project, content_type
|
|
638
|
+
FROM chunks
|
|
639
|
+
WHERE project IS NOT NULL AND content_type IS NOT NULL
|
|
640
|
+
LIMIT 100
|
|
641
|
+
""")
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
projects = set()
|
|
645
|
+
content_types = set()
|
|
646
|
+
|
|
647
|
+
for project, content_type in results:
|
|
648
|
+
projects.add(project)
|
|
649
|
+
content_types.add(content_type)
|
|
650
|
+
|
|
651
|
+
return {
|
|
652
|
+
"total_chunks": count,
|
|
653
|
+
"projects": list(projects),
|
|
654
|
+
"content_types": list(content_types),
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
def get_all_chunks(self, limit: int = 10000) -> List[Dict[str, Any]]:
|
|
658
|
+
"""Get all chunks for BM25 fitting (limited for performance)."""
|
|
659
|
+
cursor = self.conn.cursor()
|
|
660
|
+
results = list(
|
|
661
|
+
cursor.execute(
|
|
662
|
+
"""
|
|
663
|
+
SELECT id, content, metadata, source_file, project, content_type
|
|
664
|
+
FROM chunks
|
|
665
|
+
LIMIT ?
|
|
666
|
+
""",
|
|
667
|
+
(limit,),
|
|
668
|
+
)
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
return [
|
|
672
|
+
{
|
|
673
|
+
"id": row[0],
|
|
674
|
+
"content": row[1],
|
|
675
|
+
"metadata": json.loads(row[2]) if row[2] else {},
|
|
676
|
+
"source_file": row[3],
|
|
677
|
+
"project": row[4],
|
|
678
|
+
"content_type": row[5],
|
|
679
|
+
}
|
|
680
|
+
for row in results
|
|
681
|
+
]
|
|
682
|
+
|
|
683
|
+
def hybrid_search(
|
|
684
|
+
self,
|
|
685
|
+
query_embedding: List[float],
|
|
686
|
+
query_text: str,
|
|
687
|
+
n_results: int = 10,
|
|
688
|
+
project_filter: Optional[str] = None,
|
|
689
|
+
content_type_filter: Optional[str] = None,
|
|
690
|
+
source_filter: Optional[str] = None,
|
|
691
|
+
sender_filter: Optional[str] = None,
|
|
692
|
+
language_filter: Optional[str] = None,
|
|
693
|
+
tag_filter: Optional[str] = None,
|
|
694
|
+
intent_filter: Optional[str] = None,
|
|
695
|
+
importance_min: Optional[float] = None,
|
|
696
|
+
date_from: Optional[str] = None,
|
|
697
|
+
date_to: Optional[str] = None,
|
|
698
|
+
k: int = 60,
|
|
699
|
+
) -> Dict[str, List]:
|
|
700
|
+
"""Hybrid search combining semantic (vector) + keyword (FTS5) via Reciprocal Rank Fusion."""
|
|
701
|
+
|
|
702
|
+
# 1. Semantic search — get more results for fusion
|
|
703
|
+
semantic = self.search(
|
|
704
|
+
query_embedding=query_embedding,
|
|
705
|
+
n_results=n_results * 3,
|
|
706
|
+
project_filter=project_filter,
|
|
707
|
+
content_type_filter=content_type_filter,
|
|
708
|
+
source_filter=source_filter,
|
|
709
|
+
sender_filter=sender_filter,
|
|
710
|
+
language_filter=language_filter,
|
|
711
|
+
tag_filter=tag_filter,
|
|
712
|
+
intent_filter=intent_filter,
|
|
713
|
+
importance_min=importance_min,
|
|
714
|
+
date_from=date_from,
|
|
715
|
+
date_to=date_to,
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
# Build semantic rank map: chunk_content -> rank
|
|
719
|
+
semantic_ranks = {}
|
|
720
|
+
for i, (doc, meta) in enumerate(zip(semantic["documents"][0], semantic["metadatas"][0])):
|
|
721
|
+
key = meta.get("source_file", "") + "|" + doc[:100]
|
|
722
|
+
semantic_ranks[key] = i
|
|
723
|
+
|
|
724
|
+
# 2. FTS5 keyword search
|
|
725
|
+
cursor = self.conn.cursor()
|
|
726
|
+
fts_extra = []
|
|
727
|
+
# AIDEV-NOTE: FTS5 MATCH requires escaped query text. Special chars like
|
|
728
|
+
# '.', '*', '"', '(', ')' cause syntax errors if passed raw.
|
|
729
|
+
# Wrap each term in double quotes to treat as literal strings.
|
|
730
|
+
fts_query = _escape_fts5_query(query_text)
|
|
731
|
+
fts_params: list = [fts_query]
|
|
732
|
+
if tag_filter:
|
|
733
|
+
fts_extra.append(
|
|
734
|
+
"AND c.tags IS NOT NULL AND json_valid(c.tags) = 1 AND EXISTS (SELECT 1 FROM json_each(c.tags) WHERE value = ?)"
|
|
735
|
+
)
|
|
736
|
+
fts_params.append(tag_filter)
|
|
737
|
+
if intent_filter:
|
|
738
|
+
fts_extra.append("AND c.intent = ?")
|
|
739
|
+
fts_params.append(intent_filter)
|
|
740
|
+
if importance_min is not None:
|
|
741
|
+
fts_extra.append("AND c.importance >= ?")
|
|
742
|
+
fts_params.append(importance_min)
|
|
743
|
+
if date_from:
|
|
744
|
+
fts_extra.append("AND c.created_at >= ?")
|
|
745
|
+
fts_params.append(date_from)
|
|
746
|
+
if date_to:
|
|
747
|
+
fts_extra.append("AND c.created_at <= ?")
|
|
748
|
+
fts_params.append(date_to)
|
|
749
|
+
fts_params.append(n_results * 3)
|
|
750
|
+
|
|
751
|
+
fts_results = list(
|
|
752
|
+
cursor.execute(
|
|
753
|
+
f"""
|
|
754
|
+
SELECT f.chunk_id, f.rank,
|
|
755
|
+
c.content, c.metadata, c.source_file, c.project,
|
|
756
|
+
c.content_type, c.value_type, c.char_count,
|
|
757
|
+
c.summary, c.tags, c.importance, c.intent,
|
|
758
|
+
c.created_at, c.source
|
|
759
|
+
FROM chunks_fts f
|
|
760
|
+
JOIN chunks c ON f.chunk_id = c.id
|
|
761
|
+
WHERE chunks_fts MATCH ? {" ".join(fts_extra)}
|
|
762
|
+
ORDER BY f.rank
|
|
763
|
+
LIMIT ?
|
|
764
|
+
""",
|
|
765
|
+
fts_params,
|
|
766
|
+
)
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
# Build FTS rank map
|
|
770
|
+
fts_ranks = {}
|
|
771
|
+
fts_data = {}
|
|
772
|
+
for i, row in enumerate(fts_results):
|
|
773
|
+
chunk_id = row[0]
|
|
774
|
+
fts_ranks[chunk_id] = i
|
|
775
|
+
fts_data[chunk_id] = {
|
|
776
|
+
"content": row[2],
|
|
777
|
+
"metadata": json.loads(row[3]) if row[3] else {},
|
|
778
|
+
"source_file": row[4],
|
|
779
|
+
"project": row[5],
|
|
780
|
+
"content_type": row[6],
|
|
781
|
+
"value_type": row[7],
|
|
782
|
+
"char_count": row[8],
|
|
783
|
+
"summary": row[9],
|
|
784
|
+
"tags": row[10],
|
|
785
|
+
"importance": row[11],
|
|
786
|
+
"intent": row[12],
|
|
787
|
+
"created_at": row[13],
|
|
788
|
+
"source": row[14],
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
# 3. Reciprocal Rank Fusion — deduplicate by chunk_id
|
|
792
|
+
# Build semantic rank map keyed by actual chunk_id
|
|
793
|
+
semantic_by_id = {}
|
|
794
|
+
for i in range(len(semantic["ids"][0])):
|
|
795
|
+
cid = semantic["ids"][0][i]
|
|
796
|
+
if cid and cid not in semantic_by_id:
|
|
797
|
+
semantic_by_id[cid] = {
|
|
798
|
+
"rank": i,
|
|
799
|
+
"doc": semantic["documents"][0][i],
|
|
800
|
+
"meta": semantic["metadatas"][0][i],
|
|
801
|
+
"dist": semantic["distances"][0][i],
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
# Union of all chunk_ids from both sources
|
|
805
|
+
all_chunk_ids = set(semantic_by_id.keys()) | set(fts_ranks.keys())
|
|
806
|
+
|
|
807
|
+
scored = []
|
|
808
|
+
for cid in all_chunk_ids:
|
|
809
|
+
score = 0.0
|
|
810
|
+
sem_entry = semantic_by_id.get(cid)
|
|
811
|
+
fts_rank = fts_ranks.get(cid)
|
|
812
|
+
|
|
813
|
+
if sem_entry is not None:
|
|
814
|
+
score += 1.0 / (k + sem_entry["rank"])
|
|
815
|
+
if fts_rank is not None:
|
|
816
|
+
score += 1.0 / (k + fts_rank)
|
|
817
|
+
|
|
818
|
+
# Get data — prefer semantic (has distance)
|
|
819
|
+
if sem_entry is not None:
|
|
820
|
+
doc = sem_entry["doc"]
|
|
821
|
+
meta = sem_entry["meta"]
|
|
822
|
+
dist = sem_entry["dist"]
|
|
823
|
+
elif cid in fts_data:
|
|
824
|
+
data = fts_data[cid]
|
|
825
|
+
doc = data["content"]
|
|
826
|
+
meta = data["metadata"].copy()
|
|
827
|
+
meta.update(
|
|
828
|
+
{
|
|
829
|
+
"source_file": data["source_file"],
|
|
830
|
+
"project": data["project"],
|
|
831
|
+
"content_type": data["content_type"],
|
|
832
|
+
"value_type": data["value_type"],
|
|
833
|
+
"char_count": data["char_count"],
|
|
834
|
+
}
|
|
835
|
+
)
|
|
836
|
+
if data.get("summary"):
|
|
837
|
+
meta["summary"] = data["summary"]
|
|
838
|
+
if data.get("tags"):
|
|
839
|
+
try:
|
|
840
|
+
meta["tags"] = json.loads(data["tags"])
|
|
841
|
+
except (json.JSONDecodeError, TypeError):
|
|
842
|
+
pass
|
|
843
|
+
if data.get("importance") is not None:
|
|
844
|
+
meta["importance"] = data["importance"]
|
|
845
|
+
if data.get("intent"):
|
|
846
|
+
meta["intent"] = data["intent"]
|
|
847
|
+
if data.get("created_at"):
|
|
848
|
+
meta["created_at"] = data["created_at"]
|
|
849
|
+
if data.get("source"):
|
|
850
|
+
meta["source"] = data["source"]
|
|
851
|
+
dist = None
|
|
852
|
+
else:
|
|
853
|
+
continue
|
|
854
|
+
|
|
855
|
+
# Apply filters to FTS-only results
|
|
856
|
+
if fts_rank is not None and sem_entry is None:
|
|
857
|
+
if source_filter and meta.get("source") != source_filter:
|
|
858
|
+
continue
|
|
859
|
+
if project_filter and meta.get("project") != project_filter:
|
|
860
|
+
continue
|
|
861
|
+
|
|
862
|
+
scored.append((score, cid, doc, meta, dist))
|
|
863
|
+
|
|
864
|
+
# Sort by RRF score descending
|
|
865
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
866
|
+
|
|
867
|
+
ids = [s[1] for s in scored[:n_results]]
|
|
868
|
+
documents = [s[2] for s in scored[:n_results]]
|
|
869
|
+
metadatas = [s[3] for s in scored[:n_results]]
|
|
870
|
+
distances = [s[4] for s in scored[:n_results]]
|
|
871
|
+
|
|
872
|
+
return {
|
|
873
|
+
"ids": [ids],
|
|
874
|
+
"documents": [documents],
|
|
875
|
+
"metadatas": [metadatas],
|
|
876
|
+
"distances": [distances],
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
def get_context(self, chunk_id: str, before: int = 3, after: int = 3) -> Dict[str, Any]:
|
|
880
|
+
"""Get surrounding chunks from the same conversation."""
|
|
881
|
+
cursor = self.conn.cursor()
|
|
882
|
+
|
|
883
|
+
# Get the target chunk's conversation_id and position
|
|
884
|
+
target = list(
|
|
885
|
+
cursor.execute(
|
|
886
|
+
"""
|
|
887
|
+
SELECT conversation_id, position, content, metadata
|
|
888
|
+
FROM chunks WHERE id = ?
|
|
889
|
+
""",
|
|
890
|
+
(chunk_id,),
|
|
891
|
+
)
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
if not target:
|
|
895
|
+
return {"target": None, "context": [], "error": "Chunk not found"}
|
|
896
|
+
|
|
897
|
+
conv_id, position, content, metadata = target[0]
|
|
898
|
+
|
|
899
|
+
if not conv_id or position is None:
|
|
900
|
+
return {
|
|
901
|
+
"target": {"id": chunk_id, "content": content, "position": None},
|
|
902
|
+
"context": [],
|
|
903
|
+
"error": "Chunk has no conversation context (conversation_id/position not set)",
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
# Get surrounding chunks
|
|
907
|
+
context_rows = list(
|
|
908
|
+
cursor.execute(
|
|
909
|
+
"""
|
|
910
|
+
SELECT id, content, position, content_type
|
|
911
|
+
FROM chunks
|
|
912
|
+
WHERE conversation_id = ?
|
|
913
|
+
AND position BETWEEN ? AND ?
|
|
914
|
+
ORDER BY position
|
|
915
|
+
""",
|
|
916
|
+
(conv_id, position - before, position + after),
|
|
917
|
+
)
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
context = []
|
|
921
|
+
for row in context_rows:
|
|
922
|
+
context.append(
|
|
923
|
+
{
|
|
924
|
+
"id": row[0],
|
|
925
|
+
"content": row[1],
|
|
926
|
+
"position": row[2],
|
|
927
|
+
"content_type": row[3],
|
|
928
|
+
"is_target": row[0] == chunk_id,
|
|
929
|
+
}
|
|
930
|
+
)
|
|
931
|
+
|
|
932
|
+
return {
|
|
933
|
+
"target": {"id": chunk_id, "content": content, "position": position},
|
|
934
|
+
"context": context,
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
def get_unenriched_chunks(
|
|
938
|
+
self,
|
|
939
|
+
batch_size: int = 50,
|
|
940
|
+
content_types: Optional[List[str]] = None,
|
|
941
|
+
min_char_count: Optional[int] = None,
|
|
942
|
+
source: Optional[str] = None,
|
|
943
|
+
) -> List[Dict[str, Any]]:
|
|
944
|
+
"""Get chunks that haven't been enriched yet, for batch processing.
|
|
945
|
+
|
|
946
|
+
If min_char_count is not specified, uses source_aware_min_chars()
|
|
947
|
+
to pick an appropriate threshold for the given source.
|
|
948
|
+
"""
|
|
949
|
+
cursor = self.conn.cursor()
|
|
950
|
+
|
|
951
|
+
effective_min = min_char_count if min_char_count is not None else source_aware_min_chars(source)
|
|
952
|
+
where = ["enriched_at IS NULL", "char_count >= ?"]
|
|
953
|
+
params: list = [effective_min]
|
|
954
|
+
|
|
955
|
+
if source:
|
|
956
|
+
where.append("source = ?")
|
|
957
|
+
params.append(source)
|
|
958
|
+
|
|
959
|
+
if content_types:
|
|
960
|
+
placeholders = ",".join("?" for _ in content_types)
|
|
961
|
+
where.append(f"content_type IN ({placeholders})")
|
|
962
|
+
params.extend(content_types)
|
|
963
|
+
|
|
964
|
+
params.append(batch_size)
|
|
965
|
+
|
|
966
|
+
results = list(
|
|
967
|
+
cursor.execute(
|
|
968
|
+
f"""
|
|
969
|
+
SELECT id, content, source_file, project, content_type,
|
|
970
|
+
conversation_id, position, char_count
|
|
971
|
+
FROM chunks
|
|
972
|
+
WHERE {" AND ".join(where)}
|
|
973
|
+
ORDER BY rowid DESC
|
|
974
|
+
LIMIT ?
|
|
975
|
+
""",
|
|
976
|
+
params,
|
|
977
|
+
)
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
return [
|
|
981
|
+
{
|
|
982
|
+
"id": row[0],
|
|
983
|
+
"content": row[1],
|
|
984
|
+
"source_file": row[2],
|
|
985
|
+
"project": row[3],
|
|
986
|
+
"content_type": row[4],
|
|
987
|
+
"conversation_id": row[5],
|
|
988
|
+
"position": row[6],
|
|
989
|
+
"char_count": row[7],
|
|
990
|
+
}
|
|
991
|
+
for row in results
|
|
992
|
+
]
|
|
993
|
+
|
|
994
|
+
def update_enrichment(
|
|
995
|
+
self,
|
|
996
|
+
chunk_id: str,
|
|
997
|
+
summary: Optional[str] = None,
|
|
998
|
+
tags: Optional[List[str]] = None,
|
|
999
|
+
importance: Optional[float] = None,
|
|
1000
|
+
intent: Optional[str] = None,
|
|
1001
|
+
primary_symbols: Optional[List[str]] = None,
|
|
1002
|
+
resolved_query: Optional[str] = None,
|
|
1003
|
+
epistemic_level: Optional[str] = None,
|
|
1004
|
+
version_scope: Optional[str] = None,
|
|
1005
|
+
debt_impact: Optional[str] = None,
|
|
1006
|
+
external_deps: Optional[List[str]] = None,
|
|
1007
|
+
) -> None:
|
|
1008
|
+
"""Update enrichment metadata for a chunk."""
|
|
1009
|
+
cursor = self.conn.cursor()
|
|
1010
|
+
from datetime import datetime, timezone
|
|
1011
|
+
|
|
1012
|
+
sets = ["enriched_at = ?"]
|
|
1013
|
+
params: list = [datetime.now(timezone.utc).isoformat()]
|
|
1014
|
+
|
|
1015
|
+
if summary is not None:
|
|
1016
|
+
sets.append("summary = ?")
|
|
1017
|
+
params.append(summary)
|
|
1018
|
+
if tags is not None:
|
|
1019
|
+
sets.append("tags = ?")
|
|
1020
|
+
params.append(json.dumps(tags))
|
|
1021
|
+
if importance is not None:
|
|
1022
|
+
sets.append("importance = ?")
|
|
1023
|
+
params.append(importance)
|
|
1024
|
+
if intent is not None:
|
|
1025
|
+
sets.append("intent = ?")
|
|
1026
|
+
params.append(intent)
|
|
1027
|
+
if primary_symbols is not None:
|
|
1028
|
+
sets.append("primary_symbols = ?")
|
|
1029
|
+
params.append(json.dumps(primary_symbols))
|
|
1030
|
+
if resolved_query is not None:
|
|
1031
|
+
sets.append("resolved_query = ?")
|
|
1032
|
+
params.append(resolved_query)
|
|
1033
|
+
if epistemic_level is not None:
|
|
1034
|
+
sets.append("epistemic_level = ?")
|
|
1035
|
+
params.append(epistemic_level)
|
|
1036
|
+
if version_scope is not None:
|
|
1037
|
+
sets.append("version_scope = ?")
|
|
1038
|
+
params.append(version_scope)
|
|
1039
|
+
if debt_impact is not None:
|
|
1040
|
+
sets.append("debt_impact = ?")
|
|
1041
|
+
params.append(debt_impact)
|
|
1042
|
+
if external_deps is not None:
|
|
1043
|
+
sets.append("external_deps = ?")
|
|
1044
|
+
params.append(json.dumps(external_deps))
|
|
1045
|
+
|
|
1046
|
+
params.append(chunk_id)
|
|
1047
|
+
# Retry on SQLITE_BUSY — concurrent access from daemon/MCP/enrichment
|
|
1048
|
+
import time as _time
|
|
1049
|
+
|
|
1050
|
+
for attempt in range(3):
|
|
1051
|
+
try:
|
|
1052
|
+
cursor.execute(f"UPDATE chunks SET {', '.join(sets)} WHERE id = ?", params)
|
|
1053
|
+
return
|
|
1054
|
+
except apsw.BusyError:
|
|
1055
|
+
if attempt < 2:
|
|
1056
|
+
_time.sleep(0.5 * (attempt + 1))
|
|
1057
|
+
else:
|
|
1058
|
+
raise
|
|
1059
|
+
|
|
1060
|
+
def get_enrichment_stats(self) -> Dict[str, Any]:
|
|
1061
|
+
"""Get enrichment progress statistics.
|
|
1062
|
+
|
|
1063
|
+
Reports both naive (total) and accurate (enrichable-only) percentages.
|
|
1064
|
+
Chunks marked 'skipped:too_short' are excluded from enrichable count.
|
|
1065
|
+
WhatsApp/Telegram chunks use a lower threshold (15 chars) so they're
|
|
1066
|
+
NOT marked as skipped even if under 50 chars.
|
|
1067
|
+
"""
|
|
1068
|
+
cursor = self.conn.cursor()
|
|
1069
|
+
total = list(cursor.execute("SELECT COUNT(*) FROM chunks"))[0][0]
|
|
1070
|
+
enriched = list(
|
|
1071
|
+
cursor.execute(
|
|
1072
|
+
"SELECT COUNT(*) FROM chunks WHERE enriched_at IS NOT NULL AND enriched_at NOT LIKE 'skipped:%'"
|
|
1073
|
+
)
|
|
1074
|
+
)[0][0]
|
|
1075
|
+
skipped = list(cursor.execute("SELECT COUNT(*) FROM chunks WHERE enriched_at LIKE 'skipped:%'"))[0][0]
|
|
1076
|
+
remaining = list(cursor.execute("SELECT COUNT(*) FROM chunks WHERE enriched_at IS NULL"))[0][0]
|
|
1077
|
+
enrichable = total - skipped
|
|
1078
|
+
by_intent = list(
|
|
1079
|
+
cursor.execute("""
|
|
1080
|
+
SELECT intent, COUNT(*) FROM chunks
|
|
1081
|
+
WHERE intent IS NOT NULL
|
|
1082
|
+
GROUP BY intent ORDER BY COUNT(*) DESC
|
|
1083
|
+
""")
|
|
1084
|
+
)
|
|
1085
|
+
return {
|
|
1086
|
+
"total_chunks": total,
|
|
1087
|
+
"enrichable": enrichable,
|
|
1088
|
+
"enriched": enriched,
|
|
1089
|
+
"skipped": skipped,
|
|
1090
|
+
"remaining": remaining,
|
|
1091
|
+
"percent": round(enriched / enrichable * 100, 1) if enrichable > 0 else 0,
|
|
1092
|
+
"naive_percent": round((enriched + skipped) / total * 100, 1) if total > 0 else 0,
|
|
1093
|
+
"by_intent": {row[0]: row[1] for row in by_intent},
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
# ─── Phase 8b: Git Overlay Methods ──────────────────────────────
|
|
1097
|
+
|
|
1098
|
+
def store_session_context(
|
|
1099
|
+
self,
|
|
1100
|
+
session_id: str,
|
|
1101
|
+
project: str,
|
|
1102
|
+
branch: Optional[str] = None,
|
|
1103
|
+
pr_number: Optional[int] = None,
|
|
1104
|
+
commit_shas: Optional[List[str]] = None,
|
|
1105
|
+
files_changed: Optional[List[str]] = None,
|
|
1106
|
+
started_at: Optional[str] = None,
|
|
1107
|
+
ended_at: Optional[str] = None,
|
|
1108
|
+
plan_name: Optional[str] = None,
|
|
1109
|
+
plan_phase: Optional[str] = None,
|
|
1110
|
+
story_id: Optional[str] = None,
|
|
1111
|
+
) -> None:
|
|
1112
|
+
"""Store git context for a session (upsert).
|
|
1113
|
+
|
|
1114
|
+
Preserves existing plan_name/plan_phase/story_id
|
|
1115
|
+
if not provided (avoids wiping plan links on
|
|
1116
|
+
git overlay re-runs).
|
|
1117
|
+
"""
|
|
1118
|
+
cursor = self.conn.cursor()
|
|
1119
|
+
# Preserve existing plan fields if not provided
|
|
1120
|
+
if plan_name is None:
|
|
1121
|
+
existing = list(
|
|
1122
|
+
cursor.execute(
|
|
1123
|
+
"SELECT plan_name, plan_phase, story_id FROM session_context WHERE session_id = ?",
|
|
1124
|
+
(session_id,),
|
|
1125
|
+
)
|
|
1126
|
+
)
|
|
1127
|
+
if existing:
|
|
1128
|
+
plan_name = existing[0][0]
|
|
1129
|
+
plan_phase = plan_phase or existing[0][1]
|
|
1130
|
+
story_id = story_id or existing[0][2]
|
|
1131
|
+
cursor.execute(
|
|
1132
|
+
"""
|
|
1133
|
+
INSERT OR REPLACE INTO session_context
|
|
1134
|
+
(session_id, project, branch, pr_number, commit_shas,
|
|
1135
|
+
files_changed, started_at, ended_at, created_at,
|
|
1136
|
+
plan_name, plan_phase, story_id)
|
|
1137
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, datetime('now'),
|
|
1138
|
+
?, ?, ?)
|
|
1139
|
+
""",
|
|
1140
|
+
(
|
|
1141
|
+
session_id,
|
|
1142
|
+
project,
|
|
1143
|
+
branch,
|
|
1144
|
+
pr_number,
|
|
1145
|
+
json.dumps(commit_shas) if commit_shas else None,
|
|
1146
|
+
json.dumps(files_changed) if files_changed else None,
|
|
1147
|
+
started_at,
|
|
1148
|
+
ended_at,
|
|
1149
|
+
plan_name,
|
|
1150
|
+
plan_phase,
|
|
1151
|
+
story_id,
|
|
1152
|
+
),
|
|
1153
|
+
)
|
|
1154
|
+
|
|
1155
|
+
def store_file_interactions(self, interactions: List[Dict[str, Any]]) -> int:
|
|
1156
|
+
"""Store file interaction records. Returns count stored."""
|
|
1157
|
+
if not interactions:
|
|
1158
|
+
return 0
|
|
1159
|
+
cursor = self.conn.cursor()
|
|
1160
|
+
count = 0
|
|
1161
|
+
for i in interactions:
|
|
1162
|
+
cursor.execute(
|
|
1163
|
+
"""
|
|
1164
|
+
INSERT INTO file_interactions
|
|
1165
|
+
(file_path, timestamp, session_id, action, chunk_id, project)
|
|
1166
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
1167
|
+
""",
|
|
1168
|
+
(
|
|
1169
|
+
i["file_path"],
|
|
1170
|
+
i.get("timestamp"),
|
|
1171
|
+
i["session_id"],
|
|
1172
|
+
i.get("action", "unknown"),
|
|
1173
|
+
i.get("chunk_id"),
|
|
1174
|
+
i.get("project"),
|
|
1175
|
+
),
|
|
1176
|
+
)
|
|
1177
|
+
count += 1
|
|
1178
|
+
return count
|
|
1179
|
+
|
|
1180
|
+
def get_file_timeline(
|
|
1181
|
+
self,
|
|
1182
|
+
file_path: str,
|
|
1183
|
+
project: Optional[str] = None,
|
|
1184
|
+
limit: int = 50,
|
|
1185
|
+
) -> List[Dict[str, Any]]:
|
|
1186
|
+
"""Get ordered timeline of interactions with a file."""
|
|
1187
|
+
cursor = self.conn.cursor()
|
|
1188
|
+
query = """
|
|
1189
|
+
SELECT fi.file_path, fi.timestamp, fi.session_id, fi.action,
|
|
1190
|
+
fi.project, sc.branch, sc.pr_number
|
|
1191
|
+
FROM file_interactions fi
|
|
1192
|
+
LEFT JOIN session_context sc ON fi.session_id = sc.session_id
|
|
1193
|
+
WHERE fi.file_path LIKE ?
|
|
1194
|
+
"""
|
|
1195
|
+
params: list = [f"%{file_path}%"]
|
|
1196
|
+
if project:
|
|
1197
|
+
query += " AND fi.project = ?"
|
|
1198
|
+
params.append(project)
|
|
1199
|
+
query += " ORDER BY fi.timestamp ASC LIMIT ?"
|
|
1200
|
+
params.append(limit)
|
|
1201
|
+
|
|
1202
|
+
results = []
|
|
1203
|
+
for row in cursor.execute(query, params):
|
|
1204
|
+
results.append(
|
|
1205
|
+
{
|
|
1206
|
+
"file_path": row[0],
|
|
1207
|
+
"timestamp": row[1],
|
|
1208
|
+
"session_id": row[2],
|
|
1209
|
+
"action": row[3],
|
|
1210
|
+
"project": row[4],
|
|
1211
|
+
"branch": row[5],
|
|
1212
|
+
"pr_number": row[6],
|
|
1213
|
+
}
|
|
1214
|
+
)
|
|
1215
|
+
return results
|
|
1216
|
+
|
|
1217
|
+
def get_session_context(self, session_id: str) -> Optional[Dict[str, Any]]:
|
|
1218
|
+
"""Get git context for a session."""
|
|
1219
|
+
cursor = self.conn.cursor()
|
|
1220
|
+
rows = list(cursor.execute("SELECT * FROM session_context WHERE session_id = ?", (session_id,)))
|
|
1221
|
+
if not rows:
|
|
1222
|
+
return None
|
|
1223
|
+
row = rows[0]
|
|
1224
|
+
result = {
|
|
1225
|
+
"session_id": row[0],
|
|
1226
|
+
"project": row[1],
|
|
1227
|
+
"branch": row[2],
|
|
1228
|
+
"pr_number": row[3],
|
|
1229
|
+
"commit_shas": _safe_json_loads(row[4]),
|
|
1230
|
+
"files_changed": _safe_json_loads(row[5]),
|
|
1231
|
+
"started_at": row[6],
|
|
1232
|
+
"ended_at": row[7],
|
|
1233
|
+
"created_at": row[8],
|
|
1234
|
+
}
|
|
1235
|
+
# Plan linking columns (may not exist in old DBs)
|
|
1236
|
+
if len(row) > 9:
|
|
1237
|
+
result["plan_name"] = row[9]
|
|
1238
|
+
result["plan_phase"] = row[10]
|
|
1239
|
+
result["story_id"] = row[11]
|
|
1240
|
+
return result
|
|
1241
|
+
|
|
1242
|
+
def update_session_plan(
|
|
1243
|
+
self,
|
|
1244
|
+
session_id: str,
|
|
1245
|
+
plan_name: Optional[str] = None,
|
|
1246
|
+
plan_phase: Optional[str] = None,
|
|
1247
|
+
story_id: Optional[str] = None,
|
|
1248
|
+
) -> bool:
|
|
1249
|
+
"""Update plan linking fields for an existing session.
|
|
1250
|
+
|
|
1251
|
+
Returns True if session was found and updated.
|
|
1252
|
+
"""
|
|
1253
|
+
cursor = self.conn.cursor()
|
|
1254
|
+
rows = list(
|
|
1255
|
+
cursor.execute(
|
|
1256
|
+
"SELECT 1 FROM session_context WHERE session_id = ?",
|
|
1257
|
+
(session_id,),
|
|
1258
|
+
)
|
|
1259
|
+
)
|
|
1260
|
+
if not rows:
|
|
1261
|
+
return False
|
|
1262
|
+
cursor.execute(
|
|
1263
|
+
"""
|
|
1264
|
+
UPDATE session_context
|
|
1265
|
+
SET plan_name = ?, plan_phase = ?, story_id = ?
|
|
1266
|
+
WHERE session_id = ?
|
|
1267
|
+
""",
|
|
1268
|
+
(plan_name, plan_phase, story_id, session_id),
|
|
1269
|
+
)
|
|
1270
|
+
return True
|
|
1271
|
+
|
|
1272
|
+
def get_sessions_by_plan(
|
|
1273
|
+
self,
|
|
1274
|
+
plan_name: Optional[str] = None,
|
|
1275
|
+
project: Optional[str] = None,
|
|
1276
|
+
) -> List[Dict[str, Any]]:
|
|
1277
|
+
"""Get all sessions linked to a plan (or all linked sessions)."""
|
|
1278
|
+
cursor = self.conn.cursor()
|
|
1279
|
+
query = (
|
|
1280
|
+
"SELECT session_id, project, branch, pr_number,"
|
|
1281
|
+
" started_at, ended_at, plan_name, plan_phase, story_id"
|
|
1282
|
+
" FROM session_context"
|
|
1283
|
+
" WHERE plan_name IS NOT NULL"
|
|
1284
|
+
)
|
|
1285
|
+
params: list = []
|
|
1286
|
+
if plan_name:
|
|
1287
|
+
query += " AND plan_name = ?"
|
|
1288
|
+
params.append(plan_name)
|
|
1289
|
+
if project:
|
|
1290
|
+
query += " AND project = ?"
|
|
1291
|
+
params.append(project)
|
|
1292
|
+
query += " ORDER BY started_at ASC"
|
|
1293
|
+
|
|
1294
|
+
results = []
|
|
1295
|
+
for row in cursor.execute(query, params):
|
|
1296
|
+
results.append(
|
|
1297
|
+
{
|
|
1298
|
+
"session_id": row[0],
|
|
1299
|
+
"project": row[1],
|
|
1300
|
+
"branch": row[2],
|
|
1301
|
+
"pr_number": row[3],
|
|
1302
|
+
"started_at": row[4],
|
|
1303
|
+
"ended_at": row[5],
|
|
1304
|
+
"plan_name": row[6],
|
|
1305
|
+
"plan_phase": row[7],
|
|
1306
|
+
"story_id": row[8],
|
|
1307
|
+
}
|
|
1308
|
+
)
|
|
1309
|
+
return results
|
|
1310
|
+
|
|
1311
|
+
def get_plan_linking_stats(self) -> Dict[str, Any]:
|
|
1312
|
+
"""Get plan linking statistics."""
|
|
1313
|
+
cursor = self.conn.cursor()
|
|
1314
|
+
total = list(cursor.execute("SELECT COUNT(*) FROM session_context"))[0][0]
|
|
1315
|
+
linked = list(cursor.execute("SELECT COUNT(*) FROM session_context WHERE plan_name IS NOT NULL"))[0][0]
|
|
1316
|
+
plans = list(
|
|
1317
|
+
cursor.execute(
|
|
1318
|
+
"SELECT plan_name, COUNT(*) FROM session_context"
|
|
1319
|
+
" WHERE plan_name IS NOT NULL"
|
|
1320
|
+
" GROUP BY plan_name ORDER BY COUNT(*) DESC"
|
|
1321
|
+
)
|
|
1322
|
+
)
|
|
1323
|
+
return {
|
|
1324
|
+
"total_sessions": total,
|
|
1325
|
+
"linked_sessions": linked,
|
|
1326
|
+
"unlinked_sessions": total - linked,
|
|
1327
|
+
"plans": {row[0]: row[1] for row in plans},
|
|
1328
|
+
}
|
|
1329
|
+
|
|
1330
|
+
def clear_plan_links(self, project: Optional[str] = None) -> int:
|
|
1331
|
+
"""Clear plan links. Returns count cleared."""
|
|
1332
|
+
cursor = self.conn.cursor()
|
|
1333
|
+
if project:
|
|
1334
|
+
rows = list(
|
|
1335
|
+
cursor.execute(
|
|
1336
|
+
"SELECT COUNT(*) FROM session_context WHERE plan_name IS NOT NULL AND project = ?",
|
|
1337
|
+
(project,),
|
|
1338
|
+
)
|
|
1339
|
+
)
|
|
1340
|
+
cursor.execute(
|
|
1341
|
+
"UPDATE session_context SET plan_name = NULL, plan_phase = NULL, story_id = NULL WHERE project = ?",
|
|
1342
|
+
(project,),
|
|
1343
|
+
)
|
|
1344
|
+
else:
|
|
1345
|
+
rows = list(cursor.execute("SELECT COUNT(*) FROM session_context WHERE plan_name IS NOT NULL"))
|
|
1346
|
+
cursor.execute("UPDATE session_context SET plan_name = NULL, plan_phase = NULL, story_id = NULL")
|
|
1347
|
+
return rows[0][0] if rows else 0
|
|
1348
|
+
|
|
1349
|
+
def get_git_overlay_stats(self) -> Dict[str, Any]:
|
|
1350
|
+
"""Get git overlay statistics."""
|
|
1351
|
+
cursor = self.conn.cursor()
|
|
1352
|
+
sessions = list(cursor.execute("SELECT COUNT(*) FROM session_context"))[0][0]
|
|
1353
|
+
interactions = list(cursor.execute("SELECT COUNT(*) FROM file_interactions"))[0][0]
|
|
1354
|
+
unique_files = list(cursor.execute("SELECT COUNT(DISTINCT file_path) FROM file_interactions"))[0][0]
|
|
1355
|
+
return {
|
|
1356
|
+
"sessions_with_context": sessions,
|
|
1357
|
+
"file_interactions": interactions,
|
|
1358
|
+
"unique_files": unique_files,
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
def clear_session_git_data(self, session_id: str) -> None:
|
|
1362
|
+
"""Clear git overlay data for a session (for re-processing)."""
|
|
1363
|
+
cursor = self.conn.cursor()
|
|
1364
|
+
cursor.execute("DELETE FROM session_context WHERE session_id = ?", (session_id,))
|
|
1365
|
+
cursor.execute("DELETE FROM file_interactions WHERE session_id = ?", (session_id,))
|
|
1366
|
+
|
|
1367
|
+
def store_operations(
|
|
1368
|
+
self,
|
|
1369
|
+
operations: List[Dict[str, Any]],
|
|
1370
|
+
) -> int:
|
|
1371
|
+
"""Store operation groups.
|
|
1372
|
+
|
|
1373
|
+
Args:
|
|
1374
|
+
operations: List of dicts with id, session_id,
|
|
1375
|
+
operation_type, chunk_ids, summary, outcome,
|
|
1376
|
+
started_at, ended_at, step_count.
|
|
1377
|
+
|
|
1378
|
+
Returns:
|
|
1379
|
+
Number of operations stored.
|
|
1380
|
+
"""
|
|
1381
|
+
if not operations:
|
|
1382
|
+
return 0
|
|
1383
|
+
cursor = self.conn.cursor()
|
|
1384
|
+
from datetime import timezone
|
|
1385
|
+
|
|
1386
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
1387
|
+
count = 0
|
|
1388
|
+
for op in operations:
|
|
1389
|
+
chunk_ids_json = json.dumps(op.get("chunk_ids", []))
|
|
1390
|
+
cursor.execute(
|
|
1391
|
+
"""INSERT OR REPLACE INTO operations
|
|
1392
|
+
(id, session_id, operation_type, chunk_ids,
|
|
1393
|
+
summary, outcome, started_at, ended_at,
|
|
1394
|
+
step_count, created_at)
|
|
1395
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
1396
|
+
(
|
|
1397
|
+
op["id"],
|
|
1398
|
+
op["session_id"],
|
|
1399
|
+
op.get("operation_type"),
|
|
1400
|
+
chunk_ids_json,
|
|
1401
|
+
op.get("summary"),
|
|
1402
|
+
op.get("outcome"),
|
|
1403
|
+
op.get("started_at"),
|
|
1404
|
+
op.get("ended_at"),
|
|
1405
|
+
op.get("step_count", 0),
|
|
1406
|
+
now,
|
|
1407
|
+
),
|
|
1408
|
+
)
|
|
1409
|
+
count += 1
|
|
1410
|
+
return count
|
|
1411
|
+
|
|
1412
|
+
def get_session_operations(
|
|
1413
|
+
self,
|
|
1414
|
+
session_id: str,
|
|
1415
|
+
) -> List[Dict[str, Any]]:
|
|
1416
|
+
"""Get all operations for a session."""
|
|
1417
|
+
cursor = self.conn.cursor()
|
|
1418
|
+
rows = list(
|
|
1419
|
+
cursor.execute(
|
|
1420
|
+
"""SELECT id, session_id, operation_type,
|
|
1421
|
+
chunk_ids, summary, outcome,
|
|
1422
|
+
started_at, ended_at, step_count
|
|
1423
|
+
FROM operations
|
|
1424
|
+
WHERE session_id = ?
|
|
1425
|
+
ORDER BY started_at""",
|
|
1426
|
+
(session_id,),
|
|
1427
|
+
)
|
|
1428
|
+
)
|
|
1429
|
+
results = []
|
|
1430
|
+
for row in rows:
|
|
1431
|
+
chunk_ids = []
|
|
1432
|
+
if row[3]:
|
|
1433
|
+
try:
|
|
1434
|
+
chunk_ids = json.loads(row[3])
|
|
1435
|
+
except (json.JSONDecodeError, TypeError):
|
|
1436
|
+
pass
|
|
1437
|
+
results.append(
|
|
1438
|
+
{
|
|
1439
|
+
"id": row[0],
|
|
1440
|
+
"session_id": row[1],
|
|
1441
|
+
"operation_type": row[2],
|
|
1442
|
+
"chunk_ids": chunk_ids,
|
|
1443
|
+
"summary": row[4],
|
|
1444
|
+
"outcome": row[5],
|
|
1445
|
+
"started_at": row[6],
|
|
1446
|
+
"ended_at": row[7],
|
|
1447
|
+
"step_count": row[8],
|
|
1448
|
+
}
|
|
1449
|
+
)
|
|
1450
|
+
return results
|
|
1451
|
+
|
|
1452
|
+
def get_operations_stats(self) -> Dict[str, Any]:
|
|
1453
|
+
"""Get operation grouping statistics."""
|
|
1454
|
+
cursor = self.conn.cursor()
|
|
1455
|
+
total = list(cursor.execute("SELECT COUNT(*) FROM operations"))[0][0]
|
|
1456
|
+
by_type = list(
|
|
1457
|
+
cursor.execute(
|
|
1458
|
+
"""SELECT operation_type, COUNT(*)
|
|
1459
|
+
FROM operations
|
|
1460
|
+
GROUP BY operation_type
|
|
1461
|
+
ORDER BY COUNT(*) DESC"""
|
|
1462
|
+
)
|
|
1463
|
+
)
|
|
1464
|
+
sessions = list(
|
|
1465
|
+
cursor.execute(
|
|
1466
|
+
"""SELECT COUNT(DISTINCT session_id)
|
|
1467
|
+
FROM operations"""
|
|
1468
|
+
)
|
|
1469
|
+
)[0][0]
|
|
1470
|
+
return {
|
|
1471
|
+
"total_operations": total,
|
|
1472
|
+
"sessions_with_operations": sessions,
|
|
1473
|
+
"by_type": {(row[0] or "unknown"): row[1] for row in by_type},
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1476
|
+
def clear_session_operations(self, session_id: str) -> None:
|
|
1477
|
+
"""Clear operations for a session."""
|
|
1478
|
+
cursor = self.conn.cursor()
|
|
1479
|
+
cursor.execute(
|
|
1480
|
+
"DELETE FROM operations WHERE session_id = ?",
|
|
1481
|
+
(session_id,),
|
|
1482
|
+
)
|
|
1483
|
+
|
|
1484
|
+
def store_topic_chains(
|
|
1485
|
+
self,
|
|
1486
|
+
chains: List[Dict[str, Any]],
|
|
1487
|
+
) -> int:
|
|
1488
|
+
"""Store topic chain entries."""
|
|
1489
|
+
if not chains:
|
|
1490
|
+
return 0
|
|
1491
|
+
cursor = self.conn.cursor()
|
|
1492
|
+
from datetime import timezone
|
|
1493
|
+
|
|
1494
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
1495
|
+
count = 0
|
|
1496
|
+
for chain in chains:
|
|
1497
|
+
cursor.execute(
|
|
1498
|
+
"""INSERT INTO topic_chains
|
|
1499
|
+
(file_path, session_a, session_b,
|
|
1500
|
+
shared_actions, time_delta_hours,
|
|
1501
|
+
project, created_at)
|
|
1502
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
|
1503
|
+
(
|
|
1504
|
+
chain["file_path"],
|
|
1505
|
+
chain["session_a"],
|
|
1506
|
+
chain["session_b"],
|
|
1507
|
+
chain.get("shared_actions", 0),
|
|
1508
|
+
chain.get("time_delta_hours"),
|
|
1509
|
+
chain.get("project"),
|
|
1510
|
+
now,
|
|
1511
|
+
),
|
|
1512
|
+
)
|
|
1513
|
+
count += 1
|
|
1514
|
+
return count
|
|
1515
|
+
|
|
1516
|
+
def get_file_chains(
|
|
1517
|
+
self,
|
|
1518
|
+
file_path: str,
|
|
1519
|
+
limit: int = 20,
|
|
1520
|
+
) -> List[Dict[str, Any]]:
|
|
1521
|
+
"""Get topic chains for a file (sessions linked by file)."""
|
|
1522
|
+
cursor = self.conn.cursor()
|
|
1523
|
+
rows = list(
|
|
1524
|
+
cursor.execute(
|
|
1525
|
+
"""SELECT tc.file_path, tc.session_a,
|
|
1526
|
+
tc.session_b, tc.shared_actions,
|
|
1527
|
+
tc.time_delta_hours, tc.project,
|
|
1528
|
+
sa.branch AS branch_a,
|
|
1529
|
+
sb.branch AS branch_b
|
|
1530
|
+
FROM topic_chains tc
|
|
1531
|
+
LEFT JOIN session_context sa
|
|
1532
|
+
ON tc.session_a = sa.session_id
|
|
1533
|
+
LEFT JOIN session_context sb
|
|
1534
|
+
ON tc.session_b = sb.session_id
|
|
1535
|
+
WHERE tc.file_path LIKE ?
|
|
1536
|
+
ORDER BY tc.time_delta_hours
|
|
1537
|
+
LIMIT ?""",
|
|
1538
|
+
(f"%{file_path}%", limit),
|
|
1539
|
+
)
|
|
1540
|
+
)
|
|
1541
|
+
return [
|
|
1542
|
+
{
|
|
1543
|
+
"file_path": row[0],
|
|
1544
|
+
"session_a": row[1],
|
|
1545
|
+
"session_b": row[2],
|
|
1546
|
+
"shared_actions": row[3],
|
|
1547
|
+
"time_delta_hours": row[4],
|
|
1548
|
+
"project": row[5],
|
|
1549
|
+
"branch_a": row[6],
|
|
1550
|
+
"branch_b": row[7],
|
|
1551
|
+
}
|
|
1552
|
+
for row in rows
|
|
1553
|
+
]
|
|
1554
|
+
|
|
1555
|
+
def get_file_regression(
|
|
1556
|
+
self,
|
|
1557
|
+
file_path: str,
|
|
1558
|
+
project: Optional[str] = None,
|
|
1559
|
+
) -> Dict[str, Any]:
|
|
1560
|
+
"""Get regression info for a file.
|
|
1561
|
+
|
|
1562
|
+
Finds the last successful operation involving the file,
|
|
1563
|
+
then shows all changes after that point.
|
|
1564
|
+
|
|
1565
|
+
Returns:
|
|
1566
|
+
Dict with last_success, changes_after, and timeline.
|
|
1567
|
+
"""
|
|
1568
|
+
cursor = self.conn.cursor()
|
|
1569
|
+
|
|
1570
|
+
# Get all interactions for this file, ordered by time
|
|
1571
|
+
query = """
|
|
1572
|
+
SELECT fi.file_path, fi.timestamp,
|
|
1573
|
+
fi.session_id, fi.action,
|
|
1574
|
+
fi.project,
|
|
1575
|
+
sc.branch, sc.pr_number
|
|
1576
|
+
FROM file_interactions fi
|
|
1577
|
+
LEFT JOIN session_context sc
|
|
1578
|
+
ON fi.session_id = sc.session_id
|
|
1579
|
+
WHERE fi.file_path LIKE ?
|
|
1580
|
+
"""
|
|
1581
|
+
params: list = [f"%{file_path}%"]
|
|
1582
|
+
if project:
|
|
1583
|
+
query += " AND fi.project = ?"
|
|
1584
|
+
params.append(project)
|
|
1585
|
+
query += " ORDER BY fi.timestamp"
|
|
1586
|
+
|
|
1587
|
+
interactions = list(cursor.execute(query, params))
|
|
1588
|
+
|
|
1589
|
+
if not interactions:
|
|
1590
|
+
return {
|
|
1591
|
+
"file_path": file_path,
|
|
1592
|
+
"timeline": [],
|
|
1593
|
+
"last_success": None,
|
|
1594
|
+
"changes_after": [],
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
# Build timeline
|
|
1598
|
+
timeline = []
|
|
1599
|
+
for row in interactions:
|
|
1600
|
+
timeline.append(
|
|
1601
|
+
{
|
|
1602
|
+
"file_path": row[0],
|
|
1603
|
+
"timestamp": row[1],
|
|
1604
|
+
"session_id": row[2],
|
|
1605
|
+
"action": row[3],
|
|
1606
|
+
"project": row[4],
|
|
1607
|
+
"branch": row[5],
|
|
1608
|
+
"pr_number": row[6],
|
|
1609
|
+
}
|
|
1610
|
+
)
|
|
1611
|
+
|
|
1612
|
+
# Find last successful operation for this file
|
|
1613
|
+
# Check operations table for success outcomes
|
|
1614
|
+
last_success = None
|
|
1615
|
+
changes_after = []
|
|
1616
|
+
|
|
1617
|
+
# Get operations that involved this file
|
|
1618
|
+
for entry in reversed(timeline):
|
|
1619
|
+
sid = entry["session_id"]
|
|
1620
|
+
if not sid:
|
|
1621
|
+
continue
|
|
1622
|
+
ops = list(
|
|
1623
|
+
cursor.execute(
|
|
1624
|
+
"""SELECT outcome FROM operations
|
|
1625
|
+
WHERE session_id = ?
|
|
1626
|
+
AND outcome = 'success'
|
|
1627
|
+
LIMIT 1""",
|
|
1628
|
+
(sid,),
|
|
1629
|
+
)
|
|
1630
|
+
)
|
|
1631
|
+
if ops:
|
|
1632
|
+
last_success = entry
|
|
1633
|
+
break
|
|
1634
|
+
|
|
1635
|
+
# Get all entries after last success
|
|
1636
|
+
if last_success and last_success.get("timestamp"):
|
|
1637
|
+
changes_after = [e for e in timeline if (e.get("timestamp") or "") > last_success["timestamp"]]
|
|
1638
|
+
|
|
1639
|
+
return {
|
|
1640
|
+
"file_path": file_path,
|
|
1641
|
+
"timeline": timeline,
|
|
1642
|
+
"last_success": last_success,
|
|
1643
|
+
"changes_after": changes_after,
|
|
1644
|
+
}
|
|
1645
|
+
|
|
1646
|
+
def get_topic_chain_stats(self) -> Dict[str, Any]:
|
|
1647
|
+
"""Get topic chain statistics."""
|
|
1648
|
+
cursor = self.conn.cursor()
|
|
1649
|
+
total = list(cursor.execute("SELECT COUNT(*) FROM topic_chains"))[0][0]
|
|
1650
|
+
files = list(
|
|
1651
|
+
cursor.execute(
|
|
1652
|
+
"""SELECT COUNT(DISTINCT file_path)
|
|
1653
|
+
FROM topic_chains"""
|
|
1654
|
+
)
|
|
1655
|
+
)[0][0]
|
|
1656
|
+
return {
|
|
1657
|
+
"total_chains": total,
|
|
1658
|
+
"unique_files": files,
|
|
1659
|
+
}
|
|
1660
|
+
|
|
1661
|
+
def clear_topic_chains(self, project: Optional[str] = None) -> None:
|
|
1662
|
+
"""Clear topic chains, optionally for a project."""
|
|
1663
|
+
cursor = self.conn.cursor()
|
|
1664
|
+
if project:
|
|
1665
|
+
cursor.execute(
|
|
1666
|
+
"DELETE FROM topic_chains WHERE project = ?",
|
|
1667
|
+
(project,),
|
|
1668
|
+
)
|
|
1669
|
+
else:
|
|
1670
|
+
cursor.execute("DELETE FROM topic_chains")
|
|
1671
|
+
|
|
1672
|
+
# --- Phase 7: Session Enrichment CRUD ---
|
|
1673
|
+
|
|
1674
|
+
def upsert_session_enrichment(self, enrichment: Dict[str, Any]) -> None:
|
|
1675
|
+
"""Insert or update a session enrichment record."""
|
|
1676
|
+
cursor = self.conn.cursor()
|
|
1677
|
+
# Work on a copy to avoid mutating caller's dict
|
|
1678
|
+
enrichment = dict(enrichment)
|
|
1679
|
+
session_id = enrichment["session_id"]
|
|
1680
|
+
|
|
1681
|
+
# Serialize JSON fields
|
|
1682
|
+
json_fields = [
|
|
1683
|
+
"decisions_made",
|
|
1684
|
+
"corrections",
|
|
1685
|
+
"learnings",
|
|
1686
|
+
"mistakes",
|
|
1687
|
+
"patterns",
|
|
1688
|
+
"topic_tags",
|
|
1689
|
+
"tool_usage_stats",
|
|
1690
|
+
]
|
|
1691
|
+
for field in json_fields:
|
|
1692
|
+
if field in enrichment and not isinstance(enrichment[field], str):
|
|
1693
|
+
enrichment[field] = json.dumps(enrichment[field])
|
|
1694
|
+
|
|
1695
|
+
cursor.execute(
|
|
1696
|
+
"""
|
|
1697
|
+
INSERT INTO session_enrichments (
|
|
1698
|
+
session_id, file_path, enrichment_version, enrichment_model,
|
|
1699
|
+
session_start_time, session_end_time, duration_seconds,
|
|
1700
|
+
message_count, user_message_count, assistant_message_count, tool_call_count,
|
|
1701
|
+
session_summary, primary_intent, outcome, complexity_score,
|
|
1702
|
+
session_quality_score,
|
|
1703
|
+
decisions_made, corrections, learnings, mistakes, patterns,
|
|
1704
|
+
topic_tags, tool_usage_stats,
|
|
1705
|
+
what_worked, what_failed,
|
|
1706
|
+
summary_embedding
|
|
1707
|
+
) VALUES (
|
|
1708
|
+
?, ?, ?, ?,
|
|
1709
|
+
?, ?, ?,
|
|
1710
|
+
?, ?, ?, ?,
|
|
1711
|
+
?, ?, ?, ?,
|
|
1712
|
+
?,
|
|
1713
|
+
?, ?, ?, ?, ?,
|
|
1714
|
+
?, ?,
|
|
1715
|
+
?, ?,
|
|
1716
|
+
?
|
|
1717
|
+
)
|
|
1718
|
+
ON CONFLICT(session_id) DO UPDATE SET
|
|
1719
|
+
enrichment_version = excluded.enrichment_version,
|
|
1720
|
+
enrichment_model = excluded.enrichment_model,
|
|
1721
|
+
enrichment_timestamp = strftime('%Y-%m-%dT%H:%M:%fZ','now'),
|
|
1722
|
+
session_start_time = excluded.session_start_time,
|
|
1723
|
+
session_end_time = excluded.session_end_time,
|
|
1724
|
+
duration_seconds = excluded.duration_seconds,
|
|
1725
|
+
message_count = excluded.message_count,
|
|
1726
|
+
user_message_count = excluded.user_message_count,
|
|
1727
|
+
assistant_message_count = excluded.assistant_message_count,
|
|
1728
|
+
tool_call_count = excluded.tool_call_count,
|
|
1729
|
+
session_summary = excluded.session_summary,
|
|
1730
|
+
primary_intent = excluded.primary_intent,
|
|
1731
|
+
outcome = excluded.outcome,
|
|
1732
|
+
complexity_score = excluded.complexity_score,
|
|
1733
|
+
session_quality_score = excluded.session_quality_score,
|
|
1734
|
+
decisions_made = excluded.decisions_made,
|
|
1735
|
+
corrections = excluded.corrections,
|
|
1736
|
+
learnings = excluded.learnings,
|
|
1737
|
+
mistakes = excluded.mistakes,
|
|
1738
|
+
patterns = excluded.patterns,
|
|
1739
|
+
topic_tags = excluded.topic_tags,
|
|
1740
|
+
tool_usage_stats = excluded.tool_usage_stats,
|
|
1741
|
+
what_worked = excluded.what_worked,
|
|
1742
|
+
what_failed = excluded.what_failed,
|
|
1743
|
+
summary_embedding = excluded.summary_embedding
|
|
1744
|
+
""",
|
|
1745
|
+
(
|
|
1746
|
+
session_id,
|
|
1747
|
+
enrichment.get("file_path"),
|
|
1748
|
+
enrichment.get("enrichment_version", "1.0"),
|
|
1749
|
+
enrichment.get("enrichment_model"),
|
|
1750
|
+
enrichment.get("session_start_time"),
|
|
1751
|
+
enrichment.get("session_end_time"),
|
|
1752
|
+
enrichment.get("duration_seconds"),
|
|
1753
|
+
enrichment.get("message_count", 0),
|
|
1754
|
+
enrichment.get("user_message_count", 0),
|
|
1755
|
+
enrichment.get("assistant_message_count", 0),
|
|
1756
|
+
enrichment.get("tool_call_count", 0),
|
|
1757
|
+
enrichment.get("session_summary"),
|
|
1758
|
+
enrichment.get("primary_intent"),
|
|
1759
|
+
enrichment.get("outcome"),
|
|
1760
|
+
enrichment.get("complexity_score"),
|
|
1761
|
+
enrichment.get("session_quality_score"),
|
|
1762
|
+
enrichment.get("decisions_made", "[]"),
|
|
1763
|
+
enrichment.get("corrections", "[]"),
|
|
1764
|
+
enrichment.get("learnings", "[]"),
|
|
1765
|
+
enrichment.get("mistakes", "[]"),
|
|
1766
|
+
enrichment.get("patterns", "[]"),
|
|
1767
|
+
enrichment.get("topic_tags", "[]"),
|
|
1768
|
+
enrichment.get("tool_usage_stats", "[]"),
|
|
1769
|
+
enrichment.get("what_worked"),
|
|
1770
|
+
enrichment.get("what_failed"),
|
|
1771
|
+
enrichment.get("summary_embedding"),
|
|
1772
|
+
),
|
|
1773
|
+
)
|
|
1774
|
+
|
|
1775
|
+
# Update FTS5
|
|
1776
|
+
cursor.execute(
|
|
1777
|
+
"DELETE FROM session_enrichments_fts WHERE session_id = ?",
|
|
1778
|
+
(session_id,),
|
|
1779
|
+
)
|
|
1780
|
+
if enrichment.get("session_summary") or enrichment.get("what_worked") or enrichment.get("what_failed"):
|
|
1781
|
+
cursor.execute(
|
|
1782
|
+
"""INSERT INTO session_enrichments_fts
|
|
1783
|
+
(session_summary, what_worked, what_failed, session_id)
|
|
1784
|
+
VALUES (?, ?, ?, ?)""",
|
|
1785
|
+
(
|
|
1786
|
+
enrichment.get("session_summary", ""),
|
|
1787
|
+
enrichment.get("what_worked", ""),
|
|
1788
|
+
enrichment.get("what_failed", ""),
|
|
1789
|
+
session_id,
|
|
1790
|
+
),
|
|
1791
|
+
)
|
|
1792
|
+
|
|
1793
|
+
# Column names for session_enrichments (must match CREATE TABLE order)
|
|
1794
|
+
_SESSION_ENRICHMENT_COLS = [
|
|
1795
|
+
"id",
|
|
1796
|
+
"session_id",
|
|
1797
|
+
"file_path",
|
|
1798
|
+
"enrichment_version",
|
|
1799
|
+
"enrichment_model",
|
|
1800
|
+
"enrichment_timestamp",
|
|
1801
|
+
"session_start_time",
|
|
1802
|
+
"session_end_time",
|
|
1803
|
+
"duration_seconds",
|
|
1804
|
+
"message_count",
|
|
1805
|
+
"user_message_count",
|
|
1806
|
+
"assistant_message_count",
|
|
1807
|
+
"tool_call_count",
|
|
1808
|
+
"session_summary",
|
|
1809
|
+
"primary_intent",
|
|
1810
|
+
"outcome",
|
|
1811
|
+
"complexity_score",
|
|
1812
|
+
"session_quality_score",
|
|
1813
|
+
"decisions_made",
|
|
1814
|
+
"corrections",
|
|
1815
|
+
"learnings",
|
|
1816
|
+
"mistakes",
|
|
1817
|
+
"patterns",
|
|
1818
|
+
"topic_tags",
|
|
1819
|
+
"tool_usage_stats",
|
|
1820
|
+
"what_worked",
|
|
1821
|
+
"what_failed",
|
|
1822
|
+
"summary_embedding",
|
|
1823
|
+
]
|
|
1824
|
+
|
|
1825
|
+
def get_session_enrichment(self, session_id: str) -> Optional[Dict[str, Any]]:
|
|
1826
|
+
"""Get enrichment data for a session."""
|
|
1827
|
+
cursor = self.conn.cursor()
|
|
1828
|
+
rows = list(
|
|
1829
|
+
cursor.execute(
|
|
1830
|
+
"SELECT * FROM session_enrichments WHERE session_id = ?",
|
|
1831
|
+
(session_id,),
|
|
1832
|
+
)
|
|
1833
|
+
)
|
|
1834
|
+
if not rows:
|
|
1835
|
+
return None
|
|
1836
|
+
row = rows[0]
|
|
1837
|
+
result = dict(zip(self._SESSION_ENRICHMENT_COLS, row))
|
|
1838
|
+
# Parse JSON fields
|
|
1839
|
+
for field in [
|
|
1840
|
+
"decisions_made",
|
|
1841
|
+
"corrections",
|
|
1842
|
+
"learnings",
|
|
1843
|
+
"mistakes",
|
|
1844
|
+
"patterns",
|
|
1845
|
+
"topic_tags",
|
|
1846
|
+
"tool_usage_stats",
|
|
1847
|
+
]:
|
|
1848
|
+
result[field] = _safe_json_loads(result.get(field))
|
|
1849
|
+
return result
|
|
1850
|
+
|
|
1851
|
+
def list_enriched_sessions(self) -> List[str]:
|
|
1852
|
+
"""Return session IDs that already have enrichment data."""
|
|
1853
|
+
cursor = self.conn.cursor()
|
|
1854
|
+
return [row[0] for row in cursor.execute("SELECT session_id FROM session_enrichments")]
|
|
1855
|
+
|
|
1856
|
+
def get_session_enrichment_stats(self) -> Dict[str, Any]:
|
|
1857
|
+
"""Get session enrichment statistics."""
|
|
1858
|
+
cursor = self.conn.cursor()
|
|
1859
|
+
total = list(cursor.execute("SELECT COUNT(*) FROM session_enrichments"))[0][0]
|
|
1860
|
+
by_outcome = dict(
|
|
1861
|
+
cursor.execute(
|
|
1862
|
+
"SELECT outcome, COUNT(*) FROM session_enrichments WHERE outcome IS NOT NULL GROUP BY outcome"
|
|
1863
|
+
)
|
|
1864
|
+
)
|
|
1865
|
+
by_intent = dict(
|
|
1866
|
+
cursor.execute(
|
|
1867
|
+
"SELECT primary_intent, COUNT(*) FROM session_enrichments WHERE primary_intent IS NOT NULL GROUP BY primary_intent"
|
|
1868
|
+
)
|
|
1869
|
+
)
|
|
1870
|
+
avg_quality = list(
|
|
1871
|
+
cursor.execute(
|
|
1872
|
+
"SELECT AVG(session_quality_score) FROM session_enrichments WHERE session_quality_score IS NOT NULL"
|
|
1873
|
+
)
|
|
1874
|
+
)[0][0]
|
|
1875
|
+
return {
|
|
1876
|
+
"total_enriched_sessions": total,
|
|
1877
|
+
"by_outcome": by_outcome,
|
|
1878
|
+
"by_intent": by_intent,
|
|
1879
|
+
"avg_quality_score": round(avg_quality, 1) if avg_quality else None,
|
|
1880
|
+
}
|
|
1881
|
+
|
|
1882
|
+
def close(self) -> None:
|
|
1883
|
+
"""Close database connection."""
|
|
1884
|
+
if hasattr(self, "conn"):
|
|
1885
|
+
self.conn.close()
|
|
1886
|
+
|
|
1887
|
+
def __enter__(self):
|
|
1888
|
+
return self
|
|
1889
|
+
|
|
1890
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
1891
|
+
self.close()
|