brainlayer 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. brainlayer/__init__.py +3 -0
  2. brainlayer/cli/__init__.py +1545 -0
  3. brainlayer/cli/wizard.py +132 -0
  4. brainlayer/cli_new.py +151 -0
  5. brainlayer/client.py +164 -0
  6. brainlayer/clustering.py +736 -0
  7. brainlayer/daemon.py +1105 -0
  8. brainlayer/dashboard/README.md +129 -0
  9. brainlayer/dashboard/__init__.py +5 -0
  10. brainlayer/dashboard/app.py +151 -0
  11. brainlayer/dashboard/search.py +229 -0
  12. brainlayer/dashboard/views.py +230 -0
  13. brainlayer/embeddings.py +131 -0
  14. brainlayer/engine.py +550 -0
  15. brainlayer/index_new.py +87 -0
  16. brainlayer/mcp/__init__.py +1558 -0
  17. brainlayer/migrate.py +205 -0
  18. brainlayer/paths.py +43 -0
  19. brainlayer/pipeline/__init__.py +47 -0
  20. brainlayer/pipeline/analyze_communication.py +508 -0
  21. brainlayer/pipeline/brain_graph.py +567 -0
  22. brainlayer/pipeline/chat_tags.py +63 -0
  23. brainlayer/pipeline/chunk.py +422 -0
  24. brainlayer/pipeline/classify.py +472 -0
  25. brainlayer/pipeline/cluster_sampling.py +73 -0
  26. brainlayer/pipeline/enrichment.py +810 -0
  27. brainlayer/pipeline/extract.py +66 -0
  28. brainlayer/pipeline/extract_claude_desktop.py +149 -0
  29. brainlayer/pipeline/extract_corrections.py +231 -0
  30. brainlayer/pipeline/extract_markdown.py +195 -0
  31. brainlayer/pipeline/extract_whatsapp.py +227 -0
  32. brainlayer/pipeline/git_overlay.py +301 -0
  33. brainlayer/pipeline/longitudinal_analyzer.py +568 -0
  34. brainlayer/pipeline/obsidian_export.py +455 -0
  35. brainlayer/pipeline/operation_grouping.py +486 -0
  36. brainlayer/pipeline/plan_linking.py +313 -0
  37. brainlayer/pipeline/sanitize.py +549 -0
  38. brainlayer/pipeline/semantic_style.py +574 -0
  39. brainlayer/pipeline/session_enrichment.py +472 -0
  40. brainlayer/pipeline/style_embed.py +67 -0
  41. brainlayer/pipeline/style_index.py +139 -0
  42. brainlayer/pipeline/temporal_chains.py +203 -0
  43. brainlayer/pipeline/time_batcher.py +248 -0
  44. brainlayer/pipeline/unified_timeline.py +569 -0
  45. brainlayer/storage.py +66 -0
  46. brainlayer/store.py +155 -0
  47. brainlayer/taxonomy.json +80 -0
  48. brainlayer/vector_store.py +1891 -0
  49. brainlayer-1.0.0.dist-info/METADATA +313 -0
  50. brainlayer-1.0.0.dist-info/RECORD +53 -0
  51. brainlayer-1.0.0.dist-info/WHEEL +4 -0
  52. brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
  53. brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,1891 @@
1
+ """SQLite-vec based vector store for fast search."""
2
+
3
+ import json
4
+ import struct
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ import apsw
10
+ import apsw.bestpractice
11
+ import sqlite_vec
12
+
13
+ # Apply APSW best practices
14
+ apsw.bestpractice.apply(apsw.bestpractice.recommended)
15
+
16
+
17
+ _SOURCE_MIN_CHARS = {
18
+ "whatsapp": 15,
19
+ "telegram": 15,
20
+ }
21
+ _DEFAULT_MIN_CHARS = 50
22
+
23
+
24
+ def source_aware_min_chars(source: Optional[str]) -> int:
25
+ """Return minimum character count for enrichment based on message source.
26
+
27
+ Short-form messaging sources (WhatsApp, Telegram) use a lower threshold
28
+ since meaningful messages are often 15-50 chars.
29
+ """
30
+ if source is None:
31
+ return _DEFAULT_MIN_CHARS
32
+ return _SOURCE_MIN_CHARS.get(source, _DEFAULT_MIN_CHARS)
33
+
34
+
35
+ def _safe_json_loads(value: Any) -> list:
36
+ """Safely parse a JSON string, returning [] on None or invalid JSON."""
37
+ if not value:
38
+ return []
39
+ try:
40
+ return json.loads(value)
41
+ except (json.JSONDecodeError, TypeError):
42
+ return []
43
+
44
+
45
+ def _escape_fts5_query(query: str) -> str:
46
+ """Escape a query string for FTS5 MATCH.
47
+
48
+ FTS5 treats certain characters as syntax: ., *, ^, ", (, ), +, -, NOT, AND, OR, NEAR.
49
+ We wrap each word in double quotes so they're treated as literal terms,
50
+ joined with OR for lenient matching (any term matches).
51
+ Empty/whitespace-only queries return a wildcard match-all.
52
+ """
53
+ if not query or not query.strip():
54
+ return "*"
55
+ # Split into words, wrap each in double quotes (escaping any internal quotes)
56
+ terms = []
57
+ for word in query.split():
58
+ # Remove internal double quotes to prevent FTS5 injection
59
+ clean = word.replace('"', "")
60
+ if clean:
61
+ terms.append(f'"{clean}"')
62
+ # Use OR between terms so matching is lenient (any term matches)
63
+ # Without OR, FTS5 defaults to AND (all terms must be present)
64
+ return " OR ".join(terms) if terms else "*"
65
+
66
+
67
+ def serialize_f32(vector: List[float]) -> bytes:
68
+ """Serialize a float32 vector to bytes for sqlite-vec."""
69
+ return struct.pack(f"{len(vector)}f", *vector)
70
+
71
+
72
+ class VectorStore:
73
+ """SQLite-vec based vector store."""
74
+
75
+ def __init__(self, db_path: Path):
76
+ self.db_path = db_path
77
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
78
+ self._init_db()
79
+
80
+ def _init_db(self) -> None:
81
+ """Initialize database with vector extension."""
82
+ self.conn = apsw.Connection(str(self.db_path))
83
+ self.conn.enableloadextension(True)
84
+ self.conn.loadextension(sqlite_vec.loadable_path())
85
+ self.conn.enableloadextension(False)
86
+
87
+ cursor = self.conn.cursor()
88
+
89
+ # AIDEV-NOTE: busy_timeout is critical for multi-process access (daemon + MCP + enrichment).
90
+ # Without this, concurrent writes get SQLITE_BUSY immediately and crash silently.
91
+ cursor.execute("PRAGMA busy_timeout = 5000")
92
+
93
+ # Create tables
94
+ cursor.execute("""
95
+ CREATE TABLE IF NOT EXISTS chunks (
96
+ id TEXT PRIMARY KEY,
97
+ content TEXT NOT NULL,
98
+ metadata TEXT NOT NULL,
99
+ source_file TEXT NOT NULL,
100
+ project TEXT,
101
+ content_type TEXT,
102
+ value_type TEXT,
103
+ char_count INTEGER,
104
+ source TEXT,
105
+ sender TEXT,
106
+ language TEXT,
107
+ conversation_id TEXT,
108
+ position INTEGER,
109
+ context_summary TEXT
110
+ )
111
+ """)
112
+
113
+ # Add columns if upgrading existing DB (check existing columns first)
114
+ existing_cols = {row[1] for row in cursor.execute("PRAGMA table_info(chunks)")}
115
+ for col, typ in [
116
+ ("source", "TEXT"),
117
+ ("sender", "TEXT"),
118
+ ("language", "TEXT"),
119
+ ("conversation_id", "TEXT"),
120
+ ("position", "INTEGER"),
121
+ ("context_summary", "TEXT"),
122
+ ("tags", "TEXT"),
123
+ ("tag_confidence", "REAL"),
124
+ # Enrichment columns (Phase 5)
125
+ ("summary", "TEXT"),
126
+ ("importance", "REAL"),
127
+ ("intent", "TEXT"),
128
+ ("enriched_at", "TEXT"),
129
+ # Extended enrichment columns (Phase 3 — Gemini backfill)
130
+ ("primary_symbols", "TEXT"), # JSON array of classes/functions/files
131
+ ("resolved_query", "TEXT"), # HyDE-style hypothetical question
132
+ ("epistemic_level", "TEXT"), # hypothesis/substantiated/validated
133
+ ("version_scope", "TEXT"), # version or system state discussed
134
+ ("debt_impact", "TEXT"), # introduction/resolution/none
135
+ ("external_deps", "TEXT"), # JSON array of libraries/APIs
136
+ # Phase 3: created_at for date filtering
137
+ ("created_at", "TEXT"), # ISO 8601 timestamp of when chunk was created/ingested
138
+ ]:
139
+ if col not in existing_cols:
140
+ cursor.execute(f"ALTER TABLE chunks ADD COLUMN {col} {typ}")
141
+
142
+ # Indexes for filtering
143
+ for idx, col in [
144
+ ("idx_chunks_source", "source"),
145
+ ("idx_chunks_sender", "sender"),
146
+ ("idx_chunks_conversation", "conversation_id"),
147
+ ("idx_chunks_intent", "intent"),
148
+ ("idx_chunks_importance", "importance"),
149
+ ("idx_chunks_enriched", "enriched_at"),
150
+ ("idx_chunks_created", "created_at"),
151
+ ]:
152
+ cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx} ON chunks({col})")
153
+
154
+ # Create vector table with 1024 dimensions for bge-large-en-v1.5
155
+ cursor.execute("""
156
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunk_vectors USING vec0(
157
+ chunk_id TEXT PRIMARY KEY,
158
+ embedding FLOAT[1024]
159
+ )
160
+ """)
161
+
162
+ # FTS5 full-text search table for hybrid search
163
+ cursor.execute("""
164
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
165
+ content, chunk_id UNINDEXED
166
+ )
167
+ """)
168
+
169
+ # Triggers to keep FTS5 in sync with chunks table
170
+ cursor.execute("""
171
+ CREATE TRIGGER IF NOT EXISTS chunks_fts_insert AFTER INSERT ON chunks BEGIN
172
+ INSERT INTO chunks_fts(content, chunk_id) VALUES (new.content, new.id);
173
+ END
174
+ """)
175
+ cursor.execute("""
176
+ CREATE TRIGGER IF NOT EXISTS chunks_fts_delete AFTER DELETE ON chunks BEGIN
177
+ DELETE FROM chunks_fts WHERE chunk_id = old.id;
178
+ END
179
+ """)
180
+ cursor.execute("""
181
+ CREATE TRIGGER IF NOT EXISTS chunks_fts_update AFTER UPDATE OF content ON chunks BEGIN
182
+ DELETE FROM chunks_fts WHERE chunk_id = old.id;
183
+ INSERT INTO chunks_fts(content, chunk_id) VALUES (new.content, new.id);
184
+ END
185
+ """)
186
+
187
+ # Phase 8b: Git overlay tables
188
+ cursor.execute("""
189
+ CREATE TABLE IF NOT EXISTS session_context (
190
+ session_id TEXT PRIMARY KEY,
191
+ project TEXT,
192
+ branch TEXT,
193
+ pr_number INTEGER,
194
+ commit_shas TEXT,
195
+ files_changed TEXT,
196
+ started_at TEXT,
197
+ ended_at TEXT,
198
+ created_at TEXT
199
+ )
200
+ """)
201
+ # Phase 8c: Plan linking columns on session_context
202
+ existing_sc_cols = {row[1] for row in cursor.execute("PRAGMA table_info(session_context)")}
203
+ for col in ("plan_name", "plan_phase", "story_id"):
204
+ if col not in existing_sc_cols:
205
+ cursor.execute(f"ALTER TABLE session_context ADD COLUMN {col} TEXT")
206
+ cursor.execute("""
207
+ CREATE TABLE IF NOT EXISTS file_interactions (
208
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
209
+ file_path TEXT NOT NULL,
210
+ timestamp TEXT,
211
+ session_id TEXT,
212
+ action TEXT,
213
+ chunk_id TEXT,
214
+ project TEXT
215
+ )
216
+ """)
217
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_interactions_path ON file_interactions(file_path)")
218
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_file_interactions_session ON file_interactions(session_id)")
219
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_session_context_project ON session_context(project)")
220
+
221
+ # Phase 8a: Operations table
222
+ cursor.execute("""
223
+ CREATE TABLE IF NOT EXISTS operations (
224
+ id TEXT PRIMARY KEY,
225
+ session_id TEXT NOT NULL,
226
+ operation_type TEXT,
227
+ chunk_ids TEXT,
228
+ summary TEXT,
229
+ outcome TEXT,
230
+ started_at TEXT,
231
+ ended_at TEXT,
232
+ step_count INTEGER DEFAULT 0,
233
+ created_at TEXT
234
+ )
235
+ """)
236
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_operations_session ON operations(session_id)")
237
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_operations_type ON operations(operation_type)")
238
+
239
+ # Phase 8d: Topic chains table
240
+ cursor.execute("""
241
+ CREATE TABLE IF NOT EXISTS topic_chains (
242
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
243
+ file_path TEXT NOT NULL,
244
+ session_a TEXT NOT NULL,
245
+ session_b TEXT NOT NULL,
246
+ shared_actions INTEGER DEFAULT 0,
247
+ time_delta_hours REAL,
248
+ project TEXT,
249
+ created_at TEXT
250
+ )
251
+ """)
252
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_topic_chains_file ON topic_chains(file_path)")
253
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_topic_chains_session ON topic_chains(session_a)")
254
+
255
+ # Phase 7: Session-level enrichment table
256
+ cursor.execute("""
257
+ CREATE TABLE IF NOT EXISTS session_enrichments (
258
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
259
+ session_id TEXT NOT NULL UNIQUE,
260
+ file_path TEXT,
261
+ enrichment_version TEXT NOT NULL DEFAULT '1.0',
262
+ enrichment_model TEXT,
263
+ enrichment_timestamp TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ','now')),
264
+
265
+ -- Timing (flat — for temporal queries)
266
+ session_start_time TEXT,
267
+ session_end_time TEXT,
268
+ duration_seconds INTEGER,
269
+
270
+ -- Message dynamics (flat — for aggregation dashboards)
271
+ message_count INTEGER NOT NULL DEFAULT 0,
272
+ user_message_count INTEGER NOT NULL DEFAULT 0,
273
+ assistant_message_count INTEGER NOT NULL DEFAULT 0,
274
+ tool_call_count INTEGER NOT NULL DEFAULT 0,
275
+
276
+ -- Content analysis (flat — for filtering)
277
+ session_summary TEXT,
278
+ primary_intent TEXT,
279
+ outcome TEXT CHECK(outcome IN ('success','partial_success','failure','abandoned','ongoing')),
280
+ complexity_score INTEGER CHECK(complexity_score BETWEEN 1 AND 10),
281
+
282
+ -- Quality scores (flat — for dashboards)
283
+ session_quality_score INTEGER CHECK(session_quality_score BETWEEN 1 AND 10),
284
+
285
+ -- Decisions, corrections, learnings (JSON — variable-length arrays)
286
+ decisions_made TEXT DEFAULT '[]',
287
+ corrections TEXT DEFAULT '[]',
288
+ learnings TEXT DEFAULT '[]',
289
+ mistakes TEXT DEFAULT '[]',
290
+ patterns TEXT DEFAULT '[]',
291
+
292
+ -- Topic tags (JSON array)
293
+ topic_tags TEXT DEFAULT '[]',
294
+
295
+ -- Tool usage (JSON — per-tool stats)
296
+ tool_usage_stats TEXT DEFAULT '[]',
297
+
298
+ -- Narrative (text — for human reading)
299
+ what_worked TEXT,
300
+ what_failed TEXT,
301
+
302
+ -- Embedding for session-level semantic search
303
+ summary_embedding BLOB
304
+ )
305
+ """)
306
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_session_enrichments_session ON session_enrichments(session_id)")
307
+ cursor.execute(
308
+ "CREATE INDEX IF NOT EXISTS idx_session_enrichments_project ON session_enrichments(primary_intent)"
309
+ )
310
+ cursor.execute("CREATE INDEX IF NOT EXISTS idx_session_enrichments_outcome ON session_enrichments(outcome)")
311
+ cursor.execute(
312
+ "CREATE INDEX IF NOT EXISTS idx_session_enrichments_quality ON session_enrichments(session_quality_score)"
313
+ )
314
+
315
+ # Phase 7: FTS5 for session narrative search
316
+ cursor.execute("""
317
+ CREATE VIRTUAL TABLE IF NOT EXISTS session_enrichments_fts USING fts5(
318
+ session_summary, what_worked, what_failed, session_id UNINDEXED
319
+ )
320
+ """)
321
+
322
+ # Check if FTS5 needs backfill (existing DB without FTS5 data)
323
+ fts_count = list(cursor.execute("SELECT COUNT(*) FROM chunks_fts"))[0][0]
324
+ chunk_count = list(cursor.execute("SELECT COUNT(*) FROM chunks"))[0][0]
325
+ if chunk_count > 0 and fts_count == 0:
326
+ cursor.execute("""
327
+ INSERT INTO chunks_fts(content, chunk_id)
328
+ SELECT content, id FROM chunks
329
+ """)
330
+
331
+ def upsert_chunks(self, chunks: List[Dict[str, Any]], embeddings: List[List[float]]) -> int:
332
+ """Upsert chunks with embeddings."""
333
+ if len(chunks) != len(embeddings):
334
+ raise ValueError("Chunks and embeddings must have same length")
335
+
336
+ cursor = self.conn.cursor()
337
+
338
+ for chunk, embedding in zip(chunks, embeddings):
339
+ chunk_id = chunk["id"]
340
+
341
+ # Upsert chunk — preserve enrichment columns on re-index
342
+ cursor.execute(
343
+ """
344
+ INSERT INTO chunks
345
+ (id, content, metadata, source_file, project,
346
+ content_type, value_type, char_count, source, created_at)
347
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
348
+ ON CONFLICT(id) DO UPDATE SET
349
+ content = excluded.content,
350
+ metadata = excluded.metadata,
351
+ source_file = excluded.source_file,
352
+ project = excluded.project,
353
+ content_type = excluded.content_type,
354
+ value_type = excluded.value_type,
355
+ char_count = excluded.char_count,
356
+ source = excluded.source,
357
+ created_at = COALESCE(chunks.created_at, excluded.created_at)
358
+ """,
359
+ (
360
+ chunk_id,
361
+ chunk["content"],
362
+ json.dumps(chunk["metadata"]),
363
+ chunk["source_file"],
364
+ chunk.get("project"),
365
+ chunk.get("content_type"),
366
+ chunk.get("value_type"),
367
+ chunk.get("char_count", 0),
368
+ chunk.get("source", "claude_code"),
369
+ chunk.get("created_at"),
370
+ ),
371
+ )
372
+
373
+ # Upsert vector - vec0 doesn't support INSERT OR REPLACE, so delete first
374
+ cursor.execute("DELETE FROM chunk_vectors WHERE chunk_id = ?", (chunk_id,))
375
+ cursor.execute(
376
+ """
377
+ INSERT INTO chunk_vectors (chunk_id, embedding)
378
+ VALUES (?, ?)
379
+ """,
380
+ (chunk_id, serialize_f32(embedding)),
381
+ )
382
+
383
+ return len(chunks)
384
+
385
+ def search(
386
+ self,
387
+ query_embedding: Optional[List[float]] = None,
388
+ query_text: Optional[str] = None,
389
+ n_results: int = 10,
390
+ project_filter: Optional[str] = None,
391
+ content_type_filter: Optional[str] = None,
392
+ source_filter: Optional[str] = None,
393
+ sender_filter: Optional[str] = None,
394
+ language_filter: Optional[str] = None,
395
+ tag_filter: Optional[str] = None,
396
+ intent_filter: Optional[str] = None,
397
+ importance_min: Optional[float] = None,
398
+ date_from: Optional[str] = None,
399
+ date_to: Optional[str] = None,
400
+ ) -> Dict[str, List]:
401
+ """Search chunks by embedding or text."""
402
+
403
+ cursor = self.conn.cursor()
404
+
405
+ if query_embedding is not None:
406
+ # Vector similarity search
407
+ query_bytes = serialize_f32(query_embedding)
408
+
409
+ where_clauses = []
410
+ filter_params: list = []
411
+
412
+ if project_filter:
413
+ where_clauses.append("c.project = ?")
414
+ filter_params.append(project_filter)
415
+ if content_type_filter:
416
+ where_clauses.append("c.content_type = ?")
417
+ filter_params.append(content_type_filter)
418
+ if source_filter:
419
+ where_clauses.append("c.source = ?")
420
+ filter_params.append(source_filter)
421
+ if sender_filter:
422
+ where_clauses.append("c.sender = ?")
423
+ filter_params.append(sender_filter)
424
+ if language_filter:
425
+ where_clauses.append("c.language = ?")
426
+ filter_params.append(language_filter)
427
+ if tag_filter:
428
+ where_clauses.append(
429
+ "c.tags IS NOT NULL AND json_valid(c.tags) = 1 AND EXISTS (SELECT 1 FROM json_each(c.tags) WHERE value = ?)"
430
+ )
431
+ filter_params.append(tag_filter)
432
+ if intent_filter:
433
+ where_clauses.append("c.intent = ?")
434
+ filter_params.append(intent_filter)
435
+ if importance_min is not None:
436
+ where_clauses.append("c.importance >= ?")
437
+ filter_params.append(importance_min)
438
+ if date_from:
439
+ where_clauses.append("c.created_at >= ?")
440
+ filter_params.append(date_from)
441
+ if date_to:
442
+ where_clauses.append("c.created_at <= ?")
443
+ filter_params.append(date_to)
444
+
445
+ where_sql = ""
446
+ if where_clauses:
447
+ where_sql = "AND " + " AND ".join(where_clauses)
448
+
449
+ # sqlite-vec KNN: MATCH and k must bind before filter params
450
+ params = [query_bytes, n_results] + filter_params
451
+ query = f"""
452
+ SELECT c.id, c.content, c.metadata, c.source_file, c.project,
453
+ c.content_type, c.value_type, c.char_count,
454
+ v.distance,
455
+ c.summary, c.tags, c.importance, c.intent,
456
+ c.created_at, c.source
457
+ FROM chunk_vectors v
458
+ JOIN chunks c ON v.chunk_id = c.id
459
+ WHERE v.embedding MATCH ? AND k = ? {where_sql}
460
+ ORDER BY v.distance
461
+ """
462
+
463
+ results = list(cursor.execute(query, params))
464
+
465
+ elif query_text is not None:
466
+ # Text search using LIKE
467
+ where_clauses = ["content LIKE ?"]
468
+ params = [f"%{query_text}%"]
469
+
470
+ if project_filter:
471
+ where_clauses.append("project = ?")
472
+ params.append(project_filter)
473
+ if content_type_filter:
474
+ where_clauses.append("content_type = ?")
475
+ params.append(content_type_filter)
476
+ if source_filter:
477
+ where_clauses.append("source = ?")
478
+ params.append(source_filter)
479
+ if sender_filter:
480
+ where_clauses.append("sender = ?")
481
+ params.append(sender_filter)
482
+ if language_filter:
483
+ where_clauses.append("language = ?")
484
+ params.append(language_filter)
485
+ if tag_filter:
486
+ where_clauses.append(
487
+ "tags IS NOT NULL AND json_valid(tags) = 1 AND EXISTS (SELECT 1 FROM json_each(tags) WHERE value = ?)"
488
+ )
489
+ params.append(tag_filter)
490
+ if intent_filter:
491
+ where_clauses.append("intent = ?")
492
+ params.append(intent_filter)
493
+ if importance_min is not None:
494
+ where_clauses.append("importance >= ?")
495
+ params.append(importance_min)
496
+ if date_from:
497
+ where_clauses.append("created_at >= ?")
498
+ params.append(date_from)
499
+ if date_to:
500
+ where_clauses.append("created_at <= ?")
501
+ params.append(date_to)
502
+
503
+ params.append(n_results)
504
+
505
+ query = f"""
506
+ SELECT id, content, metadata, source_file, project,
507
+ content_type, value_type, char_count,
508
+ NULL as distance,
509
+ summary, tags, importance, intent,
510
+ created_at, source
511
+ FROM chunks
512
+ WHERE {" AND ".join(where_clauses)}
513
+ ORDER BY char_count DESC
514
+ LIMIT ?
515
+ """
516
+
517
+ results = list(cursor.execute(query, params))
518
+ else:
519
+ raise ValueError("Either query_embedding or query_text must be provided")
520
+
521
+ # Format results
522
+ ids = []
523
+ documents = []
524
+ metadatas = []
525
+ distances = []
526
+
527
+ for row in results:
528
+ ids.append(row[0]) # chunk id
529
+ documents.append(row[1]) # content
530
+ metadata = json.loads(row[2]) # metadata
531
+ metadata.update(
532
+ {
533
+ "source_file": row[3],
534
+ "project": row[4],
535
+ "content_type": row[5],
536
+ "value_type": row[6],
537
+ "char_count": row[7],
538
+ }
539
+ )
540
+ # Enrichment fields (may be None if not yet enriched)
541
+ if row[9]:
542
+ metadata["summary"] = row[9]
543
+ if row[10]:
544
+ try:
545
+ metadata["tags"] = json.loads(row[10])
546
+ except (json.JSONDecodeError, TypeError):
547
+ pass
548
+ if row[11] is not None:
549
+ metadata["importance"] = row[11]
550
+ if row[12]:
551
+ metadata["intent"] = row[12]
552
+ # Temporal and source metadata
553
+ if row[13]:
554
+ metadata["created_at"] = row[13]
555
+ if row[14]:
556
+ metadata["source"] = row[14]
557
+ metadatas.append(metadata)
558
+ distances.append(row[8]) # distance (None for text search)
559
+
560
+ return {
561
+ "ids": [ids],
562
+ "documents": [documents],
563
+ "metadatas": [metadatas],
564
+ "distances": [distances],
565
+ }
566
+
567
+ def enrich_results_with_session_context(self, results: Dict[str, List]) -> Dict[str, List]:
568
+ """Add session enrichment metadata to search results.
569
+
570
+ For each result, if its session has been enriched, add session_summary,
571
+ session_outcome, and session_quality_score to the metadata.
572
+ """
573
+ if not results.get("metadatas") or not results["metadatas"][0]:
574
+ return results
575
+
576
+ cursor = self.conn.cursor()
577
+ # Cache session lookups to avoid repeated queries
578
+ session_cache: Dict[str, Optional[Dict]] = {}
579
+
580
+ for meta in results["metadatas"][0]:
581
+ source_file = meta.get("source_file", "")
582
+ if not source_file:
583
+ continue
584
+
585
+ # Extract session ID from source_file
586
+ import os
587
+
588
+ session_id = os.path.splitext(os.path.basename(source_file))[0]
589
+ if not session_id:
590
+ continue
591
+
592
+ if session_id not in session_cache:
593
+ rows = list(
594
+ cursor.execute(
595
+ """SELECT session_summary, primary_intent, outcome,
596
+ session_quality_score
597
+ FROM session_enrichments WHERE session_id = ?""",
598
+ (session_id,),
599
+ )
600
+ )
601
+ if rows:
602
+ session_cache[session_id] = {
603
+ "session_summary": rows[0][0],
604
+ "session_intent": rows[0][1],
605
+ "session_outcome": rows[0][2],
606
+ "session_quality": rows[0][3],
607
+ }
608
+ else:
609
+ session_cache[session_id] = None
610
+
611
+ enrichment = session_cache[session_id]
612
+ if enrichment:
613
+ for k, v in enrichment.items():
614
+ if v is not None:
615
+ meta[k] = v
616
+
617
+ return results
618
+
619
+ def count(self) -> int:
620
+ """Get total number of chunks."""
621
+ cursor = self.conn.cursor()
622
+ result = list(cursor.execute("SELECT COUNT(*) FROM chunks"))
623
+ return result[0][0] if result else 0
624
+
625
+ def get_stats(self) -> Dict[str, Any]:
626
+ """Get collection statistics."""
627
+ count = self.count()
628
+
629
+ if count == 0:
630
+ return {"total_chunks": 0, "projects": [], "content_types": []}
631
+
632
+ cursor = self.conn.cursor()
633
+
634
+ # Get unique projects and content types
635
+ results = list(
636
+ cursor.execute("""
637
+ SELECT DISTINCT project, content_type
638
+ FROM chunks
639
+ WHERE project IS NOT NULL AND content_type IS NOT NULL
640
+ LIMIT 100
641
+ """)
642
+ )
643
+
644
+ projects = set()
645
+ content_types = set()
646
+
647
+ for project, content_type in results:
648
+ projects.add(project)
649
+ content_types.add(content_type)
650
+
651
+ return {
652
+ "total_chunks": count,
653
+ "projects": list(projects),
654
+ "content_types": list(content_types),
655
+ }
656
+
657
+ def get_all_chunks(self, limit: int = 10000) -> List[Dict[str, Any]]:
658
+ """Get all chunks for BM25 fitting (limited for performance)."""
659
+ cursor = self.conn.cursor()
660
+ results = list(
661
+ cursor.execute(
662
+ """
663
+ SELECT id, content, metadata, source_file, project, content_type
664
+ FROM chunks
665
+ LIMIT ?
666
+ """,
667
+ (limit,),
668
+ )
669
+ )
670
+
671
+ return [
672
+ {
673
+ "id": row[0],
674
+ "content": row[1],
675
+ "metadata": json.loads(row[2]) if row[2] else {},
676
+ "source_file": row[3],
677
+ "project": row[4],
678
+ "content_type": row[5],
679
+ }
680
+ for row in results
681
+ ]
682
+
683
+ def hybrid_search(
684
+ self,
685
+ query_embedding: List[float],
686
+ query_text: str,
687
+ n_results: int = 10,
688
+ project_filter: Optional[str] = None,
689
+ content_type_filter: Optional[str] = None,
690
+ source_filter: Optional[str] = None,
691
+ sender_filter: Optional[str] = None,
692
+ language_filter: Optional[str] = None,
693
+ tag_filter: Optional[str] = None,
694
+ intent_filter: Optional[str] = None,
695
+ importance_min: Optional[float] = None,
696
+ date_from: Optional[str] = None,
697
+ date_to: Optional[str] = None,
698
+ k: int = 60,
699
+ ) -> Dict[str, List]:
700
+ """Hybrid search combining semantic (vector) + keyword (FTS5) via Reciprocal Rank Fusion."""
701
+
702
+ # 1. Semantic search — get more results for fusion
703
+ semantic = self.search(
704
+ query_embedding=query_embedding,
705
+ n_results=n_results * 3,
706
+ project_filter=project_filter,
707
+ content_type_filter=content_type_filter,
708
+ source_filter=source_filter,
709
+ sender_filter=sender_filter,
710
+ language_filter=language_filter,
711
+ tag_filter=tag_filter,
712
+ intent_filter=intent_filter,
713
+ importance_min=importance_min,
714
+ date_from=date_from,
715
+ date_to=date_to,
716
+ )
717
+
718
+ # Build semantic rank map: chunk_content -> rank
719
+ semantic_ranks = {}
720
+ for i, (doc, meta) in enumerate(zip(semantic["documents"][0], semantic["metadatas"][0])):
721
+ key = meta.get("source_file", "") + "|" + doc[:100]
722
+ semantic_ranks[key] = i
723
+
724
+ # 2. FTS5 keyword search
725
+ cursor = self.conn.cursor()
726
+ fts_extra = []
727
+ # AIDEV-NOTE: FTS5 MATCH requires escaped query text. Special chars like
728
+ # '.', '*', '"', '(', ')' cause syntax errors if passed raw.
729
+ # Wrap each term in double quotes to treat as literal strings.
730
+ fts_query = _escape_fts5_query(query_text)
731
+ fts_params: list = [fts_query]
732
+ if tag_filter:
733
+ fts_extra.append(
734
+ "AND c.tags IS NOT NULL AND json_valid(c.tags) = 1 AND EXISTS (SELECT 1 FROM json_each(c.tags) WHERE value = ?)"
735
+ )
736
+ fts_params.append(tag_filter)
737
+ if intent_filter:
738
+ fts_extra.append("AND c.intent = ?")
739
+ fts_params.append(intent_filter)
740
+ if importance_min is not None:
741
+ fts_extra.append("AND c.importance >= ?")
742
+ fts_params.append(importance_min)
743
+ if date_from:
744
+ fts_extra.append("AND c.created_at >= ?")
745
+ fts_params.append(date_from)
746
+ if date_to:
747
+ fts_extra.append("AND c.created_at <= ?")
748
+ fts_params.append(date_to)
749
+ fts_params.append(n_results * 3)
750
+
751
+ fts_results = list(
752
+ cursor.execute(
753
+ f"""
754
+ SELECT f.chunk_id, f.rank,
755
+ c.content, c.metadata, c.source_file, c.project,
756
+ c.content_type, c.value_type, c.char_count,
757
+ c.summary, c.tags, c.importance, c.intent,
758
+ c.created_at, c.source
759
+ FROM chunks_fts f
760
+ JOIN chunks c ON f.chunk_id = c.id
761
+ WHERE chunks_fts MATCH ? {" ".join(fts_extra)}
762
+ ORDER BY f.rank
763
+ LIMIT ?
764
+ """,
765
+ fts_params,
766
+ )
767
+ )
768
+
769
+ # Build FTS rank map
770
+ fts_ranks = {}
771
+ fts_data = {}
772
+ for i, row in enumerate(fts_results):
773
+ chunk_id = row[0]
774
+ fts_ranks[chunk_id] = i
775
+ fts_data[chunk_id] = {
776
+ "content": row[2],
777
+ "metadata": json.loads(row[3]) if row[3] else {},
778
+ "source_file": row[4],
779
+ "project": row[5],
780
+ "content_type": row[6],
781
+ "value_type": row[7],
782
+ "char_count": row[8],
783
+ "summary": row[9],
784
+ "tags": row[10],
785
+ "importance": row[11],
786
+ "intent": row[12],
787
+ "created_at": row[13],
788
+ "source": row[14],
789
+ }
790
+
791
+ # 3. Reciprocal Rank Fusion — deduplicate by chunk_id
792
+ # Build semantic rank map keyed by actual chunk_id
793
+ semantic_by_id = {}
794
+ for i in range(len(semantic["ids"][0])):
795
+ cid = semantic["ids"][0][i]
796
+ if cid and cid not in semantic_by_id:
797
+ semantic_by_id[cid] = {
798
+ "rank": i,
799
+ "doc": semantic["documents"][0][i],
800
+ "meta": semantic["metadatas"][0][i],
801
+ "dist": semantic["distances"][0][i],
802
+ }
803
+
804
+ # Union of all chunk_ids from both sources
805
+ all_chunk_ids = set(semantic_by_id.keys()) | set(fts_ranks.keys())
806
+
807
+ scored = []
808
+ for cid in all_chunk_ids:
809
+ score = 0.0
810
+ sem_entry = semantic_by_id.get(cid)
811
+ fts_rank = fts_ranks.get(cid)
812
+
813
+ if sem_entry is not None:
814
+ score += 1.0 / (k + sem_entry["rank"])
815
+ if fts_rank is not None:
816
+ score += 1.0 / (k + fts_rank)
817
+
818
+ # Get data — prefer semantic (has distance)
819
+ if sem_entry is not None:
820
+ doc = sem_entry["doc"]
821
+ meta = sem_entry["meta"]
822
+ dist = sem_entry["dist"]
823
+ elif cid in fts_data:
824
+ data = fts_data[cid]
825
+ doc = data["content"]
826
+ meta = data["metadata"].copy()
827
+ meta.update(
828
+ {
829
+ "source_file": data["source_file"],
830
+ "project": data["project"],
831
+ "content_type": data["content_type"],
832
+ "value_type": data["value_type"],
833
+ "char_count": data["char_count"],
834
+ }
835
+ )
836
+ if data.get("summary"):
837
+ meta["summary"] = data["summary"]
838
+ if data.get("tags"):
839
+ try:
840
+ meta["tags"] = json.loads(data["tags"])
841
+ except (json.JSONDecodeError, TypeError):
842
+ pass
843
+ if data.get("importance") is not None:
844
+ meta["importance"] = data["importance"]
845
+ if data.get("intent"):
846
+ meta["intent"] = data["intent"]
847
+ if data.get("created_at"):
848
+ meta["created_at"] = data["created_at"]
849
+ if data.get("source"):
850
+ meta["source"] = data["source"]
851
+ dist = None
852
+ else:
853
+ continue
854
+
855
+ # Apply filters to FTS-only results
856
+ if fts_rank is not None and sem_entry is None:
857
+ if source_filter and meta.get("source") != source_filter:
858
+ continue
859
+ if project_filter and meta.get("project") != project_filter:
860
+ continue
861
+
862
+ scored.append((score, cid, doc, meta, dist))
863
+
864
+ # Sort by RRF score descending
865
+ scored.sort(key=lambda x: x[0], reverse=True)
866
+
867
+ ids = [s[1] for s in scored[:n_results]]
868
+ documents = [s[2] for s in scored[:n_results]]
869
+ metadatas = [s[3] for s in scored[:n_results]]
870
+ distances = [s[4] for s in scored[:n_results]]
871
+
872
+ return {
873
+ "ids": [ids],
874
+ "documents": [documents],
875
+ "metadatas": [metadatas],
876
+ "distances": [distances],
877
+ }
878
+
879
+ def get_context(self, chunk_id: str, before: int = 3, after: int = 3) -> Dict[str, Any]:
880
+ """Get surrounding chunks from the same conversation."""
881
+ cursor = self.conn.cursor()
882
+
883
+ # Get the target chunk's conversation_id and position
884
+ target = list(
885
+ cursor.execute(
886
+ """
887
+ SELECT conversation_id, position, content, metadata
888
+ FROM chunks WHERE id = ?
889
+ """,
890
+ (chunk_id,),
891
+ )
892
+ )
893
+
894
+ if not target:
895
+ return {"target": None, "context": [], "error": "Chunk not found"}
896
+
897
+ conv_id, position, content, metadata = target[0]
898
+
899
+ if not conv_id or position is None:
900
+ return {
901
+ "target": {"id": chunk_id, "content": content, "position": None},
902
+ "context": [],
903
+ "error": "Chunk has no conversation context (conversation_id/position not set)",
904
+ }
905
+
906
+ # Get surrounding chunks
907
+ context_rows = list(
908
+ cursor.execute(
909
+ """
910
+ SELECT id, content, position, content_type
911
+ FROM chunks
912
+ WHERE conversation_id = ?
913
+ AND position BETWEEN ? AND ?
914
+ ORDER BY position
915
+ """,
916
+ (conv_id, position - before, position + after),
917
+ )
918
+ )
919
+
920
+ context = []
921
+ for row in context_rows:
922
+ context.append(
923
+ {
924
+ "id": row[0],
925
+ "content": row[1],
926
+ "position": row[2],
927
+ "content_type": row[3],
928
+ "is_target": row[0] == chunk_id,
929
+ }
930
+ )
931
+
932
+ return {
933
+ "target": {"id": chunk_id, "content": content, "position": position},
934
+ "context": context,
935
+ }
936
+
937
+ def get_unenriched_chunks(
938
+ self,
939
+ batch_size: int = 50,
940
+ content_types: Optional[List[str]] = None,
941
+ min_char_count: Optional[int] = None,
942
+ source: Optional[str] = None,
943
+ ) -> List[Dict[str, Any]]:
944
+ """Get chunks that haven't been enriched yet, for batch processing.
945
+
946
+ If min_char_count is not specified, uses source_aware_min_chars()
947
+ to pick an appropriate threshold for the given source.
948
+ """
949
+ cursor = self.conn.cursor()
950
+
951
+ effective_min = min_char_count if min_char_count is not None else source_aware_min_chars(source)
952
+ where = ["enriched_at IS NULL", "char_count >= ?"]
953
+ params: list = [effective_min]
954
+
955
+ if source:
956
+ where.append("source = ?")
957
+ params.append(source)
958
+
959
+ if content_types:
960
+ placeholders = ",".join("?" for _ in content_types)
961
+ where.append(f"content_type IN ({placeholders})")
962
+ params.extend(content_types)
963
+
964
+ params.append(batch_size)
965
+
966
+ results = list(
967
+ cursor.execute(
968
+ f"""
969
+ SELECT id, content, source_file, project, content_type,
970
+ conversation_id, position, char_count
971
+ FROM chunks
972
+ WHERE {" AND ".join(where)}
973
+ ORDER BY rowid DESC
974
+ LIMIT ?
975
+ """,
976
+ params,
977
+ )
978
+ )
979
+
980
+ return [
981
+ {
982
+ "id": row[0],
983
+ "content": row[1],
984
+ "source_file": row[2],
985
+ "project": row[3],
986
+ "content_type": row[4],
987
+ "conversation_id": row[5],
988
+ "position": row[6],
989
+ "char_count": row[7],
990
+ }
991
+ for row in results
992
+ ]
993
+
994
+ def update_enrichment(
995
+ self,
996
+ chunk_id: str,
997
+ summary: Optional[str] = None,
998
+ tags: Optional[List[str]] = None,
999
+ importance: Optional[float] = None,
1000
+ intent: Optional[str] = None,
1001
+ primary_symbols: Optional[List[str]] = None,
1002
+ resolved_query: Optional[str] = None,
1003
+ epistemic_level: Optional[str] = None,
1004
+ version_scope: Optional[str] = None,
1005
+ debt_impact: Optional[str] = None,
1006
+ external_deps: Optional[List[str]] = None,
1007
+ ) -> None:
1008
+ """Update enrichment metadata for a chunk."""
1009
+ cursor = self.conn.cursor()
1010
+ from datetime import datetime, timezone
1011
+
1012
+ sets = ["enriched_at = ?"]
1013
+ params: list = [datetime.now(timezone.utc).isoformat()]
1014
+
1015
+ if summary is not None:
1016
+ sets.append("summary = ?")
1017
+ params.append(summary)
1018
+ if tags is not None:
1019
+ sets.append("tags = ?")
1020
+ params.append(json.dumps(tags))
1021
+ if importance is not None:
1022
+ sets.append("importance = ?")
1023
+ params.append(importance)
1024
+ if intent is not None:
1025
+ sets.append("intent = ?")
1026
+ params.append(intent)
1027
+ if primary_symbols is not None:
1028
+ sets.append("primary_symbols = ?")
1029
+ params.append(json.dumps(primary_symbols))
1030
+ if resolved_query is not None:
1031
+ sets.append("resolved_query = ?")
1032
+ params.append(resolved_query)
1033
+ if epistemic_level is not None:
1034
+ sets.append("epistemic_level = ?")
1035
+ params.append(epistemic_level)
1036
+ if version_scope is not None:
1037
+ sets.append("version_scope = ?")
1038
+ params.append(version_scope)
1039
+ if debt_impact is not None:
1040
+ sets.append("debt_impact = ?")
1041
+ params.append(debt_impact)
1042
+ if external_deps is not None:
1043
+ sets.append("external_deps = ?")
1044
+ params.append(json.dumps(external_deps))
1045
+
1046
+ params.append(chunk_id)
1047
+ # Retry on SQLITE_BUSY — concurrent access from daemon/MCP/enrichment
1048
+ import time as _time
1049
+
1050
+ for attempt in range(3):
1051
+ try:
1052
+ cursor.execute(f"UPDATE chunks SET {', '.join(sets)} WHERE id = ?", params)
1053
+ return
1054
+ except apsw.BusyError:
1055
+ if attempt < 2:
1056
+ _time.sleep(0.5 * (attempt + 1))
1057
+ else:
1058
+ raise
1059
+
1060
+ def get_enrichment_stats(self) -> Dict[str, Any]:
1061
+ """Get enrichment progress statistics.
1062
+
1063
+ Reports both naive (total) and accurate (enrichable-only) percentages.
1064
+ Chunks marked 'skipped:too_short' are excluded from enrichable count.
1065
+ WhatsApp/Telegram chunks use a lower threshold (15 chars) so they're
1066
+ NOT marked as skipped even if under 50 chars.
1067
+ """
1068
+ cursor = self.conn.cursor()
1069
+ total = list(cursor.execute("SELECT COUNT(*) FROM chunks"))[0][0]
1070
+ enriched = list(
1071
+ cursor.execute(
1072
+ "SELECT COUNT(*) FROM chunks WHERE enriched_at IS NOT NULL AND enriched_at NOT LIKE 'skipped:%'"
1073
+ )
1074
+ )[0][0]
1075
+ skipped = list(cursor.execute("SELECT COUNT(*) FROM chunks WHERE enriched_at LIKE 'skipped:%'"))[0][0]
1076
+ remaining = list(cursor.execute("SELECT COUNT(*) FROM chunks WHERE enriched_at IS NULL"))[0][0]
1077
+ enrichable = total - skipped
1078
+ by_intent = list(
1079
+ cursor.execute("""
1080
+ SELECT intent, COUNT(*) FROM chunks
1081
+ WHERE intent IS NOT NULL
1082
+ GROUP BY intent ORDER BY COUNT(*) DESC
1083
+ """)
1084
+ )
1085
+ return {
1086
+ "total_chunks": total,
1087
+ "enrichable": enrichable,
1088
+ "enriched": enriched,
1089
+ "skipped": skipped,
1090
+ "remaining": remaining,
1091
+ "percent": round(enriched / enrichable * 100, 1) if enrichable > 0 else 0,
1092
+ "naive_percent": round((enriched + skipped) / total * 100, 1) if total > 0 else 0,
1093
+ "by_intent": {row[0]: row[1] for row in by_intent},
1094
+ }
1095
+
1096
+ # ─── Phase 8b: Git Overlay Methods ──────────────────────────────
1097
+
1098
+ def store_session_context(
1099
+ self,
1100
+ session_id: str,
1101
+ project: str,
1102
+ branch: Optional[str] = None,
1103
+ pr_number: Optional[int] = None,
1104
+ commit_shas: Optional[List[str]] = None,
1105
+ files_changed: Optional[List[str]] = None,
1106
+ started_at: Optional[str] = None,
1107
+ ended_at: Optional[str] = None,
1108
+ plan_name: Optional[str] = None,
1109
+ plan_phase: Optional[str] = None,
1110
+ story_id: Optional[str] = None,
1111
+ ) -> None:
1112
+ """Store git context for a session (upsert).
1113
+
1114
+ Preserves existing plan_name/plan_phase/story_id
1115
+ if not provided (avoids wiping plan links on
1116
+ git overlay re-runs).
1117
+ """
1118
+ cursor = self.conn.cursor()
1119
+ # Preserve existing plan fields if not provided
1120
+ if plan_name is None:
1121
+ existing = list(
1122
+ cursor.execute(
1123
+ "SELECT plan_name, plan_phase, story_id FROM session_context WHERE session_id = ?",
1124
+ (session_id,),
1125
+ )
1126
+ )
1127
+ if existing:
1128
+ plan_name = existing[0][0]
1129
+ plan_phase = plan_phase or existing[0][1]
1130
+ story_id = story_id or existing[0][2]
1131
+ cursor.execute(
1132
+ """
1133
+ INSERT OR REPLACE INTO session_context
1134
+ (session_id, project, branch, pr_number, commit_shas,
1135
+ files_changed, started_at, ended_at, created_at,
1136
+ plan_name, plan_phase, story_id)
1137
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, datetime('now'),
1138
+ ?, ?, ?)
1139
+ """,
1140
+ (
1141
+ session_id,
1142
+ project,
1143
+ branch,
1144
+ pr_number,
1145
+ json.dumps(commit_shas) if commit_shas else None,
1146
+ json.dumps(files_changed) if files_changed else None,
1147
+ started_at,
1148
+ ended_at,
1149
+ plan_name,
1150
+ plan_phase,
1151
+ story_id,
1152
+ ),
1153
+ )
1154
+
1155
+ def store_file_interactions(self, interactions: List[Dict[str, Any]]) -> int:
1156
+ """Store file interaction records. Returns count stored."""
1157
+ if not interactions:
1158
+ return 0
1159
+ cursor = self.conn.cursor()
1160
+ count = 0
1161
+ for i in interactions:
1162
+ cursor.execute(
1163
+ """
1164
+ INSERT INTO file_interactions
1165
+ (file_path, timestamp, session_id, action, chunk_id, project)
1166
+ VALUES (?, ?, ?, ?, ?, ?)
1167
+ """,
1168
+ (
1169
+ i["file_path"],
1170
+ i.get("timestamp"),
1171
+ i["session_id"],
1172
+ i.get("action", "unknown"),
1173
+ i.get("chunk_id"),
1174
+ i.get("project"),
1175
+ ),
1176
+ )
1177
+ count += 1
1178
+ return count
1179
+
1180
+ def get_file_timeline(
1181
+ self,
1182
+ file_path: str,
1183
+ project: Optional[str] = None,
1184
+ limit: int = 50,
1185
+ ) -> List[Dict[str, Any]]:
1186
+ """Get ordered timeline of interactions with a file."""
1187
+ cursor = self.conn.cursor()
1188
+ query = """
1189
+ SELECT fi.file_path, fi.timestamp, fi.session_id, fi.action,
1190
+ fi.project, sc.branch, sc.pr_number
1191
+ FROM file_interactions fi
1192
+ LEFT JOIN session_context sc ON fi.session_id = sc.session_id
1193
+ WHERE fi.file_path LIKE ?
1194
+ """
1195
+ params: list = [f"%{file_path}%"]
1196
+ if project:
1197
+ query += " AND fi.project = ?"
1198
+ params.append(project)
1199
+ query += " ORDER BY fi.timestamp ASC LIMIT ?"
1200
+ params.append(limit)
1201
+
1202
+ results = []
1203
+ for row in cursor.execute(query, params):
1204
+ results.append(
1205
+ {
1206
+ "file_path": row[0],
1207
+ "timestamp": row[1],
1208
+ "session_id": row[2],
1209
+ "action": row[3],
1210
+ "project": row[4],
1211
+ "branch": row[5],
1212
+ "pr_number": row[6],
1213
+ }
1214
+ )
1215
+ return results
1216
+
1217
+ def get_session_context(self, session_id: str) -> Optional[Dict[str, Any]]:
1218
+ """Get git context for a session."""
1219
+ cursor = self.conn.cursor()
1220
+ rows = list(cursor.execute("SELECT * FROM session_context WHERE session_id = ?", (session_id,)))
1221
+ if not rows:
1222
+ return None
1223
+ row = rows[0]
1224
+ result = {
1225
+ "session_id": row[0],
1226
+ "project": row[1],
1227
+ "branch": row[2],
1228
+ "pr_number": row[3],
1229
+ "commit_shas": _safe_json_loads(row[4]),
1230
+ "files_changed": _safe_json_loads(row[5]),
1231
+ "started_at": row[6],
1232
+ "ended_at": row[7],
1233
+ "created_at": row[8],
1234
+ }
1235
+ # Plan linking columns (may not exist in old DBs)
1236
+ if len(row) > 9:
1237
+ result["plan_name"] = row[9]
1238
+ result["plan_phase"] = row[10]
1239
+ result["story_id"] = row[11]
1240
+ return result
1241
+
1242
+ def update_session_plan(
1243
+ self,
1244
+ session_id: str,
1245
+ plan_name: Optional[str] = None,
1246
+ plan_phase: Optional[str] = None,
1247
+ story_id: Optional[str] = None,
1248
+ ) -> bool:
1249
+ """Update plan linking fields for an existing session.
1250
+
1251
+ Returns True if session was found and updated.
1252
+ """
1253
+ cursor = self.conn.cursor()
1254
+ rows = list(
1255
+ cursor.execute(
1256
+ "SELECT 1 FROM session_context WHERE session_id = ?",
1257
+ (session_id,),
1258
+ )
1259
+ )
1260
+ if not rows:
1261
+ return False
1262
+ cursor.execute(
1263
+ """
1264
+ UPDATE session_context
1265
+ SET plan_name = ?, plan_phase = ?, story_id = ?
1266
+ WHERE session_id = ?
1267
+ """,
1268
+ (plan_name, plan_phase, story_id, session_id),
1269
+ )
1270
+ return True
1271
+
1272
+ def get_sessions_by_plan(
1273
+ self,
1274
+ plan_name: Optional[str] = None,
1275
+ project: Optional[str] = None,
1276
+ ) -> List[Dict[str, Any]]:
1277
+ """Get all sessions linked to a plan (or all linked sessions)."""
1278
+ cursor = self.conn.cursor()
1279
+ query = (
1280
+ "SELECT session_id, project, branch, pr_number,"
1281
+ " started_at, ended_at, plan_name, plan_phase, story_id"
1282
+ " FROM session_context"
1283
+ " WHERE plan_name IS NOT NULL"
1284
+ )
1285
+ params: list = []
1286
+ if plan_name:
1287
+ query += " AND plan_name = ?"
1288
+ params.append(plan_name)
1289
+ if project:
1290
+ query += " AND project = ?"
1291
+ params.append(project)
1292
+ query += " ORDER BY started_at ASC"
1293
+
1294
+ results = []
1295
+ for row in cursor.execute(query, params):
1296
+ results.append(
1297
+ {
1298
+ "session_id": row[0],
1299
+ "project": row[1],
1300
+ "branch": row[2],
1301
+ "pr_number": row[3],
1302
+ "started_at": row[4],
1303
+ "ended_at": row[5],
1304
+ "plan_name": row[6],
1305
+ "plan_phase": row[7],
1306
+ "story_id": row[8],
1307
+ }
1308
+ )
1309
+ return results
1310
+
1311
+ def get_plan_linking_stats(self) -> Dict[str, Any]:
1312
+ """Get plan linking statistics."""
1313
+ cursor = self.conn.cursor()
1314
+ total = list(cursor.execute("SELECT COUNT(*) FROM session_context"))[0][0]
1315
+ linked = list(cursor.execute("SELECT COUNT(*) FROM session_context WHERE plan_name IS NOT NULL"))[0][0]
1316
+ plans = list(
1317
+ cursor.execute(
1318
+ "SELECT plan_name, COUNT(*) FROM session_context"
1319
+ " WHERE plan_name IS NOT NULL"
1320
+ " GROUP BY plan_name ORDER BY COUNT(*) DESC"
1321
+ )
1322
+ )
1323
+ return {
1324
+ "total_sessions": total,
1325
+ "linked_sessions": linked,
1326
+ "unlinked_sessions": total - linked,
1327
+ "plans": {row[0]: row[1] for row in plans},
1328
+ }
1329
+
1330
+ def clear_plan_links(self, project: Optional[str] = None) -> int:
1331
+ """Clear plan links. Returns count cleared."""
1332
+ cursor = self.conn.cursor()
1333
+ if project:
1334
+ rows = list(
1335
+ cursor.execute(
1336
+ "SELECT COUNT(*) FROM session_context WHERE plan_name IS NOT NULL AND project = ?",
1337
+ (project,),
1338
+ )
1339
+ )
1340
+ cursor.execute(
1341
+ "UPDATE session_context SET plan_name = NULL, plan_phase = NULL, story_id = NULL WHERE project = ?",
1342
+ (project,),
1343
+ )
1344
+ else:
1345
+ rows = list(cursor.execute("SELECT COUNT(*) FROM session_context WHERE plan_name IS NOT NULL"))
1346
+ cursor.execute("UPDATE session_context SET plan_name = NULL, plan_phase = NULL, story_id = NULL")
1347
+ return rows[0][0] if rows else 0
1348
+
1349
+ def get_git_overlay_stats(self) -> Dict[str, Any]:
1350
+ """Get git overlay statistics."""
1351
+ cursor = self.conn.cursor()
1352
+ sessions = list(cursor.execute("SELECT COUNT(*) FROM session_context"))[0][0]
1353
+ interactions = list(cursor.execute("SELECT COUNT(*) FROM file_interactions"))[0][0]
1354
+ unique_files = list(cursor.execute("SELECT COUNT(DISTINCT file_path) FROM file_interactions"))[0][0]
1355
+ return {
1356
+ "sessions_with_context": sessions,
1357
+ "file_interactions": interactions,
1358
+ "unique_files": unique_files,
1359
+ }
1360
+
1361
+ def clear_session_git_data(self, session_id: str) -> None:
1362
+ """Clear git overlay data for a session (for re-processing)."""
1363
+ cursor = self.conn.cursor()
1364
+ cursor.execute("DELETE FROM session_context WHERE session_id = ?", (session_id,))
1365
+ cursor.execute("DELETE FROM file_interactions WHERE session_id = ?", (session_id,))
1366
+
1367
+ def store_operations(
1368
+ self,
1369
+ operations: List[Dict[str, Any]],
1370
+ ) -> int:
1371
+ """Store operation groups.
1372
+
1373
+ Args:
1374
+ operations: List of dicts with id, session_id,
1375
+ operation_type, chunk_ids, summary, outcome,
1376
+ started_at, ended_at, step_count.
1377
+
1378
+ Returns:
1379
+ Number of operations stored.
1380
+ """
1381
+ if not operations:
1382
+ return 0
1383
+ cursor = self.conn.cursor()
1384
+ from datetime import timezone
1385
+
1386
+ now = datetime.now(timezone.utc).isoformat()
1387
+ count = 0
1388
+ for op in operations:
1389
+ chunk_ids_json = json.dumps(op.get("chunk_ids", []))
1390
+ cursor.execute(
1391
+ """INSERT OR REPLACE INTO operations
1392
+ (id, session_id, operation_type, chunk_ids,
1393
+ summary, outcome, started_at, ended_at,
1394
+ step_count, created_at)
1395
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
1396
+ (
1397
+ op["id"],
1398
+ op["session_id"],
1399
+ op.get("operation_type"),
1400
+ chunk_ids_json,
1401
+ op.get("summary"),
1402
+ op.get("outcome"),
1403
+ op.get("started_at"),
1404
+ op.get("ended_at"),
1405
+ op.get("step_count", 0),
1406
+ now,
1407
+ ),
1408
+ )
1409
+ count += 1
1410
+ return count
1411
+
1412
+ def get_session_operations(
1413
+ self,
1414
+ session_id: str,
1415
+ ) -> List[Dict[str, Any]]:
1416
+ """Get all operations for a session."""
1417
+ cursor = self.conn.cursor()
1418
+ rows = list(
1419
+ cursor.execute(
1420
+ """SELECT id, session_id, operation_type,
1421
+ chunk_ids, summary, outcome,
1422
+ started_at, ended_at, step_count
1423
+ FROM operations
1424
+ WHERE session_id = ?
1425
+ ORDER BY started_at""",
1426
+ (session_id,),
1427
+ )
1428
+ )
1429
+ results = []
1430
+ for row in rows:
1431
+ chunk_ids = []
1432
+ if row[3]:
1433
+ try:
1434
+ chunk_ids = json.loads(row[3])
1435
+ except (json.JSONDecodeError, TypeError):
1436
+ pass
1437
+ results.append(
1438
+ {
1439
+ "id": row[0],
1440
+ "session_id": row[1],
1441
+ "operation_type": row[2],
1442
+ "chunk_ids": chunk_ids,
1443
+ "summary": row[4],
1444
+ "outcome": row[5],
1445
+ "started_at": row[6],
1446
+ "ended_at": row[7],
1447
+ "step_count": row[8],
1448
+ }
1449
+ )
1450
+ return results
1451
+
1452
+ def get_operations_stats(self) -> Dict[str, Any]:
1453
+ """Get operation grouping statistics."""
1454
+ cursor = self.conn.cursor()
1455
+ total = list(cursor.execute("SELECT COUNT(*) FROM operations"))[0][0]
1456
+ by_type = list(
1457
+ cursor.execute(
1458
+ """SELECT operation_type, COUNT(*)
1459
+ FROM operations
1460
+ GROUP BY operation_type
1461
+ ORDER BY COUNT(*) DESC"""
1462
+ )
1463
+ )
1464
+ sessions = list(
1465
+ cursor.execute(
1466
+ """SELECT COUNT(DISTINCT session_id)
1467
+ FROM operations"""
1468
+ )
1469
+ )[0][0]
1470
+ return {
1471
+ "total_operations": total,
1472
+ "sessions_with_operations": sessions,
1473
+ "by_type": {(row[0] or "unknown"): row[1] for row in by_type},
1474
+ }
1475
+
1476
+ def clear_session_operations(self, session_id: str) -> None:
1477
+ """Clear operations for a session."""
1478
+ cursor = self.conn.cursor()
1479
+ cursor.execute(
1480
+ "DELETE FROM operations WHERE session_id = ?",
1481
+ (session_id,),
1482
+ )
1483
+
1484
+ def store_topic_chains(
1485
+ self,
1486
+ chains: List[Dict[str, Any]],
1487
+ ) -> int:
1488
+ """Store topic chain entries."""
1489
+ if not chains:
1490
+ return 0
1491
+ cursor = self.conn.cursor()
1492
+ from datetime import timezone
1493
+
1494
+ now = datetime.now(timezone.utc).isoformat()
1495
+ count = 0
1496
+ for chain in chains:
1497
+ cursor.execute(
1498
+ """INSERT INTO topic_chains
1499
+ (file_path, session_a, session_b,
1500
+ shared_actions, time_delta_hours,
1501
+ project, created_at)
1502
+ VALUES (?, ?, ?, ?, ?, ?, ?)""",
1503
+ (
1504
+ chain["file_path"],
1505
+ chain["session_a"],
1506
+ chain["session_b"],
1507
+ chain.get("shared_actions", 0),
1508
+ chain.get("time_delta_hours"),
1509
+ chain.get("project"),
1510
+ now,
1511
+ ),
1512
+ )
1513
+ count += 1
1514
+ return count
1515
+
1516
+ def get_file_chains(
1517
+ self,
1518
+ file_path: str,
1519
+ limit: int = 20,
1520
+ ) -> List[Dict[str, Any]]:
1521
+ """Get topic chains for a file (sessions linked by file)."""
1522
+ cursor = self.conn.cursor()
1523
+ rows = list(
1524
+ cursor.execute(
1525
+ """SELECT tc.file_path, tc.session_a,
1526
+ tc.session_b, tc.shared_actions,
1527
+ tc.time_delta_hours, tc.project,
1528
+ sa.branch AS branch_a,
1529
+ sb.branch AS branch_b
1530
+ FROM topic_chains tc
1531
+ LEFT JOIN session_context sa
1532
+ ON tc.session_a = sa.session_id
1533
+ LEFT JOIN session_context sb
1534
+ ON tc.session_b = sb.session_id
1535
+ WHERE tc.file_path LIKE ?
1536
+ ORDER BY tc.time_delta_hours
1537
+ LIMIT ?""",
1538
+ (f"%{file_path}%", limit),
1539
+ )
1540
+ )
1541
+ return [
1542
+ {
1543
+ "file_path": row[0],
1544
+ "session_a": row[1],
1545
+ "session_b": row[2],
1546
+ "shared_actions": row[3],
1547
+ "time_delta_hours": row[4],
1548
+ "project": row[5],
1549
+ "branch_a": row[6],
1550
+ "branch_b": row[7],
1551
+ }
1552
+ for row in rows
1553
+ ]
1554
+
1555
+ def get_file_regression(
1556
+ self,
1557
+ file_path: str,
1558
+ project: Optional[str] = None,
1559
+ ) -> Dict[str, Any]:
1560
+ """Get regression info for a file.
1561
+
1562
+ Finds the last successful operation involving the file,
1563
+ then shows all changes after that point.
1564
+
1565
+ Returns:
1566
+ Dict with last_success, changes_after, and timeline.
1567
+ """
1568
+ cursor = self.conn.cursor()
1569
+
1570
+ # Get all interactions for this file, ordered by time
1571
+ query = """
1572
+ SELECT fi.file_path, fi.timestamp,
1573
+ fi.session_id, fi.action,
1574
+ fi.project,
1575
+ sc.branch, sc.pr_number
1576
+ FROM file_interactions fi
1577
+ LEFT JOIN session_context sc
1578
+ ON fi.session_id = sc.session_id
1579
+ WHERE fi.file_path LIKE ?
1580
+ """
1581
+ params: list = [f"%{file_path}%"]
1582
+ if project:
1583
+ query += " AND fi.project = ?"
1584
+ params.append(project)
1585
+ query += " ORDER BY fi.timestamp"
1586
+
1587
+ interactions = list(cursor.execute(query, params))
1588
+
1589
+ if not interactions:
1590
+ return {
1591
+ "file_path": file_path,
1592
+ "timeline": [],
1593
+ "last_success": None,
1594
+ "changes_after": [],
1595
+ }
1596
+
1597
+ # Build timeline
1598
+ timeline = []
1599
+ for row in interactions:
1600
+ timeline.append(
1601
+ {
1602
+ "file_path": row[0],
1603
+ "timestamp": row[1],
1604
+ "session_id": row[2],
1605
+ "action": row[3],
1606
+ "project": row[4],
1607
+ "branch": row[5],
1608
+ "pr_number": row[6],
1609
+ }
1610
+ )
1611
+
1612
+ # Find last successful operation for this file
1613
+ # Check operations table for success outcomes
1614
+ last_success = None
1615
+ changes_after = []
1616
+
1617
+ # Get operations that involved this file
1618
+ for entry in reversed(timeline):
1619
+ sid = entry["session_id"]
1620
+ if not sid:
1621
+ continue
1622
+ ops = list(
1623
+ cursor.execute(
1624
+ """SELECT outcome FROM operations
1625
+ WHERE session_id = ?
1626
+ AND outcome = 'success'
1627
+ LIMIT 1""",
1628
+ (sid,),
1629
+ )
1630
+ )
1631
+ if ops:
1632
+ last_success = entry
1633
+ break
1634
+
1635
+ # Get all entries after last success
1636
+ if last_success and last_success.get("timestamp"):
1637
+ changes_after = [e for e in timeline if (e.get("timestamp") or "") > last_success["timestamp"]]
1638
+
1639
+ return {
1640
+ "file_path": file_path,
1641
+ "timeline": timeline,
1642
+ "last_success": last_success,
1643
+ "changes_after": changes_after,
1644
+ }
1645
+
1646
+ def get_topic_chain_stats(self) -> Dict[str, Any]:
1647
+ """Get topic chain statistics."""
1648
+ cursor = self.conn.cursor()
1649
+ total = list(cursor.execute("SELECT COUNT(*) FROM topic_chains"))[0][0]
1650
+ files = list(
1651
+ cursor.execute(
1652
+ """SELECT COUNT(DISTINCT file_path)
1653
+ FROM topic_chains"""
1654
+ )
1655
+ )[0][0]
1656
+ return {
1657
+ "total_chains": total,
1658
+ "unique_files": files,
1659
+ }
1660
+
1661
+ def clear_topic_chains(self, project: Optional[str] = None) -> None:
1662
+ """Clear topic chains, optionally for a project."""
1663
+ cursor = self.conn.cursor()
1664
+ if project:
1665
+ cursor.execute(
1666
+ "DELETE FROM topic_chains WHERE project = ?",
1667
+ (project,),
1668
+ )
1669
+ else:
1670
+ cursor.execute("DELETE FROM topic_chains")
1671
+
1672
+ # --- Phase 7: Session Enrichment CRUD ---
1673
+
1674
+ def upsert_session_enrichment(self, enrichment: Dict[str, Any]) -> None:
1675
+ """Insert or update a session enrichment record."""
1676
+ cursor = self.conn.cursor()
1677
+ # Work on a copy to avoid mutating caller's dict
1678
+ enrichment = dict(enrichment)
1679
+ session_id = enrichment["session_id"]
1680
+
1681
+ # Serialize JSON fields
1682
+ json_fields = [
1683
+ "decisions_made",
1684
+ "corrections",
1685
+ "learnings",
1686
+ "mistakes",
1687
+ "patterns",
1688
+ "topic_tags",
1689
+ "tool_usage_stats",
1690
+ ]
1691
+ for field in json_fields:
1692
+ if field in enrichment and not isinstance(enrichment[field], str):
1693
+ enrichment[field] = json.dumps(enrichment[field])
1694
+
1695
+ cursor.execute(
1696
+ """
1697
+ INSERT INTO session_enrichments (
1698
+ session_id, file_path, enrichment_version, enrichment_model,
1699
+ session_start_time, session_end_time, duration_seconds,
1700
+ message_count, user_message_count, assistant_message_count, tool_call_count,
1701
+ session_summary, primary_intent, outcome, complexity_score,
1702
+ session_quality_score,
1703
+ decisions_made, corrections, learnings, mistakes, patterns,
1704
+ topic_tags, tool_usage_stats,
1705
+ what_worked, what_failed,
1706
+ summary_embedding
1707
+ ) VALUES (
1708
+ ?, ?, ?, ?,
1709
+ ?, ?, ?,
1710
+ ?, ?, ?, ?,
1711
+ ?, ?, ?, ?,
1712
+ ?,
1713
+ ?, ?, ?, ?, ?,
1714
+ ?, ?,
1715
+ ?, ?,
1716
+ ?
1717
+ )
1718
+ ON CONFLICT(session_id) DO UPDATE SET
1719
+ enrichment_version = excluded.enrichment_version,
1720
+ enrichment_model = excluded.enrichment_model,
1721
+ enrichment_timestamp = strftime('%Y-%m-%dT%H:%M:%fZ','now'),
1722
+ session_start_time = excluded.session_start_time,
1723
+ session_end_time = excluded.session_end_time,
1724
+ duration_seconds = excluded.duration_seconds,
1725
+ message_count = excluded.message_count,
1726
+ user_message_count = excluded.user_message_count,
1727
+ assistant_message_count = excluded.assistant_message_count,
1728
+ tool_call_count = excluded.tool_call_count,
1729
+ session_summary = excluded.session_summary,
1730
+ primary_intent = excluded.primary_intent,
1731
+ outcome = excluded.outcome,
1732
+ complexity_score = excluded.complexity_score,
1733
+ session_quality_score = excluded.session_quality_score,
1734
+ decisions_made = excluded.decisions_made,
1735
+ corrections = excluded.corrections,
1736
+ learnings = excluded.learnings,
1737
+ mistakes = excluded.mistakes,
1738
+ patterns = excluded.patterns,
1739
+ topic_tags = excluded.topic_tags,
1740
+ tool_usage_stats = excluded.tool_usage_stats,
1741
+ what_worked = excluded.what_worked,
1742
+ what_failed = excluded.what_failed,
1743
+ summary_embedding = excluded.summary_embedding
1744
+ """,
1745
+ (
1746
+ session_id,
1747
+ enrichment.get("file_path"),
1748
+ enrichment.get("enrichment_version", "1.0"),
1749
+ enrichment.get("enrichment_model"),
1750
+ enrichment.get("session_start_time"),
1751
+ enrichment.get("session_end_time"),
1752
+ enrichment.get("duration_seconds"),
1753
+ enrichment.get("message_count", 0),
1754
+ enrichment.get("user_message_count", 0),
1755
+ enrichment.get("assistant_message_count", 0),
1756
+ enrichment.get("tool_call_count", 0),
1757
+ enrichment.get("session_summary"),
1758
+ enrichment.get("primary_intent"),
1759
+ enrichment.get("outcome"),
1760
+ enrichment.get("complexity_score"),
1761
+ enrichment.get("session_quality_score"),
1762
+ enrichment.get("decisions_made", "[]"),
1763
+ enrichment.get("corrections", "[]"),
1764
+ enrichment.get("learnings", "[]"),
1765
+ enrichment.get("mistakes", "[]"),
1766
+ enrichment.get("patterns", "[]"),
1767
+ enrichment.get("topic_tags", "[]"),
1768
+ enrichment.get("tool_usage_stats", "[]"),
1769
+ enrichment.get("what_worked"),
1770
+ enrichment.get("what_failed"),
1771
+ enrichment.get("summary_embedding"),
1772
+ ),
1773
+ )
1774
+
1775
+ # Update FTS5
1776
+ cursor.execute(
1777
+ "DELETE FROM session_enrichments_fts WHERE session_id = ?",
1778
+ (session_id,),
1779
+ )
1780
+ if enrichment.get("session_summary") or enrichment.get("what_worked") or enrichment.get("what_failed"):
1781
+ cursor.execute(
1782
+ """INSERT INTO session_enrichments_fts
1783
+ (session_summary, what_worked, what_failed, session_id)
1784
+ VALUES (?, ?, ?, ?)""",
1785
+ (
1786
+ enrichment.get("session_summary", ""),
1787
+ enrichment.get("what_worked", ""),
1788
+ enrichment.get("what_failed", ""),
1789
+ session_id,
1790
+ ),
1791
+ )
1792
+
1793
+ # Column names for session_enrichments (must match CREATE TABLE order)
1794
+ _SESSION_ENRICHMENT_COLS = [
1795
+ "id",
1796
+ "session_id",
1797
+ "file_path",
1798
+ "enrichment_version",
1799
+ "enrichment_model",
1800
+ "enrichment_timestamp",
1801
+ "session_start_time",
1802
+ "session_end_time",
1803
+ "duration_seconds",
1804
+ "message_count",
1805
+ "user_message_count",
1806
+ "assistant_message_count",
1807
+ "tool_call_count",
1808
+ "session_summary",
1809
+ "primary_intent",
1810
+ "outcome",
1811
+ "complexity_score",
1812
+ "session_quality_score",
1813
+ "decisions_made",
1814
+ "corrections",
1815
+ "learnings",
1816
+ "mistakes",
1817
+ "patterns",
1818
+ "topic_tags",
1819
+ "tool_usage_stats",
1820
+ "what_worked",
1821
+ "what_failed",
1822
+ "summary_embedding",
1823
+ ]
1824
+
1825
+ def get_session_enrichment(self, session_id: str) -> Optional[Dict[str, Any]]:
1826
+ """Get enrichment data for a session."""
1827
+ cursor = self.conn.cursor()
1828
+ rows = list(
1829
+ cursor.execute(
1830
+ "SELECT * FROM session_enrichments WHERE session_id = ?",
1831
+ (session_id,),
1832
+ )
1833
+ )
1834
+ if not rows:
1835
+ return None
1836
+ row = rows[0]
1837
+ result = dict(zip(self._SESSION_ENRICHMENT_COLS, row))
1838
+ # Parse JSON fields
1839
+ for field in [
1840
+ "decisions_made",
1841
+ "corrections",
1842
+ "learnings",
1843
+ "mistakes",
1844
+ "patterns",
1845
+ "topic_tags",
1846
+ "tool_usage_stats",
1847
+ ]:
1848
+ result[field] = _safe_json_loads(result.get(field))
1849
+ return result
1850
+
1851
+ def list_enriched_sessions(self) -> List[str]:
1852
+ """Return session IDs that already have enrichment data."""
1853
+ cursor = self.conn.cursor()
1854
+ return [row[0] for row in cursor.execute("SELECT session_id FROM session_enrichments")]
1855
+
1856
+ def get_session_enrichment_stats(self) -> Dict[str, Any]:
1857
+ """Get session enrichment statistics."""
1858
+ cursor = self.conn.cursor()
1859
+ total = list(cursor.execute("SELECT COUNT(*) FROM session_enrichments"))[0][0]
1860
+ by_outcome = dict(
1861
+ cursor.execute(
1862
+ "SELECT outcome, COUNT(*) FROM session_enrichments WHERE outcome IS NOT NULL GROUP BY outcome"
1863
+ )
1864
+ )
1865
+ by_intent = dict(
1866
+ cursor.execute(
1867
+ "SELECT primary_intent, COUNT(*) FROM session_enrichments WHERE primary_intent IS NOT NULL GROUP BY primary_intent"
1868
+ )
1869
+ )
1870
+ avg_quality = list(
1871
+ cursor.execute(
1872
+ "SELECT AVG(session_quality_score) FROM session_enrichments WHERE session_quality_score IS NOT NULL"
1873
+ )
1874
+ )[0][0]
1875
+ return {
1876
+ "total_enriched_sessions": total,
1877
+ "by_outcome": by_outcome,
1878
+ "by_intent": by_intent,
1879
+ "avg_quality_score": round(avg_quality, 1) if avg_quality else None,
1880
+ }
1881
+
1882
+ def close(self) -> None:
1883
+ """Close database connection."""
1884
+ if hasattr(self, "conn"):
1885
+ self.conn.close()
1886
+
1887
+ def __enter__(self):
1888
+ return self
1889
+
1890
+ def __exit__(self, exc_type, exc_val, exc_tb):
1891
+ self.close()