code-data-ark 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cda/pipeline/embed.py ADDED
@@ -0,0 +1,694 @@
1
+ #!/usr/bin/env python3
2
+ """embed.py — Semantic intelligence for Code Data Ark.
3
+
4
+ This stage builds semantic embeddings and mini-intelligence artifacts:
5
+ - embeddings for sessions, exchanges and memory files
6
+ - session summaries and topic tags
7
+ - anomaly alerts for high-heat or recovery sessions
8
+ - review recommendations for session follow-up
9
+ """
10
+
11
+ import json
12
+ import sqlite3
13
+ from pathlib import Path
14
+ from typing import Dict, List, Optional, Tuple
15
+
16
+ ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
17
+ LOCAL_DIR = ROOT_DIR / "local"
18
+ DB_PATH = LOCAL_DIR / "data" / "cda.db"
19
+ MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
20
+ MAX_EMBED_TEXT = 1400
21
+
22
+ MODEL = None
23
+
24
+
25
+ def get_model():
26
+ global MODEL
27
+ if MODEL is not None:
28
+ return MODEL
29
+ try:
30
+ from sentence_transformers import SentenceTransformer
31
+ except ImportError as exc:
32
+ raise RuntimeError(
33
+ "Install sentence-transformers to use semantic intelligence: "
34
+ "pip install sentence-transformers"
35
+ ) from exc
36
+ MODEL = SentenceTransformer(MODEL_NAME)
37
+ return MODEL
38
+
39
+
40
+ def db():
41
+ conn = sqlite3.connect(str(DB_PATH), timeout=30)
42
+ conn.row_factory = sqlite3.Row
43
+ conn.execute("PRAGMA journal_mode=WAL")
44
+ conn.execute("PRAGMA synchronous=NORMAL")
45
+ conn.execute("PRAGMA cache_size=-2000")
46
+ conn.execute("PRAGMA mmap_size=268435456")
47
+ conn.execute("PRAGMA temp_store=MEMORY")
48
+ return conn
49
+
50
+
51
+ def _serialize_embedding(vector):
52
+ import numpy as np
53
+
54
+ if vector is None:
55
+ return None
56
+ return np.asarray(vector, dtype="float32").tobytes()
57
+
58
+
59
+ def _deserialize_embedding(blob):
60
+ import numpy as np
61
+
62
+ if blob is None:
63
+ return None
64
+ return np.frombuffer(blob, dtype="float32")
65
+
66
+
67
+ def _truncate_text(text: str, length: int = MAX_EMBED_TEXT) -> str:
68
+ if not text:
69
+ return ""
70
+ text = text.replace("\n", " ").strip()
71
+ return text[:length]
72
+
73
+
74
+ def ensure_tables(conn):
75
+ conn.executescript("""
76
+ CREATE TABLE IF NOT EXISTS embeddings (
77
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
78
+ entity_type TEXT NOT NULL,
79
+ entity_id TEXT NOT NULL,
80
+ workspace_id TEXT,
81
+ session_id TEXT,
82
+ exchange_index INTEGER,
83
+ content_type TEXT,
84
+ content_text TEXT,
85
+ metadata TEXT,
86
+ embedding BLOB,
87
+ created_at TEXT DEFAULT (datetime('now')),
88
+ UNIQUE(entity_type, entity_id)
89
+ );
90
+ CREATE INDEX IF NOT EXISTS idx_embeddings_entity ON embeddings(entity_type, entity_id);
91
+ CREATE INDEX IF NOT EXISTS idx_embeddings_session ON embeddings(session_id);
92
+
93
+ CREATE VIRTUAL TABLE IF NOT EXISTS fts_embeddings USING fts5(
94
+ entity_type UNINDEXED,
95
+ entity_id UNINDEXED,
96
+ session_id UNINDEXED,
97
+ exchange_index UNINDEXED,
98
+ content_text,
99
+ metadata
100
+ );
101
+
102
+ CREATE TABLE IF NOT EXISTS session_summaries (
103
+ session_id TEXT PRIMARY KEY,
104
+ summary_text TEXT,
105
+ topic_tags TEXT,
106
+ updated_at TEXT DEFAULT (datetime('now'))
107
+ );
108
+
109
+ CREATE TABLE IF NOT EXISTS anomaly_alerts (
110
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
111
+ session_id TEXT,
112
+ alert_type TEXT,
113
+ severity TEXT,
114
+ message TEXT,
115
+ created_at TEXT DEFAULT (datetime('now'))
116
+ );
117
+
118
+ CREATE TABLE IF NOT EXISTS recommendations (
119
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
120
+ session_id TEXT,
121
+ recommendation_text TEXT,
122
+ source TEXT,
123
+ created_at TEXT DEFAULT (datetime('now'))
124
+ );
125
+ """)
126
+ conn.commit()
127
+
128
+
129
+ def _fetch_session_title(conn, session_id: str) -> str:
130
+ row = conn.execute("SELECT title FROM sessions WHERE session_id=?", (session_id,)).fetchone()
131
+ return row[0] if row else ""
132
+
133
+
134
+ def _get_session_content(conn, session_id: str) -> str:
135
+ rows = conn.execute(
136
+ "SELECT user_message, reasoning_text, response_text, tool_calls "
137
+ "FROM exchanges WHERE session_id=? ORDER BY user_ts LIMIT 50",
138
+ (session_id,),
139
+ ).fetchall()
140
+ pieces = []
141
+ for row in rows:
142
+ if row["user_message"]:
143
+ pieces.append(f"USER: {row['user_message']}")
144
+ if row["reasoning_text"]:
145
+ pieces.append(f"ASSISTANT_THINK: {row['reasoning_text']}")
146
+ if row["response_text"]:
147
+ pieces.append(f"ASSISTANT: {row['response_text']}")
148
+ if row["tool_calls"]:
149
+ pieces.append(f"TOOL: {row['tool_calls']}")
150
+ if not pieces:
151
+ return _truncate_text(_fetch_session_title(conn, session_id))
152
+ return _truncate_text(" \n ".join(pieces))
153
+
154
+
155
+ def _get_exchange_content(row) -> str:
156
+ text = " ".join(
157
+ str(row[field] or "") for field in (
158
+ "user_message", "reasoning_text", "response_text", "tool_calls"
159
+ )
160
+ )
161
+ return _truncate_text(text)
162
+
163
+
164
+ def _get_memory_content(row) -> str:
165
+ return _truncate_text(str(row["content"] or ""))
166
+
167
+
168
+ def _embed_texts(texts: List[str]):
169
+ model = get_model()
170
+ vectors = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
171
+ return [v.astype("float32") for v in vectors]
172
+
173
+
174
+ def upsert_embedding(
175
+ conn,
176
+ entity_type: str,
177
+ entity_id: str,
178
+ workspace_id: Optional[str],
179
+ session_id: Optional[str],
180
+ exchange_index: Optional[int],
181
+ content_type: str,
182
+ content_text: str,
183
+ metadata: Dict,
184
+ ):
185
+ if not content_text:
186
+ return
187
+ vector = _embed_texts([content_text])[0]
188
+ blob = _serialize_embedding(vector)
189
+ existing = conn.execute(
190
+ "SELECT id FROM embeddings WHERE entity_type=? AND entity_id=?",
191
+ (entity_type, entity_id),
192
+ ).fetchone()
193
+ if existing:
194
+ rowid = existing[0]
195
+ conn.execute(
196
+ """
197
+ UPDATE embeddings SET
198
+ workspace_id=?,
199
+ session_id=?,
200
+ exchange_index=?,
201
+ content_type=?,
202
+ content_text=?,
203
+ metadata=?,
204
+ embedding=?,
205
+ created_at=datetime('now')
206
+ WHERE id=?
207
+ """,
208
+ (
209
+ workspace_id,
210
+ session_id,
211
+ exchange_index,
212
+ content_type,
213
+ content_text,
214
+ json.dumps(metadata, ensure_ascii=False),
215
+ blob,
216
+ rowid,
217
+ ),
218
+ )
219
+ else:
220
+ cur = conn.execute(
221
+ """
222
+ INSERT INTO embeddings
223
+ (entity_type, entity_id, workspace_id, session_id, exchange_index, content_type, content_text, metadata, embedding)
224
+ VALUES (?,?,?,?,?,?,?,?,?)
225
+ """,
226
+ (
227
+ entity_type,
228
+ entity_id,
229
+ workspace_id,
230
+ session_id,
231
+ exchange_index,
232
+ content_type,
233
+ content_text,
234
+ json.dumps(metadata, ensure_ascii=False),
235
+ blob,
236
+ ),
237
+ )
238
+ rowid = cur.lastrowid
239
+
240
+ # Maintain a fast FTS index for embedding content and metadata.
241
+ conn.execute(
242
+ "INSERT OR REPLACE INTO fts_embeddings(rowid, entity_type, entity_id, session_id, exchange_index, content_text, metadata) VALUES (?,?,?,?,?,?,?)", # noqa: E501
243
+ (
244
+ rowid,
245
+ entity_type,
246
+ entity_id,
247
+ session_id,
248
+ exchange_index,
249
+ content_text,
250
+ json.dumps(metadata, ensure_ascii=False),
251
+ ),
252
+ )
253
+
254
+
255
+ def build_session_summaries(conn, session_id: Optional[str] = None):
256
+ session_query = "SELECT session_id FROM sessions"
257
+ args: Tuple = ()
258
+ if session_id:
259
+ session_query += " WHERE session_id=?"
260
+ args = (session_id,)
261
+
262
+ for row in conn.execute(session_query, args).fetchall():
263
+ sid = row[0]
264
+ analytics = conn.execute(
265
+ "SELECT * FROM session_analysis WHERE session_id=?", (sid,)
266
+ ).fetchone()
267
+ if not analytics:
268
+ continue
269
+ title = _fetch_session_title(conn, sid)
270
+
271
+ parts = []
272
+ if title:
273
+ parts.append(title)
274
+
275
+ heat = analytics["heat_score"] or 0
276
+ corrections = analytics["total_corrections"] or 0
277
+ frustrations = analytics["total_frustrations"] or 0
278
+ redirects = analytics["total_redirects"] or 0
279
+ tools = analytics["total_tool_calls"] or 0
280
+ compactions = analytics["compaction_count"] or 0
281
+ saved = bool(analytics["saved_session"])
282
+ clean = bool(analytics["clean_run"])
283
+
284
+ if heat >= 75:
285
+ parts.append("High-heat session with friction and corrections.")
286
+ elif heat >= 40:
287
+ parts.append("Moderate-heat session with some friction.")
288
+ elif clean:
289
+ parts.append("Clean session with few corrections and stable flow.")
290
+ else:
291
+ parts.append("Session with a normal effort profile.")
292
+
293
+ if frustrations:
294
+ parts.append(f"{frustrations} frustration signal(s) detected.")
295
+ if corrections:
296
+ parts.append(f"{corrections} correction signal(s) detected.")
297
+ if redirects:
298
+ parts.append(f"{redirects} scope-change signal(s) detected.")
299
+ if tools:
300
+ parts.append(f"{tools} tool call(s) were used.")
301
+ if compactions:
302
+ parts.append(f"{compactions} context compaction event(s) occurred.")
303
+ if saved:
304
+ parts.append("The session recovered after friction.")
305
+
306
+ summary_text = " ".join(parts)
307
+ topic_tags = _infer_topic_tags(title, analytics)
308
+ conn.execute(
309
+ """
310
+ INSERT INTO session_summaries(session_id, summary_text, topic_tags)
311
+ VALUES (?,?,?)
312
+ ON CONFLICT(session_id) DO UPDATE SET
313
+ summary_text=excluded.summary_text,
314
+ topic_tags=excluded.topic_tags,
315
+ updated_at=datetime('now')
316
+ """,
317
+ (sid, summary_text, ",".join(topic_tags)),
318
+ )
319
+ build_anomaly_alerts(conn, sid, analytics)
320
+ build_recommendations(conn, sid, analytics, topic_tags)
321
+ conn.commit()
322
+
323
+
324
+ def _infer_topic_tags(title: str, analytics) -> List[str]:
325
+ tags: List[str] = []
326
+ title_lower = (title or "").lower()
327
+ heat = analytics["heat_score"] or 0
328
+ corrections = analytics["total_corrections"] or 0
329
+ frustrations = analytics["total_frustrations"] or 0
330
+ redirects = analytics["total_redirects"] or 0
331
+ tools = analytics["total_tool_calls"] or 0
332
+ compactions = analytics["compaction_count"] or 0
333
+ saved = bool(analytics["saved_session"])
334
+ clean = bool(analytics["clean_run"])
335
+
336
+ if heat >= 70:
337
+ tags.append("high-heat")
338
+ elif heat >= 40:
339
+ tags.append("medium-heat")
340
+ else:
341
+ tags.append("low-heat")
342
+
343
+ if corrections >= 3:
344
+ tags.append("correction-heavy")
345
+ if frustrations >= 1:
346
+ tags.append("frustration")
347
+ if redirects >= 2:
348
+ tags.append("scope-change")
349
+ if tools:
350
+ tags.append("tool-driven")
351
+ if compactions:
352
+ tags.append("self-summary")
353
+ if saved:
354
+ tags.append("recovery")
355
+ if clean:
356
+ tags.append("clean-run")
357
+ if any(k in title_lower for k in ["git", "branch", "commit", "merge"]):
358
+ tags.append("git")
359
+ if any(k in title_lower for k in ["error", "fail", "exception", "crash"]):
360
+ tags.append("bug")
361
+ if any(k in title_lower for k in ["refactor", "cleanup", "format", "optimize"]):
362
+ tags.append("refactor")
363
+ if any(k in title_lower for k in ["deploy", "publish", "release"]):
364
+ tags.append("deployment")
365
+
366
+ return sorted(set(tags))
367
+
368
+
369
+ def build_anomaly_alerts(conn, session_id: str, analytics=None):
370
+ if analytics is None:
371
+ analytics = conn.execute(
372
+ "SELECT * FROM session_analysis WHERE session_id=?", (session_id,)
373
+ ).fetchone()
374
+ if not analytics:
375
+ return
376
+ conn.execute("DELETE FROM anomaly_alerts WHERE session_id=?", (session_id,))
377
+ alerts = []
378
+ heat = analytics["heat_score"] or 0
379
+ corrections = analytics["total_corrections"] or 0
380
+ frustrations = analytics["total_frustrations"] or 0
381
+ saved = bool(analytics["saved_session"])
382
+
383
+ if heat >= 80:
384
+ alerts.append(("high_heat", "high", "Session has very high heat and may indicate repeated failure or stuck troubleshooting."))
385
+ elif heat >= 55:
386
+ alerts.append(("elevated_heat", "medium", "Session shows elevated heat and may warrant review."))
387
+
388
+ if frustrations >= 2:
389
+ alerts.append(("multiple_frustrations", "medium", "Multiple frustration signals were detected."))
390
+
391
+ if corrections >= 4 and not saved:
392
+ alerts.append(("corrective_cycle", "high", "Multiple corrections without clear recovery were detected."))
393
+
394
+ if saved and heat >= 25:
395
+ alerts.append(("recovery", "low", "Session recovered from friction, worth studying for successful resolution patterns."))
396
+
397
+ for alert_type, severity, message in alerts:
398
+ conn.execute(
399
+ "INSERT INTO anomaly_alerts(session_id, alert_type, severity, message) VALUES (?,?,?,?)",
400
+ (session_id, alert_type, severity, message),
401
+ )
402
+
403
+
404
+ def build_recommendations(conn, session_id: str, analytics, topic_tags: List[str]):
405
+ conn.execute("DELETE FROM recommendations WHERE session_id=?", (session_id,))
406
+ recs: List[Tuple[str, str]] = []
407
+ heat = analytics["heat_score"] or 0
408
+ corrections = analytics["total_corrections"] or 0
409
+ tools = analytics["total_tool_calls"] or 0
410
+ saved = bool(analytics["saved_session"])
411
+
412
+ if heat >= 70 and not saved:
413
+ recs.append(("followup", "Review this session for stuck issue patterns and possible unresolved errors."))
414
+ if saved and heat >= 40:
415
+ recs.append(("review_recovery", "Inspect the recovery path and tool outputs for best-practice behavior."))
416
+ if tools >= 2:
417
+ recs.append(("inspect_tools", "Confirm tool call outputs and any file changes associated with this session."))
418
+ if corrections >= 3:
419
+ recs.append(("focus_scope", "Review the session scope and prompts to reduce correction cycles."))
420
+ if not recs:
421
+ recs.append(("no_action", "No immediate recommendations; session appears stable."))
422
+
423
+ for source, text in recs:
424
+ conn.execute(
425
+ "INSERT INTO recommendations(session_id, recommendation_text, source) VALUES (?,?,?)",
426
+ (session_id, text, source),
427
+ )
428
+
429
+
430
+ def build_session_embedding(conn, session_id: str):
431
+ text = _get_session_content(conn, session_id)
432
+ if not text:
433
+ return
434
+ upsert_embedding(
435
+ conn,
436
+ entity_type="session",
437
+ entity_id=session_id,
438
+ workspace_id=None,
439
+ session_id=session_id,
440
+ exchange_index=None,
441
+ content_type="session",
442
+ content_text=text,
443
+ metadata={"stage": "session"},
444
+ )
445
+
446
+
447
+ def build_exchange_embeddings(conn):
448
+ rows = conn.execute(
449
+ "SELECT id, session_id, workspace_id, exchange_index, user_message, reasoning_text, response_text, tool_calls "
450
+ "FROM exchanges ORDER BY session_id, exchange_index"
451
+ ).fetchall()
452
+ for row in rows:
453
+ text = _get_exchange_content(row)
454
+ if not text:
455
+ continue
456
+ upsert_embedding(
457
+ conn,
458
+ entity_type="exchange",
459
+ entity_id=f"{row['session_id']}:{row['exchange_index']}",
460
+ workspace_id=row["workspace_id"],
461
+ session_id=row["session_id"],
462
+ exchange_index=row["exchange_index"],
463
+ content_type="exchange",
464
+ content_text=text,
465
+ metadata={"stage": "exchange"},
466
+ )
467
+ conn.commit()
468
+
469
+
470
+ def build_memory_embeddings(conn):
471
+ rows = conn.execute(
472
+ "SELECT id, scope, workspace_id, session_id, filename, content FROM memory_files"
473
+ ).fetchall()
474
+ for row in rows:
475
+ text = _get_memory_content(row)
476
+ if not text:
477
+ continue
478
+ upsert_embedding(
479
+ conn,
480
+ entity_type="memory",
481
+ entity_id=f"memory:{row['id']}",
482
+ workspace_id=row["workspace_id"],
483
+ session_id=row["session_id"],
484
+ exchange_index=None,
485
+ content_type=row["scope"],
486
+ content_text=text,
487
+ metadata={"filename": row["filename"] or "", "scope": row["scope"]},
488
+ )
489
+ conn.commit()
490
+
491
+
492
+ def _session_behavior_score(conn, base_session_id: str, candidate_session_id: str) -> float:
493
+ base = conn.execute(
494
+ "SELECT heat_score, total_tool_calls, saved_session, clean_run FROM session_analysis WHERE session_id=?",
495
+ (base_session_id,),
496
+ ).fetchone()
497
+ cand = conn.execute(
498
+ "SELECT heat_score, total_tool_calls, saved_session, clean_run FROM session_analysis WHERE session_id=?",
499
+ (candidate_session_id,),
500
+ ).fetchone()
501
+ if not base or not cand:
502
+ return 0.0
503
+
504
+ score = 0.0
505
+ score += 1.0 if base[2] == cand[2] else 0.0
506
+ score += 0.5 if base[3] == cand[3] else 0.0
507
+
508
+ def heat_bucket(value):
509
+ if value is None:
510
+ return -1
511
+ if value < 40:
512
+ return 0
513
+ if value < 70:
514
+ return 1
515
+ return 2
516
+
517
+ score += 0.5 if heat_bucket(base[0]) == heat_bucket(cand[0]) else 0.0
518
+ if base[1] is not None and cand[1] is not None:
519
+ tool_diff = abs(base[1] - cand[1])
520
+ max_tools = max(base[1], cand[1], 1)
521
+ score += max(0.0, 0.5 * (1.0 - (tool_diff / max_tools)))
522
+ return score
523
+
524
+
525
+ def find_similar_sessions(conn, session_id: str, top_k: int = 5):
526
+ row = conn.execute(
527
+ "SELECT embedding FROM embeddings WHERE entity_type='session' AND entity_id=?",
528
+ (session_id,),
529
+ ).fetchone()
530
+ if not row or not row[0]:
531
+ return []
532
+
533
+ import numpy as np
534
+
535
+ target = _deserialize_embedding(row[0])
536
+ rows = conn.execute(
537
+ "SELECT entity_type, entity_id, session_id, exchange_index, content_text, metadata, embedding FROM embeddings WHERE entity_type='session' AND entity_id!=?", # noqa: E501
538
+ (session_id,),
539
+ ).fetchall()
540
+ candidates = []
541
+ for item in rows:
542
+ emb = _deserialize_embedding(item[6])
543
+ if emb is None or emb.shape != target.shape:
544
+ continue
545
+ semantic_score = float(np.dot(target, emb))
546
+ behavioral_score = _session_behavior_score(conn, session_id, item['session_id'] or item['entity_id'])
547
+ score = semantic_score + (behavioral_score * 0.15)
548
+ candidates.append((score, item))
549
+ candidates.sort(key=lambda x: x[0], reverse=True)
550
+ return [(item, score) for score, item in candidates[:top_k]]
551
+
552
+
553
+ def find_similar_entities(conn, entity_type: str, entity_id: str, top_k: int = 5):
554
+ if entity_type == 'session':
555
+ return find_similar_sessions(conn, entity_id, top_k)
556
+
557
+ row = conn.execute(
558
+ "SELECT embedding FROM embeddings WHERE entity_type=? AND entity_id=?",
559
+ (entity_type, entity_id),
560
+ ).fetchone()
561
+ if not row or not row[0]:
562
+ return []
563
+ import numpy as np
564
+
565
+ target = _deserialize_embedding(row[0])
566
+ query = "SELECT entity_type, entity_id, session_id, exchange_index, content_text, metadata, embedding "
567
+ query += "FROM embeddings WHERE entity_type='session' AND entity_id!=?" if entity_type == "session" else "FROM embeddings WHERE entity_type IN ('session','exchange') AND entity_id!=?" # noqa: E501
568
+ rows = conn.execute(query, (entity_id,)).fetchall()
569
+ candidates = []
570
+ for item in rows:
571
+ emb = _deserialize_embedding(item[6])
572
+ if emb is None or emb.shape != target.shape:
573
+ continue
574
+ score = float(np.dot(target, emb))
575
+ candidates.append((score, item))
576
+ candidates.sort(key=lambda x: x[0], reverse=True)
577
+ return [(item, score) for score, item in candidates[:top_k]]
578
+
579
+
580
+ def semantic_search(conn, query_text: str, top_k: int = 5):
581
+ if not query_text:
582
+ return []
583
+ import numpy as np
584
+
585
+ model = get_model()
586
+ query_vec = model.encode([query_text], convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)[0].astype("float32")
587
+ rows = []
588
+ try:
589
+ candidate_ids = [r[0] for r in conn.execute(
590
+ "SELECT rowid FROM fts_embeddings WHERE fts_embeddings MATCH ? LIMIT ?",
591
+ (query_text, top_k * 5),
592
+ ).fetchall()]
593
+ if candidate_ids:
594
+ placeholder = ",".join("?" for _ in candidate_ids)
595
+ rows = conn.execute(
596
+ f"SELECT entity_type, entity_id, session_id, exchange_index, content_text, metadata, embedding FROM embeddings WHERE id IN ({placeholder})", # noqa: E501
597
+ candidate_ids,
598
+ ).fetchall()
599
+ except Exception:
600
+ rows = []
601
+
602
+ if not rows:
603
+ rows = conn.execute(
604
+ "SELECT entity_type, entity_id, session_id, exchange_index, content_text, metadata, embedding "
605
+ "FROM embeddings WHERE entity_type IN ('session','exchange','memory')"
606
+ ).fetchall()
607
+
608
+ candidates = []
609
+ for item in rows:
610
+ emb = _deserialize_embedding(item[6])
611
+ if emb is None or emb.shape != query_vec.shape:
612
+ continue
613
+ score = float(np.dot(query_vec, emb))
614
+ candidates.append((score, item))
615
+ candidates.sort(key=lambda x: x[0], reverse=True)
616
+ return [(item, score) for score, item in candidates[:top_k]]
617
+
618
+
619
+ def get_session_summary(conn, session_id: str) -> Optional[sqlite3.Row]:
620
+ return conn.execute(
621
+ "SELECT session_id, summary_text, topic_tags, updated_at FROM session_summaries WHERE session_id=?",
622
+ (session_id,),
623
+ ).fetchone()
624
+
625
+
626
+ def get_session_alerts(conn, session_id: str) -> List[sqlite3.Row]:
627
+ return conn.execute(
628
+ "SELECT alert_type, severity, message, created_at FROM anomaly_alerts WHERE session_id=? ORDER BY id",
629
+ (session_id,),
630
+ ).fetchall()
631
+
632
+
633
+ def get_session_recommendations(conn, session_id: str) -> List[sqlite3.Row]:
634
+ return conn.execute(
635
+ "SELECT recommendation_text, source, created_at FROM recommendations WHERE session_id=? ORDER BY id",
636
+ (session_id,),
637
+ ).fetchall()
638
+
639
+
640
+ def get_topic_counts(conn, limit: int = 20) -> List[Tuple[str, int]]:
641
+ rows = conn.execute(
642
+ "SELECT topic_tags FROM session_summaries WHERE topic_tags != ''"
643
+ ).fetchall()
644
+ counter: Dict[str, int] = {}
645
+ for row in rows:
646
+ tags = [t.strip() for t in (row[0] or "").split(",") if t.strip()]
647
+ for tag in tags:
648
+ counter[tag] = counter.get(tag, 0) + 1
649
+ items = sorted(counter.items(), key=lambda x: x[1], reverse=True)
650
+ return items[:limit]
651
+
652
+
653
+ def build(conn: Optional[sqlite3.Connection] = None):
654
+ own_conn = conn is None
655
+ if conn is None:
656
+ conn = db()
657
+ ensure_tables(conn)
658
+ build_session_summaries(conn)
659
+ print("Building session embeddings...")
660
+ session_ids = [r[0] for r in conn.execute("SELECT session_id FROM sessions").fetchall()]
661
+ for i, sid in enumerate(session_ids, 1):
662
+ build_session_embedding(conn, sid)
663
+ if i % 20 == 0:
664
+ conn.commit()
665
+ print(f" [{i}/{len(session_ids)}] sessions embedded")
666
+ conn.commit()
667
+ print("Building exchange embeddings...")
668
+ build_exchange_embeddings(conn)
669
+ print("Building memory embeddings...")
670
+ build_memory_embeddings(conn)
671
+ if own_conn:
672
+ conn.close()
673
+
674
+
675
+ def build_session_intelligence(conn, session_id: str):
676
+ ensure_tables(conn)
677
+ build_session_summaries(conn, session_id)
678
+ try:
679
+ build_session_embedding(conn, session_id)
680
+ except Exception:
681
+ pass
682
+ conn.commit()
683
+
684
+
685
+ def run():
686
+ conn = db()
687
+ ensure_tables(conn)
688
+ build(conn)
689
+ conn.close()
690
+ print("Semantic intelligence build complete.")
691
+
692
+
693
+ if __name__ == "__main__":
694
+ run()