code-data-ark 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1064 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ extract.py — Signal and token extraction pass.
4
+
5
+ Processes all chat sessions in cda.db and populates:
6
+ - token_usage : per-request token accounting
7
+ - compactions : context window compaction events
8
+ - exchange_signals : behavioral signals (corrections, affirmations, etc.)
9
+ - session_analysis : per-session rollup
10
+
11
+ Signal taxonomy:
12
+ correction — user said stop / pause / wrong / jumping ahead / etc.
13
+ redirect — user pivoting direction mid-session
14
+ affirmation — user approved / confirmed / "yes" / "lets do it" / "perfect"
15
+ question — user asking conceptual question (zoom out / meta / think)
16
+ approval — explicit build approval ("build it", "go", "lets do that")
17
+ """
18
+
19
+ import sqlite3
20
+ import gzip
21
+ import json
22
+ import re
23
+ import ast
24
+ from pathlib import Path
25
+ from datetime import datetime
26
+ from typing import Dict, List, Tuple, DefaultDict
27
+ from collections import defaultdict
28
+
29
+ ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
30
+ LOCAL_DIR = ROOT_DIR / "local"
31
+ DB_PATH = LOCAL_DIR / "data" / "cda.db"
32
+
33
+ # ─────────────────────────────────────────────────────────
34
+ # Signal patterns
35
+ # ─────────────────────────────────────────────────────────
36
+
37
+ SIGNAL_PATTERNS = [
38
+ # (signal_type, [keywords], description)
39
+ ("correction", [
40
+ "stop,", "stop.", "stop ", "pause", "wrong", "jumping ahead",
41
+ "not listening", "thats not", "that's not", "nope,", "nope.",
42
+ "incorrect", "you're off", "youre off", "missed the point",
43
+ "not what i", "didn't ask", "didnt ask", "too much", "slow down",
44
+ "hold on", "wait,", "wait.", "no,", "no.", "actually, no",
45
+ "you missed", "thats wrong", "that's wrong", "bad habit",
46
+ "you are jumping", "don't do that", "dont do that",
47
+ "i said", "do what was asked", "stay focused",
48
+ ], "Model correction — user redirecting agent behavior"),
49
+
50
+ ("redirect", [
51
+ "actually", "pivot", "change direction", "lets change",
52
+ "let's change", "forget that", "scratch that", "instead,",
53
+ "different approach", "new direction", "zoom out",
54
+ "step back", "big picture", "meta moment", "meta perspective",
55
+ ], "Session redirect — user changing scope or direction"),
56
+
57
+ ("affirmation", [
58
+ "perfect", "exactly", "yes,", "yes.", "correct", "thats right",
59
+ "that's right", "great", "nice", "good", "love it", "love that",
60
+ "well done", "solid", "clean", "nailed it", "exactly right",
61
+ "thats it", "that's it", "yes!", "boom", "beautiful", "brilliant",
62
+ ], "Affirmation — user confirming agent is on track"),
63
+
64
+ ("approval", [
65
+ "lets do it", "let's do it", "lets build", "let's build",
66
+ "go ahead", "build it", "start implementation", "do it",
67
+ "proceed", "run it", "execute", "ship it", "make it",
68
+ "yes lets", "yes let's", "go!", "go.", "implement",
69
+ ], "Build approval — user authorizing execution"),
70
+
71
+ ("question", [
72
+ "what do you think", "your thoughts", "zoom out", "meta",
73
+ "think about", "can you think", "what is", "how does",
74
+ "why does", "explain", "show me", "tell me", "what are",
75
+ "understand", "curious", "wonder if",
76
+ ], "Conceptual question — user probing for analysis"),
77
+
78
+ # ── Frustration: explicit irritation, swearing, all-caps ──
79
+ ("frustration", [
80
+ "pissing me off", "pisses me off", "pissed off", "piss off",
81
+ "are you kidding", "are you serious", "you're kidding",
82
+ "wtf", "wth", "what the hell", "what the fuck", "what the f",
83
+ "are you stupid", "this is stupid", "this is ridiculous",
84
+ "omg", "oh my god", "jesus", "jesus christ", "ffs",
85
+ "for fuck's sake", "for fucks sake", "goddamn", "god damn",
86
+ "seriously?", "seriously!", "come on!", "come on,",
87
+ "give me a break", "unbelievable", "unreal",
88
+ "you broke it", "you broke", "its broken", "it's broken",
89
+ "i'm done", "im done", "i give up", "forget it",
90
+ "this is a mess", "what a mess", "disaster",
91
+ ], "Frustration — explicit irritation signal"),
92
+
93
+ # ── Pre-correction: rising tone, about to redirect ──
94
+ ("pre_correction", [
95
+ "listen,", "listen.", "ok no", "ok wait", "ok stop",
96
+ "alright stop", "alright no", "alright wait",
97
+ "hey,", "look,", "look.", "no no", "nono",
98
+ "read the", "re-read", "read it again",
99
+ "i just said", "i just told you", "i literally",
100
+ "why did you", "why are you", "why would you",
101
+ "you just", "you literally just",
102
+ "thats not what i", "that's not what i",
103
+ "not again", "again?", "again.", "every time",
104
+ "you keep", "you always", "you never",
105
+ "i've told you", "ive told you", "told you",
106
+ "this is the", "how many times",
107
+ ], "Pre-correction — rising tone before a correction"),
108
+ ]
109
+
110
+ # Swear words as standalone detection (for any message, not keyword-anchored)
111
+ PROFANITY_PATTERNS = re.compile(
112
+ r'\b(fuck|shit|ass|bitch|damn|crap|hell|bastard|bullshit|motherfuck|dumbass|idiot|moron)\b',
113
+ re.IGNORECASE
114
+ )
115
+
116
+ # ALL CAPS detection: ≥3 consecutive uppercase words = signal
117
+ ALL_CAPS_PATTERN = re.compile(r'(?:[A-Z]{2,}\s+){2,}[A-Z]{2,}|[A-Z]{4,}')
118
+
119
+
120
+ def classify_message(text):
121
+ """Return list of (signal_type, matched_keyword) for a user message."""
122
+ tl = text.lower().strip()
123
+ signals = []
124
+ seen_types = set()
125
+ for sig_type, keywords, _ in SIGNAL_PATTERNS:
126
+ if sig_type in seen_types:
127
+ continue
128
+ for kw in keywords:
129
+ if kw in tl:
130
+ signals.append((sig_type, kw))
131
+ seen_types.add(sig_type)
132
+ break
133
+
134
+ # Profanity detection (adds frustration signal if not already caught)
135
+ if 'frustration' not in seen_types:
136
+ m = PROFANITY_PATTERNS.search(text)
137
+ if m:
138
+ signals.append(('frustration', m.group(0).lower()))
139
+ seen_types.add('frustration')
140
+
141
+ # All-caps detection (≥4 uppercase chars or 3+ uppercase words)
142
+ if 'frustration' not in seen_types:
143
+ # Strip URLs, code blocks, and known tool output artifacts before checking
144
+ clean = re.sub(r'https?://\S+|`[^`]*`', '', text)
145
+ # Skip if it looks like tool output (contains PREVIOUS OUTPUT TRUN or similar)
146
+ skip_phrases = ['PREVIOUS OUTPUT', 'TRUNCATED', 'EXIT CODE', 'CWD:', 'TERMINAL:']
147
+ if not any(p in clean for p in skip_phrases):
148
+ if ALL_CAPS_PATTERN.search(clean):
149
+ # Make sure it's not just an acronym (less than 8 caps chars total)
150
+ caps_count = sum(1 for c in clean if c.isupper())
151
+ if caps_count >= 8:
152
+ m2 = ALL_CAPS_PATTERN.search(clean)
153
+ signals.append(('frustration', 'ALL_CAPS:' + m2.group(0)[:20]))
154
+ seen_types.add('frustration')
155
+
156
+ return signals
157
+
158
+
159
+ def extract_requests_from_chat(lines):
160
+ """
161
+ Walk JSONL lines from a chat session blob.
162
+ Returns:
163
+ requests — list of {request_id, ts, message_text, model_id, turn_index}
164
+ token_rows — list of {request_id, ts, turn_index, prompt, completion, cached, total, output, model_id}
165
+ compaction_rows — list of {request_id, ts, turn_index, summary_text, trigger_text}
166
+ """
167
+ # Build a snapshot from kind=0 + patches from kind=1/2
168
+ # kind=0: initial snapshot (has requests[])
169
+ # kind=2: patches with new request arrays
170
+ # kind=1: result patches (timings, metadata, usage)
171
+
172
+ requests_map = {} # request_id -> dict
173
+ turn_index = 0
174
+
175
+ for line in lines:
176
+ try:
177
+ obj = json.loads(line)
178
+ except Exception:
179
+ continue
180
+
181
+ kind = obj.get('kind')
182
+
183
+ # kind=0: initial snapshot
184
+ if kind == 0:
185
+ v = obj.get('v', {})
186
+ for req in (v.get('requests') or []):
187
+ rid = req.get('requestId', '')
188
+ if rid:
189
+ requests_map[rid] = _parse_request(req, turn_index)
190
+ turn_index += 1
191
+
192
+ # kind=2: delta patches — new requests appended
193
+ elif kind == 2:
194
+ k = obj.get('k', [])
195
+ v = obj.get('v')
196
+ # ['requests'] with a list value = new batch of requests
197
+ if k == ['requests'] and isinstance(v, list):
198
+ for req in v:
199
+ rid = req.get('requestId', '')
200
+ if rid and rid not in requests_map:
201
+ requests_map[rid] = _parse_request(req, turn_index)
202
+ turn_index += 1
203
+ # ['requests', N, field] = patch to existing request
204
+ elif len(k) >= 3 and k[0] == 'requests' and isinstance(k[1], int):
205
+ pass # handled below in result patches
206
+
207
+ # kind=1: result patches — contains usage, timings, metadata
208
+ elif kind == 1:
209
+ k = obj.get('k', [])
210
+ v = obj.get('v', {})
211
+ # ['requests', N, 'result'] — usage is here
212
+ if len(k) >= 3 and k[0] == 'requests' and k[2] == 'result' and isinstance(v, dict):
213
+ idx = k[1]
214
+ # Find the request at that index
215
+ req_at_idx = _find_request_by_index(requests_map, idx)
216
+ if req_at_idx:
217
+ _apply_result_patch(req_at_idx, v)
218
+
219
+ return requests_map
220
+
221
+
222
+ def _parse_request(req, turn_index):
223
+ """Parse a raw request dict into our normalized form."""
224
+ msg = req.get('message', {})
225
+ text = msg.get('text', '') if isinstance(msg, dict) else ''
226
+ # Model
227
+ model_id = req.get('modelId', '')
228
+ if not model_id and isinstance(req.get('modelState'), dict):
229
+ model_id = req['modelState'].get('modelId', '')
230
+ # Response — check for compaction summary in response parts
231
+ response = req.get('response') or []
232
+ compaction_summary = ''
233
+ if isinstance(response, list):
234
+ for part in response:
235
+ if isinstance(part, dict):
236
+ ptext = part.get('value', '') or part.get('content', '')
237
+ if isinstance(ptext, str) and 'conversation-summary' in ptext.lower():
238
+ m = re.search(r'<conversation-summary>(.*?)</conversation-summary>', ptext, re.DOTALL | re.IGNORECASE)
239
+ if m:
240
+ compaction_summary = m.group(1).strip()
241
+ return {
242
+ 'request_id': req.get('requestId', ''),
243
+ 'ts': req.get('timestamp', 0),
244
+ 'turn_index': turn_index,
245
+ 'message_text': text,
246
+ 'model_id': model_id,
247
+ 'compaction_summary': compaction_summary,
248
+ # filled by result patch:
249
+ 'prompt_tokens': 0,
250
+ 'completion_tokens': 0,
251
+ 'cached_tokens': 0,
252
+ 'total_tokens': 0,
253
+ 'output_tokens': 0,
254
+ 'rendered_context': '',
255
+ 'compaction_meta': {},
256
+ }
257
+
258
+
259
+ def _find_request_by_index(requests_map, idx):
260
+ """Find request at position idx (by insertion order)."""
261
+ items = list(requests_map.values())
262
+ if 0 <= idx < len(items):
263
+ return items[idx]
264
+ return None
265
+
266
+
267
+ def _apply_result_patch(req, result):
268
+ """Apply a result patch (timings, metadata, usage) to a request record."""
269
+ meta = result.get('metadata', {}) or {}
270
+
271
+ # Token usage — directly in metadata (promptTokens / outputTokens)
272
+ pt = meta.get('promptTokens')
273
+ ot = meta.get('outputTokens')
274
+ if pt is not None:
275
+ req['prompt_tokens'] = pt
276
+ if ot is not None:
277
+ req['output_tokens'] = ot
278
+ req['completion_tokens'] = ot # outputTokens IS completion tokens here
279
+
280
+ # Model
281
+ resolved = meta.get('resolvedModel', '')
282
+ if resolved and not req['model_id']:
283
+ req['model_id'] = resolved if isinstance(resolved, str) else str(resolved)
284
+
285
+ # Compaction summaries — in metadata.summaries list
286
+ summaries = meta.get('summaries', []) or []
287
+ if isinstance(summaries, list) and summaries:
288
+ # Take the first (most recent) summary entry
289
+ s = summaries[0]
290
+ if isinstance(s, dict) and s.get('text') and not req['compaction_summary']:
291
+ req['compaction_summary'] = s['text']
292
+ # Store rich compaction metadata on the request for use in build step
293
+ req['compaction_meta'] = {
294
+ 'tool_call_round_id': s.get('toolCallRoundId', ''),
295
+ 'model': s.get('model', ''),
296
+ 'summarization_mode': s.get('summarizationMode', ''),
297
+ 'num_rounds': s.get('numRounds', 0),
298
+ 'context_length_before': s.get('contextLengthBefore', 0),
299
+ 'duration_ms': s.get('durationMs', 0),
300
+ 'outcome': s.get('outcome', ''),
301
+ 'usage': s.get('usage', {}),
302
+ }
303
+
304
+
305
+ def _decode_vfs_text(blob: bytes) -> str:
306
+ if not blob:
307
+ return ""
308
+ try:
309
+ raw = gzip.decompress(blob)
310
+ except Exception:
311
+ raw = blob
312
+ if isinstance(raw, str):
313
+ return raw
314
+ for encoding in ('utf-8', 'latin-1'):
315
+ try:
316
+ return raw.decode(encoding)
317
+ except Exception:
318
+ continue
319
+ return ""
320
+
321
+
322
+ def _symbol_context(content: str, lineno: int, radius: int = 2) -> str:
323
+ lines = content.splitlines()
324
+ if lineno is None or lineno <= 0:
325
+ return ""
326
+ start = max(0, lineno - 1 - radius)
327
+ end = min(len(lines), lineno + radius)
328
+ return "\n".join(lines[start:end]).strip()
329
+
330
+
331
+ def _extract_python_symbols(file_path: str, content: str) -> List[dict]:
332
+ symbols: List[dict] = []
333
+ if not content.strip():
334
+ return symbols
335
+
336
+ class SymbolVisitor(ast.NodeVisitor):
337
+ def __init__(self):
338
+ self.stack: List[str] = []
339
+ self.found: List[Tuple[str, str, int]] = []
340
+
341
+ def _push(self, name: str):
342
+ self.stack.append(name)
343
+
344
+ def _pop(self):
345
+ if self.stack:
346
+ self.stack.pop()
347
+
348
+ def _qualname(self, name: str) -> str:
349
+ return ".".join(self.stack + [name]) if self.stack else name
350
+
351
+ def visit_ClassDef(self, node):
352
+ self.found.append(('class', self._qualname(node.name), node.lineno))
353
+ self._push(node.name)
354
+ self.generic_visit(node)
355
+ self._pop()
356
+
357
+ def visit_FunctionDef(self, node):
358
+ kind = 'method' if self.stack else 'function'
359
+ self.found.append((kind, self._qualname(node.name), node.lineno))
360
+ self.generic_visit(node)
361
+
362
+ def visit_AsyncFunctionDef(self, node):
363
+ kind = 'method' if self.stack else 'function'
364
+ self.found.append((kind, self._qualname(node.name), node.lineno))
365
+ self.generic_visit(node)
366
+
367
+ def visit_AnnAssign(self, node):
368
+ target = node.target
369
+ if isinstance(target, ast.Name) and not target.id.startswith('_') and not self.stack:
370
+ self.found.append(('variable', target.id, node.lineno))
371
+ self.generic_visit(node)
372
+
373
+ def visit_Assign(self, node):
374
+ if not self.stack:
375
+ for target in node.targets:
376
+ if isinstance(target, ast.Name) and not target.id.startswith('_'):
377
+ self.found.append(('variable', target.id, node.lineno))
378
+ self.generic_visit(node)
379
+
380
+ try:
381
+ tree = ast.parse(content, filename=file_path)
382
+ visitor = SymbolVisitor()
383
+ visitor.visit(tree)
384
+ except Exception:
385
+ return symbols
386
+
387
+ for kind, name, lineno in visitor.found:
388
+ symbols.append({
389
+ 'symbol_type': kind,
390
+ 'symbol_name': name,
391
+ 'line_number': lineno,
392
+ 'context': _symbol_context(content, lineno),
393
+ })
394
+ return symbols
395
+
396
+
397
+ def _extract_generic_symbols(file_path: str, content: str) -> List[dict]:
398
+ symbols: List[dict] = []
399
+ if not content.strip():
400
+ return symbols
401
+
402
+ patterns = [
403
+ (r'^\s*(?:export\s+)?(?:default\s+)?function\s+([A-Za-z_][\w]*)\b', 'function'),
404
+ (r'^\s*(?:export\s+)?(?:default\s+)?class\s+([A-Za-z_][\w]*)\b', 'class'),
405
+ (r'^\s*(?:export\s+)?(?:const|let|var)\s+([A-Za-z_][\w]*)\s*=', 'variable'),
406
+ (r'^\s*(?:interface|enum|struct|type)\s+([A-Za-z_][\w]*)\b', 'type'),
407
+ (r'^\s*def\s+([A-Za-z_][\w]*)\b', 'function'),
408
+ (r'^\s*func\s+([A-Za-z_][\w]*)\b', 'function'),
409
+ ]
410
+
411
+ for line_number, line in enumerate(content.splitlines(), start=1):
412
+ for pattern, kind in patterns:
413
+ match = re.match(pattern, line)
414
+ if match:
415
+ name = match.group(1)
416
+ symbols.append({
417
+ 'symbol_type': kind,
418
+ 'symbol_name': name,
419
+ 'line_number': line_number,
420
+ 'context': _symbol_context(content, line_number),
421
+ })
422
+ break
423
+ return symbols
424
+
425
+
426
+ def extract_code_symbols(file_path: str, content: str) -> List[dict]:
427
+ ext = Path(file_path).suffix.lower()
428
+ if not ext:
429
+ return []
430
+ if ext in ('.py', '.pyi'):
431
+ return _extract_python_symbols(file_path, content)
432
+ if ext in ('.js', '.jsx', '.ts', '.tsx', '.go', '.rs', '.java', '.c', '.cpp', '.h', '.cs', '.swift', '.rb'):
433
+ return _extract_generic_symbols(file_path, content)
434
+ return []
435
+
436
+
437
+ def _is_code_file(source_path: str) -> bool:
438
+ if not source_path:
439
+ return False
440
+ ext = Path(source_path).suffix.lower()
441
+ return ext in {
442
+ '.py', '.pyi', '.js', '.jsx', '.ts', '.tsx', '.go', '.rs', '.java',
443
+ '.c', '.cpp', '.h', '.cs', '.swift', '.rb'
444
+ }
445
+
446
+
447
+ def build_symbol_index(conn):
448
+ print("\nBuilding code symbol index...")
449
+ conn.execute("DELETE FROM symbols")
450
+ rows = conn.execute(
451
+ "SELECT workspace_id, source_path, content FROM vfs"
452
+ ).fetchall()
453
+ symbols = []
454
+ indexed_at = int(datetime.utcnow().timestamp() * 1000)
455
+ for workspace_id, source_path, content_blob in rows:
456
+ if not _is_code_file(source_path or ""):
457
+ continue
458
+ text = _decode_vfs_text(content_blob)
459
+ if not text:
460
+ continue
461
+ for sym in extract_code_symbols(source_path, text):
462
+ symbols.append((
463
+ workspace_id,
464
+ source_path,
465
+ sym['symbol_name'],
466
+ sym['symbol_type'],
467
+ sym['line_number'],
468
+ sym['context'],
469
+ indexed_at,
470
+ ))
471
+ if symbols:
472
+ conn.executemany(
473
+ "INSERT INTO symbols(workspace_id, file_path, symbol_name, symbol_type, line_number, context, indexed_at) VALUES (?,?,?,?,?,?,?)",
474
+ symbols
475
+ )
476
+ conn.commit()
477
+
478
+
479
+ def ensure_schema(conn):
480
+ """Ensure extract-related tables and indexes exist for watcher and analysis passes."""
481
+ conn.executescript("""
482
+ CREATE TABLE IF NOT EXISTS exchange_signals (
483
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
484
+ session_id TEXT NOT NULL,
485
+ exchange_index INTEGER,
486
+ request_id TEXT,
487
+ ts INTEGER,
488
+ signal_type TEXT NOT NULL,
489
+ signal_text TEXT,
490
+ matched_keyword TEXT,
491
+ user_message TEXT,
492
+ ingested_at TEXT DEFAULT (datetime('now'))
493
+ );
494
+ CREATE INDEX IF NOT EXISTS idx_signals_session ON exchange_signals(session_id);
495
+ CREATE INDEX IF NOT EXISTS idx_signals_type ON exchange_signals(signal_type);
496
+
497
+ CREATE TABLE IF NOT EXISTS session_analysis (
498
+ session_id TEXT PRIMARY KEY,
499
+ total_corrections INTEGER DEFAULT 0,
500
+ total_redirects INTEGER DEFAULT 0,
501
+ total_affirmations INTEGER DEFAULT 0,
502
+ total_tool_calls INTEGER DEFAULT 0,
503
+ total_tokens_prompt INTEGER DEFAULT 0,
504
+ total_tokens_completion INTEGER DEFAULT 0,
505
+ total_tokens_cached INTEGER DEFAULT 0,
506
+ compaction_count INTEGER DEFAULT 0,
507
+ session_duration_min REAL,
508
+ first_ts INTEGER,
509
+ last_ts INTEGER,
510
+ model_ids TEXT,
511
+ clean_run INTEGER DEFAULT 0,
512
+ analyzed_at TEXT DEFAULT (datetime('now')),
513
+ total_frustrations INTEGER DEFAULT 0,
514
+ total_pre_corrections INTEGER DEFAULT 0,
515
+ heat_score INTEGER DEFAULT 0,
516
+ peak_heat INTEGER DEFAULT 0,
517
+ final_heat INTEGER DEFAULT 0,
518
+ saved_session INTEGER DEFAULT 0,
519
+ turning_point_ts INTEGER,
520
+ turning_point_text TEXT
521
+ );
522
+
523
+ CREATE VIRTUAL TABLE IF NOT EXISTS fts_exchanges USING fts5(
524
+ session_id UNINDEXED,
525
+ workspace_id UNINDEXED,
526
+ exchange_index UNINDEXED,
527
+ user_ts UNINDEXED,
528
+ user_message,
529
+ reasoning_text,
530
+ response_text,
531
+ tool_calls,
532
+ content=exchanges,
533
+ content_rowid=id
534
+ );
535
+
536
+ CREATE TABLE IF NOT EXISTS symbols (
537
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
538
+ workspace_id TEXT,
539
+ file_path TEXT,
540
+ symbol_name TEXT,
541
+ symbol_type TEXT,
542
+ line_number INTEGER,
543
+ context TEXT,
544
+ indexed_at INTEGER
545
+ );
546
+ CREATE INDEX IF NOT EXISTS idx_symbols_workspace ON symbols(workspace_id);
547
+ CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type);
548
+ CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(symbol_name);
549
+
550
+ CREATE TABLE IF NOT EXISTS tool_calls (
551
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
552
+ session_id TEXT NOT NULL,
553
+ exchange_index INTEGER,
554
+ request_id TEXT,
555
+ tool_call_id TEXT,
556
+ tool_name TEXT NOT NULL,
557
+ file_path TEXT,
558
+ arguments_json TEXT,
559
+ has_output INTEGER DEFAULT 0,
560
+ ingested_at TEXT DEFAULT (datetime('now'))
561
+ );
562
+ CREATE INDEX IF NOT EXISTS idx_tool_calls_session ON tool_calls(session_id);
563
+ CREATE INDEX IF NOT EXISTS idx_tool_calls_name ON tool_calls(tool_name);
564
+ CREATE INDEX IF NOT EXISTS idx_tool_calls_file ON tool_calls(file_path);
565
+ """)
566
+ try:
567
+ conn.execute("ALTER TABLE session_analysis ADD COLUMN clean_run INTEGER DEFAULT 0")
568
+ except sqlite3.OperationalError:
569
+ pass
570
+ conn.commit()
571
+
572
+
573
+ # ─────────────────────────────────────────────────────────
574
+ # Main processing
575
+ # ─────────────────────────────────────────────────────────
576
+
577
+ def process_session(conn, session_id, blob):
578
+ """Process one chat session blob and write rows to all tables."""
579
+ raw = gzip.decompress(blob).decode('utf-8', errors='replace')
580
+ lines = [ln for ln in raw.splitlines() if ln.strip()]
581
+
582
+ requests_map = extract_requests_from_chat(lines)
583
+ if not requests_map:
584
+ return 0, 0, 0
585
+
586
+ token_rows = []
587
+ signal_rows = []
588
+ compaction_rows = []
589
+
590
+ for req in requests_map.values():
591
+ rid = req['request_id']
592
+ ts = req['ts']
593
+ ti = req['turn_index']
594
+ mid = req['model_id']
595
+
596
+ # Token usage row (only if we have real data)
597
+ if req['prompt_tokens'] or req['output_tokens']:
598
+ token_rows.append((
599
+ session_id, rid, ti, ts,
600
+ req['prompt_tokens'], req['completion_tokens'],
601
+ req['cached_tokens'], req['total_tokens'],
602
+ req['output_tokens'], mid
603
+ ))
604
+
605
+ # Compaction row
606
+ if req['compaction_summary']:
607
+ trigger = req['message_text'][:200] if req['message_text'] else ''
608
+ cmeta = req.get('compaction_meta', {})
609
+ compaction_rows.append((
610
+ session_id, rid, ti, ts,
611
+ req['compaction_summary'],
612
+ len(req['compaction_summary']),
613
+ trigger,
614
+ cmeta.get('context_length_before', 0),
615
+ cmeta.get('num_rounds', 0),
616
+ cmeta.get('model', ''),
617
+ cmeta.get('duration_ms', 0),
618
+ ))
619
+
620
+ # Signal rows
621
+ if req['message_text']:
622
+ signals = classify_message(req['message_text'])
623
+ for sig_type, matched_kw in signals:
624
+ signal_rows.append((
625
+ session_id, None, rid, ts,
626
+ sig_type, req['message_text'][:500],
627
+ matched_kw, req['message_text'][:200]
628
+ ))
629
+
630
+ # Insert
631
+ conn.executemany(
632
+ """INSERT OR IGNORE INTO token_usage
633
+ (session_id, request_id, turn_index, ts,
634
+ prompt_tokens, completion_tokens, cached_tokens,
635
+ total_tokens, output_tokens, model_id)
636
+ VALUES (?,?,?,?,?,?,?,?,?,?)""",
637
+ token_rows
638
+ )
639
+ conn.executemany(
640
+ """INSERT OR IGNORE INTO compactions
641
+ (session_id, request_id, turn_index, ts,
642
+ summary_text, summary_length, trigger_text,
643
+ context_length_before, num_rounds, summary_model, duration_ms)
644
+ VALUES (?,?,?,?,?,?,?,?,?,?,?)""",
645
+ compaction_rows
646
+ )
647
+ conn.executemany(
648
+ """INSERT OR IGNORE INTO exchange_signals
649
+ (session_id, exchange_index, request_id, ts,
650
+ signal_type, signal_text, matched_keyword, user_message)
651
+ VALUES (?,?,?,?,?,?,?,?)""",
652
+ signal_rows
653
+ )
654
+
655
+ return len(token_rows), len(signal_rows), len(compaction_rows)
656
+
657
+
658
+ def build_session_analysis(conn, session_id):
659
+ """Compute and upsert session_analysis row."""
660
+ tok = conn.execute(
661
+ """SELECT SUM(prompt_tokens), SUM(completion_tokens), SUM(cached_tokens)
662
+ FROM token_usage WHERE session_id=?""", (session_id,)
663
+ ).fetchone()
664
+
665
+ sigs = conn.execute(
666
+ """SELECT signal_type, COUNT(*) FROM exchange_signals
667
+ WHERE session_id=? GROUP BY signal_type""", (session_id,)
668
+ ).fetchall()
669
+ sig_map = {r[0]: r[1] for r in sigs}
670
+
671
+ comp = conn.execute(
672
+ "SELECT COUNT(*) FROM compactions WHERE session_id=?", (session_id,)
673
+ ).fetchone()[0]
674
+
675
+ exc = conn.execute(
676
+ """SELECT SUM(tool_call_count), MIN(user_ts), MAX(user_ts)
677
+ FROM exchanges WHERE session_id=?""", (session_id,)
678
+ ).fetchone()
679
+
680
+ models = conn.execute(
681
+ """SELECT DISTINCT model_id FROM token_usage
682
+ WHERE session_id=? AND model_id != ''""", (session_id,)
683
+ ).fetchall()
684
+ model_ids = ','.join(r[0] for r in models)
685
+
686
+ first_ts = exc[1]
687
+ last_ts = exc[2]
688
+ duration = None
689
+ if first_ts and last_ts:
690
+ try:
691
+ from datetime import datetime
692
+ f = datetime.fromisoformat(str(first_ts).replace('Z', '+00:00'))
693
+ ln = datetime.fromisoformat(str(last_ts).replace('Z', '+00:00'))
694
+ duration = (ln - f).total_seconds() / 60
695
+ except Exception:
696
+ pass
697
+
698
+ total_corrections = sig_map.get('correction', 0)
699
+ total_frustrations = sig_map.get('frustration', 0)
700
+ total_pre_corrections = sig_map.get('pre_correction', 0)
701
+ # Clean run = no corrections and at least 3 exchanges
702
+ exc_count = conn.execute(
703
+ "SELECT COUNT(*) FROM exchanges WHERE session_id=?", (session_id,)
704
+ ).fetchone()[0]
705
+ clean_run = 1 if total_corrections == 0 and exc_count >= 3 else 0
706
+
707
+ # Heat score: weighted sum of negative signals
708
+ # corrections=3pts, pre_correction=2pts, frustration=5pts, redirects=1pt
709
+ # Normalized to 0–100 range (cap at 100)
710
+ HEAT_WEIGHT = {
711
+ 'correction': 3,
712
+ 'pre_correction': 2,
713
+ 'frustration': 5,
714
+ 'redirect': 1,
715
+ }
716
+ raw_heat = (
717
+ total_corrections * 3 +
718
+ total_pre_corrections * 2 +
719
+ total_frustrations * 5 +
720
+ sig_map.get('redirect', 0) * 1
721
+ )
722
+ heat_score = min(100, raw_heat)
723
+
724
+ # ── Per-turn heat timeline ─────────────────────────────────────
725
+ # Group signals by ts, compute heat contribution per turn,
726
+ # find: peak_heat, final_heat (last 5 turns), turning_point
727
+ signals_ordered = conn.execute(
728
+ """SELECT ts, signal_type, user_message FROM exchange_signals
729
+ WHERE session_id=? ORDER BY ts NULLS LAST""",
730
+ (session_id,)
731
+ ).fetchall()
732
+
733
+ heat_by_ts: DefaultDict[int, int] = defaultdict(int) # ts -> heat contribution
734
+ types_by_ts: DefaultDict[int, List[str]] = defaultdict(list) # ts -> [signal_types]
735
+ msg_by_ts: Dict[int, str] = {} # ts -> first message at that ts
736
+ for s in signals_ordered:
737
+ ts_val = s[0] or 0
738
+ st = s[1]
739
+ heat_by_ts[ts_val] += HEAT_WEIGHT.get(st, 0)
740
+ types_by_ts[ts_val].append(st)
741
+ if ts_val not in msg_by_ts and s[2]:
742
+ msg_by_ts[ts_val] = s[2]
743
+
744
+ sorted_ts = sorted(heat_by_ts.keys())
745
+
746
+ # Cumulative heat timeline → peak_heat = heat_score (total is the peak)
747
+ peak_heat = heat_score # heat only accumulates, so peak == total
748
+
749
+ # final_heat: heat contributed by last 5 turns
750
+ last_5_ts = sorted_ts[-5:] if len(sorted_ts) >= 5 else sorted_ts
751
+ final_heat = sum(heat_by_ts[ts] for ts in last_5_ts)
752
+
753
+ # Turning point: ts of the LAST heat-generating signal (the "Antidote")
754
+ # This is the correction/frustration that preceded recovery
755
+ turning_point_ts = None
756
+ turning_point_text = None
757
+ for ts_val in reversed(sorted_ts):
758
+ if heat_by_ts[ts_val] > 0:
759
+ turning_point_ts = ts_val
760
+ turning_point_text = (msg_by_ts.get(ts_val) or '')[:500]
761
+ break
762
+
763
+ # Saved session: had significant heat AND recovered
764
+ # Recovery = final_heat == 0 (no heat in last 5 turns) AND ended with affirmations
765
+ total_affirmations = sig_map.get('affirmation', 0) + sig_map.get('approval', 0) # noqa: F841
766
+ post_peak_affirmations = 0
767
+ if turning_point_ts is not None:
768
+ post_peak_affirmations = conn.execute(
769
+ """SELECT COUNT(*) FROM exchange_signals
770
+ WHERE session_id=? AND ts > ? AND signal_type IN ('affirmation','approval')""",
771
+ (session_id, turning_point_ts)
772
+ ).fetchone()[0]
773
+ saved_session = 1 if (peak_heat >= 25 and final_heat <= peak_heat * 0.4 and post_peak_affirmations >= 1) else 0
774
+
775
+ conn.execute("""
776
+ INSERT INTO session_analysis
777
+ (session_id, total_corrections, total_redirects, total_affirmations,
778
+ total_tool_calls, total_tokens_prompt, total_tokens_completion,
779
+ total_tokens_cached, compaction_count, session_duration_min,
780
+ first_ts, last_ts, model_ids, clean_run,
781
+ total_frustrations, total_pre_corrections, heat_score,
782
+ peak_heat, final_heat, saved_session,
783
+ turning_point_ts, turning_point_text)
784
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
785
+ ON CONFLICT(session_id) DO UPDATE SET
786
+ total_corrections=excluded.total_corrections,
787
+ total_redirects=excluded.total_redirects,
788
+ total_affirmations=excluded.total_affirmations,
789
+ total_tool_calls=excluded.total_tool_calls,
790
+ total_tokens_prompt=excluded.total_tokens_prompt,
791
+ total_tokens_completion=excluded.total_tokens_completion,
792
+ total_tokens_cached=excluded.total_tokens_cached,
793
+ compaction_count=excluded.compaction_count,
794
+ session_duration_min=excluded.session_duration_min,
795
+ first_ts=excluded.first_ts,
796
+ last_ts=excluded.last_ts,
797
+ model_ids=excluded.model_ids,
798
+ clean_run=excluded.clean_run,
799
+ total_frustrations=excluded.total_frustrations,
800
+ total_pre_corrections=excluded.total_pre_corrections,
801
+ heat_score=excluded.heat_score,
802
+ peak_heat=excluded.peak_heat,
803
+ final_heat=excluded.final_heat,
804
+ saved_session=excluded.saved_session,
805
+ turning_point_ts=excluded.turning_point_ts,
806
+ turning_point_text=excluded.turning_point_text,
807
+ analyzed_at=datetime('now')
808
+ """, (
809
+ session_id,
810
+ sig_map.get('correction', 0),
811
+ sig_map.get('redirect', 0),
812
+ sig_map.get('affirmation', 0),
813
+ exc[0] or 0,
814
+ tok[0] or 0, tok[1] or 0, tok[2] or 0,
815
+ comp,
816
+ duration,
817
+ first_ts, last_ts,
818
+ model_ids,
819
+ clean_run,
820
+ total_frustrations,
821
+ total_pre_corrections,
822
+ heat_score,
823
+ peak_heat,
824
+ final_heat,
825
+ saved_session,
826
+ turning_point_ts,
827
+ turning_point_text,
828
+ ))
829
+
830
+
831
+ def run():
832
+ conn = sqlite3.connect(str(DB_PATH), timeout=30)
833
+ conn.execute("PRAGMA journal_mode=WAL")
834
+ conn.execute("PRAGMA synchronous=NORMAL")
835
+ conn.execute("PRAGMA cache_size=-2000")
836
+ conn.execute("PRAGMA mmap_size=268435456")
837
+ conn.execute("PRAGMA temp_store=MEMORY")
838
+
839
+ # Ensure analysis tables exist
840
+ conn.executescript("""
841
+ CREATE TABLE IF NOT EXISTS token_usage (
842
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
843
+ session_id TEXT NOT NULL,
844
+ request_id TEXT,
845
+ turn_index INTEGER,
846
+ ts INTEGER,
847
+ prompt_tokens INTEGER DEFAULT 0,
848
+ completion_tokens INTEGER DEFAULT 0,
849
+ cached_tokens INTEGER DEFAULT 0,
850
+ total_tokens INTEGER DEFAULT 0,
851
+ output_tokens INTEGER DEFAULT 0,
852
+ model_id TEXT,
853
+ ingested_at TEXT DEFAULT (datetime('now'))
854
+ );
855
+ CREATE INDEX IF NOT EXISTS idx_token_usage_session ON token_usage(session_id);
856
+
857
+ CREATE TABLE IF NOT EXISTS compactions (
858
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
859
+ session_id TEXT NOT NULL,
860
+ request_id TEXT,
861
+ turn_index INTEGER,
862
+ ts INTEGER,
863
+ summary_text TEXT,
864
+ summary_length INTEGER,
865
+ trigger_text TEXT,
866
+ ingested_at TEXT DEFAULT (datetime('now')),
867
+ context_length_before INTEGER DEFAULT 0,
868
+ num_rounds INTEGER DEFAULT 0,
869
+ summary_model TEXT,
870
+ duration_ms INTEGER DEFAULT 0
871
+ );
872
+ CREATE INDEX IF NOT EXISTS idx_compactions_session ON compactions(session_id);
873
+
874
+ CREATE TABLE IF NOT EXISTS exchange_signals (
875
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
876
+ session_id TEXT NOT NULL,
877
+ exchange_index INTEGER,
878
+ request_id TEXT,
879
+ ts INTEGER,
880
+ signal_type TEXT NOT NULL,
881
+ signal_text TEXT,
882
+ matched_keyword TEXT,
883
+ user_message TEXT,
884
+ ingested_at TEXT DEFAULT (datetime('now'))
885
+ );
886
+ CREATE INDEX IF NOT EXISTS idx_signals_session ON exchange_signals(session_id);
887
+ CREATE INDEX IF NOT EXISTS idx_signals_type ON exchange_signals(signal_type);
888
+
889
+ CREATE TABLE IF NOT EXISTS session_analysis (
890
+ session_id TEXT PRIMARY KEY,
891
+ total_corrections INTEGER DEFAULT 0,
892
+ total_redirects INTEGER DEFAULT 0,
893
+ total_affirmations INTEGER DEFAULT 0,
894
+ total_tool_calls INTEGER DEFAULT 0,
895
+ total_tokens_prompt INTEGER DEFAULT 0,
896
+ total_tokens_completion INTEGER DEFAULT 0,
897
+ total_tokens_cached INTEGER DEFAULT 0,
898
+ compaction_count INTEGER DEFAULT 0,
899
+ session_duration_min REAL,
900
+ first_ts INTEGER,
901
+ last_ts INTEGER,
902
+ model_ids TEXT,
903
+ clean_run INTEGER DEFAULT 0,
904
+ analyzed_at TEXT DEFAULT (datetime('now')),
905
+ total_frustrations INTEGER DEFAULT 0,
906
+ total_pre_corrections INTEGER DEFAULT 0,
907
+ heat_score INTEGER DEFAULT 0,
908
+ peak_heat INTEGER DEFAULT 0,
909
+ final_heat INTEGER DEFAULT 0,
910
+ saved_session INTEGER DEFAULT 0,
911
+ turning_point_ts INTEGER,
912
+ turning_point_text TEXT
913
+ );
914
+
915
+ CREATE VIRTUAL TABLE IF NOT EXISTS fts_exchanges USING fts5(
916
+ session_id UNINDEXED,
917
+ workspace_id UNINDEXED,
918
+ exchange_index UNINDEXED,
919
+ user_ts UNINDEXED,
920
+ user_message,
921
+ reasoning_text,
922
+ response_text,
923
+ tool_calls,
924
+ content=exchanges,
925
+ content_rowid=id
926
+ );
927
+
928
+ CREATE TABLE IF NOT EXISTS symbols (
929
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
930
+ workspace_id TEXT,
931
+ file_path TEXT,
932
+ symbol_name TEXT,
933
+ symbol_type TEXT, -- function, class, method, variable, etc.
934
+ line_number INTEGER,
935
+ context TEXT, -- surrounding code context
936
+ indexed_at INTEGER
937
+ );
938
+ CREATE INDEX IF NOT EXISTS idx_symbols_workspace ON symbols(workspace_id);
939
+ CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type);
940
+ CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(symbol_name);
941
+ """)
942
+
943
+ # Ensure tool_calls table exists
944
+ conn.executescript("""
945
+ CREATE TABLE IF NOT EXISTS tool_calls (
946
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
947
+ session_id TEXT NOT NULL,
948
+ exchange_index INTEGER,
949
+ request_id TEXT,
950
+ tool_call_id TEXT,
951
+ tool_name TEXT NOT NULL,
952
+ file_path TEXT,
953
+ arguments_json TEXT,
954
+ has_output INTEGER DEFAULT 0,
955
+ ingested_at TEXT DEFAULT (datetime('now'))
956
+ );
957
+ CREATE INDEX IF NOT EXISTS idx_tool_calls_session ON tool_calls(session_id);
958
+ CREATE INDEX IF NOT EXISTS idx_tool_calls_name ON tool_calls(tool_name);
959
+ CREATE INDEX IF NOT EXISTS idx_tool_calls_file ON tool_calls(file_path);
960
+ """)
961
+ try:
962
+ conn.execute("ALTER TABLE session_analysis ADD COLUMN clean_run INTEGER DEFAULT 0")
963
+ except sqlite3.OperationalError:
964
+ pass
965
+ conn.commit()
966
+
967
+ # Clear existing extracted data for a clean re-run
968
+ conn.execute("DELETE FROM token_usage")
969
+ conn.execute("DELETE FROM compactions")
970
+ conn.execute("DELETE FROM exchange_signals")
971
+ conn.execute("DELETE FROM session_analysis")
972
+ conn.execute("DELETE FROM symbols")
973
+ conn.commit()
974
+
975
+ # Get all sessions that have a chat_session blob
976
+ blobs = conn.execute(
977
+ """SELECT v.session_id, v.content
978
+ FROM vfs v
979
+ WHERE v.source_type = 'chat_session'
980
+ ORDER BY v.session_id"""
981
+ ).fetchall()
982
+
983
+ print(f"Processing {len(blobs)} chat sessions...")
984
+ total_tok = total_sig = total_comp = 0
985
+ errors = 0
986
+
987
+ for i, (sid, content) in enumerate(blobs):
988
+ try:
989
+ t, s, c = process_session(conn, sid, content)
990
+ total_tok += t
991
+ total_sig += s
992
+ total_comp += c
993
+ build_session_analysis(conn, sid)
994
+ if i % 20 == 0:
995
+ conn.commit()
996
+ print(f" [{i+1}/{len(blobs)}] tokens={total_tok} signals={total_sig} compactions={total_comp}")
997
+ except Exception as e:
998
+ errors += 1
999
+ if errors <= 5:
1000
+ print(f" ERROR {sid[:16]}: {e}")
1001
+
1002
+ conn.commit()
1003
+
1004
+ # ── Populate tool_calls from exchanges ──────────────────────────────────
1005
+ print("\nBuilding tool_calls index from exchanges...")
1006
+ conn.execute("DELETE FROM tool_calls")
1007
+ tc_rows = []
1008
+ exch_rows = conn.execute(
1009
+ "SELECT session_id, exchange_index, request_id, tool_calls FROM exchanges WHERE tool_call_count > 0"
1010
+ ).fetchall()
1011
+ for sid, ex_idx, req_id, tc_json in exch_rows:
1012
+ try:
1013
+ tool_calls_list = json.loads(tc_json or '[]')
1014
+ except Exception:
1015
+ continue
1016
+ for tc in tool_calls_list:
1017
+ if not isinstance(tc, dict):
1018
+ continue
1019
+ name = tc.get('name', '') or ''
1020
+ tc_id = tc.get('toolCallId', '') or ''
1021
+ args = tc.get('arguments', {}) or {}
1022
+ has_out = 1 if tc.get('output') else 0
1023
+ # Extract file path from common argument patterns
1024
+ file_path = ''
1025
+ if isinstance(args, dict):
1026
+ file_path = (
1027
+ args.get('filePath') or args.get('file_path') or
1028
+ args.get('path') or args.get('uri') or ''
1029
+ )
1030
+ if not file_path:
1031
+ # For read_file / grep_search / replace_string_in_file
1032
+ for k in ('filePath', 'file_path', 'path', 'uri', 'includePattern', 'query'):
1033
+ v = args.get(k)
1034
+ if isinstance(v, str) and ('/' in v or '\\' in v):
1035
+ file_path = v
1036
+ break
1037
+ tc_rows.append((
1038
+ sid, ex_idx, req_id, tc_id, name,
1039
+ str(file_path)[:500] if file_path else '',
1040
+ json.dumps(args)[:1000], has_out,
1041
+ ))
1042
+
1043
+ conn.executemany("""
1044
+ INSERT INTO tool_calls
1045
+ (session_id, exchange_index, request_id, tool_call_id,
1046
+ tool_name, file_path, arguments_json, has_output)
1047
+ VALUES (?,?,?,?,?,?,?,?)
1048
+ """, tc_rows)
1049
+ conn.commit()
1050
+ n_tc = conn.execute("SELECT COUNT(*) FROM tool_calls").fetchone()[0]
1051
+ print(f" tool_calls rows: {n_tc}")
1052
+
1053
+ build_symbol_index(conn)
1054
+ conn.close()
1055
+
1056
+ print("\nDone.")
1057
+ print(f" token_usage rows: {total_tok}")
1058
+ print(f" exchange_signals rows:{total_sig}")
1059
+ print(f" compaction rows: {total_comp}")
1060
+ print(f" errors: {errors}")
1061
+
1062
+
1063
+ if __name__ == "__main__":
1064
+ run()