code-data-ark 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cda/__init__.py +3 -0
- cda/kernel/__init__.py +0 -0
- cda/kernel/control_db.py +151 -0
- cda/kernel/pmf_kernel.py +364 -0
- cda/kernel/selfcheck.py +299 -0
- cda/pipeline/__init__.py +0 -0
- cda/pipeline/embed.py +694 -0
- cda/pipeline/extract.py +1064 -0
- cda/pipeline/ingest.py +673 -0
- cda/pipeline/parse_edits.py +250 -0
- cda/pipeline/reconstruct.py +536 -0
- cda/pipeline/watcher.py +783 -0
- cda/ui/__init__.py +0 -0
- cda/ui/cli.py +2587 -0
- cda/ui/web.py +2848 -0
- code_data_ark-2.0.2.dist-info/METADATA +495 -0
- code_data_ark-2.0.2.dist-info/RECORD +20 -0
- code_data_ark-2.0.2.dist-info/WHEEL +4 -0
- code_data_ark-2.0.2.dist-info/entry_points.txt +2 -0
- code_data_ark-2.0.2.dist-info/licenses/license +21 -0
cda/pipeline/extract.py
ADDED
|
@@ -0,0 +1,1064 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
extract.py — Signal and token extraction pass.
|
|
4
|
+
|
|
5
|
+
Processes all chat sessions in cda.db and populates:
|
|
6
|
+
- token_usage : per-request token accounting
|
|
7
|
+
- compactions : context window compaction events
|
|
8
|
+
- exchange_signals : behavioral signals (corrections, affirmations, etc.)
|
|
9
|
+
- session_analysis : per-session rollup
|
|
10
|
+
|
|
11
|
+
Signal taxonomy:
|
|
12
|
+
correction — user said stop / pause / wrong / jumping ahead / etc.
|
|
13
|
+
redirect — user pivoting direction mid-session
|
|
14
|
+
affirmation — user approved / confirmed / "yes" / "lets do it" / "perfect"
|
|
15
|
+
question — user asking conceptual question (zoom out / meta / think)
|
|
16
|
+
approval — explicit build approval ("build it", "go", "lets do that")
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import sqlite3
|
|
20
|
+
import gzip
|
|
21
|
+
import json
|
|
22
|
+
import re
|
|
23
|
+
import ast
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from datetime import datetime
|
|
26
|
+
from typing import Dict, List, Tuple, DefaultDict
|
|
27
|
+
from collections import defaultdict
|
|
28
|
+
|
|
29
|
+
ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
|
|
30
|
+
LOCAL_DIR = ROOT_DIR / "local"
|
|
31
|
+
DB_PATH = LOCAL_DIR / "data" / "cda.db"
|
|
32
|
+
|
|
33
|
+
# ─────────────────────────────────────────────────────────
|
|
34
|
+
# Signal patterns
|
|
35
|
+
# ─────────────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
SIGNAL_PATTERNS = [
|
|
38
|
+
# (signal_type, [keywords], description)
|
|
39
|
+
("correction", [
|
|
40
|
+
"stop,", "stop.", "stop ", "pause", "wrong", "jumping ahead",
|
|
41
|
+
"not listening", "thats not", "that's not", "nope,", "nope.",
|
|
42
|
+
"incorrect", "you're off", "youre off", "missed the point",
|
|
43
|
+
"not what i", "didn't ask", "didnt ask", "too much", "slow down",
|
|
44
|
+
"hold on", "wait,", "wait.", "no,", "no.", "actually, no",
|
|
45
|
+
"you missed", "thats wrong", "that's wrong", "bad habit",
|
|
46
|
+
"you are jumping", "don't do that", "dont do that",
|
|
47
|
+
"i said", "do what was asked", "stay focused",
|
|
48
|
+
], "Model correction — user redirecting agent behavior"),
|
|
49
|
+
|
|
50
|
+
("redirect", [
|
|
51
|
+
"actually", "pivot", "change direction", "lets change",
|
|
52
|
+
"let's change", "forget that", "scratch that", "instead,",
|
|
53
|
+
"different approach", "new direction", "zoom out",
|
|
54
|
+
"step back", "big picture", "meta moment", "meta perspective",
|
|
55
|
+
], "Session redirect — user changing scope or direction"),
|
|
56
|
+
|
|
57
|
+
("affirmation", [
|
|
58
|
+
"perfect", "exactly", "yes,", "yes.", "correct", "thats right",
|
|
59
|
+
"that's right", "great", "nice", "good", "love it", "love that",
|
|
60
|
+
"well done", "solid", "clean", "nailed it", "exactly right",
|
|
61
|
+
"thats it", "that's it", "yes!", "boom", "beautiful", "brilliant",
|
|
62
|
+
], "Affirmation — user confirming agent is on track"),
|
|
63
|
+
|
|
64
|
+
("approval", [
|
|
65
|
+
"lets do it", "let's do it", "lets build", "let's build",
|
|
66
|
+
"go ahead", "build it", "start implementation", "do it",
|
|
67
|
+
"proceed", "run it", "execute", "ship it", "make it",
|
|
68
|
+
"yes lets", "yes let's", "go!", "go.", "implement",
|
|
69
|
+
], "Build approval — user authorizing execution"),
|
|
70
|
+
|
|
71
|
+
("question", [
|
|
72
|
+
"what do you think", "your thoughts", "zoom out", "meta",
|
|
73
|
+
"think about", "can you think", "what is", "how does",
|
|
74
|
+
"why does", "explain", "show me", "tell me", "what are",
|
|
75
|
+
"understand", "curious", "wonder if",
|
|
76
|
+
], "Conceptual question — user probing for analysis"),
|
|
77
|
+
|
|
78
|
+
# ── Frustration: explicit irritation, swearing, all-caps ──
|
|
79
|
+
("frustration", [
|
|
80
|
+
"pissing me off", "pisses me off", "pissed off", "piss off",
|
|
81
|
+
"are you kidding", "are you serious", "you're kidding",
|
|
82
|
+
"wtf", "wth", "what the hell", "what the fuck", "what the f",
|
|
83
|
+
"are you stupid", "this is stupid", "this is ridiculous",
|
|
84
|
+
"omg", "oh my god", "jesus", "jesus christ", "ffs",
|
|
85
|
+
"for fuck's sake", "for fucks sake", "goddamn", "god damn",
|
|
86
|
+
"seriously?", "seriously!", "come on!", "come on,",
|
|
87
|
+
"give me a break", "unbelievable", "unreal",
|
|
88
|
+
"you broke it", "you broke", "its broken", "it's broken",
|
|
89
|
+
"i'm done", "im done", "i give up", "forget it",
|
|
90
|
+
"this is a mess", "what a mess", "disaster",
|
|
91
|
+
], "Frustration — explicit irritation signal"),
|
|
92
|
+
|
|
93
|
+
# ── Pre-correction: rising tone, about to redirect ──
|
|
94
|
+
("pre_correction", [
|
|
95
|
+
"listen,", "listen.", "ok no", "ok wait", "ok stop",
|
|
96
|
+
"alright stop", "alright no", "alright wait",
|
|
97
|
+
"hey,", "look,", "look.", "no no", "nono",
|
|
98
|
+
"read the", "re-read", "read it again",
|
|
99
|
+
"i just said", "i just told you", "i literally",
|
|
100
|
+
"why did you", "why are you", "why would you",
|
|
101
|
+
"you just", "you literally just",
|
|
102
|
+
"thats not what i", "that's not what i",
|
|
103
|
+
"not again", "again?", "again.", "every time",
|
|
104
|
+
"you keep", "you always", "you never",
|
|
105
|
+
"i've told you", "ive told you", "told you",
|
|
106
|
+
"this is the", "how many times",
|
|
107
|
+
], "Pre-correction — rising tone before a correction"),
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
# Swear words as standalone detection (for any message, not keyword-anchored)
|
|
111
|
+
PROFANITY_PATTERNS = re.compile(
|
|
112
|
+
r'\b(fuck|shit|ass|bitch|damn|crap|hell|bastard|bullshit|motherfuck|dumbass|idiot|moron)\b',
|
|
113
|
+
re.IGNORECASE
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# ALL CAPS detection: ≥3 consecutive uppercase words = signal
|
|
117
|
+
ALL_CAPS_PATTERN = re.compile(r'(?:[A-Z]{2,}\s+){2,}[A-Z]{2,}|[A-Z]{4,}')
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def classify_message(text):
|
|
121
|
+
"""Return list of (signal_type, matched_keyword) for a user message."""
|
|
122
|
+
tl = text.lower().strip()
|
|
123
|
+
signals = []
|
|
124
|
+
seen_types = set()
|
|
125
|
+
for sig_type, keywords, _ in SIGNAL_PATTERNS:
|
|
126
|
+
if sig_type in seen_types:
|
|
127
|
+
continue
|
|
128
|
+
for kw in keywords:
|
|
129
|
+
if kw in tl:
|
|
130
|
+
signals.append((sig_type, kw))
|
|
131
|
+
seen_types.add(sig_type)
|
|
132
|
+
break
|
|
133
|
+
|
|
134
|
+
# Profanity detection (adds frustration signal if not already caught)
|
|
135
|
+
if 'frustration' not in seen_types:
|
|
136
|
+
m = PROFANITY_PATTERNS.search(text)
|
|
137
|
+
if m:
|
|
138
|
+
signals.append(('frustration', m.group(0).lower()))
|
|
139
|
+
seen_types.add('frustration')
|
|
140
|
+
|
|
141
|
+
# All-caps detection (≥4 uppercase chars or 3+ uppercase words)
|
|
142
|
+
if 'frustration' not in seen_types:
|
|
143
|
+
# Strip URLs, code blocks, and known tool output artifacts before checking
|
|
144
|
+
clean = re.sub(r'https?://\S+|`[^`]*`', '', text)
|
|
145
|
+
# Skip if it looks like tool output (contains PREVIOUS OUTPUT TRUN or similar)
|
|
146
|
+
skip_phrases = ['PREVIOUS OUTPUT', 'TRUNCATED', 'EXIT CODE', 'CWD:', 'TERMINAL:']
|
|
147
|
+
if not any(p in clean for p in skip_phrases):
|
|
148
|
+
if ALL_CAPS_PATTERN.search(clean):
|
|
149
|
+
# Make sure it's not just an acronym (less than 8 caps chars total)
|
|
150
|
+
caps_count = sum(1 for c in clean if c.isupper())
|
|
151
|
+
if caps_count >= 8:
|
|
152
|
+
m2 = ALL_CAPS_PATTERN.search(clean)
|
|
153
|
+
signals.append(('frustration', 'ALL_CAPS:' + m2.group(0)[:20]))
|
|
154
|
+
seen_types.add('frustration')
|
|
155
|
+
|
|
156
|
+
return signals
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def extract_requests_from_chat(lines):
|
|
160
|
+
"""
|
|
161
|
+
Walk JSONL lines from a chat session blob.
|
|
162
|
+
Returns:
|
|
163
|
+
requests — list of {request_id, ts, message_text, model_id, turn_index}
|
|
164
|
+
token_rows — list of {request_id, ts, turn_index, prompt, completion, cached, total, output, model_id}
|
|
165
|
+
compaction_rows — list of {request_id, ts, turn_index, summary_text, trigger_text}
|
|
166
|
+
"""
|
|
167
|
+
# Build a snapshot from kind=0 + patches from kind=1/2
|
|
168
|
+
# kind=0: initial snapshot (has requests[])
|
|
169
|
+
# kind=2: patches with new request arrays
|
|
170
|
+
# kind=1: result patches (timings, metadata, usage)
|
|
171
|
+
|
|
172
|
+
requests_map = {} # request_id -> dict
|
|
173
|
+
turn_index = 0
|
|
174
|
+
|
|
175
|
+
for line in lines:
|
|
176
|
+
try:
|
|
177
|
+
obj = json.loads(line)
|
|
178
|
+
except Exception:
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
kind = obj.get('kind')
|
|
182
|
+
|
|
183
|
+
# kind=0: initial snapshot
|
|
184
|
+
if kind == 0:
|
|
185
|
+
v = obj.get('v', {})
|
|
186
|
+
for req in (v.get('requests') or []):
|
|
187
|
+
rid = req.get('requestId', '')
|
|
188
|
+
if rid:
|
|
189
|
+
requests_map[rid] = _parse_request(req, turn_index)
|
|
190
|
+
turn_index += 1
|
|
191
|
+
|
|
192
|
+
# kind=2: delta patches — new requests appended
|
|
193
|
+
elif kind == 2:
|
|
194
|
+
k = obj.get('k', [])
|
|
195
|
+
v = obj.get('v')
|
|
196
|
+
# ['requests'] with a list value = new batch of requests
|
|
197
|
+
if k == ['requests'] and isinstance(v, list):
|
|
198
|
+
for req in v:
|
|
199
|
+
rid = req.get('requestId', '')
|
|
200
|
+
if rid and rid not in requests_map:
|
|
201
|
+
requests_map[rid] = _parse_request(req, turn_index)
|
|
202
|
+
turn_index += 1
|
|
203
|
+
# ['requests', N, field] = patch to existing request
|
|
204
|
+
elif len(k) >= 3 and k[0] == 'requests' and isinstance(k[1], int):
|
|
205
|
+
pass # handled below in result patches
|
|
206
|
+
|
|
207
|
+
# kind=1: result patches — contains usage, timings, metadata
|
|
208
|
+
elif kind == 1:
|
|
209
|
+
k = obj.get('k', [])
|
|
210
|
+
v = obj.get('v', {})
|
|
211
|
+
# ['requests', N, 'result'] — usage is here
|
|
212
|
+
if len(k) >= 3 and k[0] == 'requests' and k[2] == 'result' and isinstance(v, dict):
|
|
213
|
+
idx = k[1]
|
|
214
|
+
# Find the request at that index
|
|
215
|
+
req_at_idx = _find_request_by_index(requests_map, idx)
|
|
216
|
+
if req_at_idx:
|
|
217
|
+
_apply_result_patch(req_at_idx, v)
|
|
218
|
+
|
|
219
|
+
return requests_map
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _parse_request(req, turn_index):
|
|
223
|
+
"""Parse a raw request dict into our normalized form."""
|
|
224
|
+
msg = req.get('message', {})
|
|
225
|
+
text = msg.get('text', '') if isinstance(msg, dict) else ''
|
|
226
|
+
# Model
|
|
227
|
+
model_id = req.get('modelId', '')
|
|
228
|
+
if not model_id and isinstance(req.get('modelState'), dict):
|
|
229
|
+
model_id = req['modelState'].get('modelId', '')
|
|
230
|
+
# Response — check for compaction summary in response parts
|
|
231
|
+
response = req.get('response') or []
|
|
232
|
+
compaction_summary = ''
|
|
233
|
+
if isinstance(response, list):
|
|
234
|
+
for part in response:
|
|
235
|
+
if isinstance(part, dict):
|
|
236
|
+
ptext = part.get('value', '') or part.get('content', '')
|
|
237
|
+
if isinstance(ptext, str) and 'conversation-summary' in ptext.lower():
|
|
238
|
+
m = re.search(r'<conversation-summary>(.*?)</conversation-summary>', ptext, re.DOTALL | re.IGNORECASE)
|
|
239
|
+
if m:
|
|
240
|
+
compaction_summary = m.group(1).strip()
|
|
241
|
+
return {
|
|
242
|
+
'request_id': req.get('requestId', ''),
|
|
243
|
+
'ts': req.get('timestamp', 0),
|
|
244
|
+
'turn_index': turn_index,
|
|
245
|
+
'message_text': text,
|
|
246
|
+
'model_id': model_id,
|
|
247
|
+
'compaction_summary': compaction_summary,
|
|
248
|
+
# filled by result patch:
|
|
249
|
+
'prompt_tokens': 0,
|
|
250
|
+
'completion_tokens': 0,
|
|
251
|
+
'cached_tokens': 0,
|
|
252
|
+
'total_tokens': 0,
|
|
253
|
+
'output_tokens': 0,
|
|
254
|
+
'rendered_context': '',
|
|
255
|
+
'compaction_meta': {},
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _find_request_by_index(requests_map, idx):
|
|
260
|
+
"""Find request at position idx (by insertion order)."""
|
|
261
|
+
items = list(requests_map.values())
|
|
262
|
+
if 0 <= idx < len(items):
|
|
263
|
+
return items[idx]
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _apply_result_patch(req, result):
|
|
268
|
+
"""Apply a result patch (timings, metadata, usage) to a request record."""
|
|
269
|
+
meta = result.get('metadata', {}) or {}
|
|
270
|
+
|
|
271
|
+
# Token usage — directly in metadata (promptTokens / outputTokens)
|
|
272
|
+
pt = meta.get('promptTokens')
|
|
273
|
+
ot = meta.get('outputTokens')
|
|
274
|
+
if pt is not None:
|
|
275
|
+
req['prompt_tokens'] = pt
|
|
276
|
+
if ot is not None:
|
|
277
|
+
req['output_tokens'] = ot
|
|
278
|
+
req['completion_tokens'] = ot # outputTokens IS completion tokens here
|
|
279
|
+
|
|
280
|
+
# Model
|
|
281
|
+
resolved = meta.get('resolvedModel', '')
|
|
282
|
+
if resolved and not req['model_id']:
|
|
283
|
+
req['model_id'] = resolved if isinstance(resolved, str) else str(resolved)
|
|
284
|
+
|
|
285
|
+
# Compaction summaries — in metadata.summaries list
|
|
286
|
+
summaries = meta.get('summaries', []) or []
|
|
287
|
+
if isinstance(summaries, list) and summaries:
|
|
288
|
+
# Take the first (most recent) summary entry
|
|
289
|
+
s = summaries[0]
|
|
290
|
+
if isinstance(s, dict) and s.get('text') and not req['compaction_summary']:
|
|
291
|
+
req['compaction_summary'] = s['text']
|
|
292
|
+
# Store rich compaction metadata on the request for use in build step
|
|
293
|
+
req['compaction_meta'] = {
|
|
294
|
+
'tool_call_round_id': s.get('toolCallRoundId', ''),
|
|
295
|
+
'model': s.get('model', ''),
|
|
296
|
+
'summarization_mode': s.get('summarizationMode', ''),
|
|
297
|
+
'num_rounds': s.get('numRounds', 0),
|
|
298
|
+
'context_length_before': s.get('contextLengthBefore', 0),
|
|
299
|
+
'duration_ms': s.get('durationMs', 0),
|
|
300
|
+
'outcome': s.get('outcome', ''),
|
|
301
|
+
'usage': s.get('usage', {}),
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _decode_vfs_text(blob: bytes) -> str:
|
|
306
|
+
if not blob:
|
|
307
|
+
return ""
|
|
308
|
+
try:
|
|
309
|
+
raw = gzip.decompress(blob)
|
|
310
|
+
except Exception:
|
|
311
|
+
raw = blob
|
|
312
|
+
if isinstance(raw, str):
|
|
313
|
+
return raw
|
|
314
|
+
for encoding in ('utf-8', 'latin-1'):
|
|
315
|
+
try:
|
|
316
|
+
return raw.decode(encoding)
|
|
317
|
+
except Exception:
|
|
318
|
+
continue
|
|
319
|
+
return ""
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _symbol_context(content: str, lineno: int, radius: int = 2) -> str:
|
|
323
|
+
lines = content.splitlines()
|
|
324
|
+
if lineno is None or lineno <= 0:
|
|
325
|
+
return ""
|
|
326
|
+
start = max(0, lineno - 1 - radius)
|
|
327
|
+
end = min(len(lines), lineno + radius)
|
|
328
|
+
return "\n".join(lines[start:end]).strip()
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _extract_python_symbols(file_path: str, content: str) -> List[dict]:
|
|
332
|
+
symbols: List[dict] = []
|
|
333
|
+
if not content.strip():
|
|
334
|
+
return symbols
|
|
335
|
+
|
|
336
|
+
class SymbolVisitor(ast.NodeVisitor):
|
|
337
|
+
def __init__(self):
|
|
338
|
+
self.stack: List[str] = []
|
|
339
|
+
self.found: List[Tuple[str, str, int]] = []
|
|
340
|
+
|
|
341
|
+
def _push(self, name: str):
|
|
342
|
+
self.stack.append(name)
|
|
343
|
+
|
|
344
|
+
def _pop(self):
|
|
345
|
+
if self.stack:
|
|
346
|
+
self.stack.pop()
|
|
347
|
+
|
|
348
|
+
def _qualname(self, name: str) -> str:
|
|
349
|
+
return ".".join(self.stack + [name]) if self.stack else name
|
|
350
|
+
|
|
351
|
+
def visit_ClassDef(self, node):
|
|
352
|
+
self.found.append(('class', self._qualname(node.name), node.lineno))
|
|
353
|
+
self._push(node.name)
|
|
354
|
+
self.generic_visit(node)
|
|
355
|
+
self._pop()
|
|
356
|
+
|
|
357
|
+
def visit_FunctionDef(self, node):
|
|
358
|
+
kind = 'method' if self.stack else 'function'
|
|
359
|
+
self.found.append((kind, self._qualname(node.name), node.lineno))
|
|
360
|
+
self.generic_visit(node)
|
|
361
|
+
|
|
362
|
+
def visit_AsyncFunctionDef(self, node):
|
|
363
|
+
kind = 'method' if self.stack else 'function'
|
|
364
|
+
self.found.append((kind, self._qualname(node.name), node.lineno))
|
|
365
|
+
self.generic_visit(node)
|
|
366
|
+
|
|
367
|
+
def visit_AnnAssign(self, node):
|
|
368
|
+
target = node.target
|
|
369
|
+
if isinstance(target, ast.Name) and not target.id.startswith('_') and not self.stack:
|
|
370
|
+
self.found.append(('variable', target.id, node.lineno))
|
|
371
|
+
self.generic_visit(node)
|
|
372
|
+
|
|
373
|
+
def visit_Assign(self, node):
|
|
374
|
+
if not self.stack:
|
|
375
|
+
for target in node.targets:
|
|
376
|
+
if isinstance(target, ast.Name) and not target.id.startswith('_'):
|
|
377
|
+
self.found.append(('variable', target.id, node.lineno))
|
|
378
|
+
self.generic_visit(node)
|
|
379
|
+
|
|
380
|
+
try:
|
|
381
|
+
tree = ast.parse(content, filename=file_path)
|
|
382
|
+
visitor = SymbolVisitor()
|
|
383
|
+
visitor.visit(tree)
|
|
384
|
+
except Exception:
|
|
385
|
+
return symbols
|
|
386
|
+
|
|
387
|
+
for kind, name, lineno in visitor.found:
|
|
388
|
+
symbols.append({
|
|
389
|
+
'symbol_type': kind,
|
|
390
|
+
'symbol_name': name,
|
|
391
|
+
'line_number': lineno,
|
|
392
|
+
'context': _symbol_context(content, lineno),
|
|
393
|
+
})
|
|
394
|
+
return symbols
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _extract_generic_symbols(file_path: str, content: str) -> List[dict]:
|
|
398
|
+
symbols: List[dict] = []
|
|
399
|
+
if not content.strip():
|
|
400
|
+
return symbols
|
|
401
|
+
|
|
402
|
+
patterns = [
|
|
403
|
+
(r'^\s*(?:export\s+)?(?:default\s+)?function\s+([A-Za-z_][\w]*)\b', 'function'),
|
|
404
|
+
(r'^\s*(?:export\s+)?(?:default\s+)?class\s+([A-Za-z_][\w]*)\b', 'class'),
|
|
405
|
+
(r'^\s*(?:export\s+)?(?:const|let|var)\s+([A-Za-z_][\w]*)\s*=', 'variable'),
|
|
406
|
+
(r'^\s*(?:interface|enum|struct|type)\s+([A-Za-z_][\w]*)\b', 'type'),
|
|
407
|
+
(r'^\s*def\s+([A-Za-z_][\w]*)\b', 'function'),
|
|
408
|
+
(r'^\s*func\s+([A-Za-z_][\w]*)\b', 'function'),
|
|
409
|
+
]
|
|
410
|
+
|
|
411
|
+
for line_number, line in enumerate(content.splitlines(), start=1):
|
|
412
|
+
for pattern, kind in patterns:
|
|
413
|
+
match = re.match(pattern, line)
|
|
414
|
+
if match:
|
|
415
|
+
name = match.group(1)
|
|
416
|
+
symbols.append({
|
|
417
|
+
'symbol_type': kind,
|
|
418
|
+
'symbol_name': name,
|
|
419
|
+
'line_number': line_number,
|
|
420
|
+
'context': _symbol_context(content, line_number),
|
|
421
|
+
})
|
|
422
|
+
break
|
|
423
|
+
return symbols
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def extract_code_symbols(file_path: str, content: str) -> List[dict]:
|
|
427
|
+
ext = Path(file_path).suffix.lower()
|
|
428
|
+
if not ext:
|
|
429
|
+
return []
|
|
430
|
+
if ext in ('.py', '.pyi'):
|
|
431
|
+
return _extract_python_symbols(file_path, content)
|
|
432
|
+
if ext in ('.js', '.jsx', '.ts', '.tsx', '.go', '.rs', '.java', '.c', '.cpp', '.h', '.cs', '.swift', '.rb'):
|
|
433
|
+
return _extract_generic_symbols(file_path, content)
|
|
434
|
+
return []
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def _is_code_file(source_path: str) -> bool:
|
|
438
|
+
if not source_path:
|
|
439
|
+
return False
|
|
440
|
+
ext = Path(source_path).suffix.lower()
|
|
441
|
+
return ext in {
|
|
442
|
+
'.py', '.pyi', '.js', '.jsx', '.ts', '.tsx', '.go', '.rs', '.java',
|
|
443
|
+
'.c', '.cpp', '.h', '.cs', '.swift', '.rb'
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def build_symbol_index(conn):
|
|
448
|
+
print("\nBuilding code symbol index...")
|
|
449
|
+
conn.execute("DELETE FROM symbols")
|
|
450
|
+
rows = conn.execute(
|
|
451
|
+
"SELECT workspace_id, source_path, content FROM vfs"
|
|
452
|
+
).fetchall()
|
|
453
|
+
symbols = []
|
|
454
|
+
indexed_at = int(datetime.utcnow().timestamp() * 1000)
|
|
455
|
+
for workspace_id, source_path, content_blob in rows:
|
|
456
|
+
if not _is_code_file(source_path or ""):
|
|
457
|
+
continue
|
|
458
|
+
text = _decode_vfs_text(content_blob)
|
|
459
|
+
if not text:
|
|
460
|
+
continue
|
|
461
|
+
for sym in extract_code_symbols(source_path, text):
|
|
462
|
+
symbols.append((
|
|
463
|
+
workspace_id,
|
|
464
|
+
source_path,
|
|
465
|
+
sym['symbol_name'],
|
|
466
|
+
sym['symbol_type'],
|
|
467
|
+
sym['line_number'],
|
|
468
|
+
sym['context'],
|
|
469
|
+
indexed_at,
|
|
470
|
+
))
|
|
471
|
+
if symbols:
|
|
472
|
+
conn.executemany(
|
|
473
|
+
"INSERT INTO symbols(workspace_id, file_path, symbol_name, symbol_type, line_number, context, indexed_at) VALUES (?,?,?,?,?,?,?)",
|
|
474
|
+
symbols
|
|
475
|
+
)
|
|
476
|
+
conn.commit()
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def ensure_schema(conn):
|
|
480
|
+
"""Ensure extract-related tables and indexes exist for watcher and analysis passes."""
|
|
481
|
+
conn.executescript("""
|
|
482
|
+
CREATE TABLE IF NOT EXISTS exchange_signals (
|
|
483
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
484
|
+
session_id TEXT NOT NULL,
|
|
485
|
+
exchange_index INTEGER,
|
|
486
|
+
request_id TEXT,
|
|
487
|
+
ts INTEGER,
|
|
488
|
+
signal_type TEXT NOT NULL,
|
|
489
|
+
signal_text TEXT,
|
|
490
|
+
matched_keyword TEXT,
|
|
491
|
+
user_message TEXT,
|
|
492
|
+
ingested_at TEXT DEFAULT (datetime('now'))
|
|
493
|
+
);
|
|
494
|
+
CREATE INDEX IF NOT EXISTS idx_signals_session ON exchange_signals(session_id);
|
|
495
|
+
CREATE INDEX IF NOT EXISTS idx_signals_type ON exchange_signals(signal_type);
|
|
496
|
+
|
|
497
|
+
CREATE TABLE IF NOT EXISTS session_analysis (
|
|
498
|
+
session_id TEXT PRIMARY KEY,
|
|
499
|
+
total_corrections INTEGER DEFAULT 0,
|
|
500
|
+
total_redirects INTEGER DEFAULT 0,
|
|
501
|
+
total_affirmations INTEGER DEFAULT 0,
|
|
502
|
+
total_tool_calls INTEGER DEFAULT 0,
|
|
503
|
+
total_tokens_prompt INTEGER DEFAULT 0,
|
|
504
|
+
total_tokens_completion INTEGER DEFAULT 0,
|
|
505
|
+
total_tokens_cached INTEGER DEFAULT 0,
|
|
506
|
+
compaction_count INTEGER DEFAULT 0,
|
|
507
|
+
session_duration_min REAL,
|
|
508
|
+
first_ts INTEGER,
|
|
509
|
+
last_ts INTEGER,
|
|
510
|
+
model_ids TEXT,
|
|
511
|
+
clean_run INTEGER DEFAULT 0,
|
|
512
|
+
analyzed_at TEXT DEFAULT (datetime('now')),
|
|
513
|
+
total_frustrations INTEGER DEFAULT 0,
|
|
514
|
+
total_pre_corrections INTEGER DEFAULT 0,
|
|
515
|
+
heat_score INTEGER DEFAULT 0,
|
|
516
|
+
peak_heat INTEGER DEFAULT 0,
|
|
517
|
+
final_heat INTEGER DEFAULT 0,
|
|
518
|
+
saved_session INTEGER DEFAULT 0,
|
|
519
|
+
turning_point_ts INTEGER,
|
|
520
|
+
turning_point_text TEXT
|
|
521
|
+
);
|
|
522
|
+
|
|
523
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS fts_exchanges USING fts5(
|
|
524
|
+
session_id UNINDEXED,
|
|
525
|
+
workspace_id UNINDEXED,
|
|
526
|
+
exchange_index UNINDEXED,
|
|
527
|
+
user_ts UNINDEXED,
|
|
528
|
+
user_message,
|
|
529
|
+
reasoning_text,
|
|
530
|
+
response_text,
|
|
531
|
+
tool_calls,
|
|
532
|
+
content=exchanges,
|
|
533
|
+
content_rowid=id
|
|
534
|
+
);
|
|
535
|
+
|
|
536
|
+
CREATE TABLE IF NOT EXISTS symbols (
|
|
537
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
538
|
+
workspace_id TEXT,
|
|
539
|
+
file_path TEXT,
|
|
540
|
+
symbol_name TEXT,
|
|
541
|
+
symbol_type TEXT,
|
|
542
|
+
line_number INTEGER,
|
|
543
|
+
context TEXT,
|
|
544
|
+
indexed_at INTEGER
|
|
545
|
+
);
|
|
546
|
+
CREATE INDEX IF NOT EXISTS idx_symbols_workspace ON symbols(workspace_id);
|
|
547
|
+
CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type);
|
|
548
|
+
CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(symbol_name);
|
|
549
|
+
|
|
550
|
+
CREATE TABLE IF NOT EXISTS tool_calls (
|
|
551
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
552
|
+
session_id TEXT NOT NULL,
|
|
553
|
+
exchange_index INTEGER,
|
|
554
|
+
request_id TEXT,
|
|
555
|
+
tool_call_id TEXT,
|
|
556
|
+
tool_name TEXT NOT NULL,
|
|
557
|
+
file_path TEXT,
|
|
558
|
+
arguments_json TEXT,
|
|
559
|
+
has_output INTEGER DEFAULT 0,
|
|
560
|
+
ingested_at TEXT DEFAULT (datetime('now'))
|
|
561
|
+
);
|
|
562
|
+
CREATE INDEX IF NOT EXISTS idx_tool_calls_session ON tool_calls(session_id);
|
|
563
|
+
CREATE INDEX IF NOT EXISTS idx_tool_calls_name ON tool_calls(tool_name);
|
|
564
|
+
CREATE INDEX IF NOT EXISTS idx_tool_calls_file ON tool_calls(file_path);
|
|
565
|
+
""")
|
|
566
|
+
try:
|
|
567
|
+
conn.execute("ALTER TABLE session_analysis ADD COLUMN clean_run INTEGER DEFAULT 0")
|
|
568
|
+
except sqlite3.OperationalError:
|
|
569
|
+
pass
|
|
570
|
+
conn.commit()
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
# ─────────────────────────────────────────────────────────
|
|
574
|
+
# Main processing
|
|
575
|
+
# ─────────────────────────────────────────────────────────
|
|
576
|
+
|
|
577
|
+
def process_session(conn, session_id, blob):
|
|
578
|
+
"""Process one chat session blob and write rows to all tables."""
|
|
579
|
+
raw = gzip.decompress(blob).decode('utf-8', errors='replace')
|
|
580
|
+
lines = [ln for ln in raw.splitlines() if ln.strip()]
|
|
581
|
+
|
|
582
|
+
requests_map = extract_requests_from_chat(lines)
|
|
583
|
+
if not requests_map:
|
|
584
|
+
return 0, 0, 0
|
|
585
|
+
|
|
586
|
+
token_rows = []
|
|
587
|
+
signal_rows = []
|
|
588
|
+
compaction_rows = []
|
|
589
|
+
|
|
590
|
+
for req in requests_map.values():
|
|
591
|
+
rid = req['request_id']
|
|
592
|
+
ts = req['ts']
|
|
593
|
+
ti = req['turn_index']
|
|
594
|
+
mid = req['model_id']
|
|
595
|
+
|
|
596
|
+
# Token usage row (only if we have real data)
|
|
597
|
+
if req['prompt_tokens'] or req['output_tokens']:
|
|
598
|
+
token_rows.append((
|
|
599
|
+
session_id, rid, ti, ts,
|
|
600
|
+
req['prompt_tokens'], req['completion_tokens'],
|
|
601
|
+
req['cached_tokens'], req['total_tokens'],
|
|
602
|
+
req['output_tokens'], mid
|
|
603
|
+
))
|
|
604
|
+
|
|
605
|
+
# Compaction row
|
|
606
|
+
if req['compaction_summary']:
|
|
607
|
+
trigger = req['message_text'][:200] if req['message_text'] else ''
|
|
608
|
+
cmeta = req.get('compaction_meta', {})
|
|
609
|
+
compaction_rows.append((
|
|
610
|
+
session_id, rid, ti, ts,
|
|
611
|
+
req['compaction_summary'],
|
|
612
|
+
len(req['compaction_summary']),
|
|
613
|
+
trigger,
|
|
614
|
+
cmeta.get('context_length_before', 0),
|
|
615
|
+
cmeta.get('num_rounds', 0),
|
|
616
|
+
cmeta.get('model', ''),
|
|
617
|
+
cmeta.get('duration_ms', 0),
|
|
618
|
+
))
|
|
619
|
+
|
|
620
|
+
# Signal rows
|
|
621
|
+
if req['message_text']:
|
|
622
|
+
signals = classify_message(req['message_text'])
|
|
623
|
+
for sig_type, matched_kw in signals:
|
|
624
|
+
signal_rows.append((
|
|
625
|
+
session_id, None, rid, ts,
|
|
626
|
+
sig_type, req['message_text'][:500],
|
|
627
|
+
matched_kw, req['message_text'][:200]
|
|
628
|
+
))
|
|
629
|
+
|
|
630
|
+
# Insert
|
|
631
|
+
conn.executemany(
|
|
632
|
+
"""INSERT OR IGNORE INTO token_usage
|
|
633
|
+
(session_id, request_id, turn_index, ts,
|
|
634
|
+
prompt_tokens, completion_tokens, cached_tokens,
|
|
635
|
+
total_tokens, output_tokens, model_id)
|
|
636
|
+
VALUES (?,?,?,?,?,?,?,?,?,?)""",
|
|
637
|
+
token_rows
|
|
638
|
+
)
|
|
639
|
+
conn.executemany(
|
|
640
|
+
"""INSERT OR IGNORE INTO compactions
|
|
641
|
+
(session_id, request_id, turn_index, ts,
|
|
642
|
+
summary_text, summary_length, trigger_text,
|
|
643
|
+
context_length_before, num_rounds, summary_model, duration_ms)
|
|
644
|
+
VALUES (?,?,?,?,?,?,?,?,?,?,?)""",
|
|
645
|
+
compaction_rows
|
|
646
|
+
)
|
|
647
|
+
conn.executemany(
|
|
648
|
+
"""INSERT OR IGNORE INTO exchange_signals
|
|
649
|
+
(session_id, exchange_index, request_id, ts,
|
|
650
|
+
signal_type, signal_text, matched_keyword, user_message)
|
|
651
|
+
VALUES (?,?,?,?,?,?,?,?)""",
|
|
652
|
+
signal_rows
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
return len(token_rows), len(signal_rows), len(compaction_rows)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def build_session_analysis(conn, session_id):
|
|
659
|
+
"""Compute and upsert session_analysis row."""
|
|
660
|
+
tok = conn.execute(
|
|
661
|
+
"""SELECT SUM(prompt_tokens), SUM(completion_tokens), SUM(cached_tokens)
|
|
662
|
+
FROM token_usage WHERE session_id=?""", (session_id,)
|
|
663
|
+
).fetchone()
|
|
664
|
+
|
|
665
|
+
sigs = conn.execute(
|
|
666
|
+
"""SELECT signal_type, COUNT(*) FROM exchange_signals
|
|
667
|
+
WHERE session_id=? GROUP BY signal_type""", (session_id,)
|
|
668
|
+
).fetchall()
|
|
669
|
+
sig_map = {r[0]: r[1] for r in sigs}
|
|
670
|
+
|
|
671
|
+
comp = conn.execute(
|
|
672
|
+
"SELECT COUNT(*) FROM compactions WHERE session_id=?", (session_id,)
|
|
673
|
+
).fetchone()[0]
|
|
674
|
+
|
|
675
|
+
exc = conn.execute(
|
|
676
|
+
"""SELECT SUM(tool_call_count), MIN(user_ts), MAX(user_ts)
|
|
677
|
+
FROM exchanges WHERE session_id=?""", (session_id,)
|
|
678
|
+
).fetchone()
|
|
679
|
+
|
|
680
|
+
models = conn.execute(
|
|
681
|
+
"""SELECT DISTINCT model_id FROM token_usage
|
|
682
|
+
WHERE session_id=? AND model_id != ''""", (session_id,)
|
|
683
|
+
).fetchall()
|
|
684
|
+
model_ids = ','.join(r[0] for r in models)
|
|
685
|
+
|
|
686
|
+
first_ts = exc[1]
|
|
687
|
+
last_ts = exc[2]
|
|
688
|
+
duration = None
|
|
689
|
+
if first_ts and last_ts:
|
|
690
|
+
try:
|
|
691
|
+
from datetime import datetime
|
|
692
|
+
f = datetime.fromisoformat(str(first_ts).replace('Z', '+00:00'))
|
|
693
|
+
ln = datetime.fromisoformat(str(last_ts).replace('Z', '+00:00'))
|
|
694
|
+
duration = (ln - f).total_seconds() / 60
|
|
695
|
+
except Exception:
|
|
696
|
+
pass
|
|
697
|
+
|
|
698
|
+
total_corrections = sig_map.get('correction', 0)
|
|
699
|
+
total_frustrations = sig_map.get('frustration', 0)
|
|
700
|
+
total_pre_corrections = sig_map.get('pre_correction', 0)
|
|
701
|
+
# Clean run = no corrections and at least 3 exchanges
|
|
702
|
+
exc_count = conn.execute(
|
|
703
|
+
"SELECT COUNT(*) FROM exchanges WHERE session_id=?", (session_id,)
|
|
704
|
+
).fetchone()[0]
|
|
705
|
+
clean_run = 1 if total_corrections == 0 and exc_count >= 3 else 0
|
|
706
|
+
|
|
707
|
+
# Heat score: weighted sum of negative signals
|
|
708
|
+
# corrections=3pts, pre_correction=2pts, frustration=5pts, redirects=1pt
|
|
709
|
+
# Normalized to 0–100 range (cap at 100)
|
|
710
|
+
HEAT_WEIGHT = {
|
|
711
|
+
'correction': 3,
|
|
712
|
+
'pre_correction': 2,
|
|
713
|
+
'frustration': 5,
|
|
714
|
+
'redirect': 1,
|
|
715
|
+
}
|
|
716
|
+
raw_heat = (
|
|
717
|
+
total_corrections * 3 +
|
|
718
|
+
total_pre_corrections * 2 +
|
|
719
|
+
total_frustrations * 5 +
|
|
720
|
+
sig_map.get('redirect', 0) * 1
|
|
721
|
+
)
|
|
722
|
+
heat_score = min(100, raw_heat)
|
|
723
|
+
|
|
724
|
+
# ── Per-turn heat timeline ─────────────────────────────────────
|
|
725
|
+
# Group signals by ts, compute heat contribution per turn,
|
|
726
|
+
# find: peak_heat, final_heat (last 5 turns), turning_point
|
|
727
|
+
signals_ordered = conn.execute(
|
|
728
|
+
"""SELECT ts, signal_type, user_message FROM exchange_signals
|
|
729
|
+
WHERE session_id=? ORDER BY ts NULLS LAST""",
|
|
730
|
+
(session_id,)
|
|
731
|
+
).fetchall()
|
|
732
|
+
|
|
733
|
+
heat_by_ts: DefaultDict[int, int] = defaultdict(int) # ts -> heat contribution
|
|
734
|
+
types_by_ts: DefaultDict[int, List[str]] = defaultdict(list) # ts -> [signal_types]
|
|
735
|
+
msg_by_ts: Dict[int, str] = {} # ts -> first message at that ts
|
|
736
|
+
for s in signals_ordered:
|
|
737
|
+
ts_val = s[0] or 0
|
|
738
|
+
st = s[1]
|
|
739
|
+
heat_by_ts[ts_val] += HEAT_WEIGHT.get(st, 0)
|
|
740
|
+
types_by_ts[ts_val].append(st)
|
|
741
|
+
if ts_val not in msg_by_ts and s[2]:
|
|
742
|
+
msg_by_ts[ts_val] = s[2]
|
|
743
|
+
|
|
744
|
+
sorted_ts = sorted(heat_by_ts.keys())
|
|
745
|
+
|
|
746
|
+
# Cumulative heat timeline → peak_heat = heat_score (total is the peak)
|
|
747
|
+
peak_heat = heat_score # heat only accumulates, so peak == total
|
|
748
|
+
|
|
749
|
+
# final_heat: heat contributed by last 5 turns
|
|
750
|
+
last_5_ts = sorted_ts[-5:] if len(sorted_ts) >= 5 else sorted_ts
|
|
751
|
+
final_heat = sum(heat_by_ts[ts] for ts in last_5_ts)
|
|
752
|
+
|
|
753
|
+
# Turning point: ts of the LAST heat-generating signal (the "Antidote")
|
|
754
|
+
# This is the correction/frustration that preceded recovery
|
|
755
|
+
turning_point_ts = None
|
|
756
|
+
turning_point_text = None
|
|
757
|
+
for ts_val in reversed(sorted_ts):
|
|
758
|
+
if heat_by_ts[ts_val] > 0:
|
|
759
|
+
turning_point_ts = ts_val
|
|
760
|
+
turning_point_text = (msg_by_ts.get(ts_val) or '')[:500]
|
|
761
|
+
break
|
|
762
|
+
|
|
763
|
+
# Saved session: had significant heat AND recovered
|
|
764
|
+
# Recovery = final_heat == 0 (no heat in last 5 turns) AND ended with affirmations
|
|
765
|
+
total_affirmations = sig_map.get('affirmation', 0) + sig_map.get('approval', 0) # noqa: F841
|
|
766
|
+
post_peak_affirmations = 0
|
|
767
|
+
if turning_point_ts is not None:
|
|
768
|
+
post_peak_affirmations = conn.execute(
|
|
769
|
+
"""SELECT COUNT(*) FROM exchange_signals
|
|
770
|
+
WHERE session_id=? AND ts > ? AND signal_type IN ('affirmation','approval')""",
|
|
771
|
+
(session_id, turning_point_ts)
|
|
772
|
+
).fetchone()[0]
|
|
773
|
+
saved_session = 1 if (peak_heat >= 25 and final_heat <= peak_heat * 0.4 and post_peak_affirmations >= 1) else 0
|
|
774
|
+
|
|
775
|
+
conn.execute("""
|
|
776
|
+
INSERT INTO session_analysis
|
|
777
|
+
(session_id, total_corrections, total_redirects, total_affirmations,
|
|
778
|
+
total_tool_calls, total_tokens_prompt, total_tokens_completion,
|
|
779
|
+
total_tokens_cached, compaction_count, session_duration_min,
|
|
780
|
+
first_ts, last_ts, model_ids, clean_run,
|
|
781
|
+
total_frustrations, total_pre_corrections, heat_score,
|
|
782
|
+
peak_heat, final_heat, saved_session,
|
|
783
|
+
turning_point_ts, turning_point_text)
|
|
784
|
+
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
|
785
|
+
ON CONFLICT(session_id) DO UPDATE SET
|
|
786
|
+
total_corrections=excluded.total_corrections,
|
|
787
|
+
total_redirects=excluded.total_redirects,
|
|
788
|
+
total_affirmations=excluded.total_affirmations,
|
|
789
|
+
total_tool_calls=excluded.total_tool_calls,
|
|
790
|
+
total_tokens_prompt=excluded.total_tokens_prompt,
|
|
791
|
+
total_tokens_completion=excluded.total_tokens_completion,
|
|
792
|
+
total_tokens_cached=excluded.total_tokens_cached,
|
|
793
|
+
compaction_count=excluded.compaction_count,
|
|
794
|
+
session_duration_min=excluded.session_duration_min,
|
|
795
|
+
first_ts=excluded.first_ts,
|
|
796
|
+
last_ts=excluded.last_ts,
|
|
797
|
+
model_ids=excluded.model_ids,
|
|
798
|
+
clean_run=excluded.clean_run,
|
|
799
|
+
total_frustrations=excluded.total_frustrations,
|
|
800
|
+
total_pre_corrections=excluded.total_pre_corrections,
|
|
801
|
+
heat_score=excluded.heat_score,
|
|
802
|
+
peak_heat=excluded.peak_heat,
|
|
803
|
+
final_heat=excluded.final_heat,
|
|
804
|
+
saved_session=excluded.saved_session,
|
|
805
|
+
turning_point_ts=excluded.turning_point_ts,
|
|
806
|
+
turning_point_text=excluded.turning_point_text,
|
|
807
|
+
analyzed_at=datetime('now')
|
|
808
|
+
""", (
|
|
809
|
+
session_id,
|
|
810
|
+
sig_map.get('correction', 0),
|
|
811
|
+
sig_map.get('redirect', 0),
|
|
812
|
+
sig_map.get('affirmation', 0),
|
|
813
|
+
exc[0] or 0,
|
|
814
|
+
tok[0] or 0, tok[1] or 0, tok[2] or 0,
|
|
815
|
+
comp,
|
|
816
|
+
duration,
|
|
817
|
+
first_ts, last_ts,
|
|
818
|
+
model_ids,
|
|
819
|
+
clean_run,
|
|
820
|
+
total_frustrations,
|
|
821
|
+
total_pre_corrections,
|
|
822
|
+
heat_score,
|
|
823
|
+
peak_heat,
|
|
824
|
+
final_heat,
|
|
825
|
+
saved_session,
|
|
826
|
+
turning_point_ts,
|
|
827
|
+
turning_point_text,
|
|
828
|
+
))
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def run():
|
|
832
|
+
conn = sqlite3.connect(str(DB_PATH), timeout=30)
|
|
833
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
834
|
+
conn.execute("PRAGMA synchronous=NORMAL")
|
|
835
|
+
conn.execute("PRAGMA cache_size=-2000")
|
|
836
|
+
conn.execute("PRAGMA mmap_size=268435456")
|
|
837
|
+
conn.execute("PRAGMA temp_store=MEMORY")
|
|
838
|
+
|
|
839
|
+
# Ensure analysis tables exist
|
|
840
|
+
conn.executescript("""
|
|
841
|
+
CREATE TABLE IF NOT EXISTS token_usage (
|
|
842
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
843
|
+
session_id TEXT NOT NULL,
|
|
844
|
+
request_id TEXT,
|
|
845
|
+
turn_index INTEGER,
|
|
846
|
+
ts INTEGER,
|
|
847
|
+
prompt_tokens INTEGER DEFAULT 0,
|
|
848
|
+
completion_tokens INTEGER DEFAULT 0,
|
|
849
|
+
cached_tokens INTEGER DEFAULT 0,
|
|
850
|
+
total_tokens INTEGER DEFAULT 0,
|
|
851
|
+
output_tokens INTEGER DEFAULT 0,
|
|
852
|
+
model_id TEXT,
|
|
853
|
+
ingested_at TEXT DEFAULT (datetime('now'))
|
|
854
|
+
);
|
|
855
|
+
CREATE INDEX IF NOT EXISTS idx_token_usage_session ON token_usage(session_id);
|
|
856
|
+
|
|
857
|
+
CREATE TABLE IF NOT EXISTS compactions (
|
|
858
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
859
|
+
session_id TEXT NOT NULL,
|
|
860
|
+
request_id TEXT,
|
|
861
|
+
turn_index INTEGER,
|
|
862
|
+
ts INTEGER,
|
|
863
|
+
summary_text TEXT,
|
|
864
|
+
summary_length INTEGER,
|
|
865
|
+
trigger_text TEXT,
|
|
866
|
+
ingested_at TEXT DEFAULT (datetime('now')),
|
|
867
|
+
context_length_before INTEGER DEFAULT 0,
|
|
868
|
+
num_rounds INTEGER DEFAULT 0,
|
|
869
|
+
summary_model TEXT,
|
|
870
|
+
duration_ms INTEGER DEFAULT 0
|
|
871
|
+
);
|
|
872
|
+
CREATE INDEX IF NOT EXISTS idx_compactions_session ON compactions(session_id);
|
|
873
|
+
|
|
874
|
+
CREATE TABLE IF NOT EXISTS exchange_signals (
|
|
875
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
876
|
+
session_id TEXT NOT NULL,
|
|
877
|
+
exchange_index INTEGER,
|
|
878
|
+
request_id TEXT,
|
|
879
|
+
ts INTEGER,
|
|
880
|
+
signal_type TEXT NOT NULL,
|
|
881
|
+
signal_text TEXT,
|
|
882
|
+
matched_keyword TEXT,
|
|
883
|
+
user_message TEXT,
|
|
884
|
+
ingested_at TEXT DEFAULT (datetime('now'))
|
|
885
|
+
);
|
|
886
|
+
CREATE INDEX IF NOT EXISTS idx_signals_session ON exchange_signals(session_id);
|
|
887
|
+
CREATE INDEX IF NOT EXISTS idx_signals_type ON exchange_signals(signal_type);
|
|
888
|
+
|
|
889
|
+
CREATE TABLE IF NOT EXISTS session_analysis (
|
|
890
|
+
session_id TEXT PRIMARY KEY,
|
|
891
|
+
total_corrections INTEGER DEFAULT 0,
|
|
892
|
+
total_redirects INTEGER DEFAULT 0,
|
|
893
|
+
total_affirmations INTEGER DEFAULT 0,
|
|
894
|
+
total_tool_calls INTEGER DEFAULT 0,
|
|
895
|
+
total_tokens_prompt INTEGER DEFAULT 0,
|
|
896
|
+
total_tokens_completion INTEGER DEFAULT 0,
|
|
897
|
+
total_tokens_cached INTEGER DEFAULT 0,
|
|
898
|
+
compaction_count INTEGER DEFAULT 0,
|
|
899
|
+
session_duration_min REAL,
|
|
900
|
+
first_ts INTEGER,
|
|
901
|
+
last_ts INTEGER,
|
|
902
|
+
model_ids TEXT,
|
|
903
|
+
clean_run INTEGER DEFAULT 0,
|
|
904
|
+
analyzed_at TEXT DEFAULT (datetime('now')),
|
|
905
|
+
total_frustrations INTEGER DEFAULT 0,
|
|
906
|
+
total_pre_corrections INTEGER DEFAULT 0,
|
|
907
|
+
heat_score INTEGER DEFAULT 0,
|
|
908
|
+
peak_heat INTEGER DEFAULT 0,
|
|
909
|
+
final_heat INTEGER DEFAULT 0,
|
|
910
|
+
saved_session INTEGER DEFAULT 0,
|
|
911
|
+
turning_point_ts INTEGER,
|
|
912
|
+
turning_point_text TEXT
|
|
913
|
+
);
|
|
914
|
+
|
|
915
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS fts_exchanges USING fts5(
|
|
916
|
+
session_id UNINDEXED,
|
|
917
|
+
workspace_id UNINDEXED,
|
|
918
|
+
exchange_index UNINDEXED,
|
|
919
|
+
user_ts UNINDEXED,
|
|
920
|
+
user_message,
|
|
921
|
+
reasoning_text,
|
|
922
|
+
response_text,
|
|
923
|
+
tool_calls,
|
|
924
|
+
content=exchanges,
|
|
925
|
+
content_rowid=id
|
|
926
|
+
);
|
|
927
|
+
|
|
928
|
+
CREATE TABLE IF NOT EXISTS symbols (
|
|
929
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
930
|
+
workspace_id TEXT,
|
|
931
|
+
file_path TEXT,
|
|
932
|
+
symbol_name TEXT,
|
|
933
|
+
symbol_type TEXT, -- function, class, method, variable, etc.
|
|
934
|
+
line_number INTEGER,
|
|
935
|
+
context TEXT, -- surrounding code context
|
|
936
|
+
indexed_at INTEGER
|
|
937
|
+
);
|
|
938
|
+
CREATE INDEX IF NOT EXISTS idx_symbols_workspace ON symbols(workspace_id);
|
|
939
|
+
CREATE INDEX IF NOT EXISTS idx_symbols_type ON symbols(symbol_type);
|
|
940
|
+
CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(symbol_name);
|
|
941
|
+
""")
|
|
942
|
+
|
|
943
|
+
# Ensure tool_calls table exists
|
|
944
|
+
conn.executescript("""
|
|
945
|
+
CREATE TABLE IF NOT EXISTS tool_calls (
|
|
946
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
947
|
+
session_id TEXT NOT NULL,
|
|
948
|
+
exchange_index INTEGER,
|
|
949
|
+
request_id TEXT,
|
|
950
|
+
tool_call_id TEXT,
|
|
951
|
+
tool_name TEXT NOT NULL,
|
|
952
|
+
file_path TEXT,
|
|
953
|
+
arguments_json TEXT,
|
|
954
|
+
has_output INTEGER DEFAULT 0,
|
|
955
|
+
ingested_at TEXT DEFAULT (datetime('now'))
|
|
956
|
+
);
|
|
957
|
+
CREATE INDEX IF NOT EXISTS idx_tool_calls_session ON tool_calls(session_id);
|
|
958
|
+
CREATE INDEX IF NOT EXISTS idx_tool_calls_name ON tool_calls(tool_name);
|
|
959
|
+
CREATE INDEX IF NOT EXISTS idx_tool_calls_file ON tool_calls(file_path);
|
|
960
|
+
""")
|
|
961
|
+
try:
|
|
962
|
+
conn.execute("ALTER TABLE session_analysis ADD COLUMN clean_run INTEGER DEFAULT 0")
|
|
963
|
+
except sqlite3.OperationalError:
|
|
964
|
+
pass
|
|
965
|
+
conn.commit()
|
|
966
|
+
|
|
967
|
+
# Clear existing extracted data for a clean re-run
|
|
968
|
+
conn.execute("DELETE FROM token_usage")
|
|
969
|
+
conn.execute("DELETE FROM compactions")
|
|
970
|
+
conn.execute("DELETE FROM exchange_signals")
|
|
971
|
+
conn.execute("DELETE FROM session_analysis")
|
|
972
|
+
conn.execute("DELETE FROM symbols")
|
|
973
|
+
conn.commit()
|
|
974
|
+
|
|
975
|
+
# Get all sessions that have a chat_session blob
|
|
976
|
+
blobs = conn.execute(
|
|
977
|
+
"""SELECT v.session_id, v.content
|
|
978
|
+
FROM vfs v
|
|
979
|
+
WHERE v.source_type = 'chat_session'
|
|
980
|
+
ORDER BY v.session_id"""
|
|
981
|
+
).fetchall()
|
|
982
|
+
|
|
983
|
+
print(f"Processing {len(blobs)} chat sessions...")
|
|
984
|
+
total_tok = total_sig = total_comp = 0
|
|
985
|
+
errors = 0
|
|
986
|
+
|
|
987
|
+
for i, (sid, content) in enumerate(blobs):
|
|
988
|
+
try:
|
|
989
|
+
t, s, c = process_session(conn, sid, content)
|
|
990
|
+
total_tok += t
|
|
991
|
+
total_sig += s
|
|
992
|
+
total_comp += c
|
|
993
|
+
build_session_analysis(conn, sid)
|
|
994
|
+
if i % 20 == 0:
|
|
995
|
+
conn.commit()
|
|
996
|
+
print(f" [{i+1}/{len(blobs)}] tokens={total_tok} signals={total_sig} compactions={total_comp}")
|
|
997
|
+
except Exception as e:
|
|
998
|
+
errors += 1
|
|
999
|
+
if errors <= 5:
|
|
1000
|
+
print(f" ERROR {sid[:16]}: {e}")
|
|
1001
|
+
|
|
1002
|
+
conn.commit()
|
|
1003
|
+
|
|
1004
|
+
# ── Populate tool_calls from exchanges ──────────────────────────────────
|
|
1005
|
+
print("\nBuilding tool_calls index from exchanges...")
|
|
1006
|
+
conn.execute("DELETE FROM tool_calls")
|
|
1007
|
+
tc_rows = []
|
|
1008
|
+
exch_rows = conn.execute(
|
|
1009
|
+
"SELECT session_id, exchange_index, request_id, tool_calls FROM exchanges WHERE tool_call_count > 0"
|
|
1010
|
+
).fetchall()
|
|
1011
|
+
for sid, ex_idx, req_id, tc_json in exch_rows:
|
|
1012
|
+
try:
|
|
1013
|
+
tool_calls_list = json.loads(tc_json or '[]')
|
|
1014
|
+
except Exception:
|
|
1015
|
+
continue
|
|
1016
|
+
for tc in tool_calls_list:
|
|
1017
|
+
if not isinstance(tc, dict):
|
|
1018
|
+
continue
|
|
1019
|
+
name = tc.get('name', '') or ''
|
|
1020
|
+
tc_id = tc.get('toolCallId', '') or ''
|
|
1021
|
+
args = tc.get('arguments', {}) or {}
|
|
1022
|
+
has_out = 1 if tc.get('output') else 0
|
|
1023
|
+
# Extract file path from common argument patterns
|
|
1024
|
+
file_path = ''
|
|
1025
|
+
if isinstance(args, dict):
|
|
1026
|
+
file_path = (
|
|
1027
|
+
args.get('filePath') or args.get('file_path') or
|
|
1028
|
+
args.get('path') or args.get('uri') or ''
|
|
1029
|
+
)
|
|
1030
|
+
if not file_path:
|
|
1031
|
+
# For read_file / grep_search / replace_string_in_file
|
|
1032
|
+
for k in ('filePath', 'file_path', 'path', 'uri', 'includePattern', 'query'):
|
|
1033
|
+
v = args.get(k)
|
|
1034
|
+
if isinstance(v, str) and ('/' in v or '\\' in v):
|
|
1035
|
+
file_path = v
|
|
1036
|
+
break
|
|
1037
|
+
tc_rows.append((
|
|
1038
|
+
sid, ex_idx, req_id, tc_id, name,
|
|
1039
|
+
str(file_path)[:500] if file_path else '',
|
|
1040
|
+
json.dumps(args)[:1000], has_out,
|
|
1041
|
+
))
|
|
1042
|
+
|
|
1043
|
+
conn.executemany("""
|
|
1044
|
+
INSERT INTO tool_calls
|
|
1045
|
+
(session_id, exchange_index, request_id, tool_call_id,
|
|
1046
|
+
tool_name, file_path, arguments_json, has_output)
|
|
1047
|
+
VALUES (?,?,?,?,?,?,?,?)
|
|
1048
|
+
""", tc_rows)
|
|
1049
|
+
conn.commit()
|
|
1050
|
+
n_tc = conn.execute("SELECT COUNT(*) FROM tool_calls").fetchone()[0]
|
|
1051
|
+
print(f" tool_calls rows: {n_tc}")
|
|
1052
|
+
|
|
1053
|
+
build_symbol_index(conn)
|
|
1054
|
+
conn.close()
|
|
1055
|
+
|
|
1056
|
+
print("\nDone.")
|
|
1057
|
+
print(f" token_usage rows: {total_tok}")
|
|
1058
|
+
print(f" exchange_signals rows:{total_sig}")
|
|
1059
|
+
print(f" compaction rows: {total_comp}")
|
|
1060
|
+
print(f" errors: {errors}")
|
|
1061
|
+
|
|
1062
|
+
|
|
1063
|
+
if __name__ == "__main__":
|
|
1064
|
+
run()
|