code-data-ark 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,536 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ cda/reconstruct.py
4
+
5
+ Walks transcript_events for every session and builds fully-structured
6
+ request/response exchanges, joining tool outputs from the VFS.
7
+
8
+ Output table: exchanges
9
+ - One row per user→assistant cycle (request/response pair)
10
+ - exchange_json contains the full structured object
11
+
12
+ Schema added: exchanges
13
+ """
14
+
15
+ import sqlite3
16
+ import json
17
+ import gzip
18
+ import time
19
+ from typing import Optional
20
+ from pathlib import Path
21
+
22
+ ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
23
+ LOCAL_DIR = ROOT_DIR / "local"
24
+ DB_PATH = LOCAL_DIR / "data" / "cda.db"
25
+ NOW_MS = int(time.time() * 1000)
26
+
27
+ EXCHANGES_SCHEMA = """
28
+ CREATE TABLE IF NOT EXISTS exchanges (
29
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
30
+ session_id TEXT,
31
+ workspace_id TEXT,
32
+ exchange_index INTEGER, -- 0-based turn index within session
33
+ request_id TEXT, -- event id of the user.message
34
+ user_ts TEXT, -- ISO timestamp of user message
35
+ assistant_ts TEXT, -- ISO timestamp of first assistant.turn_start
36
+ user_message TEXT, -- plain text of user prompt
37
+ attachments TEXT, -- JSON array of attachments
38
+ reasoning_text TEXT, -- concatenated reasoningText from all assistant.message events
39
+ response_text TEXT, -- concatenated content from all assistant.message events
40
+ tool_calls TEXT, -- JSON array of {toolCallId, name, arguments, output, success}
41
+ tool_call_count INTEGER,
42
+ has_tool_output INTEGER, -- 1 if any tool call has a VFS payload
43
+ session_meta TEXT, -- from session.start event (versions, producer)
44
+ ingested_at INTEGER,
45
+ UNIQUE(session_id, exchange_index)
46
+ );
47
+ CREATE INDEX IF NOT EXISTS ex_session ON exchanges(session_id);
48
+ CREATE INDEX IF NOT EXISTS ex_workspace ON exchanges(workspace_id);
49
+ CREATE INDEX IF NOT EXISTS ex_ts ON exchanges(user_ts);
50
+ """
51
+
52
+
53
+ def decompress_vfs(blob: bytes) -> bytes:
54
+ # Safety check: don't decompress blobs larger than 100MB
55
+ MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024 # 100MB
56
+ if len(blob) > MAX_DECOMPRESS_SIZE:
57
+ print(f"Warning: Skipping decompression of large blob ({len(blob)} bytes)")
58
+ return blob
59
+
60
+ try:
61
+ return gzip.decompress(blob)
62
+ except Exception:
63
+ return blob
64
+
65
+
66
+ def build_tool_output_index(conn, session_id: str) -> dict:
67
+ """
68
+ Returns {toolCallId: content_text} by reading VFS tool_output rows
69
+ where source_path contains the toolCallId directory segment.
70
+ """
71
+ rows = conn.execute(
72
+ "SELECT source_path, content FROM vfs WHERE session_id=? AND source_type='tool_output'",
73
+ (session_id,)
74
+ ).fetchall()
75
+ index = {}
76
+ for row in rows:
77
+ # path: .../chat-session-resources/<session_id>/<toolCallId>__vscode-.../content.txt
78
+ parts = Path(row[0]).parts
79
+ # Find the directory directly under the session_id dir
80
+ try:
81
+ for i, p in enumerate(parts):
82
+ if p == session_id and i + 1 < len(parts):
83
+ tool_call_dir = parts[i + 1]
84
+ # toolCallId is the prefix before '__vscode-'
85
+ tool_call_id = tool_call_dir.split('__vscode-')[0]
86
+ content = decompress_vfs(row[1]).decode('utf-8', errors='replace')
87
+ index[tool_call_id] = content
88
+ break
89
+ except Exception:
90
+ pass
91
+ return index
92
+
93
+
94
+ def reconstruct_session(conn, session_id: str, workspace_id: str) -> int:
95
+ """
96
+ Reconstructs all exchanges for a session from transcript_events.
97
+ Returns number of exchanges written.
98
+ """
99
+ # Load all events ordered by timestamp
100
+ rows = conn.execute(
101
+ """SELECT event_type, data_json, id, ts
102
+ FROM transcript_events
103
+ WHERE session_id=?
104
+ ORDER BY ts ASC, rowid ASC""",
105
+ (session_id,)
106
+ ).fetchall()
107
+
108
+ if not rows:
109
+ return 0
110
+
111
+ # Build event list
112
+ events = []
113
+ for row in rows:
114
+ try:
115
+ d = json.loads(row[1])
116
+ except Exception:
117
+ d = {}
118
+ events.append({
119
+ "type": row[0],
120
+ "data": d.get("data", {}),
121
+ "event_id": d.get("id"),
122
+ "timestamp": d.get("timestamp"),
123
+ "ts_ms": row[2],
124
+ })
125
+
126
+ # Extract session.start metadata
127
+ session_meta = {}
128
+ for e in events:
129
+ if e["type"] == "session.start":
130
+ session_meta = e["data"]
131
+ break
132
+
133
+ # Build tool output index: toolCallId → output text
134
+ tool_output_index = build_tool_output_index(conn, session_id)
135
+
136
+ # Walk events and group into exchanges
137
+ # An exchange = one user.message + everything until the next user.message
138
+ exchanges = []
139
+ current: Optional[dict] = None
140
+ current_turn: Optional[dict] = None # tracks the active assistant turn
141
+
142
+ def flush_turn():
143
+ nonlocal current_turn
144
+ if current_turn and current:
145
+ current["turns"].append(current_turn)
146
+ current_turn = None
147
+
148
+ def new_turn():
149
+ nonlocal current_turn
150
+ flush_turn()
151
+ current_turn = {"messages": [], "tool_calls": []}
152
+
153
+ for e in events:
154
+ etype = e["type"]
155
+ data = e["data"]
156
+
157
+ if etype == "user.message":
158
+ # Flush previous exchange
159
+ if current is not None:
160
+ flush_turn()
161
+ exchanges.append(current)
162
+ current = {
163
+ "request_id": e["event_id"],
164
+ "user_ts": e["timestamp"],
165
+ "user_message": data.get("content", ""),
166
+ "attachments": data.get("attachments", []),
167
+ "assistant_ts": None,
168
+ "turns": [],
169
+ }
170
+ current_turn = None
171
+
172
+ elif etype == "assistant.turn_start":
173
+ if current is None:
174
+ # Turn before any user message — session-level assistant intro
175
+ current = {
176
+ "request_id": None,
177
+ "user_ts": None,
178
+ "user_message": "",
179
+ "attachments": [],
180
+ "assistant_ts": e["timestamp"],
181
+ "turns": [],
182
+ }
183
+ if current["assistant_ts"] is None:
184
+ current["assistant_ts"] = e["timestamp"]
185
+ new_turn()
186
+
187
+ elif etype == "assistant.message":
188
+ if current_turn is None:
189
+ new_turn()
190
+ assert current_turn is not None
191
+ current_turn["messages"].append({
192
+ "message_id": data.get("messageId"),
193
+ "content": data.get("content", ""),
194
+ "reasoning": data.get("reasoningText", ""),
195
+ "tool_requests": data.get("toolRequests", []),
196
+ "timestamp": e["timestamp"],
197
+ })
198
+
199
+ elif etype == "tool.execution_start":
200
+ if current_turn is None:
201
+ new_turn()
202
+ assert current_turn is not None
203
+ tool_call_id = data.get("toolCallId", "")
204
+ current_turn["tool_calls"].append({
205
+ "toolCallId": tool_call_id,
206
+ "name": data.get("toolName", ""),
207
+ "arguments": data.get("arguments", {}),
208
+ "output": tool_output_index.get(tool_call_id),
209
+ "success": None,
210
+ "timestamp": e["timestamp"],
211
+ })
212
+
213
+ elif etype == "tool.execution_complete":
214
+ # Patch success onto the matching tool call in current turn
215
+ tool_call_id = data.get("toolCallId", "")
216
+ if current_turn:
217
+ for tc in current_turn["tool_calls"]:
218
+ if tc["toolCallId"] == tool_call_id:
219
+ tc["success"] = data.get("success")
220
+ break
221
+
222
+ elif etype == "assistant.turn_end":
223
+ flush_turn()
224
+
225
+ # Flush final exchange
226
+ if current is not None:
227
+ flush_turn()
228
+ exchanges.append(current)
229
+
230
+ # Write to DB
231
+ written = 0
232
+ for idx, ex in enumerate(exchanges):
233
+ # Flatten turns into top-level fields
234
+ reasoning_parts = []
235
+ response_parts = []
236
+ all_tool_calls = []
237
+
238
+ for turn in ex.get("turns", []):
239
+ for msg in turn.get("messages", []):
240
+ if msg.get("reasoning"):
241
+ reasoning_parts.append(msg["reasoning"])
242
+ if msg.get("content"):
243
+ response_parts.append(msg["content"])
244
+ all_tool_calls.extend(turn.get("tool_calls", []))
245
+
246
+ has_output = any(tc.get("output") is not None for tc in all_tool_calls)
247
+
248
+ conn.execute(
249
+ """INSERT OR IGNORE INTO exchanges(
250
+ session_id, workspace_id, exchange_index, request_id,
251
+ user_ts, assistant_ts,
252
+ user_message, attachments,
253
+ reasoning_text, response_text,
254
+ tool_calls, tool_call_count, has_tool_output,
255
+ session_meta, ingested_at
256
+ ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
257
+ (
258
+ session_id, workspace_id, idx,
259
+ ex.get("request_id"),
260
+ ex.get("user_ts"),
261
+ ex.get("assistant_ts"),
262
+ ex.get("user_message", ""),
263
+ json.dumps(ex.get("attachments", [])),
264
+ "\n\n".join(reasoning_parts),
265
+ "\n\n".join(response_parts),
266
+ json.dumps(all_tool_calls),
267
+ len(all_tool_calls),
268
+ 1 if has_output else 0,
269
+ json.dumps(session_meta),
270
+ NOW_MS,
271
+ )
272
+ )
273
+ written += 1
274
+
275
+ return written
276
+
277
+
278
+ def _parse_chat_request(req, turn_index):
279
+ """Parse a raw chat-session request dict into a normalized exchange record."""
280
+ msg = req.get('message', {})
281
+ text = msg.get('text', '') if isinstance(msg, dict) else ''
282
+
283
+ model_id = req.get('modelId', '')
284
+ if not model_id and isinstance(req.get('modelState'), dict):
285
+ model_id = req['modelState'].get('modelId', '')
286
+
287
+ # Response content — array of parts in kind=0 snapshot
288
+ response = req.get('response') or []
289
+ response_parts = []
290
+ if isinstance(response, list):
291
+ for part in response:
292
+ if isinstance(part, dict):
293
+ v = part.get('value', '') or part.get('content', '') or ''
294
+ if isinstance(v, str) and v and 'conversation-summary' not in v.lower():
295
+ response_parts.append(v)
296
+
297
+ # Tool requests
298
+ tool_requests = []
299
+ for tr in (req.get('toolRequests') or req.get('toolResults') or []):
300
+ if isinstance(tr, dict):
301
+ tool_requests.append({
302
+ 'toolCallId': tr.get('toolCallId', ''),
303
+ 'name': tr.get('toolName', '') or tr.get('name', ''),
304
+ 'arguments': tr.get('arguments', {}),
305
+ 'success': tr.get('success'),
306
+ })
307
+
308
+ return {
309
+ 'request_id': req.get('requestId', ''),
310
+ 'ts': req.get('timestamp', 0),
311
+ 'turn_index': turn_index,
312
+ 'message_text': text,
313
+ 'model_id': model_id,
314
+ 'response_text': '\n\n'.join(response_parts),
315
+ 'tool_requests': tool_requests,
316
+ }
317
+
318
+
319
+ def reconstruct_from_chat_blob(conn, session_id, workspace_id, content):
320
+ """
321
+ Reconstruct exchanges from a chat_session VFS blob.
322
+ Used for sessions that have no transcript_events (chat-only sessions).
323
+ Returns number of exchanges written.
324
+ """
325
+ raw = gzip.decompress(content).decode('utf-8', errors='replace')
326
+ lines = [ln for ln in raw.splitlines() if ln.strip()]
327
+
328
+ requests_map = {} # request_id -> parsed dict (ordered by insertion)
329
+ turn_index = 0
330
+
331
+ for line in lines:
332
+ try:
333
+ obj = json.loads(line)
334
+ except Exception:
335
+ continue
336
+
337
+ kind = obj.get('kind')
338
+
339
+ # kind=0: initial snapshot
340
+ if kind == 0:
341
+ for req in (obj.get('v', {}).get('requests') or []):
342
+ rid = req.get('requestId', '')
343
+ if rid and rid not in requests_map:
344
+ requests_map[rid] = _parse_chat_request(req, turn_index)
345
+ turn_index += 1
346
+
347
+ # kind=2: delta patches — new requests appended
348
+ elif kind == 2:
349
+ k = obj.get('k', [])
350
+ v = obj.get('v')
351
+ if k == ['requests'] and isinstance(v, list):
352
+ for req in v:
353
+ rid = req.get('requestId', '')
354
+ if rid and rid not in requests_map:
355
+ requests_map[rid] = _parse_chat_request(req, turn_index)
356
+ turn_index += 1
357
+
358
+ # kind=1: result patches — response content, model, usage
359
+ elif kind == 1:
360
+ k = obj.get('k', [])
361
+ v = obj.get('v', {})
362
+ if len(k) >= 3 and k[0] == 'requests' and k[2] == 'result' and isinstance(v, dict):
363
+ idx = k[1]
364
+ items = list(requests_map.values())
365
+ if 0 <= idx < len(items):
366
+ req = items[idx]
367
+ meta = (v.get('metadata') or {})
368
+ # Fill model if missing
369
+ if not req['model_id']:
370
+ resolved = meta.get('resolvedModel', '')
371
+ if resolved:
372
+ req['model_id'] = resolved if isinstance(resolved, str) else str(resolved)
373
+ # Backfill response text from result if not already present
374
+ if not req['response_text']:
375
+ # result.value or result.output
376
+ for key in ('value', 'output', 'content', 'text'):
377
+ rv = v.get(key)
378
+ if isinstance(rv, str) and rv:
379
+ req['response_text'] = rv[:5000]
380
+ break
381
+
382
+ if not requests_map:
383
+ return 0
384
+
385
+ tool_output_index = build_tool_output_index(conn, session_id)
386
+
387
+ written = 0
388
+ for idx, req in enumerate(requests_map.values()):
389
+ # Skip requests with no user message text (system/empty entries)
390
+ if not req['message_text']:
391
+ continue
392
+
393
+ tool_calls = []
394
+ for tr in req.get('tool_requests', []):
395
+ tc_id = tr.get('toolCallId', '')
396
+ tool_calls.append({
397
+ 'toolCallId': tc_id,
398
+ 'name': tr.get('name', ''),
399
+ 'arguments': tr.get('arguments', {}),
400
+ 'output': tool_output_index.get(tc_id),
401
+ 'success': tr.get('success'),
402
+ })
403
+
404
+ has_output = any(tc.get('output') is not None for tc in tool_calls)
405
+
406
+ conn.execute(
407
+ """INSERT OR IGNORE INTO exchanges(
408
+ session_id, workspace_id, exchange_index, request_id,
409
+ user_ts, assistant_ts,
410
+ user_message, attachments,
411
+ reasoning_text, response_text,
412
+ tool_calls, tool_call_count, has_tool_output,
413
+ session_meta, ingested_at
414
+ ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
415
+ (
416
+ session_id, workspace_id, written,
417
+ req.get('request_id'),
418
+ req.get('ts'),
419
+ None,
420
+ req.get('message_text', ''),
421
+ json.dumps([]),
422
+ '',
423
+ req.get('response_text', ''),
424
+ json.dumps(tool_calls),
425
+ len(tool_calls),
426
+ 1 if has_output else 0,
427
+ json.dumps({}),
428
+ NOW_MS,
429
+ )
430
+ )
431
+ written += 1
432
+
433
+ return written
434
+
435
+
436
+ def main():
437
+ conn = sqlite3.connect(str(DB_PATH))
438
+ conn.execute("PRAGMA journal_mode=WAL")
439
+ conn.execute("PRAGMA synchronous=NORMAL")
440
+ conn.execute("PRAGMA cache_size=-2000")
441
+ conn.execute("PRAGMA mmap_size=268435456")
442
+ conn.execute("PRAGMA temp_store=MEMORY")
443
+
444
+ # Add schema
445
+ conn.executescript(EXCHANGES_SCHEMA)
446
+ conn.commit()
447
+
448
+ # Wipe existing exchanges to allow re-run
449
+ conn.execute("DELETE FROM exchanges")
450
+ conn.commit()
451
+
452
+ # Get all sessions that have transcript events
453
+ sessions = conn.execute(
454
+ """SELECT DISTINCT te.session_id, s.workspace_id
455
+ FROM transcript_events te
456
+ LEFT JOIN sessions s USING(session_id)"""
457
+ ).fetchall()
458
+
459
+ print(f"Reconstructing {len(sessions)} sessions with transcript data...")
460
+
461
+ total_exchanges = 0
462
+ for session_id, workspace_id in sessions:
463
+ n = reconstruct_session(conn, session_id, workspace_id or "unknown")
464
+ total_exchanges += n
465
+ print(f" [transcript] {session_id[:16]} {n} exchanges")
466
+ conn.commit()
467
+
468
+ # ── Chat-only sessions: reconstruct from chat_session blob ──────────────
469
+ # Sessions that have a chat_session blob but no transcript events
470
+ chat_only = conn.execute(
471
+ """SELECT v.session_id, s.workspace_id, v.content
472
+ FROM vfs v
473
+ LEFT JOIN sessions s ON s.session_id = v.session_id
474
+ WHERE v.source_type = 'chat_session'
475
+ AND v.session_id NOT IN (
476
+ SELECT DISTINCT session_id FROM transcript_events
477
+ )
478
+ ORDER BY v.session_id"""
479
+ ).fetchall()
480
+
481
+ print(f"\nReconstructing {len(chat_only)} chat-only sessions (no transcript)...")
482
+ chat_total = 0
483
+ for session_id, workspace_id, content in chat_only:
484
+ try:
485
+ n = reconstruct_from_chat_blob(conn, session_id, workspace_id or "unknown", content)
486
+ chat_total += n
487
+ if n > 0:
488
+ print(f" [chat-blob] {session_id[:16]} {n} exchanges")
489
+ conn.commit()
490
+ except Exception as e:
491
+ print(f" [chat-blob] {session_id[:16]} ERROR: {e}")
492
+
493
+ total_exchanges += chat_total
494
+
495
+ print()
496
+ print("=== RECONSTRUCTION COMPLETE ===")
497
+ print(f" From transcripts: {total_exchanges - chat_total}")
498
+ print(f" From chat blobs: {chat_total}")
499
+ print(f" Total exchanges: {total_exchanges}")
500
+
501
+ # Spot-check this session
502
+ this_session = 'f274fb87-77f8-477a-993e-ed6e73d930ff'
503
+ print()
504
+ print(f"=== Spot-check: {this_session[:16]}... ===")
505
+ rows = conn.execute(
506
+ """SELECT exchange_index, user_ts, user_message, tool_call_count, has_tool_output,
507
+ LENGTH(reasoning_text) reasoning_len, LENGTH(response_text) response_len
508
+ FROM exchanges WHERE session_id=? ORDER BY exchange_index""",
509
+ (this_session,)
510
+ ).fetchall()
511
+ for r in rows:
512
+ user_preview = (r[2] or "")[:60].replace('\n', ' ')
513
+ print(f" [{r[0]:>2}] {r[1] or '':>25} tools={r[3]} has_output={r[4]} "
514
+ f"reasoning={r[5]}b response={r[6]}b user='{user_preview}'")
515
+
516
+ # Show one full exchange as sample
517
+ print()
518
+ print("=== Sample exchange [1] full structure ===")
519
+ row = conn.execute(
520
+ "SELECT * FROM exchanges WHERE session_id=? AND exchange_index=1",
521
+ (this_session,)
522
+ ).fetchone()
523
+ if row:
524
+ cols = [d[0] for d in conn.execute("SELECT * FROM exchanges LIMIT 0").description]
525
+ d = dict(zip(cols, row))
526
+ # Truncate large fields for display
527
+ for field in ['tool_calls', 'session_meta', 'reasoning_text', 'response_text']:
528
+ if d.get(field):
529
+ d[field] = d[field][:300] + ('...' if len(str(d[field])) > 300 else '')
530
+ print(json.dumps(d, indent=2)[:2000])
531
+
532
+ conn.close()
533
+
534
+
535
+ if __name__ == "__main__":
536
+ main()