code-data-ark 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cda/pipeline/ingest.py ADDED
@@ -0,0 +1,673 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ cda/ingest.py
4
+
5
+ Extracts all VSCode/Copilot session data into a local SQLite database.
6
+
7
+ Storage locations ingested per workspace/session:
8
+ 1. transcripts/*.jsonl — Copilot transcript event stream
9
+ 2. chatSessions/*.jsonl — VS Code chat UI state (kind 0/1/2)
10
+ 3. chatEditingSessions/*/state.json — file edit checkpoints
11
+ 4. chatEditingSessions/*/contents/* — versioned file content blobs
12
+ 5. chat-session-resources/*/*/content.txt — tool output payloads
13
+ 6. debug-logs/*/models.json — model catalog at session start
14
+ 7. debug-logs/*/main.jsonl — minimal debug events
15
+ 8. state.vscdb ItemTable — VS Code workspace state (parsed, not blobbed)
16
+ 9. memory-tool/ (workspace) — workspace-scoped memory files
17
+ 10. globalStorage/.../memories/ — global memory files (once, not per-workspace)
18
+ """
19
+
20
+ import os
21
+ import json
22
+ import sqlite3
23
+ import gzip
24
+ import hashlib
25
+ import time
26
+ import logging
27
+ from pathlib import Path
28
+
29
+ # Set up logging
30
+ logging.basicConfig(
31
+ level=logging.INFO,
32
+ format="%(asctime)s %(levelname)-7s %(message)s",
33
+ datefmt="%H:%M:%S",
34
+ )
35
+ log = logging.getLogger("ark-ingest")
36
+
37
+ HOME = Path.home()
38
+ # Allow override via env var for portability
39
+ VSCODE_DATA_DIR = Path(os.environ.get("VSCODE_DATA_DIR", HOME / "Library/Application Support/Code/User"))
40
+ VS_STORAGE = VSCODE_DATA_DIR / "workspaceStorage"
41
+ GLOBAL_MEM = VSCODE_DATA_DIR / "globalStorage/github.copilot-chat/memory-tool/memories"
42
+ ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
43
+ LOCAL_DIR = ROOT_DIR / "local"
44
+ DB_PATH = LOCAL_DIR / "data" / "cda.db"
45
+
46
+ # Large index DBs — too big to blob, record path only
47
+ SKIP_BLOB_PATTERNS = ["workspace-chunks.db", "local-index"]
48
+
49
+ NOW_MS = int(time.time() * 1000)
50
+
51
+
52
+ # ─────────────────────────────────────────────
53
+ # HELPERS
54
+ # ─────────────────────────────────────────────
55
+
56
+ def sha256_short(data: bytes) -> str:
57
+ return hashlib.sha256(data).hexdigest()[:16]
58
+
59
+
60
+ def compress(data: bytes) -> bytes:
61
+ return gzip.compress(data, compresslevel=6)
62
+
63
+
64
+ def read_bytes(path):
65
+ try:
66
+ return Path(path).read_bytes()
67
+ except Exception as e:
68
+ log.warning(f"Failed to read bytes from {path}: {e}")
69
+ return None
70
+
71
+
72
+ def read_json(path):
73
+ try:
74
+ return json.loads(Path(path).read_text())
75
+ except Exception as e:
76
+ log.warning(f"Failed to read JSON from {path}: {e}")
77
+ return None
78
+
79
+
80
+ def log_ingest(conn, workspace_id, session_id, source_type, status, message=""):
81
+ conn.execute(
82
+ "INSERT INTO ingest_log(workspace_id, session_id, source_type, status, message, at) VALUES(?,?,?,?,?,?)",
83
+ (workspace_id, session_id, source_type, status, message, NOW_MS)
84
+ )
85
+
86
+
87
+ # ─────────────────────────────────────────────
88
+ # SCHEMA
89
+ # ─────────────────────────────────────────────
90
+
91
+ SCHEMA = """
92
+ CREATE TABLE IF NOT EXISTS workspaces (
93
+ workspace_id TEXT PRIMARY KEY,
94
+ uri TEXT,
95
+ name TEXT,
96
+ type TEXT, -- 'workspace' | 'folder' | 'unknown'
97
+ session_count INTEGER DEFAULT 0,
98
+ ingested_at INTEGER
99
+ );
100
+
101
+ CREATE TABLE IF NOT EXISTS sessions (
102
+ session_id TEXT PRIMARY KEY,
103
+ workspace_id TEXT,
104
+ title TEXT,
105
+ created_at INTEGER,
106
+ last_message_at INTEGER,
107
+ request_count INTEGER DEFAULT 0,
108
+ response_state INTEGER,
109
+ initial_location TEXT,
110
+ ingested_at INTEGER,
111
+ FOREIGN KEY (workspace_id) REFERENCES workspaces(workspace_id)
112
+ );
113
+
114
+ -- Which of the 14 locations exist for each session + sizes
115
+ CREATE TABLE IF NOT EXISTS session_storage (
116
+ session_id TEXT PRIMARY KEY,
117
+ workspace_id TEXT,
118
+ has_transcript INTEGER DEFAULT 0,
119
+ transcript_size INTEGER DEFAULT 0,
120
+ has_chat_session INTEGER DEFAULT 0,
121
+ chat_session_size INTEGER DEFAULT 0,
122
+ has_edit_session INTEGER DEFAULT 0,
123
+ edit_state_size INTEGER DEFAULT 0,
124
+ edit_content_count INTEGER DEFAULT 0,
125
+ has_tool_outputs INTEGER DEFAULT 0,
126
+ tool_output_count INTEGER DEFAULT 0,
127
+ has_debug_log INTEGER DEFAULT 0,
128
+ debug_models_size INTEGER DEFAULT 0,
129
+ in_state_vscdb INTEGER DEFAULT 0,
130
+ has_workspace_memory INTEGER DEFAULT 0,
131
+ workspace_memory_count INTEGER DEFAULT 0,
132
+ semantic_index_path TEXT,
133
+ fulltext_index_path TEXT
134
+ );
135
+
136
+ -- Blob VFS — raw file content, gzip-compressed
137
+ CREATE TABLE IF NOT EXISTS vfs (
138
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
139
+ workspace_id TEXT,
140
+ session_id TEXT,
141
+ source_type TEXT, -- transcript | chat_session | edit_state | edit_content |
142
+ -- tool_output | debug_models | debug_main | memory_global | memory_workspace
143
+ source_path TEXT, -- original path on disk
144
+ filename TEXT, -- basename
145
+ content_type TEXT, -- jsonl | json | text | binary
146
+ content BLOB, -- gzip-compressed raw bytes
147
+ size_bytes INTEGER, -- original uncompressed size
148
+ sha256 TEXT,
149
+ ingested_at INTEGER
150
+ );
151
+ CREATE INDEX IF NOT EXISTS vfs_session ON vfs(session_id);
152
+ CREATE INDEX IF NOT EXISTS vfs_type ON vfs(source_type);
153
+ CREATE INDEX IF NOT EXISTS vfs_workspace ON vfs(workspace_id);
154
+
155
+ -- Parsed transcript events (from transcripts/*.jsonl)
156
+ CREATE TABLE IF NOT EXISTS transcript_events (
157
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
158
+ session_id TEXT,
159
+ workspace_id TEXT,
160
+ event_type TEXT,
161
+ request_id TEXT,
162
+ turn_index INTEGER,
163
+ ts INTEGER,
164
+ data_json TEXT
165
+ );
166
+ CREATE INDEX IF NOT EXISTS te_session ON transcript_events(session_id);
167
+ CREATE INDEX IF NOT EXISTS te_type ON transcript_events(event_type);
168
+ CREATE INDEX IF NOT EXISTS te_request ON transcript_events(request_id);
169
+
170
+ -- Parsed chat messages (from chatSessions kind=1 user text + kind=2 request entries)
171
+ CREATE TABLE IF NOT EXISTS chat_messages (
172
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
173
+ session_id TEXT,
174
+ workspace_id TEXT,
175
+ request_id TEXT,
176
+ ts INTEGER,
177
+ role TEXT, -- 'user' | 'assistant' | 'request_meta'
178
+ content TEXT,
179
+ agent_id TEXT,
180
+ kind INTEGER -- original chatSessions kind
181
+ );
182
+ CREATE INDEX IF NOT EXISTS cm_session ON chat_messages(session_id);
183
+ CREATE INDEX IF NOT EXISTS cm_request ON chat_messages(request_id);
184
+
185
+ -- state.vscdb ItemTable rows per workspace
186
+ CREATE TABLE IF NOT EXISTS state_items (
187
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
188
+ workspace_id TEXT,
189
+ key TEXT,
190
+ value TEXT,
191
+ UNIQUE(workspace_id, key)
192
+ );
193
+ CREATE INDEX IF NOT EXISTS si_workspace ON state_items(workspace_id);
194
+ CREATE INDEX IF NOT EXISTS si_key ON state_items(key);
195
+
196
+ -- Memory files (global + workspace-scoped)
197
+ CREATE TABLE IF NOT EXISTS memory_files (
198
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
199
+ scope TEXT, -- 'global' | 'workspace' | 'session' | 'repo'
200
+ workspace_id TEXT,
201
+ session_id TEXT,
202
+ filename TEXT,
203
+ content TEXT,
204
+ size_bytes INTEGER,
205
+ ingested_at INTEGER
206
+ );
207
+
208
+ -- Ingest audit trail
209
+ CREATE TABLE IF NOT EXISTS ingest_log (
210
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
211
+ workspace_id TEXT,
212
+ session_id TEXT,
213
+ source_type TEXT,
214
+ status TEXT, -- 'ok' | 'skip' | 'error'
215
+ message TEXT,
216
+ at INTEGER
217
+ );
218
+ """
219
+
220
+
221
+ # ─────────────────────────────────────────────
222
+ # VFS INSERT
223
+ # ─────────────────────────────────────────────
224
+
225
+ def vfs_insert(conn, workspace_id, session_id, source_type, source_path, content_type, raw: bytes):
226
+ compressed = compress(raw)
227
+ h = sha256_short(raw)
228
+ filename = Path(source_path).name
229
+ conn.execute(
230
+ """INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
231
+ content_type, content, size_bytes, sha256, ingested_at)
232
+ VALUES(?,?,?,?,?,?,?,?,?,?)""",
233
+ (workspace_id, session_id, source_type, str(source_path), filename,
234
+ content_type, compressed, len(raw), h, NOW_MS)
235
+ )
236
+
237
+
238
+ # ─────────────────────────────────────────────
239
+ # INGEST: TRANSCRIPT
240
+ # ─────────────────────────────────────────────
241
+
242
+ def ingest_transcript(conn, workspace_id, session_id, path: Path):
243
+ raw = read_bytes(path)
244
+ if raw is None:
245
+ return 0
246
+ vfs_insert(conn, workspace_id, session_id, "transcript", path, "jsonl", raw)
247
+ count = 0
248
+ turn_index = 0
249
+ for line in raw.decode("utf-8", errors="replace").splitlines():
250
+ line = line.strip()
251
+ if not line:
252
+ continue
253
+ try:
254
+ evt = json.loads(line)
255
+ except Exception:
256
+ continue
257
+ event_type = evt.get("type", "unknown")
258
+ request_id = evt.get("requestId") or evt.get("request_id")
259
+ ts = evt.get("timestamp") or evt.get("ts")
260
+ if event_type in ("assistant.turn_start", "user.message"):
261
+ turn_index += 1
262
+ conn.execute(
263
+ """INSERT INTO transcript_events(session_id, workspace_id, event_type, request_id, turn_index, ts, data_json)
264
+ VALUES(?,?,?,?,?,?,?)""",
265
+ (session_id, workspace_id, event_type, request_id, turn_index, ts, line)
266
+ )
267
+ count += 1
268
+ return count
269
+
270
+
271
+ # ─────────────────────────────────────────────
272
+ # INGEST: CHAT SESSIONS
273
+ # ─────────────────────────────────────────────
274
+
275
+ def ingest_chat_session(conn, workspace_id, session_id, path: Path):
276
+ raw = read_bytes(path)
277
+ if raw is None:
278
+ return 0
279
+ vfs_insert(conn, workspace_id, session_id, "chat_session", path, "jsonl", raw)
280
+ count = 0
281
+ for line in raw.decode("utf-8", errors="replace").splitlines():
282
+ line = line.strip()
283
+ if not line:
284
+ continue
285
+ try:
286
+ obj = json.loads(line)
287
+ except Exception:
288
+ continue
289
+ kind = obj.get("kind")
290
+ v = obj.get("v")
291
+
292
+ if kind == 1 and isinstance(v, str):
293
+ # Raw user message text
294
+ conn.execute(
295
+ "INSERT INTO chat_messages(session_id, workspace_id, request_id, ts, role, content, kind) VALUES(?,?,?,?,?,?,?)",
296
+ (session_id, workspace_id, None, None, "user", v, 1)
297
+ )
298
+ count += 1
299
+
300
+ elif kind == 2 and isinstance(v, list):
301
+ # Incremental request entries
302
+ for req in v:
303
+ if not isinstance(req, dict):
304
+ continue
305
+ request_id = req.get("requestId")
306
+ ts = req.get("timestamp")
307
+ agent_id = None
308
+ if isinstance(req.get("agent"), dict):
309
+ agent_id = req["agent"].get("id")
310
+ conn.execute(
311
+ "INSERT INTO chat_messages(session_id, workspace_id, request_id, ts, role, content, agent_id, kind) VALUES(?,?,?,?,?,?,?,?)",
312
+ (session_id, workspace_id, request_id, ts, "request_meta",
313
+ json.dumps(req), agent_id, 2)
314
+ )
315
+ count += 1
316
+ return count
317
+
318
+
319
+ # ─────────────────────────────────────────────
320
+ # INGEST: CHAT EDITING SESSIONS
321
+ # ─────────────────────────────────────────────
322
+
323
+ def ingest_edit_session(conn, workspace_id, session_id, session_dir: Path):
324
+ state_path = session_dir / "state.json"
325
+ raw = read_bytes(state_path)
326
+ if raw:
327
+ vfs_insert(conn, workspace_id, session_id, "edit_state", state_path, "json", raw)
328
+
329
+ contents_dir = session_dir / "contents"
330
+ content_count = 0
331
+ if contents_dir.is_dir():
332
+ for blob_file in contents_dir.iterdir():
333
+ if blob_file.is_file():
334
+ raw_blob = read_bytes(blob_file)
335
+ if raw_blob:
336
+ vfs_insert(conn, workspace_id, session_id, "edit_content", blob_file, "binary", raw_blob)
337
+ content_count += 1
338
+
339
+ return content_count
340
+
341
+
342
+ # ─────────────────────────────────────────────
343
+ # INGEST: TOOL OUTPUTS
344
+ # ─────────────────────────────────────────────
345
+
346
+ def ingest_tool_outputs(conn, workspace_id, session_id, session_dir: Path):
347
+ count = 0
348
+ if not session_dir.is_dir():
349
+ return 0
350
+ for tool_dir in session_dir.iterdir():
351
+ if not tool_dir.is_dir():
352
+ continue
353
+ content_file = tool_dir / "content.txt"
354
+ raw = read_bytes(content_file)
355
+ if raw:
356
+ vfs_insert(conn, workspace_id, session_id, "tool_output", content_file, "text", raw)
357
+ count += 1
358
+ return count
359
+
360
+
361
+ # ─────────────────────────────────────────────
362
+ # INGEST: DEBUG LOGS
363
+ # ─────────────────────────────────────────────
364
+
365
+ def ingest_debug_log(conn, workspace_id, session_id, session_dir: Path):
366
+ models_size = 0
367
+ for name, ctype in [("models.json", "json"), ("main.jsonl", "jsonl")]:
368
+ path = session_dir / name
369
+ raw = read_bytes(path)
370
+ if raw:
371
+ vfs_insert(conn, workspace_id, session_id, "debug_models" if name == "models.json" else "debug_main",
372
+ path, ctype, raw)
373
+ if name == "models.json":
374
+ models_size = len(raw)
375
+ return models_size
376
+
377
+
378
+ # ─────────────────────────────────────────────
379
+ # INGEST: STATE.VSCDB
380
+ # ─────────────────────────────────────────────
381
+
382
+ def ingest_state_vscdb(conn, workspace_id, ws_path: Path):
383
+ db_path = ws_path / "state.vscdb"
384
+ if not db_path.exists():
385
+ return 0
386
+ try:
387
+ src = sqlite3.connect(str(db_path))
388
+ src.row_factory = sqlite3.Row
389
+ rows = src.execute("SELECT key, value FROM ItemTable").fetchall()
390
+ src.close()
391
+ except Exception as e:
392
+ log_ingest(conn, workspace_id, None, "state_vscdb", "error", str(e))
393
+ return 0
394
+ count = 0
395
+ for row in rows:
396
+ try:
397
+ conn.execute(
398
+ "INSERT OR REPLACE INTO state_items(workspace_id, key, value) VALUES(?,?,?)",
399
+ (workspace_id, row["key"], row["value"])
400
+ )
401
+ count += 1
402
+ except Exception:
403
+ pass
404
+ return count
405
+
406
+
407
+ # ─────────────────────────────────────────────
408
+ # INGEST: MEMORY FILES
409
+ # ─────────────────────────────────────────────
410
+
411
+ def ingest_memory_dir(conn, scope, workspace_id, session_id, mem_dir: Path):
412
+ count = 0
413
+ if not mem_dir.is_dir():
414
+ return 0
415
+ for f in mem_dir.rglob("*"):
416
+ if not f.is_file():
417
+ continue
418
+ try:
419
+ content = f.read_text(errors="replace")
420
+ conn.execute(
421
+ """INSERT INTO memory_files(scope, workspace_id, session_id, filename, content, size_bytes, ingested_at)
422
+ VALUES(?,?,?,?,?,?,?)""",
423
+ (scope, workspace_id, session_id, f.name, content, f.stat().st_size, NOW_MS)
424
+ )
425
+ count += 1
426
+ except Exception:
427
+ pass
428
+ return count
429
+
430
+
431
+ # ─────────────────────────────────────────────
432
+ # INGEST: ONE WORKSPACE
433
+ # ─────────────────────────────────────────────
434
+
435
+ def ingest_workspace(conn, ws_id: str):
436
+ ws_path = VS_STORAGE / ws_id
437
+
438
+ # Resolve workspace URI
439
+ ws_json_path = ws_path / "workspace.json"
440
+ ws_data = read_json(ws_json_path) or {}
441
+ uri = ws_data.get("workspace") or ws_data.get("folder") or ws_data.get("folderUri") or "unknown"
442
+ ws_type = "workspace" if "workspace" in ws_data else ("folder" if "folder" in ws_data else "unknown")
443
+
444
+ # Derive name
445
+ name = Path(str(uri).rstrip("/").replace("file://", "")).name or ws_id[:12]
446
+
447
+ # Collect session IDs from transcripts dir (most reliable)
448
+ sessions_found: dict[str, dict] = {}
449
+ copilot_dir = ws_path / "GitHub.copilot-chat"
450
+
451
+ transcripts_dir = copilot_dir / "transcripts"
452
+ if transcripts_dir.is_dir():
453
+ for f in transcripts_dir.glob("*.jsonl"):
454
+ sid = f.stem
455
+ sessions_found.setdefault(sid, {})["transcript"] = f
456
+
457
+ chat_sessions_dir = ws_path / "chatSessions"
458
+ if chat_sessions_dir.is_dir():
459
+ for f in chat_sessions_dir.glob("*.jsonl"):
460
+ sid = f.stem
461
+ sessions_found.setdefault(sid, {})["chat_session"] = f
462
+
463
+ # Get session metadata from state.vscdb
464
+ session_meta: dict[str, dict] = {}
465
+ state_db_path = ws_path / "state.vscdb"
466
+ if state_db_path.exists():
467
+ try:
468
+ src = sqlite3.connect(str(state_db_path))
469
+ row = src.execute("SELECT value FROM ItemTable WHERE key='chat.ChatSessionStore.index'").fetchone()
470
+ if row:
471
+ idx = json.loads(row[0])
472
+ for sid, entry in (idx.get("entries") or {}).items():
473
+ session_meta[sid] = entry
474
+ src.close()
475
+ except Exception:
476
+ pass
477
+
478
+ # Merge sessions from all sources
479
+ for sid, entry in session_meta.items():
480
+ sessions_found.setdefault(sid, {})["meta"] = entry
481
+
482
+ edit_sessions_dir = ws_path / "chatEditingSessions"
483
+ if edit_sessions_dir.is_dir():
484
+ for d in edit_sessions_dir.iterdir():
485
+ if d.is_dir():
486
+ sessions_found.setdefault(d.name, {})["edit_session_dir"] = d
487
+
488
+ tool_resources_dir = copilot_dir / "chat-session-resources"
489
+ if tool_resources_dir.is_dir():
490
+ for d in tool_resources_dir.iterdir():
491
+ if d.is_dir():
492
+ sessions_found.setdefault(d.name, {})["tool_outputs_dir"] = d
493
+
494
+ debug_logs_dir = copilot_dir / "debug-logs"
495
+ if debug_logs_dir.is_dir():
496
+ for d in debug_logs_dir.iterdir():
497
+ if d.is_dir():
498
+ sessions_found.setdefault(d.name, {})["debug_log_dir"] = d
499
+
500
+ # Register workspace
501
+ conn.execute(
502
+ "INSERT OR REPLACE INTO workspaces(workspace_id, uri, name, type, session_count, ingested_at) VALUES(?,?,?,?,?,?)",
503
+ (ws_id, str(uri), name, ws_type, len(sessions_found), NOW_MS)
504
+ )
505
+
506
+ # Ingest state.vscdb
507
+ ingest_state_vscdb(conn, ws_id, ws_path)
508
+
509
+ # Ingest workspace memory files
510
+ ws_mem_dir = copilot_dir / "memory-tool" / "memories"
511
+ ingest_memory_dir(conn, "workspace", ws_id, None, ws_mem_dir)
512
+
513
+ # Process each session
514
+ for sid, sources in sessions_found.items():
515
+ meta = sources.get("meta", {})
516
+ title = meta.get("title") or meta.get("customTitle") or "untitled"
517
+ created_at = (meta.get("timing") or {}).get("created") or meta.get("creationDate")
518
+ last_msg = (meta.get("timing") or {}).get("lastRequestStarted") or meta.get("lastMessageDate")
519
+ response_state = meta.get("lastResponseState")
520
+ initial_location = meta.get("initialLocation")
521
+
522
+ conn.execute(
523
+ """INSERT OR REPLACE INTO sessions(session_id, workspace_id, title, created_at,
524
+ last_message_at, response_state, initial_location, ingested_at)
525
+ VALUES(?,?,?,?,?,?,?,?)""",
526
+ (sid, ws_id, title, created_at, last_msg, response_state, initial_location, NOW_MS)
527
+ )
528
+
529
+ storage_row = {
530
+ "session_id": sid, "workspace_id": ws_id,
531
+ "has_transcript": 0, "transcript_size": 0,
532
+ "has_chat_session": 0, "chat_session_size": 0,
533
+ "has_edit_session": 0, "edit_state_size": 0, "edit_content_count": 0,
534
+ "has_tool_outputs": 0, "tool_output_count": 0,
535
+ "has_debug_log": 0, "debug_models_size": 0,
536
+ "in_state_vscdb": 1 if sid in session_meta else 0,
537
+ "has_workspace_memory": 0, "workspace_memory_count": 0,
538
+ }
539
+
540
+ # 1. Transcript
541
+ if "transcript" in sources:
542
+ p = sources["transcript"]
543
+ evt_count = ingest_transcript(conn, ws_id, sid, p)
544
+ storage_row.update(has_transcript=1, transcript_size=p.stat().st_size)
545
+ conn.execute("UPDATE sessions SET request_count=? WHERE session_id=?",
546
+ (evt_count, sid))
547
+
548
+ # 2. Chat session
549
+ if "chat_session" in sources:
550
+ p = sources["chat_session"]
551
+ ingest_chat_session(conn, ws_id, sid, p)
552
+ storage_row.update(has_chat_session=1, chat_session_size=p.stat().st_size)
553
+
554
+ # 3. Edit session
555
+ if "edit_session_dir" in sources:
556
+ d = sources["edit_session_dir"]
557
+ content_count = ingest_edit_session(conn, ws_id, sid, d)
558
+ state_size = 0
559
+ sp = d / "state.json"
560
+ if sp.exists():
561
+ state_size = sp.stat().st_size
562
+ storage_row.update(has_edit_session=1, edit_state_size=state_size,
563
+ edit_content_count=content_count)
564
+
565
+ # 4. Tool outputs
566
+ if "tool_outputs_dir" in sources:
567
+ d = sources["tool_outputs_dir"]
568
+ count = ingest_tool_outputs(conn, ws_id, sid, d)
569
+ storage_row.update(has_tool_outputs=1 if count > 0 else 0, tool_output_count=count)
570
+
571
+ # 5. Debug logs
572
+ if "debug_log_dir" in sources:
573
+ d = sources["debug_log_dir"]
574
+ models_size = ingest_debug_log(conn, ws_id, sid, d)
575
+ storage_row.update(has_debug_log=1, debug_models_size=models_size)
576
+
577
+ # 6. Large index DBs — path only
578
+ semantic = ws_path / "GitHub.copilot-chat" / "workspace-chunks.db"
579
+ fulltext_candidates = list(ws_path.glob("local-index*"))
580
+ storage_row["semantic_index_path"] = str(semantic) if semantic.exists() else None
581
+ storage_row["fulltext_index_path"] = str(fulltext_candidates[0]) if fulltext_candidates else None
582
+
583
+ conn.execute(
584
+ """INSERT OR REPLACE INTO session_storage(
585
+ session_id, workspace_id,
586
+ has_transcript, transcript_size,
587
+ has_chat_session, chat_session_size,
588
+ has_edit_session, edit_state_size, edit_content_count,
589
+ has_tool_outputs, tool_output_count,
590
+ has_debug_log, debug_models_size,
591
+ in_state_vscdb,
592
+ has_workspace_memory, workspace_memory_count,
593
+ semantic_index_path, fulltext_index_path
594
+ ) VALUES(
595
+ :session_id, :workspace_id,
596
+ :has_transcript, :transcript_size,
597
+ :has_chat_session, :chat_session_size,
598
+ :has_edit_session, :edit_state_size, :edit_content_count,
599
+ :has_tool_outputs, :tool_output_count,
600
+ :has_debug_log, :debug_models_size,
601
+ :in_state_vscdb,
602
+ :has_workspace_memory, :workspace_memory_count,
603
+ :semantic_index_path, :fulltext_index_path
604
+ )""",
605
+ storage_row
606
+ )
607
+
608
+ return len(sessions_found)
609
+
610
+
611
+ # ─────────────────────────────────────────────
612
+ # MAIN
613
+ # ─────────────────────────────────────────────
614
+
615
+ def main():
616
+ print(f"cda ingest → {DB_PATH}")
617
+
618
+ if DB_PATH.exists():
619
+ DB_PATH.unlink()
620
+ print(" dropped existing DB")
621
+
622
+ conn = sqlite3.connect(str(DB_PATH))
623
+ conn.execute("PRAGMA journal_mode=WAL")
624
+ conn.execute("PRAGMA synchronous=NORMAL")
625
+ conn.execute("PRAGMA cache_size=-2000")
626
+ conn.execute("PRAGMA mmap_size=268435456")
627
+ conn.execute("PRAGMA temp_store=MEMORY")
628
+ conn.executescript(SCHEMA)
629
+ conn.commit()
630
+ print(" schema initialized")
631
+
632
+ # Global memory files
633
+ global_mem_count = ingest_memory_dir(conn, "global", None, None, GLOBAL_MEM)
634
+ print(f" global memory: {global_mem_count} files")
635
+ conn.commit()
636
+
637
+ # Walk all workspaces
638
+ workspace_dirs = [d for d in VS_STORAGE.iterdir() if d.is_dir()]
639
+ print(f" found {len(workspace_dirs)} workspace dirs")
640
+
641
+ total_sessions = 0
642
+ for i, ws_dir in enumerate(sorted(workspace_dirs), 1):
643
+ ws_id = ws_dir.name
644
+ try:
645
+ n = ingest_workspace(conn, ws_id)
646
+ total_sessions += n
647
+ if n > 0:
648
+ print(f" [{i:3}] {ws_id[:16]}... {n} session(s)")
649
+ except Exception as e:
650
+ print(f" [{i:3}] {ws_id[:16]}... ERROR: {e}")
651
+ log_ingest(conn, ws_id, None, "workspace", "error", str(e))
652
+ if i % 10 == 0:
653
+ conn.commit()
654
+
655
+ conn.commit()
656
+
657
+ # Summary
658
+ print()
659
+ print("=== INGEST COMPLETE ===")
660
+ for table in ["workspaces", "sessions", "session_storage", "vfs", "transcript_events",
661
+ "chat_messages", "state_items", "memory_files", "ingest_log"]:
662
+ count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
663
+ print(f" {table:<25} {count:>8} rows")
664
+
665
+ db_size = DB_PATH.stat().st_size
666
+ print(f"\n DB size: {db_size / 1024 / 1024:.1f} MB")
667
+ print(f" workspaces: {len(workspace_dirs)}")
668
+ print(f" sessions: {total_sessions}")
669
+ conn.close()
670
+
671
+
672
+ if __name__ == "__main__":
673
+ main()