code-data-ark 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cda/__init__.py +3 -0
- cda/kernel/__init__.py +0 -0
- cda/kernel/control_db.py +151 -0
- cda/kernel/pmf_kernel.py +364 -0
- cda/kernel/selfcheck.py +299 -0
- cda/pipeline/__init__.py +0 -0
- cda/pipeline/embed.py +694 -0
- cda/pipeline/extract.py +1064 -0
- cda/pipeline/ingest.py +673 -0
- cda/pipeline/parse_edits.py +250 -0
- cda/pipeline/reconstruct.py +536 -0
- cda/pipeline/watcher.py +783 -0
- cda/ui/__init__.py +0 -0
- cda/ui/cli.py +2587 -0
- cda/ui/web.py +2848 -0
- code_data_ark-2.0.2.dist-info/METADATA +495 -0
- code_data_ark-2.0.2.dist-info/RECORD +20 -0
- code_data_ark-2.0.2.dist-info/WHEEL +4 -0
- code_data_ark-2.0.2.dist-info/entry_points.txt +2 -0
- code_data_ark-2.0.2.dist-info/licenses/license +21 -0
cda/pipeline/ingest.py
ADDED
|
@@ -0,0 +1,673 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
cda/ingest.py
|
|
4
|
+
|
|
5
|
+
Extracts all VSCode/Copilot session data into a local SQLite database.
|
|
6
|
+
|
|
7
|
+
Storage locations ingested per workspace/session:
|
|
8
|
+
1. transcripts/*.jsonl — Copilot transcript event stream
|
|
9
|
+
2. chatSessions/*.jsonl — VS Code chat UI state (kind 0/1/2)
|
|
10
|
+
3. chatEditingSessions/*/state.json — file edit checkpoints
|
|
11
|
+
4. chatEditingSessions/*/contents/* — versioned file content blobs
|
|
12
|
+
5. chat-session-resources/*/*/content.txt — tool output payloads
|
|
13
|
+
6. debug-logs/*/models.json — model catalog at session start
|
|
14
|
+
7. debug-logs/*/main.jsonl — minimal debug events
|
|
15
|
+
8. state.vscdb ItemTable — VS Code workspace state (parsed, not blobbed)
|
|
16
|
+
9. memory-tool/ (workspace) — workspace-scoped memory files
|
|
17
|
+
10. globalStorage/.../memories/ — global memory files (once, not per-workspace)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
import json
|
|
22
|
+
import sqlite3
|
|
23
|
+
import gzip
|
|
24
|
+
import hashlib
|
|
25
|
+
import time
|
|
26
|
+
import logging
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
# Set up logging
|
|
30
|
+
logging.basicConfig(
|
|
31
|
+
level=logging.INFO,
|
|
32
|
+
format="%(asctime)s %(levelname)-7s %(message)s",
|
|
33
|
+
datefmt="%H:%M:%S",
|
|
34
|
+
)
|
|
35
|
+
log = logging.getLogger("ark-ingest")
|
|
36
|
+
|
|
37
|
+
HOME = Path.home()
|
|
38
|
+
# Allow override via env var for portability
|
|
39
|
+
VSCODE_DATA_DIR = Path(os.environ.get("VSCODE_DATA_DIR", HOME / "Library/Application Support/Code/User"))
|
|
40
|
+
VS_STORAGE = VSCODE_DATA_DIR / "workspaceStorage"
|
|
41
|
+
GLOBAL_MEM = VSCODE_DATA_DIR / "globalStorage/github.copilot-chat/memory-tool/memories"
|
|
42
|
+
ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
|
|
43
|
+
LOCAL_DIR = ROOT_DIR / "local"
|
|
44
|
+
DB_PATH = LOCAL_DIR / "data" / "cda.db"
|
|
45
|
+
|
|
46
|
+
# Large index DBs — too big to blob, record path only
|
|
47
|
+
SKIP_BLOB_PATTERNS = ["workspace-chunks.db", "local-index"]
|
|
48
|
+
|
|
49
|
+
NOW_MS = int(time.time() * 1000)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ─────────────────────────────────────────────
|
|
53
|
+
# HELPERS
|
|
54
|
+
# ─────────────────────────────────────────────
|
|
55
|
+
|
|
56
|
+
def sha256_short(data: bytes) -> str:
|
|
57
|
+
return hashlib.sha256(data).hexdigest()[:16]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def compress(data: bytes) -> bytes:
|
|
61
|
+
return gzip.compress(data, compresslevel=6)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def read_bytes(path):
|
|
65
|
+
try:
|
|
66
|
+
return Path(path).read_bytes()
|
|
67
|
+
except Exception as e:
|
|
68
|
+
log.warning(f"Failed to read bytes from {path}: {e}")
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def read_json(path):
|
|
73
|
+
try:
|
|
74
|
+
return json.loads(Path(path).read_text())
|
|
75
|
+
except Exception as e:
|
|
76
|
+
log.warning(f"Failed to read JSON from {path}: {e}")
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def log_ingest(conn, workspace_id, session_id, source_type, status, message=""):
|
|
81
|
+
conn.execute(
|
|
82
|
+
"INSERT INTO ingest_log(workspace_id, session_id, source_type, status, message, at) VALUES(?,?,?,?,?,?)",
|
|
83
|
+
(workspace_id, session_id, source_type, status, message, NOW_MS)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ─────────────────────────────────────────────
|
|
88
|
+
# SCHEMA
|
|
89
|
+
# ─────────────────────────────────────────────
|
|
90
|
+
|
|
91
|
+
SCHEMA = """
|
|
92
|
+
CREATE TABLE IF NOT EXISTS workspaces (
|
|
93
|
+
workspace_id TEXT PRIMARY KEY,
|
|
94
|
+
uri TEXT,
|
|
95
|
+
name TEXT,
|
|
96
|
+
type TEXT, -- 'workspace' | 'folder' | 'unknown'
|
|
97
|
+
session_count INTEGER DEFAULT 0,
|
|
98
|
+
ingested_at INTEGER
|
|
99
|
+
);
|
|
100
|
+
|
|
101
|
+
CREATE TABLE IF NOT EXISTS sessions (
|
|
102
|
+
session_id TEXT PRIMARY KEY,
|
|
103
|
+
workspace_id TEXT,
|
|
104
|
+
title TEXT,
|
|
105
|
+
created_at INTEGER,
|
|
106
|
+
last_message_at INTEGER,
|
|
107
|
+
request_count INTEGER DEFAULT 0,
|
|
108
|
+
response_state INTEGER,
|
|
109
|
+
initial_location TEXT,
|
|
110
|
+
ingested_at INTEGER,
|
|
111
|
+
FOREIGN KEY (workspace_id) REFERENCES workspaces(workspace_id)
|
|
112
|
+
);
|
|
113
|
+
|
|
114
|
+
-- Which of the 14 locations exist for each session + sizes
|
|
115
|
+
CREATE TABLE IF NOT EXISTS session_storage (
|
|
116
|
+
session_id TEXT PRIMARY KEY,
|
|
117
|
+
workspace_id TEXT,
|
|
118
|
+
has_transcript INTEGER DEFAULT 0,
|
|
119
|
+
transcript_size INTEGER DEFAULT 0,
|
|
120
|
+
has_chat_session INTEGER DEFAULT 0,
|
|
121
|
+
chat_session_size INTEGER DEFAULT 0,
|
|
122
|
+
has_edit_session INTEGER DEFAULT 0,
|
|
123
|
+
edit_state_size INTEGER DEFAULT 0,
|
|
124
|
+
edit_content_count INTEGER DEFAULT 0,
|
|
125
|
+
has_tool_outputs INTEGER DEFAULT 0,
|
|
126
|
+
tool_output_count INTEGER DEFAULT 0,
|
|
127
|
+
has_debug_log INTEGER DEFAULT 0,
|
|
128
|
+
debug_models_size INTEGER DEFAULT 0,
|
|
129
|
+
in_state_vscdb INTEGER DEFAULT 0,
|
|
130
|
+
has_workspace_memory INTEGER DEFAULT 0,
|
|
131
|
+
workspace_memory_count INTEGER DEFAULT 0,
|
|
132
|
+
semantic_index_path TEXT,
|
|
133
|
+
fulltext_index_path TEXT
|
|
134
|
+
);
|
|
135
|
+
|
|
136
|
+
-- Blob VFS — raw file content, gzip-compressed
|
|
137
|
+
CREATE TABLE IF NOT EXISTS vfs (
|
|
138
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
139
|
+
workspace_id TEXT,
|
|
140
|
+
session_id TEXT,
|
|
141
|
+
source_type TEXT, -- transcript | chat_session | edit_state | edit_content |
|
|
142
|
+
-- tool_output | debug_models | debug_main | memory_global | memory_workspace
|
|
143
|
+
source_path TEXT, -- original path on disk
|
|
144
|
+
filename TEXT, -- basename
|
|
145
|
+
content_type TEXT, -- jsonl | json | text | binary
|
|
146
|
+
content BLOB, -- gzip-compressed raw bytes
|
|
147
|
+
size_bytes INTEGER, -- original uncompressed size
|
|
148
|
+
sha256 TEXT,
|
|
149
|
+
ingested_at INTEGER
|
|
150
|
+
);
|
|
151
|
+
CREATE INDEX IF NOT EXISTS vfs_session ON vfs(session_id);
|
|
152
|
+
CREATE INDEX IF NOT EXISTS vfs_type ON vfs(source_type);
|
|
153
|
+
CREATE INDEX IF NOT EXISTS vfs_workspace ON vfs(workspace_id);
|
|
154
|
+
|
|
155
|
+
-- Parsed transcript events (from transcripts/*.jsonl)
|
|
156
|
+
CREATE TABLE IF NOT EXISTS transcript_events (
|
|
157
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
158
|
+
session_id TEXT,
|
|
159
|
+
workspace_id TEXT,
|
|
160
|
+
event_type TEXT,
|
|
161
|
+
request_id TEXT,
|
|
162
|
+
turn_index INTEGER,
|
|
163
|
+
ts INTEGER,
|
|
164
|
+
data_json TEXT
|
|
165
|
+
);
|
|
166
|
+
CREATE INDEX IF NOT EXISTS te_session ON transcript_events(session_id);
|
|
167
|
+
CREATE INDEX IF NOT EXISTS te_type ON transcript_events(event_type);
|
|
168
|
+
CREATE INDEX IF NOT EXISTS te_request ON transcript_events(request_id);
|
|
169
|
+
|
|
170
|
+
-- Parsed chat messages (from chatSessions kind=1 user text + kind=2 request entries)
|
|
171
|
+
CREATE TABLE IF NOT EXISTS chat_messages (
|
|
172
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
173
|
+
session_id TEXT,
|
|
174
|
+
workspace_id TEXT,
|
|
175
|
+
request_id TEXT,
|
|
176
|
+
ts INTEGER,
|
|
177
|
+
role TEXT, -- 'user' | 'assistant' | 'request_meta'
|
|
178
|
+
content TEXT,
|
|
179
|
+
agent_id TEXT,
|
|
180
|
+
kind INTEGER -- original chatSessions kind
|
|
181
|
+
);
|
|
182
|
+
CREATE INDEX IF NOT EXISTS cm_session ON chat_messages(session_id);
|
|
183
|
+
CREATE INDEX IF NOT EXISTS cm_request ON chat_messages(request_id);
|
|
184
|
+
|
|
185
|
+
-- state.vscdb ItemTable rows per workspace
|
|
186
|
+
CREATE TABLE IF NOT EXISTS state_items (
|
|
187
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
188
|
+
workspace_id TEXT,
|
|
189
|
+
key TEXT,
|
|
190
|
+
value TEXT,
|
|
191
|
+
UNIQUE(workspace_id, key)
|
|
192
|
+
);
|
|
193
|
+
CREATE INDEX IF NOT EXISTS si_workspace ON state_items(workspace_id);
|
|
194
|
+
CREATE INDEX IF NOT EXISTS si_key ON state_items(key);
|
|
195
|
+
|
|
196
|
+
-- Memory files (global + workspace-scoped)
|
|
197
|
+
CREATE TABLE IF NOT EXISTS memory_files (
|
|
198
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
199
|
+
scope TEXT, -- 'global' | 'workspace' | 'session' | 'repo'
|
|
200
|
+
workspace_id TEXT,
|
|
201
|
+
session_id TEXT,
|
|
202
|
+
filename TEXT,
|
|
203
|
+
content TEXT,
|
|
204
|
+
size_bytes INTEGER,
|
|
205
|
+
ingested_at INTEGER
|
|
206
|
+
);
|
|
207
|
+
|
|
208
|
+
-- Ingest audit trail
|
|
209
|
+
CREATE TABLE IF NOT EXISTS ingest_log (
|
|
210
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
211
|
+
workspace_id TEXT,
|
|
212
|
+
session_id TEXT,
|
|
213
|
+
source_type TEXT,
|
|
214
|
+
status TEXT, -- 'ok' | 'skip' | 'error'
|
|
215
|
+
message TEXT,
|
|
216
|
+
at INTEGER
|
|
217
|
+
);
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# ─────────────────────────────────────────────
|
|
222
|
+
# VFS INSERT
|
|
223
|
+
# ─────────────────────────────────────────────
|
|
224
|
+
|
|
225
|
+
def vfs_insert(conn, workspace_id, session_id, source_type, source_path, content_type, raw: bytes):
|
|
226
|
+
compressed = compress(raw)
|
|
227
|
+
h = sha256_short(raw)
|
|
228
|
+
filename = Path(source_path).name
|
|
229
|
+
conn.execute(
|
|
230
|
+
"""INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
|
|
231
|
+
content_type, content, size_bytes, sha256, ingested_at)
|
|
232
|
+
VALUES(?,?,?,?,?,?,?,?,?,?)""",
|
|
233
|
+
(workspace_id, session_id, source_type, str(source_path), filename,
|
|
234
|
+
content_type, compressed, len(raw), h, NOW_MS)
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
# ─────────────────────────────────────────────
|
|
239
|
+
# INGEST: TRANSCRIPT
|
|
240
|
+
# ─────────────────────────────────────────────
|
|
241
|
+
|
|
242
|
+
def ingest_transcript(conn, workspace_id, session_id, path: Path):
|
|
243
|
+
raw = read_bytes(path)
|
|
244
|
+
if raw is None:
|
|
245
|
+
return 0
|
|
246
|
+
vfs_insert(conn, workspace_id, session_id, "transcript", path, "jsonl", raw)
|
|
247
|
+
count = 0
|
|
248
|
+
turn_index = 0
|
|
249
|
+
for line in raw.decode("utf-8", errors="replace").splitlines():
|
|
250
|
+
line = line.strip()
|
|
251
|
+
if not line:
|
|
252
|
+
continue
|
|
253
|
+
try:
|
|
254
|
+
evt = json.loads(line)
|
|
255
|
+
except Exception:
|
|
256
|
+
continue
|
|
257
|
+
event_type = evt.get("type", "unknown")
|
|
258
|
+
request_id = evt.get("requestId") or evt.get("request_id")
|
|
259
|
+
ts = evt.get("timestamp") or evt.get("ts")
|
|
260
|
+
if event_type in ("assistant.turn_start", "user.message"):
|
|
261
|
+
turn_index += 1
|
|
262
|
+
conn.execute(
|
|
263
|
+
"""INSERT INTO transcript_events(session_id, workspace_id, event_type, request_id, turn_index, ts, data_json)
|
|
264
|
+
VALUES(?,?,?,?,?,?,?)""",
|
|
265
|
+
(session_id, workspace_id, event_type, request_id, turn_index, ts, line)
|
|
266
|
+
)
|
|
267
|
+
count += 1
|
|
268
|
+
return count
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# ─────────────────────────────────────────────
|
|
272
|
+
# INGEST: CHAT SESSIONS
|
|
273
|
+
# ─────────────────────────────────────────────
|
|
274
|
+
|
|
275
|
+
def ingest_chat_session(conn, workspace_id, session_id, path: Path):
|
|
276
|
+
raw = read_bytes(path)
|
|
277
|
+
if raw is None:
|
|
278
|
+
return 0
|
|
279
|
+
vfs_insert(conn, workspace_id, session_id, "chat_session", path, "jsonl", raw)
|
|
280
|
+
count = 0
|
|
281
|
+
for line in raw.decode("utf-8", errors="replace").splitlines():
|
|
282
|
+
line = line.strip()
|
|
283
|
+
if not line:
|
|
284
|
+
continue
|
|
285
|
+
try:
|
|
286
|
+
obj = json.loads(line)
|
|
287
|
+
except Exception:
|
|
288
|
+
continue
|
|
289
|
+
kind = obj.get("kind")
|
|
290
|
+
v = obj.get("v")
|
|
291
|
+
|
|
292
|
+
if kind == 1 and isinstance(v, str):
|
|
293
|
+
# Raw user message text
|
|
294
|
+
conn.execute(
|
|
295
|
+
"INSERT INTO chat_messages(session_id, workspace_id, request_id, ts, role, content, kind) VALUES(?,?,?,?,?,?,?)",
|
|
296
|
+
(session_id, workspace_id, None, None, "user", v, 1)
|
|
297
|
+
)
|
|
298
|
+
count += 1
|
|
299
|
+
|
|
300
|
+
elif kind == 2 and isinstance(v, list):
|
|
301
|
+
# Incremental request entries
|
|
302
|
+
for req in v:
|
|
303
|
+
if not isinstance(req, dict):
|
|
304
|
+
continue
|
|
305
|
+
request_id = req.get("requestId")
|
|
306
|
+
ts = req.get("timestamp")
|
|
307
|
+
agent_id = None
|
|
308
|
+
if isinstance(req.get("agent"), dict):
|
|
309
|
+
agent_id = req["agent"].get("id")
|
|
310
|
+
conn.execute(
|
|
311
|
+
"INSERT INTO chat_messages(session_id, workspace_id, request_id, ts, role, content, agent_id, kind) VALUES(?,?,?,?,?,?,?,?)",
|
|
312
|
+
(session_id, workspace_id, request_id, ts, "request_meta",
|
|
313
|
+
json.dumps(req), agent_id, 2)
|
|
314
|
+
)
|
|
315
|
+
count += 1
|
|
316
|
+
return count
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
# ─────────────────────────────────────────────
|
|
320
|
+
# INGEST: CHAT EDITING SESSIONS
|
|
321
|
+
# ─────────────────────────────────────────────
|
|
322
|
+
|
|
323
|
+
def ingest_edit_session(conn, workspace_id, session_id, session_dir: Path):
|
|
324
|
+
state_path = session_dir / "state.json"
|
|
325
|
+
raw = read_bytes(state_path)
|
|
326
|
+
if raw:
|
|
327
|
+
vfs_insert(conn, workspace_id, session_id, "edit_state", state_path, "json", raw)
|
|
328
|
+
|
|
329
|
+
contents_dir = session_dir / "contents"
|
|
330
|
+
content_count = 0
|
|
331
|
+
if contents_dir.is_dir():
|
|
332
|
+
for blob_file in contents_dir.iterdir():
|
|
333
|
+
if blob_file.is_file():
|
|
334
|
+
raw_blob = read_bytes(blob_file)
|
|
335
|
+
if raw_blob:
|
|
336
|
+
vfs_insert(conn, workspace_id, session_id, "edit_content", blob_file, "binary", raw_blob)
|
|
337
|
+
content_count += 1
|
|
338
|
+
|
|
339
|
+
return content_count
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
# ─────────────────────────────────────────────
|
|
343
|
+
# INGEST: TOOL OUTPUTS
|
|
344
|
+
# ─────────────────────────────────────────────
|
|
345
|
+
|
|
346
|
+
def ingest_tool_outputs(conn, workspace_id, session_id, session_dir: Path):
|
|
347
|
+
count = 0
|
|
348
|
+
if not session_dir.is_dir():
|
|
349
|
+
return 0
|
|
350
|
+
for tool_dir in session_dir.iterdir():
|
|
351
|
+
if not tool_dir.is_dir():
|
|
352
|
+
continue
|
|
353
|
+
content_file = tool_dir / "content.txt"
|
|
354
|
+
raw = read_bytes(content_file)
|
|
355
|
+
if raw:
|
|
356
|
+
vfs_insert(conn, workspace_id, session_id, "tool_output", content_file, "text", raw)
|
|
357
|
+
count += 1
|
|
358
|
+
return count
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
# ─────────────────────────────────────────────
|
|
362
|
+
# INGEST: DEBUG LOGS
|
|
363
|
+
# ─────────────────────────────────────────────
|
|
364
|
+
|
|
365
|
+
def ingest_debug_log(conn, workspace_id, session_id, session_dir: Path):
|
|
366
|
+
models_size = 0
|
|
367
|
+
for name, ctype in [("models.json", "json"), ("main.jsonl", "jsonl")]:
|
|
368
|
+
path = session_dir / name
|
|
369
|
+
raw = read_bytes(path)
|
|
370
|
+
if raw:
|
|
371
|
+
vfs_insert(conn, workspace_id, session_id, "debug_models" if name == "models.json" else "debug_main",
|
|
372
|
+
path, ctype, raw)
|
|
373
|
+
if name == "models.json":
|
|
374
|
+
models_size = len(raw)
|
|
375
|
+
return models_size
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
# ─────────────────────────────────────────────
|
|
379
|
+
# INGEST: STATE.VSCDB
|
|
380
|
+
# ─────────────────────────────────────────────
|
|
381
|
+
|
|
382
|
+
def ingest_state_vscdb(conn, workspace_id, ws_path: Path):
|
|
383
|
+
db_path = ws_path / "state.vscdb"
|
|
384
|
+
if not db_path.exists():
|
|
385
|
+
return 0
|
|
386
|
+
try:
|
|
387
|
+
src = sqlite3.connect(str(db_path))
|
|
388
|
+
src.row_factory = sqlite3.Row
|
|
389
|
+
rows = src.execute("SELECT key, value FROM ItemTable").fetchall()
|
|
390
|
+
src.close()
|
|
391
|
+
except Exception as e:
|
|
392
|
+
log_ingest(conn, workspace_id, None, "state_vscdb", "error", str(e))
|
|
393
|
+
return 0
|
|
394
|
+
count = 0
|
|
395
|
+
for row in rows:
|
|
396
|
+
try:
|
|
397
|
+
conn.execute(
|
|
398
|
+
"INSERT OR REPLACE INTO state_items(workspace_id, key, value) VALUES(?,?,?)",
|
|
399
|
+
(workspace_id, row["key"], row["value"])
|
|
400
|
+
)
|
|
401
|
+
count += 1
|
|
402
|
+
except Exception:
|
|
403
|
+
pass
|
|
404
|
+
return count
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
# ─────────────────────────────────────────────
|
|
408
|
+
# INGEST: MEMORY FILES
|
|
409
|
+
# ─────────────────────────────────────────────
|
|
410
|
+
|
|
411
|
+
def ingest_memory_dir(conn, scope, workspace_id, session_id, mem_dir: Path):
|
|
412
|
+
count = 0
|
|
413
|
+
if not mem_dir.is_dir():
|
|
414
|
+
return 0
|
|
415
|
+
for f in mem_dir.rglob("*"):
|
|
416
|
+
if not f.is_file():
|
|
417
|
+
continue
|
|
418
|
+
try:
|
|
419
|
+
content = f.read_text(errors="replace")
|
|
420
|
+
conn.execute(
|
|
421
|
+
"""INSERT INTO memory_files(scope, workspace_id, session_id, filename, content, size_bytes, ingested_at)
|
|
422
|
+
VALUES(?,?,?,?,?,?,?)""",
|
|
423
|
+
(scope, workspace_id, session_id, f.name, content, f.stat().st_size, NOW_MS)
|
|
424
|
+
)
|
|
425
|
+
count += 1
|
|
426
|
+
except Exception:
|
|
427
|
+
pass
|
|
428
|
+
return count
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
# ─────────────────────────────────────────────
|
|
432
|
+
# INGEST: ONE WORKSPACE
|
|
433
|
+
# ─────────────────────────────────────────────
|
|
434
|
+
|
|
435
|
+
def ingest_workspace(conn, ws_id: str):
|
|
436
|
+
ws_path = VS_STORAGE / ws_id
|
|
437
|
+
|
|
438
|
+
# Resolve workspace URI
|
|
439
|
+
ws_json_path = ws_path / "workspace.json"
|
|
440
|
+
ws_data = read_json(ws_json_path) or {}
|
|
441
|
+
uri = ws_data.get("workspace") or ws_data.get("folder") or ws_data.get("folderUri") or "unknown"
|
|
442
|
+
ws_type = "workspace" if "workspace" in ws_data else ("folder" if "folder" in ws_data else "unknown")
|
|
443
|
+
|
|
444
|
+
# Derive name
|
|
445
|
+
name = Path(str(uri).rstrip("/").replace("file://", "")).name or ws_id[:12]
|
|
446
|
+
|
|
447
|
+
# Collect session IDs from transcripts dir (most reliable)
|
|
448
|
+
sessions_found: dict[str, dict] = {}
|
|
449
|
+
copilot_dir = ws_path / "GitHub.copilot-chat"
|
|
450
|
+
|
|
451
|
+
transcripts_dir = copilot_dir / "transcripts"
|
|
452
|
+
if transcripts_dir.is_dir():
|
|
453
|
+
for f in transcripts_dir.glob("*.jsonl"):
|
|
454
|
+
sid = f.stem
|
|
455
|
+
sessions_found.setdefault(sid, {})["transcript"] = f
|
|
456
|
+
|
|
457
|
+
chat_sessions_dir = ws_path / "chatSessions"
|
|
458
|
+
if chat_sessions_dir.is_dir():
|
|
459
|
+
for f in chat_sessions_dir.glob("*.jsonl"):
|
|
460
|
+
sid = f.stem
|
|
461
|
+
sessions_found.setdefault(sid, {})["chat_session"] = f
|
|
462
|
+
|
|
463
|
+
# Get session metadata from state.vscdb
|
|
464
|
+
session_meta: dict[str, dict] = {}
|
|
465
|
+
state_db_path = ws_path / "state.vscdb"
|
|
466
|
+
if state_db_path.exists():
|
|
467
|
+
try:
|
|
468
|
+
src = sqlite3.connect(str(state_db_path))
|
|
469
|
+
row = src.execute("SELECT value FROM ItemTable WHERE key='chat.ChatSessionStore.index'").fetchone()
|
|
470
|
+
if row:
|
|
471
|
+
idx = json.loads(row[0])
|
|
472
|
+
for sid, entry in (idx.get("entries") or {}).items():
|
|
473
|
+
session_meta[sid] = entry
|
|
474
|
+
src.close()
|
|
475
|
+
except Exception:
|
|
476
|
+
pass
|
|
477
|
+
|
|
478
|
+
# Merge sessions from all sources
|
|
479
|
+
for sid, entry in session_meta.items():
|
|
480
|
+
sessions_found.setdefault(sid, {})["meta"] = entry
|
|
481
|
+
|
|
482
|
+
edit_sessions_dir = ws_path / "chatEditingSessions"
|
|
483
|
+
if edit_sessions_dir.is_dir():
|
|
484
|
+
for d in edit_sessions_dir.iterdir():
|
|
485
|
+
if d.is_dir():
|
|
486
|
+
sessions_found.setdefault(d.name, {})["edit_session_dir"] = d
|
|
487
|
+
|
|
488
|
+
tool_resources_dir = copilot_dir / "chat-session-resources"
|
|
489
|
+
if tool_resources_dir.is_dir():
|
|
490
|
+
for d in tool_resources_dir.iterdir():
|
|
491
|
+
if d.is_dir():
|
|
492
|
+
sessions_found.setdefault(d.name, {})["tool_outputs_dir"] = d
|
|
493
|
+
|
|
494
|
+
debug_logs_dir = copilot_dir / "debug-logs"
|
|
495
|
+
if debug_logs_dir.is_dir():
|
|
496
|
+
for d in debug_logs_dir.iterdir():
|
|
497
|
+
if d.is_dir():
|
|
498
|
+
sessions_found.setdefault(d.name, {})["debug_log_dir"] = d
|
|
499
|
+
|
|
500
|
+
# Register workspace
|
|
501
|
+
conn.execute(
|
|
502
|
+
"INSERT OR REPLACE INTO workspaces(workspace_id, uri, name, type, session_count, ingested_at) VALUES(?,?,?,?,?,?)",
|
|
503
|
+
(ws_id, str(uri), name, ws_type, len(sessions_found), NOW_MS)
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
# Ingest state.vscdb
|
|
507
|
+
ingest_state_vscdb(conn, ws_id, ws_path)
|
|
508
|
+
|
|
509
|
+
# Ingest workspace memory files
|
|
510
|
+
ws_mem_dir = copilot_dir / "memory-tool" / "memories"
|
|
511
|
+
ingest_memory_dir(conn, "workspace", ws_id, None, ws_mem_dir)
|
|
512
|
+
|
|
513
|
+
# Process each session
|
|
514
|
+
for sid, sources in sessions_found.items():
|
|
515
|
+
meta = sources.get("meta", {})
|
|
516
|
+
title = meta.get("title") or meta.get("customTitle") or "untitled"
|
|
517
|
+
created_at = (meta.get("timing") or {}).get("created") or meta.get("creationDate")
|
|
518
|
+
last_msg = (meta.get("timing") or {}).get("lastRequestStarted") or meta.get("lastMessageDate")
|
|
519
|
+
response_state = meta.get("lastResponseState")
|
|
520
|
+
initial_location = meta.get("initialLocation")
|
|
521
|
+
|
|
522
|
+
conn.execute(
|
|
523
|
+
"""INSERT OR REPLACE INTO sessions(session_id, workspace_id, title, created_at,
|
|
524
|
+
last_message_at, response_state, initial_location, ingested_at)
|
|
525
|
+
VALUES(?,?,?,?,?,?,?,?)""",
|
|
526
|
+
(sid, ws_id, title, created_at, last_msg, response_state, initial_location, NOW_MS)
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
storage_row = {
|
|
530
|
+
"session_id": sid, "workspace_id": ws_id,
|
|
531
|
+
"has_transcript": 0, "transcript_size": 0,
|
|
532
|
+
"has_chat_session": 0, "chat_session_size": 0,
|
|
533
|
+
"has_edit_session": 0, "edit_state_size": 0, "edit_content_count": 0,
|
|
534
|
+
"has_tool_outputs": 0, "tool_output_count": 0,
|
|
535
|
+
"has_debug_log": 0, "debug_models_size": 0,
|
|
536
|
+
"in_state_vscdb": 1 if sid in session_meta else 0,
|
|
537
|
+
"has_workspace_memory": 0, "workspace_memory_count": 0,
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
# 1. Transcript
|
|
541
|
+
if "transcript" in sources:
|
|
542
|
+
p = sources["transcript"]
|
|
543
|
+
evt_count = ingest_transcript(conn, ws_id, sid, p)
|
|
544
|
+
storage_row.update(has_transcript=1, transcript_size=p.stat().st_size)
|
|
545
|
+
conn.execute("UPDATE sessions SET request_count=? WHERE session_id=?",
|
|
546
|
+
(evt_count, sid))
|
|
547
|
+
|
|
548
|
+
# 2. Chat session
|
|
549
|
+
if "chat_session" in sources:
|
|
550
|
+
p = sources["chat_session"]
|
|
551
|
+
ingest_chat_session(conn, ws_id, sid, p)
|
|
552
|
+
storage_row.update(has_chat_session=1, chat_session_size=p.stat().st_size)
|
|
553
|
+
|
|
554
|
+
# 3. Edit session
|
|
555
|
+
if "edit_session_dir" in sources:
|
|
556
|
+
d = sources["edit_session_dir"]
|
|
557
|
+
content_count = ingest_edit_session(conn, ws_id, sid, d)
|
|
558
|
+
state_size = 0
|
|
559
|
+
sp = d / "state.json"
|
|
560
|
+
if sp.exists():
|
|
561
|
+
state_size = sp.stat().st_size
|
|
562
|
+
storage_row.update(has_edit_session=1, edit_state_size=state_size,
|
|
563
|
+
edit_content_count=content_count)
|
|
564
|
+
|
|
565
|
+
# 4. Tool outputs
|
|
566
|
+
if "tool_outputs_dir" in sources:
|
|
567
|
+
d = sources["tool_outputs_dir"]
|
|
568
|
+
count = ingest_tool_outputs(conn, ws_id, sid, d)
|
|
569
|
+
storage_row.update(has_tool_outputs=1 if count > 0 else 0, tool_output_count=count)
|
|
570
|
+
|
|
571
|
+
# 5. Debug logs
|
|
572
|
+
if "debug_log_dir" in sources:
|
|
573
|
+
d = sources["debug_log_dir"]
|
|
574
|
+
models_size = ingest_debug_log(conn, ws_id, sid, d)
|
|
575
|
+
storage_row.update(has_debug_log=1, debug_models_size=models_size)
|
|
576
|
+
|
|
577
|
+
# 6. Large index DBs — path only
|
|
578
|
+
semantic = ws_path / "GitHub.copilot-chat" / "workspace-chunks.db"
|
|
579
|
+
fulltext_candidates = list(ws_path.glob("local-index*"))
|
|
580
|
+
storage_row["semantic_index_path"] = str(semantic) if semantic.exists() else None
|
|
581
|
+
storage_row["fulltext_index_path"] = str(fulltext_candidates[0]) if fulltext_candidates else None
|
|
582
|
+
|
|
583
|
+
conn.execute(
|
|
584
|
+
"""INSERT OR REPLACE INTO session_storage(
|
|
585
|
+
session_id, workspace_id,
|
|
586
|
+
has_transcript, transcript_size,
|
|
587
|
+
has_chat_session, chat_session_size,
|
|
588
|
+
has_edit_session, edit_state_size, edit_content_count,
|
|
589
|
+
has_tool_outputs, tool_output_count,
|
|
590
|
+
has_debug_log, debug_models_size,
|
|
591
|
+
in_state_vscdb,
|
|
592
|
+
has_workspace_memory, workspace_memory_count,
|
|
593
|
+
semantic_index_path, fulltext_index_path
|
|
594
|
+
) VALUES(
|
|
595
|
+
:session_id, :workspace_id,
|
|
596
|
+
:has_transcript, :transcript_size,
|
|
597
|
+
:has_chat_session, :chat_session_size,
|
|
598
|
+
:has_edit_session, :edit_state_size, :edit_content_count,
|
|
599
|
+
:has_tool_outputs, :tool_output_count,
|
|
600
|
+
:has_debug_log, :debug_models_size,
|
|
601
|
+
:in_state_vscdb,
|
|
602
|
+
:has_workspace_memory, :workspace_memory_count,
|
|
603
|
+
:semantic_index_path, :fulltext_index_path
|
|
604
|
+
)""",
|
|
605
|
+
storage_row
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
return len(sessions_found)
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
# ─────────────────────────────────────────────
|
|
612
|
+
# MAIN
|
|
613
|
+
# ─────────────────────────────────────────────
|
|
614
|
+
|
|
615
|
+
def main():
|
|
616
|
+
print(f"cda ingest → {DB_PATH}")
|
|
617
|
+
|
|
618
|
+
if DB_PATH.exists():
|
|
619
|
+
DB_PATH.unlink()
|
|
620
|
+
print(" dropped existing DB")
|
|
621
|
+
|
|
622
|
+
conn = sqlite3.connect(str(DB_PATH))
|
|
623
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
624
|
+
conn.execute("PRAGMA synchronous=NORMAL")
|
|
625
|
+
conn.execute("PRAGMA cache_size=-2000")
|
|
626
|
+
conn.execute("PRAGMA mmap_size=268435456")
|
|
627
|
+
conn.execute("PRAGMA temp_store=MEMORY")
|
|
628
|
+
conn.executescript(SCHEMA)
|
|
629
|
+
conn.commit()
|
|
630
|
+
print(" schema initialized")
|
|
631
|
+
|
|
632
|
+
# Global memory files
|
|
633
|
+
global_mem_count = ingest_memory_dir(conn, "global", None, None, GLOBAL_MEM)
|
|
634
|
+
print(f" global memory: {global_mem_count} files")
|
|
635
|
+
conn.commit()
|
|
636
|
+
|
|
637
|
+
# Walk all workspaces
|
|
638
|
+
workspace_dirs = [d for d in VS_STORAGE.iterdir() if d.is_dir()]
|
|
639
|
+
print(f" found {len(workspace_dirs)} workspace dirs")
|
|
640
|
+
|
|
641
|
+
total_sessions = 0
|
|
642
|
+
for i, ws_dir in enumerate(sorted(workspace_dirs), 1):
|
|
643
|
+
ws_id = ws_dir.name
|
|
644
|
+
try:
|
|
645
|
+
n = ingest_workspace(conn, ws_id)
|
|
646
|
+
total_sessions += n
|
|
647
|
+
if n > 0:
|
|
648
|
+
print(f" [{i:3}] {ws_id[:16]}... {n} session(s)")
|
|
649
|
+
except Exception as e:
|
|
650
|
+
print(f" [{i:3}] {ws_id[:16]}... ERROR: {e}")
|
|
651
|
+
log_ingest(conn, ws_id, None, "workspace", "error", str(e))
|
|
652
|
+
if i % 10 == 0:
|
|
653
|
+
conn.commit()
|
|
654
|
+
|
|
655
|
+
conn.commit()
|
|
656
|
+
|
|
657
|
+
# Summary
|
|
658
|
+
print()
|
|
659
|
+
print("=== INGEST COMPLETE ===")
|
|
660
|
+
for table in ["workspaces", "sessions", "session_storage", "vfs", "transcript_events",
|
|
661
|
+
"chat_messages", "state_items", "memory_files", "ingest_log"]:
|
|
662
|
+
count = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()[0]
|
|
663
|
+
print(f" {table:<25} {count:>8} rows")
|
|
664
|
+
|
|
665
|
+
db_size = DB_PATH.stat().st_size
|
|
666
|
+
print(f"\n DB size: {db_size / 1024 / 1024:.1f} MB")
|
|
667
|
+
print(f" workspaces: {len(workspace_dirs)}")
|
|
668
|
+
print(f" sessions: {total_sessions}")
|
|
669
|
+
conn.close()
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
if __name__ == "__main__":
|
|
673
|
+
main()
|