code-data-ark 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cda/__init__.py +3 -0
- cda/kernel/__init__.py +0 -0
- cda/kernel/control_db.py +151 -0
- cda/kernel/pmf_kernel.py +364 -0
- cda/kernel/selfcheck.py +299 -0
- cda/pipeline/__init__.py +0 -0
- cda/pipeline/embed.py +694 -0
- cda/pipeline/extract.py +1064 -0
- cda/pipeline/ingest.py +673 -0
- cda/pipeline/parse_edits.py +250 -0
- cda/pipeline/reconstruct.py +536 -0
- cda/pipeline/watcher.py +783 -0
- cda/ui/__init__.py +0 -0
- cda/ui/cli.py +2587 -0
- cda/ui/web.py +2848 -0
- code_data_ark-2.0.2.dist-info/METADATA +495 -0
- code_data_ark-2.0.2.dist-info/RECORD +20 -0
- code_data_ark-2.0.2.dist-info/WHEEL +4 -0
- code_data_ark-2.0.2.dist-info/entry_points.txt +2 -0
- code_data_ark-2.0.2.dist-info/licenses/license +21 -0
cda/pipeline/watcher.py
ADDED
|
@@ -0,0 +1,783 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
cda/watcher.py
|
|
4
|
+
|
|
5
|
+
Live sync daemon. Watches all VSCode storage locations and streams
|
|
6
|
+
updates into cda.db as they happen during a session.
|
|
7
|
+
|
|
8
|
+
What it watches:
|
|
9
|
+
- chatSessions/*.jsonl — append-only, new lines → chat_messages + fts
|
|
10
|
+
- transcripts/*.jsonl — append-only, new lines → transcript_events
|
|
11
|
+
- chat-session-resources/ — new tool output files → vfs
|
|
12
|
+
- chatEditingSessions/*/state.json — rewrites → vfs update
|
|
13
|
+
- memory-tool/memories/** — new/changed files → memory_files
|
|
14
|
+
- state.vscdb — mtime change → state_items refresh
|
|
15
|
+
|
|
16
|
+
After any transcript change for a session, re-reconstructs exchanges
|
|
17
|
+
and refreshes fts_exchanges for that session only.
|
|
18
|
+
|
|
19
|
+
Runs as a foreground daemon. Write PID to watcher.pid.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
import sys
|
|
24
|
+
import json
|
|
25
|
+
import gzip
|
|
26
|
+
import hashlib
|
|
27
|
+
import sqlite3
|
|
28
|
+
import time
|
|
29
|
+
import threading
|
|
30
|
+
import signal
|
|
31
|
+
import logging
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Optional
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from watchfiles import watch
|
|
37
|
+
except ImportError:
|
|
38
|
+
print("ERROR: watchfiles not installed. Run: pip install watchfiles")
|
|
39
|
+
sys.exit(1)
|
|
40
|
+
|
|
41
|
+
ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
|
|
42
|
+
LOCAL_DIR = ROOT_DIR / "local"
|
|
43
|
+
DB_PATH = LOCAL_DIR / "data" / "cda.db"
|
|
44
|
+
PID_FILE = LOCAL_DIR / "run" / "watcher.pid"
|
|
45
|
+
QUEUE_DIR = LOCAL_DIR / "queue"
|
|
46
|
+
# Allow override via env var for portability
|
|
47
|
+
VSCODE_DATA_DIR = Path(os.environ.get("VSCODE_DATA_DIR", Path.home() / "Library/Application Support/Code/User"))
|
|
48
|
+
VS_ROOT = VSCODE_DATA_DIR / "workspaceStorage"
|
|
49
|
+
GLOBAL_MEM = VSCODE_DATA_DIR / "globalStorage/github.copilot-chat/memory-tool/memories"
|
|
50
|
+
|
|
51
|
+
log_file = LOCAL_DIR / "logs" / "watcher.log"
|
|
52
|
+
logging.basicConfig(
|
|
53
|
+
level=logging.INFO,
|
|
54
|
+
format="%(asctime)s %(levelname)-7s %(message)s",
|
|
55
|
+
datefmt="%H:%M:%S",
|
|
56
|
+
filename=str(log_file),
|
|
57
|
+
filemode='a',
|
|
58
|
+
)
|
|
59
|
+
log = logging.getLogger("ark-watcher")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ─────────────────────────────────────────────
|
|
63
|
+
# DB helpers
|
|
64
|
+
# ─────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
def get_conn():
|
|
67
|
+
conn = sqlite3.connect(str(DB_PATH), timeout=10)
|
|
68
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
69
|
+
conn.execute("PRAGMA synchronous=NORMAL")
|
|
70
|
+
conn.execute("PRAGMA cache_size=-2000")
|
|
71
|
+
conn.execute("PRAGMA mmap_size=268435456")
|
|
72
|
+
conn.execute("PRAGMA temp_store=MEMORY")
|
|
73
|
+
conn.row_factory = sqlite3.Row
|
|
74
|
+
return conn
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def compress(data: bytes) -> bytes:
|
|
78
|
+
return gzip.compress(data, compresslevel=6)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def sha256_short(data: bytes) -> str:
|
|
82
|
+
return hashlib.sha256(data).hexdigest()[:16]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def now_ms() -> int:
|
|
86
|
+
return int(time.time() * 1000)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ─────────────────────────────────────────────
|
|
90
|
+
# Offset tracking — so we only parse new bytes
|
|
91
|
+
# ─────────────────────────────────────────────
|
|
92
|
+
|
|
93
|
+
OFFSETS_SCHEMA = """
|
|
94
|
+
CREATE TABLE IF NOT EXISTS file_offsets (
|
|
95
|
+
path TEXT PRIMARY KEY,
|
|
96
|
+
byte_offset INTEGER DEFAULT 0,
|
|
97
|
+
updated_at INTEGER
|
|
98
|
+
);
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_offset(conn, path: str) -> int:
|
|
103
|
+
row = conn.execute("SELECT byte_offset FROM file_offsets WHERE path=?", (path,)).fetchone()
|
|
104
|
+
return row[0] if row else 0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def set_offset(conn, path: str, offset: int):
|
|
108
|
+
conn.execute(
|
|
109
|
+
"INSERT OR REPLACE INTO file_offsets(path, byte_offset, updated_at) VALUES(?,?,?)",
|
|
110
|
+
(path, offset, now_ms())
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ─────────────────────────────────────────────
|
|
115
|
+
# Extract workspace_id + session_id from a path
|
|
116
|
+
# ─────────────────────────────────────────────
|
|
117
|
+
|
|
118
|
+
def parse_path(path: Path):
|
|
119
|
+
"""
|
|
120
|
+
Returns (workspace_id, session_id, file_type) or None.
|
|
121
|
+
file_type: 'transcript' | 'chat_session' | 'tool_output' |
|
|
122
|
+
'edit_state' | 'memory_workspace' | 'memory_global' | 'state_vscdb'
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
rel = path.relative_to(VS_ROOT)
|
|
126
|
+
parts = rel.parts
|
|
127
|
+
ws_id = parts[0]
|
|
128
|
+
|
|
129
|
+
# chatSessions/<session_id>.jsonl
|
|
130
|
+
if len(parts) == 3 and parts[1] == "chatSessions" and path.suffix == ".jsonl":
|
|
131
|
+
return ws_id, parts[2].replace(".jsonl", ""), "chat_session"
|
|
132
|
+
|
|
133
|
+
# GitHub.copilot-chat/transcripts/<session_id>.jsonl
|
|
134
|
+
if len(parts) == 4 and parts[1] == "GitHub.copilot-chat" and parts[2] == "transcripts" and path.suffix == ".jsonl":
|
|
135
|
+
return ws_id, parts[3].replace(".jsonl", ""), "transcript"
|
|
136
|
+
|
|
137
|
+
# GitHub.copilot-chat/chat-session-resources/<session_id>/<tool_dir>/content.txt
|
|
138
|
+
if len(parts) == 6 and parts[1] == "GitHub.copilot-chat" and parts[2] == "chat-session-resources" and parts[5] == "content.txt":
|
|
139
|
+
return ws_id, parts[3], "tool_output"
|
|
140
|
+
|
|
141
|
+
# chatEditingSessions/<session_id>/state.json
|
|
142
|
+
if len(parts) == 4 and parts[1] == "chatEditingSessions" and parts[3] == "state.json":
|
|
143
|
+
return ws_id, parts[2], "edit_state"
|
|
144
|
+
|
|
145
|
+
# chatEditingSessions/<session_id>/contents/<blob_file>
|
|
146
|
+
if len(parts) == 5 and parts[1] == "chatEditingSessions" and parts[3] == "contents":
|
|
147
|
+
return ws_id, parts[2], "edit_content"
|
|
148
|
+
|
|
149
|
+
# GitHub.copilot-chat/memory-tool/memories/**
|
|
150
|
+
if len(parts) >= 5 and parts[1] == "GitHub.copilot-chat" and parts[2] == "memory-tool" and parts[3] == "memories":
|
|
151
|
+
return ws_id, None, "memory_workspace"
|
|
152
|
+
|
|
153
|
+
# state.vscdb
|
|
154
|
+
if len(parts) == 2 and parts[1] == "state.vscdb":
|
|
155
|
+
return ws_id, None, "state_vscdb"
|
|
156
|
+
|
|
157
|
+
except ValueError:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
# Global memory
|
|
161
|
+
try:
|
|
162
|
+
path.relative_to(GLOBAL_MEM)
|
|
163
|
+
return None, None, "memory_global"
|
|
164
|
+
except ValueError:
|
|
165
|
+
pass
|
|
166
|
+
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ─────────────────────────────────────────────
|
|
171
|
+
# Persistent Queue for Resilience
|
|
172
|
+
# ─────────────────────────────────────────────
|
|
173
|
+
|
|
174
|
+
def init_queue():
|
|
175
|
+
"""Initialize the queue directory."""
|
|
176
|
+
QUEUE_DIR.mkdir(exist_ok=True)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def queue_operation(op_type: str, data: dict):
|
|
180
|
+
"""Write an operation to the persistent queue before executing."""
|
|
181
|
+
timestamp = now_ms()
|
|
182
|
+
queue_file = QUEUE_DIR / f"{timestamp}_{op_type}.json"
|
|
183
|
+
try:
|
|
184
|
+
queue_file.write_text(json.dumps({
|
|
185
|
+
"timestamp": timestamp,
|
|
186
|
+
"type": op_type,
|
|
187
|
+
"data": data,
|
|
188
|
+
"status": "pending"
|
|
189
|
+
}))
|
|
190
|
+
log.debug(f"Queued operation: {op_type}")
|
|
191
|
+
except Exception as e:
|
|
192
|
+
log.error(f"Failed to queue operation {op_type}: {e}")
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def dequeue_operation(queue_file: Path):
|
|
196
|
+
"""Mark a queued operation as completed."""
|
|
197
|
+
try:
|
|
198
|
+
data = json.loads(queue_file.read_text())
|
|
199
|
+
data["status"] = "completed"
|
|
200
|
+
queue_file.write_text(json.dumps(data))
|
|
201
|
+
# Rename to .completed extension
|
|
202
|
+
completed_file = queue_file.with_suffix(".completed")
|
|
203
|
+
queue_file.rename(completed_file)
|
|
204
|
+
log.debug(f"Dequeued operation: {queue_file.name}")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
log.error(f"Failed to dequeue {queue_file}: {e}")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def replay_queue(conn):
|
|
210
|
+
"""Replay any pending operations from the queue on startup."""
|
|
211
|
+
if not QUEUE_DIR.exists():
|
|
212
|
+
return
|
|
213
|
+
|
|
214
|
+
pending_files = list(QUEUE_DIR.glob("*.json"))
|
|
215
|
+
if not pending_files:
|
|
216
|
+
return
|
|
217
|
+
|
|
218
|
+
log.info(f"Replaying {len(pending_files)} queued operations...")
|
|
219
|
+
|
|
220
|
+
for queue_file in sorted(pending_files):
|
|
221
|
+
try:
|
|
222
|
+
data = json.loads(queue_file.read_text())
|
|
223
|
+
if data.get("status") == "pending":
|
|
224
|
+
op_type = data["type"]
|
|
225
|
+
op_data = data["data"]
|
|
226
|
+
|
|
227
|
+
if op_type == "vfs_insert":
|
|
228
|
+
_insert_vfs(conn, op_data["path"], op_data["ws_id"], op_data["session_id"],
|
|
229
|
+
op_data["source_type"], None, op_data["filename"])
|
|
230
|
+
elif op_type == "transcript_event":
|
|
231
|
+
_insert_transcript_events(conn, op_data["ws_id"], op_data["session_id"],
|
|
232
|
+
op_data["events"])
|
|
233
|
+
elif op_type in ("chat_message", "exchange_rebuild"):
|
|
234
|
+
log.warning(f"Skipping unsupported queue op type on replay: {op_type}")
|
|
235
|
+
|
|
236
|
+
dequeue_operation(queue_file)
|
|
237
|
+
except Exception as e:
|
|
238
|
+
log.error(f"Failed to replay {queue_file}: {e}")
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def cleanup_old_queue_files():
|
|
242
|
+
"""Clean up completed queue files older than 7 days."""
|
|
243
|
+
if not QUEUE_DIR.exists():
|
|
244
|
+
return
|
|
245
|
+
|
|
246
|
+
cutoff = now_ms() - (7 * 24 * 60 * 60 * 1000) # 7 days ago
|
|
247
|
+
|
|
248
|
+
for completed_file in QUEUE_DIR.glob("*.completed"):
|
|
249
|
+
try:
|
|
250
|
+
data = json.loads(completed_file.read_text())
|
|
251
|
+
if data.get("timestamp", 0) < cutoff:
|
|
252
|
+
completed_file.unlink()
|
|
253
|
+
except Exception:
|
|
254
|
+
completed_file.unlink() # Remove corrupted files
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _insert_vfs(conn, path: str, ws_id: str, session_id: str, source_type: str, content: "Optional[bytes]", filename: str):
|
|
258
|
+
"""Insert VFS blob - used by queue replay."""
|
|
259
|
+
if content is None:
|
|
260
|
+
try:
|
|
261
|
+
content = Path(path).read_bytes()
|
|
262
|
+
except Exception as e:
|
|
263
|
+
raise RuntimeError(f"Failed to read queued VFS content from {path}: {e}") from e
|
|
264
|
+
|
|
265
|
+
conn.execute(
|
|
266
|
+
"""INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
|
|
267
|
+
content_type, content, size_bytes, sha256, ingested_at)
|
|
268
|
+
VALUES(?,?,?,?,?,?,?,?,?,?)""",
|
|
269
|
+
(ws_id, session_id, source_type, path, filename,
|
|
270
|
+
"jsonl", compress(content), len(content), sha256_short(content), now_ms())
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _insert_transcript_events(conn, ws_id: str, session_id: str, events: list):
|
|
275
|
+
"""Insert transcript events - used by queue replay."""
|
|
276
|
+
for event_data in events:
|
|
277
|
+
conn.execute(
|
|
278
|
+
"""INSERT INTO transcript_events(session_id, workspace_id, event_type, request_id, turn_index, ts, data_json)
|
|
279
|
+
VALUES(?,?,?,?,?,?,?)""",
|
|
280
|
+
event_data
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# ─────────────────────────────────────────────
|
|
285
|
+
# Incremental JSONL parse
|
|
286
|
+
# ─────────────────────────────────────────────
|
|
287
|
+
|
|
288
|
+
def read_new_lines(path: Path, from_offset: int):
|
|
289
|
+
"""Returns (new_lines, new_offset)."""
|
|
290
|
+
try:
|
|
291
|
+
raw = path.read_bytes()
|
|
292
|
+
except Exception:
|
|
293
|
+
return [], from_offset
|
|
294
|
+
new_bytes = raw[from_offset:]
|
|
295
|
+
if not new_bytes:
|
|
296
|
+
return [], from_offset
|
|
297
|
+
text = new_bytes.decode("utf-8", errors="replace")
|
|
298
|
+
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
|
299
|
+
return lines, len(raw)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
# ─────────────────────────────────────────────
|
|
303
|
+
# Handlers
|
|
304
|
+
# ─────────────────────────────────────────────
|
|
305
|
+
|
|
306
|
+
def handle_transcript(conn, ws_id, session_id, path: Path):
|
|
307
|
+
path_str = str(path)
|
|
308
|
+
offset = get_offset(conn, path_str)
|
|
309
|
+
lines, new_offset = read_new_lines(path, offset)
|
|
310
|
+
if not lines:
|
|
311
|
+
return 0
|
|
312
|
+
|
|
313
|
+
count = 0
|
|
314
|
+
turn_index = conn.execute(
|
|
315
|
+
"SELECT COALESCE(MAX(turn_index),0) FROM transcript_events WHERE session_id=?",
|
|
316
|
+
(session_id,)
|
|
317
|
+
).fetchone()[0]
|
|
318
|
+
|
|
319
|
+
events_data = []
|
|
320
|
+
for line in lines:
|
|
321
|
+
try:
|
|
322
|
+
evt = json.loads(line)
|
|
323
|
+
except Exception:
|
|
324
|
+
continue
|
|
325
|
+
event_type = evt.get("type", "unknown")
|
|
326
|
+
request_id = evt.get("requestId") or evt.get("request_id")
|
|
327
|
+
ts = evt.get("timestamp") or evt.get("ts")
|
|
328
|
+
if event_type in ("assistant.turn_start", "user.message"):
|
|
329
|
+
turn_index += 1
|
|
330
|
+
events_data.append((session_id, ws_id, event_type, request_id, turn_index, ts, line))
|
|
331
|
+
count += 1
|
|
332
|
+
|
|
333
|
+
# Queue the transcript events operation
|
|
334
|
+
queue_operation("transcript_event", {
|
|
335
|
+
"ws_id": ws_id,
|
|
336
|
+
"session_id": session_id,
|
|
337
|
+
"events": events_data
|
|
338
|
+
})
|
|
339
|
+
|
|
340
|
+
# Execute the operations
|
|
341
|
+
for event_data in events_data:
|
|
342
|
+
conn.execute(
|
|
343
|
+
"""INSERT INTO transcript_events(session_id, workspace_id, event_type, request_id, turn_index, ts, data_json)
|
|
344
|
+
VALUES(?,?,?,?,?,?,?)""",
|
|
345
|
+
event_data
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
set_offset(conn, path_str, new_offset)
|
|
349
|
+
|
|
350
|
+
# Queue VFS update
|
|
351
|
+
queue_operation("vfs_insert", {
|
|
352
|
+
"path": path_str,
|
|
353
|
+
"ws_id": ws_id,
|
|
354
|
+
"session_id": session_id,
|
|
355
|
+
"source_type": "transcript",
|
|
356
|
+
"filename": path.name
|
|
357
|
+
})
|
|
358
|
+
|
|
359
|
+
# Update VFS blob
|
|
360
|
+
conn.execute(
|
|
361
|
+
"DELETE FROM vfs WHERE session_id=? AND source_type='transcript'", (session_id,)
|
|
362
|
+
)
|
|
363
|
+
_insert_vfs(conn, path_str, ws_id, session_id, "transcript", path.read_bytes(), path.name)
|
|
364
|
+
|
|
365
|
+
log.info(f"transcript +{count} events {session_id[:16]} (total offset {new_offset})")
|
|
366
|
+
return count
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def handle_chat_session(conn, ws_id, session_id, path: Path):
|
|
370
|
+
path_str = str(path)
|
|
371
|
+
offset = get_offset(conn, path_str)
|
|
372
|
+
lines, new_offset = read_new_lines(path, offset)
|
|
373
|
+
if not lines:
|
|
374
|
+
return 0
|
|
375
|
+
|
|
376
|
+
count = 0
|
|
377
|
+
for line in lines:
|
|
378
|
+
try:
|
|
379
|
+
obj = json.loads(line)
|
|
380
|
+
except Exception:
|
|
381
|
+
continue
|
|
382
|
+
kind = obj.get("kind")
|
|
383
|
+
v = obj.get("v")
|
|
384
|
+
if kind == 1 and isinstance(v, str):
|
|
385
|
+
conn.execute(
|
|
386
|
+
"INSERT INTO chat_messages(session_id, workspace_id, role, content, kind) VALUES(?,?,?,?,?)",
|
|
387
|
+
(session_id, ws_id, "user", v, 1)
|
|
388
|
+
)
|
|
389
|
+
count += 1
|
|
390
|
+
elif kind == 2 and isinstance(v, list):
|
|
391
|
+
for req in v:
|
|
392
|
+
if not isinstance(req, dict):
|
|
393
|
+
continue
|
|
394
|
+
request_id = req.get("requestId")
|
|
395
|
+
ts = req.get("timestamp")
|
|
396
|
+
agent_id = (req.get("agent") or {}).get("id")
|
|
397
|
+
conn.execute(
|
|
398
|
+
"""INSERT INTO chat_messages(session_id, workspace_id, request_id, ts, role, content, agent_id, kind)
|
|
399
|
+
VALUES(?,?,?,?,?,?,?,?)""",
|
|
400
|
+
(session_id, ws_id, request_id, ts, "request_meta",
|
|
401
|
+
json.dumps(req), agent_id, 2)
|
|
402
|
+
)
|
|
403
|
+
count += 1
|
|
404
|
+
|
|
405
|
+
# Update VFS blob
|
|
406
|
+
raw = path.read_bytes()
|
|
407
|
+
conn.execute("DELETE FROM vfs WHERE session_id=? AND source_type='chat_session'", (session_id,))
|
|
408
|
+
conn.execute(
|
|
409
|
+
"""INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
|
|
410
|
+
content_type, content, size_bytes, sha256, ingested_at)
|
|
411
|
+
VALUES(?,?,?,?,?,?,?,?,?,?)""",
|
|
412
|
+
(ws_id, session_id, "chat_session", str(path), path.name,
|
|
413
|
+
"jsonl", compress(raw), len(raw), sha256_short(raw), now_ms())
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
set_offset(conn, path_str, new_offset)
|
|
417
|
+
log.info(f"chat_session +{count} msgs {session_id[:16]}")
|
|
418
|
+
return count
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def handle_tool_output(conn, ws_id, session_id, path: Path):
|
|
422
|
+
try:
|
|
423
|
+
raw = path.read_bytes()
|
|
424
|
+
except Exception:
|
|
425
|
+
return
|
|
426
|
+
# Check if already in VFS by path
|
|
427
|
+
exists = conn.execute(
|
|
428
|
+
"SELECT id FROM vfs WHERE source_path=? AND source_type='tool_output'", (str(path),)
|
|
429
|
+
).fetchone()
|
|
430
|
+
if exists:
|
|
431
|
+
return
|
|
432
|
+
conn.execute(
|
|
433
|
+
"""INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
|
|
434
|
+
content_type, content, size_bytes, sha256, ingested_at)
|
|
435
|
+
VALUES(?,?,?,?,?,?,?,?,?,?)""",
|
|
436
|
+
(ws_id, session_id, "tool_output", str(path), path.name,
|
|
437
|
+
"text", compress(raw), len(raw), sha256_short(raw), now_ms())
|
|
438
|
+
)
|
|
439
|
+
log.info(f"tool_output +1 {session_id[:16]} ({len(raw)} bytes)")
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def handle_edit_state(conn, ws_id, session_id, path: Path):
|
|
443
|
+
try:
|
|
444
|
+
raw = path.read_bytes()
|
|
445
|
+
except Exception:
|
|
446
|
+
return
|
|
447
|
+
conn.execute(
|
|
448
|
+
"DELETE FROM vfs WHERE session_id=? AND source_type='edit_state'", (session_id,)
|
|
449
|
+
)
|
|
450
|
+
conn.execute(
|
|
451
|
+
"""INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
|
|
452
|
+
content_type, content, size_bytes, sha256, ingested_at)
|
|
453
|
+
VALUES(?,?,?,?,?,?,?,?,?,?)""",
|
|
454
|
+
(ws_id, session_id, "edit_state", str(path), path.name,
|
|
455
|
+
"json", compress(raw), len(raw), sha256_short(raw), now_ms())
|
|
456
|
+
)
|
|
457
|
+
log.info(f"edit_state updated {session_id[:16]}")
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def handle_edit_content(conn, ws_id, session_id, path: Path):
|
|
461
|
+
try:
|
|
462
|
+
raw = path.read_bytes()
|
|
463
|
+
except Exception:
|
|
464
|
+
conn.execute(
|
|
465
|
+
"DELETE FROM vfs WHERE source_path=? AND source_type='edit_content'", (str(path),)
|
|
466
|
+
)
|
|
467
|
+
log.info(f"edit_content removed {session_id[:16]} {path.name}")
|
|
468
|
+
return
|
|
469
|
+
conn.execute(
|
|
470
|
+
"DELETE FROM vfs WHERE source_path=? AND source_type='edit_content'", (str(path),)
|
|
471
|
+
)
|
|
472
|
+
conn.execute(
|
|
473
|
+
"""INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
|
|
474
|
+
content_type, content, size_bytes, sha256, ingested_at)
|
|
475
|
+
VALUES(?,?,?,?,?,?,?,?,?,?)""",
|
|
476
|
+
(ws_id, session_id, "edit_content", str(path), path.name,
|
|
477
|
+
"binary", compress(raw), len(raw), sha256_short(raw), now_ms())
|
|
478
|
+
)
|
|
479
|
+
log.info(f"edit_content updated {session_id[:16]} {path.name}")
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def handle_memory(conn, scope, ws_id, path: Path):
|
|
483
|
+
try:
|
|
484
|
+
content = path.read_text(errors="replace")
|
|
485
|
+
except Exception:
|
|
486
|
+
return
|
|
487
|
+
conn.execute(
|
|
488
|
+
"""INSERT OR REPLACE INTO memory_files(scope, workspace_id, filename, content, size_bytes, ingested_at)
|
|
489
|
+
VALUES(?,?,?,?,?,?)""",
|
|
490
|
+
(scope, ws_id, path.name, content, path.stat().st_size, now_ms())
|
|
491
|
+
)
|
|
492
|
+
log.info(f"memory updated [{scope}] {path.name}")
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def handle_state_vscdb(conn, ws_id, path: Path):
|
|
496
|
+
try:
|
|
497
|
+
src = sqlite3.connect(str(path), timeout=3)
|
|
498
|
+
rows = src.execute("SELECT key, value FROM ItemTable").fetchall()
|
|
499
|
+
src.close()
|
|
500
|
+
except Exception as e:
|
|
501
|
+
log.warning(f"state_vscdb read error {ws_id[:16]}: {e}")
|
|
502
|
+
return
|
|
503
|
+
for key, value in rows:
|
|
504
|
+
conn.execute(
|
|
505
|
+
"INSERT OR REPLACE INTO state_items(workspace_id, key, value) VALUES(?,?,?)",
|
|
506
|
+
(ws_id, key, value)
|
|
507
|
+
)
|
|
508
|
+
log.info(f"state_vscdb refreshed {ws_id[:16]} ({len(rows)} rows)")
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
# ─────────────────────────────────────────────
|
|
512
|
+
# Exchange reconstruction (incremental)
|
|
513
|
+
# ─────────────────────────────────────────────
|
|
514
|
+
|
|
515
|
+
from cda.pipeline.reconstruct import EXCHANGES_SCHEMA, reconstruct_session as _reconstruct_session
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def rebuild_exchanges(conn, session_id: str, ws_id: str):
|
|
519
|
+
"""Delete and rebuild exchanges + FTS for one session."""
|
|
520
|
+
conn.executescript(EXCHANGES_SCHEMA)
|
|
521
|
+
conn.execute("DELETE FROM exchanges WHERE session_id=?", (session_id,))
|
|
522
|
+
# Remove from FTS (content= tables auto-handle via triggers if configured,
|
|
523
|
+
# but since we used content= without triggers, rebuild manually)
|
|
524
|
+
n = _reconstruct_session(conn, session_id, ws_id or "unknown")
|
|
525
|
+
|
|
526
|
+
# Refresh FTS for this session
|
|
527
|
+
# FTS5 content= tables need explicit sync after content table changes
|
|
528
|
+
# Use transaction for atomicity
|
|
529
|
+
with conn:
|
|
530
|
+
conn.execute(
|
|
531
|
+
"INSERT INTO fts_exchanges(fts_exchanges, rowid, session_id, workspace_id, exchange_index, user_ts, user_message, reasoning_text, response_text, tool_calls) SELECT 'delete', id, session_id, workspace_id, exchange_index, user_ts, user_message, reasoning_text, response_text, tool_calls FROM exchanges WHERE session_id=?", # noqa: E501
|
|
532
|
+
(session_id,)
|
|
533
|
+
)
|
|
534
|
+
# Re-insert
|
|
535
|
+
conn.execute(
|
|
536
|
+
"INSERT INTO fts_exchanges(rowid, session_id, workspace_id, exchange_index, user_ts, user_message, reasoning_text, response_text, tool_calls) SELECT id, session_id, workspace_id, exchange_index, user_ts, user_message, reasoning_text, response_text, tool_calls FROM exchanges WHERE session_id=?", # noqa: E501
|
|
537
|
+
(session_id,)
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
log.info(f"exchanges rebuilt {session_id[:16]} ({n} exchanges)")
|
|
541
|
+
return n
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
# ─────────────────────────────────────────────
|
|
545
|
+
# Debounce: batch changes per session
|
|
546
|
+
# ─────────────────────────────────────────────
|
|
547
|
+
|
|
548
|
+
class Debouncer:
|
|
549
|
+
"""Collect dirty sessions and flush after DELAY seconds of quiet."""
|
|
550
|
+
DELAY = 2.0
|
|
551
|
+
|
|
552
|
+
def __init__(self, flush_fn):
|
|
553
|
+
self._dirty: dict[str, tuple] = {} # session_id → (ws_id, deadline)
|
|
554
|
+
self._lock = threading.Lock()
|
|
555
|
+
self._flush_fn = flush_fn
|
|
556
|
+
self._thread = threading.Thread(target=self._loop, daemon=True)
|
|
557
|
+
self._thread.start()
|
|
558
|
+
|
|
559
|
+
def mark(self, session_id: str, ws_id: str):
|
|
560
|
+
with self._lock:
|
|
561
|
+
self._dirty[session_id] = (ws_id, time.time() + self.DELAY)
|
|
562
|
+
|
|
563
|
+
def _loop(self):
|
|
564
|
+
while True:
|
|
565
|
+
time.sleep(0.5)
|
|
566
|
+
now = time.time()
|
|
567
|
+
with self._lock:
|
|
568
|
+
ready = [(sid, ws) for sid, (ws, deadline) in self._dirty.items() if now >= deadline]
|
|
569
|
+
for sid, _ in ready:
|
|
570
|
+
del self._dirty[sid]
|
|
571
|
+
for sid, ws in ready:
|
|
572
|
+
try:
|
|
573
|
+
self._flush_fn(sid, ws)
|
|
574
|
+
except Exception as e:
|
|
575
|
+
log.error(f"flush error {sid[:16]}: {e}")
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
# ─────────────────────────────────────────────
|
|
579
|
+
# Main loop
|
|
580
|
+
# ─────────────────────────────────────────────
|
|
581
|
+
|
|
582
|
+
def main():
|
|
583
|
+
print("STARTING WATCHER", os.environ.get('PYTHONPATH'), file=sys.stderr)
|
|
584
|
+
# Initialize persistent queue
|
|
585
|
+
init_queue()
|
|
586
|
+
cleanup_old_queue_files()
|
|
587
|
+
|
|
588
|
+
PID_FILE.write_text(str(os.getpid()))
|
|
589
|
+
log.info(f"cda watcher started pid={os.getpid()}")
|
|
590
|
+
log.info(f"DB: {DB_PATH}")
|
|
591
|
+
log.info(f"Queue: {QUEUE_DIR}")
|
|
592
|
+
log.info(f"Watching: {VS_ROOT}")
|
|
593
|
+
|
|
594
|
+
conn = get_conn()
|
|
595
|
+
conn.executescript(OFFSETS_SCHEMA)
|
|
596
|
+
conn.executescript(EXCHANGES_SCHEMA)
|
|
597
|
+
conn.commit()
|
|
598
|
+
|
|
599
|
+
# Ensure watcher-required schema exists before replaying operations.
|
|
600
|
+
try:
|
|
601
|
+
import importlib
|
|
602
|
+
extract = importlib.import_module('cda.extract')
|
|
603
|
+
importlib.reload(extract)
|
|
604
|
+
extract.ensure_schema(conn)
|
|
605
|
+
except Exception as ex:
|
|
606
|
+
log.warning(f"Failed to ensure extract schema: {ex}")
|
|
607
|
+
|
|
608
|
+
# Replay any pending operations from queue
|
|
609
|
+
replay_queue(conn)
|
|
610
|
+
|
|
611
|
+
# Initialize offsets for all existing JSONL files so we don't re-ingest
|
|
612
|
+
log.info("Initializing offsets for existing files...")
|
|
613
|
+
for ws_dir in VS_ROOT.iterdir():
|
|
614
|
+
if not ws_dir.is_dir():
|
|
615
|
+
continue
|
|
616
|
+
ws_id = ws_dir.name
|
|
617
|
+
|
|
618
|
+
# chatSessions
|
|
619
|
+
cs_dir = ws_dir / "chatSessions"
|
|
620
|
+
if cs_dir.is_dir():
|
|
621
|
+
for f in cs_dir.glob("*.jsonl"):
|
|
622
|
+
if get_offset(conn, str(f)) == 0:
|
|
623
|
+
try:
|
|
624
|
+
set_offset(conn, str(f), f.stat().st_size)
|
|
625
|
+
except Exception:
|
|
626
|
+
pass
|
|
627
|
+
|
|
628
|
+
# transcripts
|
|
629
|
+
tr_dir = ws_dir / "GitHub.copilot-chat" / "transcripts"
|
|
630
|
+
if tr_dir.is_dir():
|
|
631
|
+
for f in tr_dir.glob("*.jsonl"):
|
|
632
|
+
if get_offset(conn, str(f)) == 0:
|
|
633
|
+
try:
|
|
634
|
+
set_offset(conn, str(f), f.stat().st_size)
|
|
635
|
+
except Exception:
|
|
636
|
+
pass
|
|
637
|
+
|
|
638
|
+
conn.commit()
|
|
639
|
+
log.info("Offsets initialized — watching for new data only")
|
|
640
|
+
|
|
641
|
+
# Debouncer: when transcript changes, rebuild exchanges after quiet period
|
|
642
|
+
def flush_exchanges(session_id, ws_id):
|
|
643
|
+
c = get_conn()
|
|
644
|
+
try:
|
|
645
|
+
rebuild_exchanges(c, session_id, ws_id)
|
|
646
|
+
c.commit()
|
|
647
|
+
finally:
|
|
648
|
+
c.close()
|
|
649
|
+
# Incremental extraction: run behavioral signals + session analysis
|
|
650
|
+
try:
|
|
651
|
+
import importlib
|
|
652
|
+
extract = importlib.import_module('cda.extract')
|
|
653
|
+
importlib.reload(extract)
|
|
654
|
+
c2 = get_conn()
|
|
655
|
+
try:
|
|
656
|
+
blob_row = c2.execute(
|
|
657
|
+
"SELECT content FROM vfs WHERE session_id=? AND source_type='chat_session'",
|
|
658
|
+
(session_id,)
|
|
659
|
+
).fetchone()
|
|
660
|
+
if blob_row:
|
|
661
|
+
c2.execute("DELETE FROM token_usage WHERE session_id=?", (session_id,))
|
|
662
|
+
c2.execute("DELETE FROM compactions WHERE session_id=?", (session_id,))
|
|
663
|
+
c2.execute("DELETE FROM exchange_signals WHERE session_id=?", (session_id,))
|
|
664
|
+
extract.process_session(c2, session_id, blob_row[0])
|
|
665
|
+
extract.build_session_analysis(c2, session_id)
|
|
666
|
+
c2.commit()
|
|
667
|
+
try:
|
|
668
|
+
embed = importlib.import_module('cda.embed')
|
|
669
|
+
importlib.reload(embed)
|
|
670
|
+
embed.build_session_intelligence(c2, session_id)
|
|
671
|
+
c2.commit()
|
|
672
|
+
except Exception as ex2:
|
|
673
|
+
log.warning(f"embed pass failed for {session_id[:8]}: {ex2}")
|
|
674
|
+
finally:
|
|
675
|
+
c2.close()
|
|
676
|
+
except Exception as ex:
|
|
677
|
+
log.warning(f"extract pass failed for {session_id[:8]}: {ex}")
|
|
678
|
+
|
|
679
|
+
debouncer = Debouncer(flush_exchanges)
|
|
680
|
+
|
|
681
|
+
# Track session→workspace for debouncer
|
|
682
|
+
session_ws_map: dict[str, str] = {}
|
|
683
|
+
|
|
684
|
+
def handle_shutdown(sig, frame):
|
|
685
|
+
log.info("Shutting down...")
|
|
686
|
+
try:
|
|
687
|
+
PID_FILE.unlink()
|
|
688
|
+
except Exception:
|
|
689
|
+
pass
|
|
690
|
+
sys.exit(0)
|
|
691
|
+
|
|
692
|
+
signal.signal(signal.SIGINT, handle_shutdown)
|
|
693
|
+
signal.signal(signal.SIGTERM, handle_shutdown)
|
|
694
|
+
|
|
695
|
+
watch_paths = [str(VS_ROOT), str(GLOBAL_MEM)]
|
|
696
|
+
log.info(f"Watch paths: {watch_paths}")
|
|
697
|
+
|
|
698
|
+
# Build session→workspace map from DB
|
|
699
|
+
c = get_conn()
|
|
700
|
+
for row in c.execute("SELECT session_id, workspace_id FROM sessions"):
|
|
701
|
+
session_ws_map[row[0]] = row[1]
|
|
702
|
+
c.close()
|
|
703
|
+
|
|
704
|
+
needs_exchange_rebuild = set() # noqa: F841 — reserved for future use
|
|
705
|
+
symbol_index_dirty = False
|
|
706
|
+
|
|
707
|
+
for changes in watch(VS_ROOT, GLOBAL_MEM, watch_filter=lambda change, path: True, yield_on_timeout=True, rust_timeout=500):
|
|
708
|
+
c = get_conn()
|
|
709
|
+
try:
|
|
710
|
+
for change_type, path_str in changes:
|
|
711
|
+
path = Path(path_str)
|
|
712
|
+
|
|
713
|
+
# Skip SQLite WAL/SHM side files and our own DB
|
|
714
|
+
if path.suffix in ('.wal', '.shm') or path == DB_PATH:
|
|
715
|
+
continue
|
|
716
|
+
if 'cda.db' in path_str:
|
|
717
|
+
continue
|
|
718
|
+
|
|
719
|
+
result = parse_path(path)
|
|
720
|
+
if result is None:
|
|
721
|
+
continue
|
|
722
|
+
|
|
723
|
+
ws_id, session_id, file_type = result
|
|
724
|
+
|
|
725
|
+
if file_type == "transcript":
|
|
726
|
+
session_ws_map[session_id] = ws_id
|
|
727
|
+
n = handle_transcript(c, ws_id, session_id, path)
|
|
728
|
+
if n > 0:
|
|
729
|
+
debouncer.mark(session_id, ws_id)
|
|
730
|
+
|
|
731
|
+
elif file_type == "chat_session":
|
|
732
|
+
session_ws_map[session_id] = ws_id
|
|
733
|
+
n = handle_chat_session(c, ws_id, session_id, path)
|
|
734
|
+
if n > 0:
|
|
735
|
+
debouncer.mark(session_id, ws_id)
|
|
736
|
+
|
|
737
|
+
elif file_type == "tool_output":
|
|
738
|
+
handle_tool_output(c, ws_id, session_id, path)
|
|
739
|
+
if session_id:
|
|
740
|
+
debouncer.mark(session_id, ws_id or session_ws_map.get(session_id, "unknown"))
|
|
741
|
+
|
|
742
|
+
elif file_type == "edit_state":
|
|
743
|
+
handle_edit_state(c, ws_id, session_id, path)
|
|
744
|
+
symbol_index_dirty = True
|
|
745
|
+
|
|
746
|
+
elif file_type == "edit_content":
|
|
747
|
+
handle_edit_content(c, ws_id, session_id, path)
|
|
748
|
+
symbol_index_dirty = True
|
|
749
|
+
|
|
750
|
+
elif file_type == "memory_workspace":
|
|
751
|
+
if path.is_file():
|
|
752
|
+
handle_memory(c, "workspace", ws_id, path)
|
|
753
|
+
|
|
754
|
+
elif file_type == "memory_global":
|
|
755
|
+
if path.is_file():
|
|
756
|
+
handle_memory(c, "global", None, path)
|
|
757
|
+
|
|
758
|
+
elif file_type == "state_vscdb":
|
|
759
|
+
handle_state_vscdb(c, ws_id, path)
|
|
760
|
+
|
|
761
|
+
c.commit()
|
|
762
|
+
if symbol_index_dirty:
|
|
763
|
+
try:
|
|
764
|
+
import importlib
|
|
765
|
+
extract = importlib.import_module('cda.extract')
|
|
766
|
+
importlib.reload(extract)
|
|
767
|
+
c2 = get_conn()
|
|
768
|
+
try:
|
|
769
|
+
extract.build_symbol_index(c2)
|
|
770
|
+
c2.commit()
|
|
771
|
+
finally:
|
|
772
|
+
c2.close()
|
|
773
|
+
except Exception as ex:
|
|
774
|
+
log.warning(f"symbol index rebuild failed: {ex}")
|
|
775
|
+
symbol_index_dirty = False
|
|
776
|
+
except Exception as e:
|
|
777
|
+
log.error(f"handler error: {e}", exc_info=True)
|
|
778
|
+
finally:
|
|
779
|
+
c.close()
|
|
780
|
+
|
|
781
|
+
|
|
782
|
+
if __name__ == "__main__":
|
|
783
|
+
main()
|