code-data-ark 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,783 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ cda/watcher.py
4
+
5
+ Live sync daemon. Watches all VSCode storage locations and streams
6
+ updates into cda.db as they happen during a session.
7
+
8
+ What it watches:
9
+ - chatSessions/*.jsonl — append-only, new lines → chat_messages + fts
10
+ - transcripts/*.jsonl — append-only, new lines → transcript_events
11
+ - chat-session-resources/ — new tool output files → vfs
12
+ - chatEditingSessions/*/state.json — rewrites → vfs update
13
+ - memory-tool/memories/** — new/changed files → memory_files
14
+ - state.vscdb — mtime change → state_items refresh
15
+
16
+ After any transcript change for a session, re-reconstructs exchanges
17
+ and refreshes fts_exchanges for that session only.
18
+
19
+ Runs as a foreground daemon. Write PID to watcher.pid.
20
+ """
21
+
22
+ import os
23
+ import sys
24
+ import json
25
+ import gzip
26
+ import hashlib
27
+ import sqlite3
28
+ import time
29
+ import threading
30
+ import signal
31
+ import logging
32
+ from pathlib import Path
33
+ from typing import Optional
34
+
35
+ try:
36
+ from watchfiles import watch
37
+ except ImportError:
38
+ print("ERROR: watchfiles not installed. Run: pip install watchfiles")
39
+ sys.exit(1)
40
+
41
+ ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
42
+ LOCAL_DIR = ROOT_DIR / "local"
43
+ DB_PATH = LOCAL_DIR / "data" / "cda.db"
44
+ PID_FILE = LOCAL_DIR / "run" / "watcher.pid"
45
+ QUEUE_DIR = LOCAL_DIR / "queue"
46
+ # Allow override via env var for portability
47
+ VSCODE_DATA_DIR = Path(os.environ.get("VSCODE_DATA_DIR", Path.home() / "Library/Application Support/Code/User"))
48
+ VS_ROOT = VSCODE_DATA_DIR / "workspaceStorage"
49
+ GLOBAL_MEM = VSCODE_DATA_DIR / "globalStorage/github.copilot-chat/memory-tool/memories"
50
+
51
+ log_file = LOCAL_DIR / "logs" / "watcher.log"
52
+ logging.basicConfig(
53
+ level=logging.INFO,
54
+ format="%(asctime)s %(levelname)-7s %(message)s",
55
+ datefmt="%H:%M:%S",
56
+ filename=str(log_file),
57
+ filemode='a',
58
+ )
59
+ log = logging.getLogger("ark-watcher")
60
+
61
+
62
+ # ─────────────────────────────────────────────
63
+ # DB helpers
64
+ # ─────────────────────────────────────────────
65
+
66
+ def get_conn():
67
+ conn = sqlite3.connect(str(DB_PATH), timeout=10)
68
+ conn.execute("PRAGMA journal_mode=WAL")
69
+ conn.execute("PRAGMA synchronous=NORMAL")
70
+ conn.execute("PRAGMA cache_size=-2000")
71
+ conn.execute("PRAGMA mmap_size=268435456")
72
+ conn.execute("PRAGMA temp_store=MEMORY")
73
+ conn.row_factory = sqlite3.Row
74
+ return conn
75
+
76
+
77
+ def compress(data: bytes) -> bytes:
78
+ return gzip.compress(data, compresslevel=6)
79
+
80
+
81
+ def sha256_short(data: bytes) -> str:
82
+ return hashlib.sha256(data).hexdigest()[:16]
83
+
84
+
85
+ def now_ms() -> int:
86
+ return int(time.time() * 1000)
87
+
88
+
89
+ # ─────────────────────────────────────────────
90
+ # Offset tracking — so we only parse new bytes
91
+ # ─────────────────────────────────────────────
92
+
93
+ OFFSETS_SCHEMA = """
94
+ CREATE TABLE IF NOT EXISTS file_offsets (
95
+ path TEXT PRIMARY KEY,
96
+ byte_offset INTEGER DEFAULT 0,
97
+ updated_at INTEGER
98
+ );
99
+ """
100
+
101
+
102
+ def get_offset(conn, path: str) -> int:
103
+ row = conn.execute("SELECT byte_offset FROM file_offsets WHERE path=?", (path,)).fetchone()
104
+ return row[0] if row else 0
105
+
106
+
107
+ def set_offset(conn, path: str, offset: int):
108
+ conn.execute(
109
+ "INSERT OR REPLACE INTO file_offsets(path, byte_offset, updated_at) VALUES(?,?,?)",
110
+ (path, offset, now_ms())
111
+ )
112
+
113
+
114
+ # ─────────────────────────────────────────────
115
+ # Extract workspace_id + session_id from a path
116
+ # ─────────────────────────────────────────────
117
+
118
+ def parse_path(path: Path):
119
+ """
120
+ Returns (workspace_id, session_id, file_type) or None.
121
+ file_type: 'transcript' | 'chat_session' | 'tool_output' |
122
+ 'edit_state' | 'memory_workspace' | 'memory_global' | 'state_vscdb'
123
+ """
124
+ try:
125
+ rel = path.relative_to(VS_ROOT)
126
+ parts = rel.parts
127
+ ws_id = parts[0]
128
+
129
+ # chatSessions/<session_id>.jsonl
130
+ if len(parts) == 3 and parts[1] == "chatSessions" and path.suffix == ".jsonl":
131
+ return ws_id, parts[2].replace(".jsonl", ""), "chat_session"
132
+
133
+ # GitHub.copilot-chat/transcripts/<session_id>.jsonl
134
+ if len(parts) == 4 and parts[1] == "GitHub.copilot-chat" and parts[2] == "transcripts" and path.suffix == ".jsonl":
135
+ return ws_id, parts[3].replace(".jsonl", ""), "transcript"
136
+
137
+ # GitHub.copilot-chat/chat-session-resources/<session_id>/<tool_dir>/content.txt
138
+ if len(parts) == 6 and parts[1] == "GitHub.copilot-chat" and parts[2] == "chat-session-resources" and parts[5] == "content.txt":
139
+ return ws_id, parts[3], "tool_output"
140
+
141
+ # chatEditingSessions/<session_id>/state.json
142
+ if len(parts) == 4 and parts[1] == "chatEditingSessions" and parts[3] == "state.json":
143
+ return ws_id, parts[2], "edit_state"
144
+
145
+ # chatEditingSessions/<session_id>/contents/<blob_file>
146
+ if len(parts) == 5 and parts[1] == "chatEditingSessions" and parts[3] == "contents":
147
+ return ws_id, parts[2], "edit_content"
148
+
149
+ # GitHub.copilot-chat/memory-tool/memories/**
150
+ if len(parts) >= 5 and parts[1] == "GitHub.copilot-chat" and parts[2] == "memory-tool" and parts[3] == "memories":
151
+ return ws_id, None, "memory_workspace"
152
+
153
+ # state.vscdb
154
+ if len(parts) == 2 and parts[1] == "state.vscdb":
155
+ return ws_id, None, "state_vscdb"
156
+
157
+ except ValueError:
158
+ pass
159
+
160
+ # Global memory
161
+ try:
162
+ path.relative_to(GLOBAL_MEM)
163
+ return None, None, "memory_global"
164
+ except ValueError:
165
+ pass
166
+
167
+ return None
168
+
169
+
170
+ # ─────────────────────────────────────────────
171
+ # Persistent Queue for Resilience
172
+ # ─────────────────────────────────────────────
173
+
174
+ def init_queue():
175
+ """Initialize the queue directory."""
176
+ QUEUE_DIR.mkdir(exist_ok=True)
177
+
178
+
179
+ def queue_operation(op_type: str, data: dict):
180
+ """Write an operation to the persistent queue before executing."""
181
+ timestamp = now_ms()
182
+ queue_file = QUEUE_DIR / f"{timestamp}_{op_type}.json"
183
+ try:
184
+ queue_file.write_text(json.dumps({
185
+ "timestamp": timestamp,
186
+ "type": op_type,
187
+ "data": data,
188
+ "status": "pending"
189
+ }))
190
+ log.debug(f"Queued operation: {op_type}")
191
+ except Exception as e:
192
+ log.error(f"Failed to queue operation {op_type}: {e}")
193
+
194
+
195
+ def dequeue_operation(queue_file: Path):
196
+ """Mark a queued operation as completed."""
197
+ try:
198
+ data = json.loads(queue_file.read_text())
199
+ data["status"] = "completed"
200
+ queue_file.write_text(json.dumps(data))
201
+ # Rename to .completed extension
202
+ completed_file = queue_file.with_suffix(".completed")
203
+ queue_file.rename(completed_file)
204
+ log.debug(f"Dequeued operation: {queue_file.name}")
205
+ except Exception as e:
206
+ log.error(f"Failed to dequeue {queue_file}: {e}")
207
+
208
+
209
+ def replay_queue(conn):
210
+ """Replay any pending operations from the queue on startup."""
211
+ if not QUEUE_DIR.exists():
212
+ return
213
+
214
+ pending_files = list(QUEUE_DIR.glob("*.json"))
215
+ if not pending_files:
216
+ return
217
+
218
+ log.info(f"Replaying {len(pending_files)} queued operations...")
219
+
220
+ for queue_file in sorted(pending_files):
221
+ try:
222
+ data = json.loads(queue_file.read_text())
223
+ if data.get("status") == "pending":
224
+ op_type = data["type"]
225
+ op_data = data["data"]
226
+
227
+ if op_type == "vfs_insert":
228
+ _insert_vfs(conn, op_data["path"], op_data["ws_id"], op_data["session_id"],
229
+ op_data["source_type"], None, op_data["filename"])
230
+ elif op_type == "transcript_event":
231
+ _insert_transcript_events(conn, op_data["ws_id"], op_data["session_id"],
232
+ op_data["events"])
233
+ elif op_type in ("chat_message", "exchange_rebuild"):
234
+ log.warning(f"Skipping unsupported queue op type on replay: {op_type}")
235
+
236
+ dequeue_operation(queue_file)
237
+ except Exception as e:
238
+ log.error(f"Failed to replay {queue_file}: {e}")
239
+
240
+
241
+ def cleanup_old_queue_files():
242
+ """Clean up completed queue files older than 7 days."""
243
+ if not QUEUE_DIR.exists():
244
+ return
245
+
246
+ cutoff = now_ms() - (7 * 24 * 60 * 60 * 1000) # 7 days ago
247
+
248
+ for completed_file in QUEUE_DIR.glob("*.completed"):
249
+ try:
250
+ data = json.loads(completed_file.read_text())
251
+ if data.get("timestamp", 0) < cutoff:
252
+ completed_file.unlink()
253
+ except Exception:
254
+ completed_file.unlink() # Remove corrupted files
255
+
256
+
257
+ def _insert_vfs(conn, path: str, ws_id: str, session_id: str, source_type: str, content: "Optional[bytes]", filename: str):
258
+ """Insert VFS blob - used by queue replay."""
259
+ if content is None:
260
+ try:
261
+ content = Path(path).read_bytes()
262
+ except Exception as e:
263
+ raise RuntimeError(f"Failed to read queued VFS content from {path}: {e}") from e
264
+
265
+ conn.execute(
266
+ """INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
267
+ content_type, content, size_bytes, sha256, ingested_at)
268
+ VALUES(?,?,?,?,?,?,?,?,?,?)""",
269
+ (ws_id, session_id, source_type, path, filename,
270
+ "jsonl", compress(content), len(content), sha256_short(content), now_ms())
271
+ )
272
+
273
+
274
+ def _insert_transcript_events(conn, ws_id: str, session_id: str, events: list):
275
+ """Insert transcript events - used by queue replay."""
276
+ for event_data in events:
277
+ conn.execute(
278
+ """INSERT INTO transcript_events(session_id, workspace_id, event_type, request_id, turn_index, ts, data_json)
279
+ VALUES(?,?,?,?,?,?,?)""",
280
+ event_data
281
+ )
282
+
283
+
284
+ # ─────────────────────────────────────────────
285
+ # Incremental JSONL parse
286
+ # ─────────────────────────────────────────────
287
+
288
+ def read_new_lines(path: Path, from_offset: int):
289
+ """Returns (new_lines, new_offset)."""
290
+ try:
291
+ raw = path.read_bytes()
292
+ except Exception:
293
+ return [], from_offset
294
+ new_bytes = raw[from_offset:]
295
+ if not new_bytes:
296
+ return [], from_offset
297
+ text = new_bytes.decode("utf-8", errors="replace")
298
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
299
+ return lines, len(raw)
300
+
301
+
302
+ # ─────────────────────────────────────────────
303
+ # Handlers
304
+ # ─────────────────────────────────────────────
305
+
306
+ def handle_transcript(conn, ws_id, session_id, path: Path):
307
+ path_str = str(path)
308
+ offset = get_offset(conn, path_str)
309
+ lines, new_offset = read_new_lines(path, offset)
310
+ if not lines:
311
+ return 0
312
+
313
+ count = 0
314
+ turn_index = conn.execute(
315
+ "SELECT COALESCE(MAX(turn_index),0) FROM transcript_events WHERE session_id=?",
316
+ (session_id,)
317
+ ).fetchone()[0]
318
+
319
+ events_data = []
320
+ for line in lines:
321
+ try:
322
+ evt = json.loads(line)
323
+ except Exception:
324
+ continue
325
+ event_type = evt.get("type", "unknown")
326
+ request_id = evt.get("requestId") or evt.get("request_id")
327
+ ts = evt.get("timestamp") or evt.get("ts")
328
+ if event_type in ("assistant.turn_start", "user.message"):
329
+ turn_index += 1
330
+ events_data.append((session_id, ws_id, event_type, request_id, turn_index, ts, line))
331
+ count += 1
332
+
333
+ # Queue the transcript events operation
334
+ queue_operation("transcript_event", {
335
+ "ws_id": ws_id,
336
+ "session_id": session_id,
337
+ "events": events_data
338
+ })
339
+
340
+ # Execute the operations
341
+ for event_data in events_data:
342
+ conn.execute(
343
+ """INSERT INTO transcript_events(session_id, workspace_id, event_type, request_id, turn_index, ts, data_json)
344
+ VALUES(?,?,?,?,?,?,?)""",
345
+ event_data
346
+ )
347
+
348
+ set_offset(conn, path_str, new_offset)
349
+
350
+ # Queue VFS update
351
+ queue_operation("vfs_insert", {
352
+ "path": path_str,
353
+ "ws_id": ws_id,
354
+ "session_id": session_id,
355
+ "source_type": "transcript",
356
+ "filename": path.name
357
+ })
358
+
359
+ # Update VFS blob
360
+ conn.execute(
361
+ "DELETE FROM vfs WHERE session_id=? AND source_type='transcript'", (session_id,)
362
+ )
363
+ _insert_vfs(conn, path_str, ws_id, session_id, "transcript", path.read_bytes(), path.name)
364
+
365
+ log.info(f"transcript +{count} events {session_id[:16]} (total offset {new_offset})")
366
+ return count
367
+
368
+
369
+ def handle_chat_session(conn, ws_id, session_id, path: Path):
370
+ path_str = str(path)
371
+ offset = get_offset(conn, path_str)
372
+ lines, new_offset = read_new_lines(path, offset)
373
+ if not lines:
374
+ return 0
375
+
376
+ count = 0
377
+ for line in lines:
378
+ try:
379
+ obj = json.loads(line)
380
+ except Exception:
381
+ continue
382
+ kind = obj.get("kind")
383
+ v = obj.get("v")
384
+ if kind == 1 and isinstance(v, str):
385
+ conn.execute(
386
+ "INSERT INTO chat_messages(session_id, workspace_id, role, content, kind) VALUES(?,?,?,?,?)",
387
+ (session_id, ws_id, "user", v, 1)
388
+ )
389
+ count += 1
390
+ elif kind == 2 and isinstance(v, list):
391
+ for req in v:
392
+ if not isinstance(req, dict):
393
+ continue
394
+ request_id = req.get("requestId")
395
+ ts = req.get("timestamp")
396
+ agent_id = (req.get("agent") or {}).get("id")
397
+ conn.execute(
398
+ """INSERT INTO chat_messages(session_id, workspace_id, request_id, ts, role, content, agent_id, kind)
399
+ VALUES(?,?,?,?,?,?,?,?)""",
400
+ (session_id, ws_id, request_id, ts, "request_meta",
401
+ json.dumps(req), agent_id, 2)
402
+ )
403
+ count += 1
404
+
405
+ # Update VFS blob
406
+ raw = path.read_bytes()
407
+ conn.execute("DELETE FROM vfs WHERE session_id=? AND source_type='chat_session'", (session_id,))
408
+ conn.execute(
409
+ """INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
410
+ content_type, content, size_bytes, sha256, ingested_at)
411
+ VALUES(?,?,?,?,?,?,?,?,?,?)""",
412
+ (ws_id, session_id, "chat_session", str(path), path.name,
413
+ "jsonl", compress(raw), len(raw), sha256_short(raw), now_ms())
414
+ )
415
+
416
+ set_offset(conn, path_str, new_offset)
417
+ log.info(f"chat_session +{count} msgs {session_id[:16]}")
418
+ return count
419
+
420
+
421
+ def handle_tool_output(conn, ws_id, session_id, path: Path):
422
+ try:
423
+ raw = path.read_bytes()
424
+ except Exception:
425
+ return
426
+ # Check if already in VFS by path
427
+ exists = conn.execute(
428
+ "SELECT id FROM vfs WHERE source_path=? AND source_type='tool_output'", (str(path),)
429
+ ).fetchone()
430
+ if exists:
431
+ return
432
+ conn.execute(
433
+ """INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
434
+ content_type, content, size_bytes, sha256, ingested_at)
435
+ VALUES(?,?,?,?,?,?,?,?,?,?)""",
436
+ (ws_id, session_id, "tool_output", str(path), path.name,
437
+ "text", compress(raw), len(raw), sha256_short(raw), now_ms())
438
+ )
439
+ log.info(f"tool_output +1 {session_id[:16]} ({len(raw)} bytes)")
440
+
441
+
442
+ def handle_edit_state(conn, ws_id, session_id, path: Path):
443
+ try:
444
+ raw = path.read_bytes()
445
+ except Exception:
446
+ return
447
+ conn.execute(
448
+ "DELETE FROM vfs WHERE session_id=? AND source_type='edit_state'", (session_id,)
449
+ )
450
+ conn.execute(
451
+ """INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
452
+ content_type, content, size_bytes, sha256, ingested_at)
453
+ VALUES(?,?,?,?,?,?,?,?,?,?)""",
454
+ (ws_id, session_id, "edit_state", str(path), path.name,
455
+ "json", compress(raw), len(raw), sha256_short(raw), now_ms())
456
+ )
457
+ log.info(f"edit_state updated {session_id[:16]}")
458
+
459
+
460
+ def handle_edit_content(conn, ws_id, session_id, path: Path):
461
+ try:
462
+ raw = path.read_bytes()
463
+ except Exception:
464
+ conn.execute(
465
+ "DELETE FROM vfs WHERE source_path=? AND source_type='edit_content'", (str(path),)
466
+ )
467
+ log.info(f"edit_content removed {session_id[:16]} {path.name}")
468
+ return
469
+ conn.execute(
470
+ "DELETE FROM vfs WHERE source_path=? AND source_type='edit_content'", (str(path),)
471
+ )
472
+ conn.execute(
473
+ """INSERT INTO vfs(workspace_id, session_id, source_type, source_path, filename,
474
+ content_type, content, size_bytes, sha256, ingested_at)
475
+ VALUES(?,?,?,?,?,?,?,?,?,?)""",
476
+ (ws_id, session_id, "edit_content", str(path), path.name,
477
+ "binary", compress(raw), len(raw), sha256_short(raw), now_ms())
478
+ )
479
+ log.info(f"edit_content updated {session_id[:16]} {path.name}")
480
+
481
+
482
+ def handle_memory(conn, scope, ws_id, path: Path):
483
+ try:
484
+ content = path.read_text(errors="replace")
485
+ except Exception:
486
+ return
487
+ conn.execute(
488
+ """INSERT OR REPLACE INTO memory_files(scope, workspace_id, filename, content, size_bytes, ingested_at)
489
+ VALUES(?,?,?,?,?,?)""",
490
+ (scope, ws_id, path.name, content, path.stat().st_size, now_ms())
491
+ )
492
+ log.info(f"memory updated [{scope}] {path.name}")
493
+
494
+
495
+ def handle_state_vscdb(conn, ws_id, path: Path):
496
+ try:
497
+ src = sqlite3.connect(str(path), timeout=3)
498
+ rows = src.execute("SELECT key, value FROM ItemTable").fetchall()
499
+ src.close()
500
+ except Exception as e:
501
+ log.warning(f"state_vscdb read error {ws_id[:16]}: {e}")
502
+ return
503
+ for key, value in rows:
504
+ conn.execute(
505
+ "INSERT OR REPLACE INTO state_items(workspace_id, key, value) VALUES(?,?,?)",
506
+ (ws_id, key, value)
507
+ )
508
+ log.info(f"state_vscdb refreshed {ws_id[:16]} ({len(rows)} rows)")
509
+
510
+
511
+ # ─────────────────────────────────────────────
512
+ # Exchange reconstruction (incremental)
513
+ # ─────────────────────────────────────────────
514
+
515
+ from cda.pipeline.reconstruct import EXCHANGES_SCHEMA, reconstruct_session as _reconstruct_session
516
+
517
+
518
+ def rebuild_exchanges(conn, session_id: str, ws_id: str):
519
+ """Delete and rebuild exchanges + FTS for one session."""
520
+ conn.executescript(EXCHANGES_SCHEMA)
521
+ conn.execute("DELETE FROM exchanges WHERE session_id=?", (session_id,))
522
+ # Remove from FTS (content= tables auto-handle via triggers if configured,
523
+ # but since we used content= without triggers, rebuild manually)
524
+ n = _reconstruct_session(conn, session_id, ws_id or "unknown")
525
+
526
+ # Refresh FTS for this session
527
+ # FTS5 content= tables need explicit sync after content table changes
528
+ # Use transaction for atomicity
529
+ with conn:
530
+ conn.execute(
531
+ "INSERT INTO fts_exchanges(fts_exchanges, rowid, session_id, workspace_id, exchange_index, user_ts, user_message, reasoning_text, response_text, tool_calls) SELECT 'delete', id, session_id, workspace_id, exchange_index, user_ts, user_message, reasoning_text, response_text, tool_calls FROM exchanges WHERE session_id=?", # noqa: E501
532
+ (session_id,)
533
+ )
534
+ # Re-insert
535
+ conn.execute(
536
+ "INSERT INTO fts_exchanges(rowid, session_id, workspace_id, exchange_index, user_ts, user_message, reasoning_text, response_text, tool_calls) SELECT id, session_id, workspace_id, exchange_index, user_ts, user_message, reasoning_text, response_text, tool_calls FROM exchanges WHERE session_id=?", # noqa: E501
537
+ (session_id,)
538
+ )
539
+
540
+ log.info(f"exchanges rebuilt {session_id[:16]} ({n} exchanges)")
541
+ return n
542
+
543
+
544
+ # ─────────────────────────────────────────────
545
+ # Debounce: batch changes per session
546
+ # ─────────────────────────────────────────────
547
+
548
+ class Debouncer:
549
+ """Collect dirty sessions and flush after DELAY seconds of quiet."""
550
+ DELAY = 2.0
551
+
552
+ def __init__(self, flush_fn):
553
+ self._dirty: dict[str, tuple] = {} # session_id → (ws_id, deadline)
554
+ self._lock = threading.Lock()
555
+ self._flush_fn = flush_fn
556
+ self._thread = threading.Thread(target=self._loop, daemon=True)
557
+ self._thread.start()
558
+
559
+ def mark(self, session_id: str, ws_id: str):
560
+ with self._lock:
561
+ self._dirty[session_id] = (ws_id, time.time() + self.DELAY)
562
+
563
+ def _loop(self):
564
+ while True:
565
+ time.sleep(0.5)
566
+ now = time.time()
567
+ with self._lock:
568
+ ready = [(sid, ws) for sid, (ws, deadline) in self._dirty.items() if now >= deadline]
569
+ for sid, _ in ready:
570
+ del self._dirty[sid]
571
+ for sid, ws in ready:
572
+ try:
573
+ self._flush_fn(sid, ws)
574
+ except Exception as e:
575
+ log.error(f"flush error {sid[:16]}: {e}")
576
+
577
+
578
+ # ─────────────────────────────────────────────
579
+ # Main loop
580
+ # ─────────────────────────────────────────────
581
+
582
+ def main():
583
+ print("STARTING WATCHER", os.environ.get('PYTHONPATH'), file=sys.stderr)
584
+ # Initialize persistent queue
585
+ init_queue()
586
+ cleanup_old_queue_files()
587
+
588
+ PID_FILE.write_text(str(os.getpid()))
589
+ log.info(f"cda watcher started pid={os.getpid()}")
590
+ log.info(f"DB: {DB_PATH}")
591
+ log.info(f"Queue: {QUEUE_DIR}")
592
+ log.info(f"Watching: {VS_ROOT}")
593
+
594
+ conn = get_conn()
595
+ conn.executescript(OFFSETS_SCHEMA)
596
+ conn.executescript(EXCHANGES_SCHEMA)
597
+ conn.commit()
598
+
599
+ # Ensure watcher-required schema exists before replaying operations.
600
+ try:
601
+ import importlib
602
+ extract = importlib.import_module('cda.extract')
603
+ importlib.reload(extract)
604
+ extract.ensure_schema(conn)
605
+ except Exception as ex:
606
+ log.warning(f"Failed to ensure extract schema: {ex}")
607
+
608
+ # Replay any pending operations from queue
609
+ replay_queue(conn)
610
+
611
+ # Initialize offsets for all existing JSONL files so we don't re-ingest
612
+ log.info("Initializing offsets for existing files...")
613
+ for ws_dir in VS_ROOT.iterdir():
614
+ if not ws_dir.is_dir():
615
+ continue
616
+ ws_id = ws_dir.name
617
+
618
+ # chatSessions
619
+ cs_dir = ws_dir / "chatSessions"
620
+ if cs_dir.is_dir():
621
+ for f in cs_dir.glob("*.jsonl"):
622
+ if get_offset(conn, str(f)) == 0:
623
+ try:
624
+ set_offset(conn, str(f), f.stat().st_size)
625
+ except Exception:
626
+ pass
627
+
628
+ # transcripts
629
+ tr_dir = ws_dir / "GitHub.copilot-chat" / "transcripts"
630
+ if tr_dir.is_dir():
631
+ for f in tr_dir.glob("*.jsonl"):
632
+ if get_offset(conn, str(f)) == 0:
633
+ try:
634
+ set_offset(conn, str(f), f.stat().st_size)
635
+ except Exception:
636
+ pass
637
+
638
+ conn.commit()
639
+ log.info("Offsets initialized — watching for new data only")
640
+
641
+ # Debouncer: when transcript changes, rebuild exchanges after quiet period
642
+ def flush_exchanges(session_id, ws_id):
643
+ c = get_conn()
644
+ try:
645
+ rebuild_exchanges(c, session_id, ws_id)
646
+ c.commit()
647
+ finally:
648
+ c.close()
649
+ # Incremental extraction: run behavioral signals + session analysis
650
+ try:
651
+ import importlib
652
+ extract = importlib.import_module('cda.extract')
653
+ importlib.reload(extract)
654
+ c2 = get_conn()
655
+ try:
656
+ blob_row = c2.execute(
657
+ "SELECT content FROM vfs WHERE session_id=? AND source_type='chat_session'",
658
+ (session_id,)
659
+ ).fetchone()
660
+ if blob_row:
661
+ c2.execute("DELETE FROM token_usage WHERE session_id=?", (session_id,))
662
+ c2.execute("DELETE FROM compactions WHERE session_id=?", (session_id,))
663
+ c2.execute("DELETE FROM exchange_signals WHERE session_id=?", (session_id,))
664
+ extract.process_session(c2, session_id, blob_row[0])
665
+ extract.build_session_analysis(c2, session_id)
666
+ c2.commit()
667
+ try:
668
+ embed = importlib.import_module('cda.embed')
669
+ importlib.reload(embed)
670
+ embed.build_session_intelligence(c2, session_id)
671
+ c2.commit()
672
+ except Exception as ex2:
673
+ log.warning(f"embed pass failed for {session_id[:8]}: {ex2}")
674
+ finally:
675
+ c2.close()
676
+ except Exception as ex:
677
+ log.warning(f"extract pass failed for {session_id[:8]}: {ex}")
678
+
679
+ debouncer = Debouncer(flush_exchanges)
680
+
681
+ # Track session→workspace for debouncer
682
+ session_ws_map: dict[str, str] = {}
683
+
684
+ def handle_shutdown(sig, frame):
685
+ log.info("Shutting down...")
686
+ try:
687
+ PID_FILE.unlink()
688
+ except Exception:
689
+ pass
690
+ sys.exit(0)
691
+
692
+ signal.signal(signal.SIGINT, handle_shutdown)
693
+ signal.signal(signal.SIGTERM, handle_shutdown)
694
+
695
+ watch_paths = [str(VS_ROOT), str(GLOBAL_MEM)]
696
+ log.info(f"Watch paths: {watch_paths}")
697
+
698
+ # Build session→workspace map from DB
699
+ c = get_conn()
700
+ for row in c.execute("SELECT session_id, workspace_id FROM sessions"):
701
+ session_ws_map[row[0]] = row[1]
702
+ c.close()
703
+
704
+ needs_exchange_rebuild = set() # noqa: F841 — reserved for future use
705
+ symbol_index_dirty = False
706
+
707
+ for changes in watch(VS_ROOT, GLOBAL_MEM, watch_filter=lambda change, path: True, yield_on_timeout=True, rust_timeout=500):
708
+ c = get_conn()
709
+ try:
710
+ for change_type, path_str in changes:
711
+ path = Path(path_str)
712
+
713
+ # Skip SQLite WAL/SHM side files and our own DB
714
+ if path.suffix in ('.wal', '.shm') or path == DB_PATH:
715
+ continue
716
+ if 'cda.db' in path_str:
717
+ continue
718
+
719
+ result = parse_path(path)
720
+ if result is None:
721
+ continue
722
+
723
+ ws_id, session_id, file_type = result
724
+
725
+ if file_type == "transcript":
726
+ session_ws_map[session_id] = ws_id
727
+ n = handle_transcript(c, ws_id, session_id, path)
728
+ if n > 0:
729
+ debouncer.mark(session_id, ws_id)
730
+
731
+ elif file_type == "chat_session":
732
+ session_ws_map[session_id] = ws_id
733
+ n = handle_chat_session(c, ws_id, session_id, path)
734
+ if n > 0:
735
+ debouncer.mark(session_id, ws_id)
736
+
737
+ elif file_type == "tool_output":
738
+ handle_tool_output(c, ws_id, session_id, path)
739
+ if session_id:
740
+ debouncer.mark(session_id, ws_id or session_ws_map.get(session_id, "unknown"))
741
+
742
+ elif file_type == "edit_state":
743
+ handle_edit_state(c, ws_id, session_id, path)
744
+ symbol_index_dirty = True
745
+
746
+ elif file_type == "edit_content":
747
+ handle_edit_content(c, ws_id, session_id, path)
748
+ symbol_index_dirty = True
749
+
750
+ elif file_type == "memory_workspace":
751
+ if path.is_file():
752
+ handle_memory(c, "workspace", ws_id, path)
753
+
754
+ elif file_type == "memory_global":
755
+ if path.is_file():
756
+ handle_memory(c, "global", None, path)
757
+
758
+ elif file_type == "state_vscdb":
759
+ handle_state_vscdb(c, ws_id, path)
760
+
761
+ c.commit()
762
+ if symbol_index_dirty:
763
+ try:
764
+ import importlib
765
+ extract = importlib.import_module('cda.extract')
766
+ importlib.reload(extract)
767
+ c2 = get_conn()
768
+ try:
769
+ extract.build_symbol_index(c2)
770
+ c2.commit()
771
+ finally:
772
+ c2.close()
773
+ except Exception as ex:
774
+ log.warning(f"symbol index rebuild failed: {ex}")
775
+ symbol_index_dirty = False
776
+ except Exception as e:
777
+ log.error(f"handler error: {e}", exc_info=True)
778
+ finally:
779
+ c.close()
780
+
781
+
782
+ if __name__ == "__main__":
783
+ main()