code-data-ark 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cda/__init__.py +3 -0
- cda/kernel/__init__.py +0 -0
- cda/kernel/control_db.py +151 -0
- cda/kernel/pmf_kernel.py +364 -0
- cda/kernel/selfcheck.py +299 -0
- cda/pipeline/__init__.py +0 -0
- cda/pipeline/embed.py +694 -0
- cda/pipeline/extract.py +1064 -0
- cda/pipeline/ingest.py +673 -0
- cda/pipeline/parse_edits.py +250 -0
- cda/pipeline/reconstruct.py +536 -0
- cda/pipeline/watcher.py +783 -0
- cda/ui/__init__.py +0 -0
- cda/ui/cli.py +2587 -0
- cda/ui/web.py +2848 -0
- code_data_ark-2.0.2.dist-info/METADATA +495 -0
- code_data_ark-2.0.2.dist-info/RECORD +20 -0
- code_data_ark-2.0.2.dist-info/WHEEL +4 -0
- code_data_ark-2.0.2.dist-info/entry_points.txt +2 -0
- code_data_ark-2.0.2.dist-info/licenses/license +21 -0
cda/kernel/selfcheck.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cda selfcheck — the system knows itself.
|
|
3
|
+
|
|
4
|
+
Checks:
|
|
5
|
+
version — VERSION file exists, valid semver, matches __version__
|
|
6
|
+
install_path — editable install of cda resolves to this project dir
|
|
7
|
+
db_present — local/data/cda.db exists on disk
|
|
8
|
+
db_accessible — DB opens and WAL mode is confirmed
|
|
9
|
+
db_integrity — PRAGMA integrity_check passes
|
|
10
|
+
db_tables — all expected tables are present
|
|
11
|
+
db_counts — core tables have rows (non-empty)
|
|
12
|
+
db_wal — no abandoned WAL/SHM files blocking writes
|
|
13
|
+
watcher_state — watcher.pid present and process is alive (or cleanly absent)
|
|
14
|
+
queue_depth — local/queue/ exists and reports pending file count
|
|
15
|
+
data_gitignored — local/ is gitignored in git
|
|
16
|
+
cli_path — this binary is on PATH and resolves correctly
|
|
17
|
+
python_runtime — running on expected Python (3.9, not Homebrew 3.14+)
|
|
18
|
+
dependencies — all required imports load without error
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import importlib
|
|
22
|
+
import os
|
|
23
|
+
import shutil
|
|
24
|
+
import sqlite3
|
|
25
|
+
import subprocess
|
|
26
|
+
import sys
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
# ── paths the system knows about itself ─────────────────────────────────────
|
|
30
|
+
PACKAGE_DIR = Path(__file__).resolve().parent
|
|
31
|
+
SOURCE_DIR = PACKAGE_DIR.parent.parent # source/ — tracked repo root
|
|
32
|
+
PROJECT_DIR = PACKAGE_DIR.parent.parent.parent # repo root — where layers live
|
|
33
|
+
LOCAL_DIR = PROJECT_DIR / "local"
|
|
34
|
+
DB_PATH = LOCAL_DIR / "data" / "cda.db"
|
|
35
|
+
PID_FILE = LOCAL_DIR / "run" / "watcher.pid"
|
|
36
|
+
QUEUE_DIR = LOCAL_DIR / "queue"
|
|
37
|
+
VERSION_FILE = SOURCE_DIR / "version"
|
|
38
|
+
|
|
39
|
+
REQUIRED_TABLES = [
|
|
40
|
+
"sessions", "exchanges", "tool_calls", "vfs", "workspaces",
|
|
41
|
+
"memory_files", "embeddings", "exchange_signals", "ingest_log",
|
|
42
|
+
"transcript_events", "token_usage", "compactions",
|
|
43
|
+
"session_analysis", "session_summaries",
|
|
44
|
+
"recommendations", "anomaly_alerts", "symbols", "file_offsets",
|
|
45
|
+
"state_items", "chat_messages",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
CORE_COUNT_TABLES = ["sessions", "exchanges", "tool_calls", "vfs"]
|
|
49
|
+
|
|
50
|
+
REQUIRED_IMPORTS = [
|
|
51
|
+
"click", "sqlite3", "watchfiles", "pathlib", "json", "gzip",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ── result helpers ────────────────────────────────────────────────────────────
|
|
56
|
+
|
|
57
|
+
def _ok(name, message, details=None):
|
|
58
|
+
r = {"name": name, "passed": True, "message": message}
|
|
59
|
+
if details:
|
|
60
|
+
r["details"] = details
|
|
61
|
+
return r
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _fail(name, message, details=None):
|
|
65
|
+
r = {"name": name, "passed": False, "message": message}
|
|
66
|
+
if details:
|
|
67
|
+
r["details"] = details
|
|
68
|
+
return r
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ── individual checks ─────────────────────────────────────────────────────────
|
|
72
|
+
|
|
73
|
+
def check_version():
|
|
74
|
+
import re
|
|
75
|
+
if not VERSION_FILE.exists():
|
|
76
|
+
return _fail("version", "VERSION file not found")
|
|
77
|
+
version = VERSION_FILE.read_text().strip()
|
|
78
|
+
if not re.fullmatch(r"\d+\.\d+\.\d+", version):
|
|
79
|
+
return _fail("version", f"VERSION is not valid semver: {version!r}")
|
|
80
|
+
try:
|
|
81
|
+
from cda import __version__
|
|
82
|
+
if __version__ != version:
|
|
83
|
+
return _fail("version",
|
|
84
|
+
f"VERSION file ({version}) does not match __version__ ({__version__})")
|
|
85
|
+
except (ImportError, AttributeError):
|
|
86
|
+
pass # __version__ not defined — just check the file
|
|
87
|
+
return _ok("version", f"VERSION is valid semver: {version}")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def check_install_path():
|
|
91
|
+
try:
|
|
92
|
+
result = subprocess.run(
|
|
93
|
+
[sys.executable, "-c",
|
|
94
|
+
"import cda, pathlib; "
|
|
95
|
+
"print(pathlib.Path(cda.__file__).parent.parent.resolve())"],
|
|
96
|
+
capture_output=True, text=True,
|
|
97
|
+
)
|
|
98
|
+
if result.returncode != 0:
|
|
99
|
+
return _fail("install_path", "cda not importable — editable install broken")
|
|
100
|
+
install_dir = Path(result.stdout.strip()).resolve()
|
|
101
|
+
if install_dir == SOURCE_DIR:
|
|
102
|
+
return _ok("install_path", f"editable install → {install_dir}")
|
|
103
|
+
return _fail("install_path",
|
|
104
|
+
f"editable install points to wrong path: {install_dir} (expected {SOURCE_DIR})")
|
|
105
|
+
except Exception as exc:
|
|
106
|
+
return _fail("install_path", f"install_path check error: {exc}")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def check_db_present():
|
|
110
|
+
if not DB_PATH.exists():
|
|
111
|
+
return _fail("db_present", f"cda.db not found at {DB_PATH}")
|
|
112
|
+
size_mb = DB_PATH.stat().st_size / (1024 * 1024)
|
|
113
|
+
return _ok("db_present", f"cda.db present ({size_mb:.0f} MB)")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def check_db_accessible():
|
|
117
|
+
if not DB_PATH.exists():
|
|
118
|
+
return _fail("db_accessible", "cda.db not found — skipping")
|
|
119
|
+
try:
|
|
120
|
+
conn = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, timeout=5)
|
|
121
|
+
row = conn.execute("PRAGMA journal_mode").fetchone()
|
|
122
|
+
conn.close()
|
|
123
|
+
mode = row[0] if row else "unknown"
|
|
124
|
+
if mode != "wal":
|
|
125
|
+
return _fail("db_accessible", f"DB is accessible but journal_mode={mode} (expected wal)")
|
|
126
|
+
return _ok("db_accessible", "DB accessible, journal_mode=wal")
|
|
127
|
+
except sqlite3.DatabaseError as exc:
|
|
128
|
+
return _fail("db_accessible", f"DB is corrupt or unreadable: {exc}")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def check_db_integrity():
|
|
132
|
+
if not DB_PATH.exists():
|
|
133
|
+
return _fail("db_integrity", "cda.db not found — skipping")
|
|
134
|
+
try:
|
|
135
|
+
conn = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, timeout=10)
|
|
136
|
+
row = conn.execute("PRAGMA integrity_check(1)").fetchone()
|
|
137
|
+
conn.close()
|
|
138
|
+
result = row[0] if row else "unknown"
|
|
139
|
+
if result == "ok":
|
|
140
|
+
return _ok("db_integrity", "PRAGMA integrity_check: ok")
|
|
141
|
+
return _fail("db_integrity", f"PRAGMA integrity_check: {result}")
|
|
142
|
+
except sqlite3.DatabaseError as exc:
|
|
143
|
+
return _fail("db_integrity", f"integrity_check failed: {exc}")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def check_db_tables():
|
|
147
|
+
if not DB_PATH.exists():
|
|
148
|
+
return _fail("db_tables", "cda.db not found — skipping")
|
|
149
|
+
try:
|
|
150
|
+
conn = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, timeout=5)
|
|
151
|
+
present = {r[0] for r in conn.execute(
|
|
152
|
+
"SELECT name FROM sqlite_master WHERE type='table'"
|
|
153
|
+
).fetchall()}
|
|
154
|
+
conn.close()
|
|
155
|
+
missing = [t for t in REQUIRED_TABLES if t not in present]
|
|
156
|
+
if missing:
|
|
157
|
+
return _fail("db_tables", f"Missing tables: {', '.join(missing)}")
|
|
158
|
+
return _ok("db_tables", f"All {len(REQUIRED_TABLES)} expected tables present")
|
|
159
|
+
except sqlite3.DatabaseError as exc:
|
|
160
|
+
return _fail("db_tables", f"Table check failed: {exc}")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def check_db_counts():
|
|
164
|
+
if not DB_PATH.exists():
|
|
165
|
+
return _fail("db_counts", "cda.db not found — skipping")
|
|
166
|
+
try:
|
|
167
|
+
conn = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, timeout=5)
|
|
168
|
+
counts = {}
|
|
169
|
+
for t in CORE_COUNT_TABLES:
|
|
170
|
+
row = conn.execute(f"SELECT COUNT(*) FROM {t}").fetchone()
|
|
171
|
+
counts[t] = row[0] if row else 0
|
|
172
|
+
conn.close()
|
|
173
|
+
empty = [t for t, c in counts.items() if c == 0]
|
|
174
|
+
summary = ", ".join(f"{t}={c:,}" for t, c in counts.items())
|
|
175
|
+
if empty:
|
|
176
|
+
return _fail("db_counts", f"Empty core tables: {', '.join(empty)}", summary)
|
|
177
|
+
return _ok("db_counts", f"Core table counts: {summary}")
|
|
178
|
+
except sqlite3.DatabaseError as exc:
|
|
179
|
+
return _fail("db_counts", f"Count check failed: {exc}")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def check_db_wal():
|
|
183
|
+
wal = DB_PATH.with_suffix(".db-wal")
|
|
184
|
+
shm = DB_PATH.with_suffix(".db-shm")
|
|
185
|
+
issues = []
|
|
186
|
+
if wal.exists():
|
|
187
|
+
size_kb = wal.stat().st_size // 1024
|
|
188
|
+
if size_kb > 100 * 1024: # > 100MB WAL may indicate abandoned writer
|
|
189
|
+
# Only flag as bad if the watcher is NOT running (active writer is fine)
|
|
190
|
+
watcher_active = False
|
|
191
|
+
if PID_FILE.exists():
|
|
192
|
+
try:
|
|
193
|
+
os.kill(int(PID_FILE.read_text().strip()), 0)
|
|
194
|
+
watcher_active = True
|
|
195
|
+
except (ProcessLookupError, ValueError, OSError):
|
|
196
|
+
pass
|
|
197
|
+
if not watcher_active:
|
|
198
|
+
issues.append(f"WAL file is large ({size_kb // 1024} MB) — may indicate abandoned writer")
|
|
199
|
+
if shm.exists() and not wal.exists():
|
|
200
|
+
issues.append("SHM file present without WAL — possible unclean shutdown")
|
|
201
|
+
if issues:
|
|
202
|
+
return _fail("db_wal", "; ".join(issues))
|
|
203
|
+
return _ok("db_wal", "WAL/SHM state looks healthy")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def check_watcher_state():
|
|
207
|
+
if not PID_FILE.exists():
|
|
208
|
+
return _ok("watcher_state", "watcher not running (no PID file)")
|
|
209
|
+
try:
|
|
210
|
+
pid = int(PID_FILE.read_text().strip())
|
|
211
|
+
os.kill(pid, 0) # signal 0 = existence check, no actual signal
|
|
212
|
+
return _ok("watcher_state", f"watcher running (PID {pid})")
|
|
213
|
+
except ProcessLookupError:
|
|
214
|
+
return _fail("watcher_state",
|
|
215
|
+
f"watcher.pid exists (PID {pid}) but process is dead — stale PID file")
|
|
216
|
+
except ValueError:
|
|
217
|
+
return _fail("watcher_state", "watcher.pid contains invalid PID")
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def check_queue_depth():
|
|
221
|
+
if not QUEUE_DIR.exists():
|
|
222
|
+
return _fail("queue_depth", f"watcher-queue/ not found at {QUEUE_DIR}")
|
|
223
|
+
pending = [f for f in QUEUE_DIR.iterdir() if not f.name.endswith(".completed")]
|
|
224
|
+
count = len(pending)
|
|
225
|
+
if count > 500:
|
|
226
|
+
return _fail("queue_depth", f"queue backlog is high: {count} files pending")
|
|
227
|
+
return _ok("queue_depth", f"watcher-queue/ exists, {count} files pending")
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def check_data_gitignored():
|
|
231
|
+
try:
|
|
232
|
+
result = subprocess.run(
|
|
233
|
+
["git", "check-ignore", "-q", "local"],
|
|
234
|
+
cwd=PROJECT_DIR,
|
|
235
|
+
capture_output=True,
|
|
236
|
+
)
|
|
237
|
+
if result.returncode == 0:
|
|
238
|
+
return _ok("data_gitignored", "local/ is gitignored")
|
|
239
|
+
return _fail("data_gitignored", "local/ is NOT gitignored — sensitive data at risk") # noqa: E501
|
|
240
|
+
except FileNotFoundError:
|
|
241
|
+
return _fail("data_gitignored", "git not available")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def check_cli_path():
|
|
245
|
+
cda_bin = shutil.which("cda")
|
|
246
|
+
if not cda_bin:
|
|
247
|
+
return _fail("cli_path", "cda not found on PATH")
|
|
248
|
+
resolved = Path(cda_bin).resolve()
|
|
249
|
+
return _ok("cli_path", f"cda found at {resolved}")
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def check_python_runtime():
|
|
253
|
+
major, minor = sys.version_info[:2]
|
|
254
|
+
version_str = f"{major}.{minor}.{sys.version_info[2]}"
|
|
255
|
+
if major == 3 and minor == 9:
|
|
256
|
+
return _ok("python_runtime", f"Python {version_str} (system 3.9 — correct)")
|
|
257
|
+
if major == 3 and minor >= 14:
|
|
258
|
+
return _fail("python_runtime",
|
|
259
|
+
f"Python {version_str} — running under Homebrew Python. Use system Python 3.9.")
|
|
260
|
+
return _ok("python_runtime", f"Python {version_str}")
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def check_dependencies():
|
|
264
|
+
failed = []
|
|
265
|
+
for mod in REQUIRED_IMPORTS:
|
|
266
|
+
try:
|
|
267
|
+
importlib.import_module(mod)
|
|
268
|
+
except ImportError:
|
|
269
|
+
failed.append(mod)
|
|
270
|
+
if failed:
|
|
271
|
+
return _fail("dependencies", f"Missing imports: {', '.join(failed)}")
|
|
272
|
+
return _ok("dependencies", f"All {len(REQUIRED_IMPORTS)} required imports available")
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
# ── public interface ──────────────────────────────────────────────────────────
|
|
276
|
+
|
|
277
|
+
CHECKS = [
|
|
278
|
+
check_version,
|
|
279
|
+
check_install_path,
|
|
280
|
+
check_db_present,
|
|
281
|
+
check_db_accessible,
|
|
282
|
+
check_db_integrity,
|
|
283
|
+
check_db_tables,
|
|
284
|
+
check_db_counts,
|
|
285
|
+
check_db_wal,
|
|
286
|
+
check_watcher_state,
|
|
287
|
+
check_queue_depth,
|
|
288
|
+
check_data_gitignored,
|
|
289
|
+
check_cli_path,
|
|
290
|
+
check_python_runtime,
|
|
291
|
+
check_dependencies,
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def run_all():
|
|
296
|
+
"""Run all self-checks. Returns (passed: bool, results: list[dict])."""
|
|
297
|
+
results = [c() for c in CHECKS]
|
|
298
|
+
passed = all(r["passed"] for r in results)
|
|
299
|
+
return passed, results
|
cda/pipeline/__init__.py
ADDED
|
File without changes
|