code-data-ark 2.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cda/__init__.py +3 -0
- cda/kernel/__init__.py +0 -0
- cda/kernel/control_db.py +151 -0
- cda/kernel/pmf_kernel.py +364 -0
- cda/kernel/selfcheck.py +299 -0
- cda/pipeline/__init__.py +0 -0
- cda/pipeline/embed.py +694 -0
- cda/pipeline/extract.py +1064 -0
- cda/pipeline/ingest.py +673 -0
- cda/pipeline/parse_edits.py +250 -0
- cda/pipeline/reconstruct.py +536 -0
- cda/pipeline/watcher.py +783 -0
- cda/ui/__init__.py +0 -0
- cda/ui/cli.py +2587 -0
- cda/ui/web.py +2848 -0
- code_data_ark-2.0.2.dist-info/METADATA +495 -0
- code_data_ark-2.0.2.dist-info/RECORD +20 -0
- code_data_ark-2.0.2.dist-info/WHEEL +4 -0
- code_data_ark-2.0.2.dist-info/entry_points.txt +2 -0
- code_data_ark-2.0.2.dist-info/licenses/license +21 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
parse_edits.py — Edit session analysis.
|
|
4
|
+
|
|
5
|
+
Parses edit_state VFS blobs and populates:
|
|
6
|
+
- edit_sessions : per-session file edit summary
|
|
7
|
+
- edited_files : per-file-per-session record
|
|
8
|
+
|
|
9
|
+
edit_state schema (VSCode internal, version 2):
|
|
10
|
+
{
|
|
11
|
+
version: 2,
|
|
12
|
+
initialFileContents: [[fileUri, contentHash], ...],
|
|
13
|
+
timeline: {
|
|
14
|
+
checkpoints: [{checkpointId, requestId, epoch, label}, ...],
|
|
15
|
+
currentEpoch: N,
|
|
16
|
+
fileBaselines: ...
|
|
17
|
+
},
|
|
18
|
+
recentSnapshot: {
|
|
19
|
+
entries: [{resource, languageId, originalHash, currentHash, state}, ...]
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
State values (from VSCode source):
|
|
24
|
+
0 = Unmodified
|
|
25
|
+
1 = Modified (pending)
|
|
26
|
+
2 = Accepted
|
|
27
|
+
3 = Rejected
|
|
28
|
+
|
|
29
|
+
Modified files: originalHash != currentHash in snapshot entries
|
|
30
|
+
Edit rounds: len(checkpoints) - 1 (first is always "Initial State")
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
import sqlite3
|
|
34
|
+
import gzip
|
|
35
|
+
import json
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
|
|
38
|
+
ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
|
|
39
|
+
LOCAL_DIR = ROOT_DIR / "local"
|
|
40
|
+
DB_PATH = LOCAL_DIR / "data" / "cda.db"
|
|
41
|
+
|
|
42
|
+
SCHEMA = """
|
|
43
|
+
CREATE TABLE IF NOT EXISTS edit_sessions (
|
|
44
|
+
session_id TEXT PRIMARY KEY,
|
|
45
|
+
workspace_id TEXT,
|
|
46
|
+
total_files INTEGER DEFAULT 0, -- files in snapshot
|
|
47
|
+
modified_files INTEGER DEFAULT 0, -- files where hash changed
|
|
48
|
+
edit_rounds INTEGER DEFAULT 0, -- checkpoints minus initial
|
|
49
|
+
file_paths TEXT, -- JSON array of modified file paths
|
|
50
|
+
all_file_paths TEXT, -- JSON array of all file paths in session
|
|
51
|
+
ingested_at TEXT DEFAULT (datetime('now'))
|
|
52
|
+
);
|
|
53
|
+
|
|
54
|
+
CREATE TABLE IF NOT EXISTS edited_files (
|
|
55
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
56
|
+
session_id TEXT NOT NULL,
|
|
57
|
+
workspace_id TEXT,
|
|
58
|
+
file_uri TEXT NOT NULL,
|
|
59
|
+
file_path TEXT, -- path without file:// scheme
|
|
60
|
+
language_id TEXT,
|
|
61
|
+
original_hash TEXT,
|
|
62
|
+
current_hash TEXT,
|
|
63
|
+
was_modified INTEGER DEFAULT 0, -- 1 if original_hash != current_hash
|
|
64
|
+
final_state INTEGER, -- 0=unmodified,1=modified,2=accepted,3=rejected
|
|
65
|
+
ingested_at TEXT DEFAULT (datetime('now'))
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
CREATE INDEX IF NOT EXISTS idx_edit_sessions_workspace ON edit_sessions(workspace_id);
|
|
69
|
+
CREATE INDEX IF NOT EXISTS idx_edited_files_session ON edited_files(session_id);
|
|
70
|
+
CREATE INDEX IF NOT EXISTS idx_edited_files_path ON edited_files(file_path);
|
|
71
|
+
CREATE INDEX IF NOT EXISTS idx_edited_files_modified ON edited_files(was_modified);
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def strip_scheme(uri):
|
|
76
|
+
"""Convert file:///path/to/file → /path/to/file"""
|
|
77
|
+
if uri.startswith('file://'):
|
|
78
|
+
return uri[7:]
|
|
79
|
+
return uri
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def parse_edit_state(conn, session_id, workspace_id, content):
|
|
83
|
+
"""Parse one edit_state blob and upsert rows."""
|
|
84
|
+
raw = gzip.decompress(content).decode('utf-8', errors='replace')
|
|
85
|
+
try:
|
|
86
|
+
obj = json.loads(raw)
|
|
87
|
+
except Exception:
|
|
88
|
+
return 0
|
|
89
|
+
|
|
90
|
+
if obj.get('version') != 2:
|
|
91
|
+
return 0
|
|
92
|
+
|
|
93
|
+
snapshot = obj.get('recentSnapshot', {})
|
|
94
|
+
entries = snapshot.get('entries', [])
|
|
95
|
+
|
|
96
|
+
timeline = obj.get('timeline', {})
|
|
97
|
+
checkpoints = timeline.get('checkpoints', [])
|
|
98
|
+
edit_rounds = max(0, len(checkpoints) - 1) # subtract initial state
|
|
99
|
+
|
|
100
|
+
total_files = len(entries)
|
|
101
|
+
modified_files = 0
|
|
102
|
+
file_paths_modified = []
|
|
103
|
+
all_file_paths = []
|
|
104
|
+
|
|
105
|
+
file_rows = []
|
|
106
|
+
for entry in entries:
|
|
107
|
+
if not isinstance(entry, dict):
|
|
108
|
+
continue
|
|
109
|
+
uri = entry.get('resource', '')
|
|
110
|
+
file_path = strip_scheme(uri)
|
|
111
|
+
lang = entry.get('languageId', '')
|
|
112
|
+
orig = entry.get('originalHash', '')
|
|
113
|
+
curr = entry.get('currentHash', '')
|
|
114
|
+
state = entry.get('state', 0)
|
|
115
|
+
was_mod = 1 if (orig and curr and orig != curr) else 0
|
|
116
|
+
|
|
117
|
+
if was_mod:
|
|
118
|
+
modified_files += 1
|
|
119
|
+
file_paths_modified.append(file_path)
|
|
120
|
+
if file_path:
|
|
121
|
+
all_file_paths.append(file_path)
|
|
122
|
+
|
|
123
|
+
file_rows.append((
|
|
124
|
+
session_id, workspace_id,
|
|
125
|
+
uri, file_path, lang,
|
|
126
|
+
orig, curr, was_mod, state,
|
|
127
|
+
))
|
|
128
|
+
|
|
129
|
+
# Upsert edit_sessions row
|
|
130
|
+
conn.execute("""
|
|
131
|
+
INSERT INTO edit_sessions
|
|
132
|
+
(session_id, workspace_id, total_files, modified_files,
|
|
133
|
+
edit_rounds, file_paths, all_file_paths)
|
|
134
|
+
VALUES (?,?,?,?,?,?,?)
|
|
135
|
+
ON CONFLICT(session_id) DO UPDATE SET
|
|
136
|
+
workspace_id=excluded.workspace_id,
|
|
137
|
+
total_files=excluded.total_files,
|
|
138
|
+
modified_files=excluded.modified_files,
|
|
139
|
+
edit_rounds=excluded.edit_rounds,
|
|
140
|
+
file_paths=excluded.file_paths,
|
|
141
|
+
all_file_paths=excluded.all_file_paths,
|
|
142
|
+
ingested_at=datetime('now')
|
|
143
|
+
""", (
|
|
144
|
+
session_id, workspace_id,
|
|
145
|
+
total_files, modified_files,
|
|
146
|
+
edit_rounds,
|
|
147
|
+
json.dumps(file_paths_modified),
|
|
148
|
+
json.dumps(all_file_paths),
|
|
149
|
+
))
|
|
150
|
+
|
|
151
|
+
# Delete existing file rows for this session (re-run safe)
|
|
152
|
+
conn.execute("DELETE FROM edited_files WHERE session_id=?", (session_id,))
|
|
153
|
+
|
|
154
|
+
# Insert file rows
|
|
155
|
+
conn.executemany("""
|
|
156
|
+
INSERT INTO edited_files
|
|
157
|
+
(session_id, workspace_id, file_uri, file_path, language_id,
|
|
158
|
+
original_hash, current_hash, was_modified, final_state)
|
|
159
|
+
VALUES (?,?,?,?,?,?,?,?,?)
|
|
160
|
+
""", file_rows)
|
|
161
|
+
|
|
162
|
+
return len(file_rows)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def run():
|
|
166
|
+
conn = sqlite3.connect(str(DB_PATH), timeout=30)
|
|
167
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
168
|
+
conn.execute("PRAGMA synchronous=NORMAL")
|
|
169
|
+
conn.execute("PRAGMA cache_size=-2000")
|
|
170
|
+
conn.execute("PRAGMA mmap_size=268435456")
|
|
171
|
+
conn.execute("PRAGMA temp_store=MEMORY")
|
|
172
|
+
conn.executescript(SCHEMA)
|
|
173
|
+
conn.commit()
|
|
174
|
+
|
|
175
|
+
blobs = conn.execute(
|
|
176
|
+
"""SELECT v.session_id, s.workspace_id, v.content
|
|
177
|
+
FROM vfs v
|
|
178
|
+
LEFT JOIN sessions s ON s.session_id = v.session_id
|
|
179
|
+
WHERE v.source_type = 'edit_state'
|
|
180
|
+
ORDER BY v.size_bytes DESC"""
|
|
181
|
+
).fetchall()
|
|
182
|
+
|
|
183
|
+
print(f"Parsing {len(blobs)} edit_state blobs...")
|
|
184
|
+
total_files = 0
|
|
185
|
+
total_modified = 0
|
|
186
|
+
errors = 0
|
|
187
|
+
|
|
188
|
+
# Deduplicate by session_id — use largest blob per session
|
|
189
|
+
seen = set()
|
|
190
|
+
deduped = []
|
|
191
|
+
for sid, wid, content in blobs:
|
|
192
|
+
if sid not in seen:
|
|
193
|
+
seen.add(sid)
|
|
194
|
+
deduped.append((sid, wid, content))
|
|
195
|
+
|
|
196
|
+
print(f" Unique sessions: {len(deduped)}")
|
|
197
|
+
|
|
198
|
+
for sid, wid, content in deduped:
|
|
199
|
+
try:
|
|
200
|
+
parse_edit_state(conn, sid, wid, content)
|
|
201
|
+
row = conn.execute(
|
|
202
|
+
"SELECT total_files, modified_files, edit_rounds FROM edit_sessions WHERE session_id=?",
|
|
203
|
+
(sid,)
|
|
204
|
+
).fetchone()
|
|
205
|
+
if row:
|
|
206
|
+
total_files += row[0]
|
|
207
|
+
total_modified += row[1]
|
|
208
|
+
if row[1] > 0:
|
|
209
|
+
print(f" {sid[:16]} files={row[0]} modified={row[1]} rounds={row[2]}")
|
|
210
|
+
except Exception as e:
|
|
211
|
+
errors += 1
|
|
212
|
+
if errors <= 5:
|
|
213
|
+
print(f" ERROR {sid[:16]}: {e}")
|
|
214
|
+
|
|
215
|
+
conn.commit()
|
|
216
|
+
|
|
217
|
+
# Summary stats
|
|
218
|
+
n_sessions = conn.execute("SELECT COUNT(*) FROM edit_sessions").fetchone()[0]
|
|
219
|
+
n_mod_sessions = conn.execute("SELECT COUNT(*) FROM edit_sessions WHERE modified_files > 0").fetchone()[0]
|
|
220
|
+
n_file_rows = conn.execute("SELECT COUNT(*) FROM edited_files").fetchone()[0]
|
|
221
|
+
n_mod_files = conn.execute("SELECT COUNT(*) FROM edited_files WHERE was_modified=1").fetchone()[0]
|
|
222
|
+
|
|
223
|
+
print()
|
|
224
|
+
print("=== EDIT PARSE COMPLETE ===")
|
|
225
|
+
print(f" edit_sessions rows: {n_sessions}")
|
|
226
|
+
print(f" sessions with changes: {n_mod_sessions}")
|
|
227
|
+
print(f" edited_files rows: {n_file_rows}")
|
|
228
|
+
print(f" modified files: {n_mod_files}")
|
|
229
|
+
print(f" errors: {errors}")
|
|
230
|
+
|
|
231
|
+
# Top modified files across all sessions
|
|
232
|
+
print()
|
|
233
|
+
print("=== TOP MODIFIED FILES ===")
|
|
234
|
+
rows = conn.execute("""
|
|
235
|
+
SELECT file_path, COUNT(DISTINCT session_id) sessions,
|
|
236
|
+
SUM(was_modified) times_modified
|
|
237
|
+
FROM edited_files
|
|
238
|
+
WHERE was_modified=1 AND file_path != ''
|
|
239
|
+
GROUP BY file_path
|
|
240
|
+
ORDER BY times_modified DESC
|
|
241
|
+
LIMIT 15
|
|
242
|
+
""").fetchall()
|
|
243
|
+
for r in rows:
|
|
244
|
+
print(f" {r[2]:>3}× {r[0][-70:]}")
|
|
245
|
+
|
|
246
|
+
conn.close()
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
if __name__ == "__main__":
|
|
250
|
+
run()
|