code-data-ark 2.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,250 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ parse_edits.py — Edit session analysis.
4
+
5
+ Parses edit_state VFS blobs and populates:
6
+ - edit_sessions : per-session file edit summary
7
+ - edited_files : per-file-per-session record
8
+
9
+ edit_state schema (VSCode internal, version 2):
10
+ {
11
+ version: 2,
12
+ initialFileContents: [[fileUri, contentHash], ...],
13
+ timeline: {
14
+ checkpoints: [{checkpointId, requestId, epoch, label}, ...],
15
+ currentEpoch: N,
16
+ fileBaselines: ...
17
+ },
18
+ recentSnapshot: {
19
+ entries: [{resource, languageId, originalHash, currentHash, state}, ...]
20
+ }
21
+ }
22
+
23
+ State values (from VSCode source):
24
+ 0 = Unmodified
25
+ 1 = Modified (pending)
26
+ 2 = Accepted
27
+ 3 = Rejected
28
+
29
+ Modified files: originalHash != currentHash in snapshot entries
30
+ Edit rounds: len(checkpoints) - 1 (first is always "Initial State")
31
+ """
32
+
33
+ import sqlite3
34
+ import gzip
35
+ import json
36
+ from pathlib import Path
37
+
38
+ ROOT_DIR = Path(__file__).resolve().parent.parent.parent.parent
39
+ LOCAL_DIR = ROOT_DIR / "local"
40
+ DB_PATH = LOCAL_DIR / "data" / "cda.db"
41
+
42
+ SCHEMA = """
43
+ CREATE TABLE IF NOT EXISTS edit_sessions (
44
+ session_id TEXT PRIMARY KEY,
45
+ workspace_id TEXT,
46
+ total_files INTEGER DEFAULT 0, -- files in snapshot
47
+ modified_files INTEGER DEFAULT 0, -- files where hash changed
48
+ edit_rounds INTEGER DEFAULT 0, -- checkpoints minus initial
49
+ file_paths TEXT, -- JSON array of modified file paths
50
+ all_file_paths TEXT, -- JSON array of all file paths in session
51
+ ingested_at TEXT DEFAULT (datetime('now'))
52
+ );
53
+
54
+ CREATE TABLE IF NOT EXISTS edited_files (
55
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
56
+ session_id TEXT NOT NULL,
57
+ workspace_id TEXT,
58
+ file_uri TEXT NOT NULL,
59
+ file_path TEXT, -- path without file:// scheme
60
+ language_id TEXT,
61
+ original_hash TEXT,
62
+ current_hash TEXT,
63
+ was_modified INTEGER DEFAULT 0, -- 1 if original_hash != current_hash
64
+ final_state INTEGER, -- 0=unmodified,1=modified,2=accepted,3=rejected
65
+ ingested_at TEXT DEFAULT (datetime('now'))
66
+ );
67
+
68
+ CREATE INDEX IF NOT EXISTS idx_edit_sessions_workspace ON edit_sessions(workspace_id);
69
+ CREATE INDEX IF NOT EXISTS idx_edited_files_session ON edited_files(session_id);
70
+ CREATE INDEX IF NOT EXISTS idx_edited_files_path ON edited_files(file_path);
71
+ CREATE INDEX IF NOT EXISTS idx_edited_files_modified ON edited_files(was_modified);
72
+ """
73
+
74
+
75
+ def strip_scheme(uri):
76
+ """Convert file:///path/to/file → /path/to/file"""
77
+ if uri.startswith('file://'):
78
+ return uri[7:]
79
+ return uri
80
+
81
+
82
+ def parse_edit_state(conn, session_id, workspace_id, content):
83
+ """Parse one edit_state blob and upsert rows."""
84
+ raw = gzip.decompress(content).decode('utf-8', errors='replace')
85
+ try:
86
+ obj = json.loads(raw)
87
+ except Exception:
88
+ return 0
89
+
90
+ if obj.get('version') != 2:
91
+ return 0
92
+
93
+ snapshot = obj.get('recentSnapshot', {})
94
+ entries = snapshot.get('entries', [])
95
+
96
+ timeline = obj.get('timeline', {})
97
+ checkpoints = timeline.get('checkpoints', [])
98
+ edit_rounds = max(0, len(checkpoints) - 1) # subtract initial state
99
+
100
+ total_files = len(entries)
101
+ modified_files = 0
102
+ file_paths_modified = []
103
+ all_file_paths = []
104
+
105
+ file_rows = []
106
+ for entry in entries:
107
+ if not isinstance(entry, dict):
108
+ continue
109
+ uri = entry.get('resource', '')
110
+ file_path = strip_scheme(uri)
111
+ lang = entry.get('languageId', '')
112
+ orig = entry.get('originalHash', '')
113
+ curr = entry.get('currentHash', '')
114
+ state = entry.get('state', 0)
115
+ was_mod = 1 if (orig and curr and orig != curr) else 0
116
+
117
+ if was_mod:
118
+ modified_files += 1
119
+ file_paths_modified.append(file_path)
120
+ if file_path:
121
+ all_file_paths.append(file_path)
122
+
123
+ file_rows.append((
124
+ session_id, workspace_id,
125
+ uri, file_path, lang,
126
+ orig, curr, was_mod, state,
127
+ ))
128
+
129
+ # Upsert edit_sessions row
130
+ conn.execute("""
131
+ INSERT INTO edit_sessions
132
+ (session_id, workspace_id, total_files, modified_files,
133
+ edit_rounds, file_paths, all_file_paths)
134
+ VALUES (?,?,?,?,?,?,?)
135
+ ON CONFLICT(session_id) DO UPDATE SET
136
+ workspace_id=excluded.workspace_id,
137
+ total_files=excluded.total_files,
138
+ modified_files=excluded.modified_files,
139
+ edit_rounds=excluded.edit_rounds,
140
+ file_paths=excluded.file_paths,
141
+ all_file_paths=excluded.all_file_paths,
142
+ ingested_at=datetime('now')
143
+ """, (
144
+ session_id, workspace_id,
145
+ total_files, modified_files,
146
+ edit_rounds,
147
+ json.dumps(file_paths_modified),
148
+ json.dumps(all_file_paths),
149
+ ))
150
+
151
+ # Delete existing file rows for this session (re-run safe)
152
+ conn.execute("DELETE FROM edited_files WHERE session_id=?", (session_id,))
153
+
154
+ # Insert file rows
155
+ conn.executemany("""
156
+ INSERT INTO edited_files
157
+ (session_id, workspace_id, file_uri, file_path, language_id,
158
+ original_hash, current_hash, was_modified, final_state)
159
+ VALUES (?,?,?,?,?,?,?,?,?)
160
+ """, file_rows)
161
+
162
+ return len(file_rows)
163
+
164
+
165
+ def run():
166
+ conn = sqlite3.connect(str(DB_PATH), timeout=30)
167
+ conn.execute("PRAGMA journal_mode=WAL")
168
+ conn.execute("PRAGMA synchronous=NORMAL")
169
+ conn.execute("PRAGMA cache_size=-2000")
170
+ conn.execute("PRAGMA mmap_size=268435456")
171
+ conn.execute("PRAGMA temp_store=MEMORY")
172
+ conn.executescript(SCHEMA)
173
+ conn.commit()
174
+
175
+ blobs = conn.execute(
176
+ """SELECT v.session_id, s.workspace_id, v.content
177
+ FROM vfs v
178
+ LEFT JOIN sessions s ON s.session_id = v.session_id
179
+ WHERE v.source_type = 'edit_state'
180
+ ORDER BY v.size_bytes DESC"""
181
+ ).fetchall()
182
+
183
+ print(f"Parsing {len(blobs)} edit_state blobs...")
184
+ total_files = 0
185
+ total_modified = 0
186
+ errors = 0
187
+
188
+ # Deduplicate by session_id — use largest blob per session
189
+ seen = set()
190
+ deduped = []
191
+ for sid, wid, content in blobs:
192
+ if sid not in seen:
193
+ seen.add(sid)
194
+ deduped.append((sid, wid, content))
195
+
196
+ print(f" Unique sessions: {len(deduped)}")
197
+
198
+ for sid, wid, content in deduped:
199
+ try:
200
+ parse_edit_state(conn, sid, wid, content)
201
+ row = conn.execute(
202
+ "SELECT total_files, modified_files, edit_rounds FROM edit_sessions WHERE session_id=?",
203
+ (sid,)
204
+ ).fetchone()
205
+ if row:
206
+ total_files += row[0]
207
+ total_modified += row[1]
208
+ if row[1] > 0:
209
+ print(f" {sid[:16]} files={row[0]} modified={row[1]} rounds={row[2]}")
210
+ except Exception as e:
211
+ errors += 1
212
+ if errors <= 5:
213
+ print(f" ERROR {sid[:16]}: {e}")
214
+
215
+ conn.commit()
216
+
217
+ # Summary stats
218
+ n_sessions = conn.execute("SELECT COUNT(*) FROM edit_sessions").fetchone()[0]
219
+ n_mod_sessions = conn.execute("SELECT COUNT(*) FROM edit_sessions WHERE modified_files > 0").fetchone()[0]
220
+ n_file_rows = conn.execute("SELECT COUNT(*) FROM edited_files").fetchone()[0]
221
+ n_mod_files = conn.execute("SELECT COUNT(*) FROM edited_files WHERE was_modified=1").fetchone()[0]
222
+
223
+ print()
224
+ print("=== EDIT PARSE COMPLETE ===")
225
+ print(f" edit_sessions rows: {n_sessions}")
226
+ print(f" sessions with changes: {n_mod_sessions}")
227
+ print(f" edited_files rows: {n_file_rows}")
228
+ print(f" modified files: {n_mod_files}")
229
+ print(f" errors: {errors}")
230
+
231
+ # Top modified files across all sessions
232
+ print()
233
+ print("=== TOP MODIFIED FILES ===")
234
+ rows = conn.execute("""
235
+ SELECT file_path, COUNT(DISTINCT session_id) sessions,
236
+ SUM(was_modified) times_modified
237
+ FROM edited_files
238
+ WHERE was_modified=1 AND file_path != ''
239
+ GROUP BY file_path
240
+ ORDER BY times_modified DESC
241
+ LIMIT 15
242
+ """).fetchall()
243
+ for r in rows:
244
+ print(f" {r[2]:>3}× {r[0][-70:]}")
245
+
246
+ conn.close()
247
+
248
+
249
+ if __name__ == "__main__":
250
+ run()