codealmanac 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-2JJTTN7P.js +539 -0
- package/dist/chunk-2JJTTN7P.js.map +1 -0
- package/dist/chunk-3C5SY5SE.js +1239 -0
- package/dist/chunk-3C5SY5SE.js.map +1 -0
- package/dist/chunk-4CODZRHH.js +19 -0
- package/dist/chunk-4CODZRHH.js.map +1 -0
- package/dist/chunk-7JUX4ADQ.js +38 -0
- package/dist/chunk-7JUX4ADQ.js.map +1 -0
- package/dist/chunk-A6PUCAVJ.js +145 -0
- package/dist/chunk-A6PUCAVJ.js.map +1 -0
- package/dist/chunk-AXFPUHBN.js +227 -0
- package/dist/chunk-AXFPUHBN.js.map +1 -0
- package/dist/chunk-FM3VRDK7.js +20 -0
- package/dist/chunk-FM3VRDK7.js.map +1 -0
- package/dist/chunk-H6WU6PYH.js +441 -0
- package/dist/chunk-H6WU6PYH.js.map +1 -0
- package/dist/chunk-P3LDTCLB.js +34 -0
- package/dist/chunk-P3LDTCLB.js.map +1 -0
- package/dist/chunk-QHQ6YH7U.js +81 -0
- package/dist/chunk-QHQ6YH7U.js.map +1 -0
- package/dist/chunk-Z4MWLVS2.js +355 -0
- package/dist/chunk-Z4MWLVS2.js.map +1 -0
- package/dist/chunk-Z6MBJ3D2.js +203 -0
- package/dist/chunk-Z6MBJ3D2.js.map +1 -0
- package/dist/cli-AIH5QQ5H.js +393 -0
- package/dist/cli-AIH5QQ5H.js.map +1 -0
- package/dist/codealmanac.js +68 -5954
- package/dist/codealmanac.js.map +1 -1
- package/dist/doctor-6FN5JO5F.js +15 -0
- package/dist/doctor-6FN5JO5F.js.map +1 -0
- package/dist/hook-CRJMWSSO.js +12 -0
- package/dist/hook-CRJMWSSO.js.map +1 -0
- package/dist/register-commands-PZMQNGCH.js +2644 -0
- package/dist/register-commands-PZMQNGCH.js.map +1 -0
- package/dist/uninstall-NBEZNNKM.js +12 -0
- package/dist/uninstall-NBEZNNKM.js.map +1 -0
- package/dist/update-IL243I4E.js +10 -0
- package/dist/update-IL243I4E.js.map +1 -0
- package/dist/wiki-EHZ7LG7R.js +238 -0
- package/dist/wiki-EHZ7LG7R.js.map +1 -0
- package/guides/processing/claude-code.md +152 -0
- package/guides/processing/codex.md +214 -0
- package/guides/processing/generic.md +128 -0
- package/package.json +2 -2
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
ensureFreshIndex,
|
|
4
|
+
findEntry,
|
|
5
|
+
openIndex,
|
|
6
|
+
runHealth
|
|
7
|
+
} from "./chunk-3C5SY5SE.js";
|
|
8
|
+
import {
|
|
9
|
+
formatDuration
|
|
10
|
+
} from "./chunk-4CODZRHH.js";
|
|
11
|
+
import "./chunk-FM3VRDK7.js";
|
|
12
|
+
import {
|
|
13
|
+
findNearestAlmanacDir
|
|
14
|
+
} from "./chunk-7JUX4ADQ.js";
|
|
15
|
+
|
|
16
|
+
// src/commands/doctor-checks/wiki.ts
|
|
17
|
+
import { existsSync, readdirSync, statSync } from "fs";
|
|
18
|
+
import path from "path";
|
|
19
|
+
async function gatherWikiChecks(options) {
|
|
20
|
+
const checks = [];
|
|
21
|
+
const repoRoot = findNearestAlmanacDir(options.cwd);
|
|
22
|
+
if (repoRoot === null) {
|
|
23
|
+
checks.push({
|
|
24
|
+
status: "info",
|
|
25
|
+
key: "wiki.none",
|
|
26
|
+
message: "No wiki in current directory",
|
|
27
|
+
fix: "run: almanac bootstrap (to create one in this repo)"
|
|
28
|
+
});
|
|
29
|
+
return checks;
|
|
30
|
+
}
|
|
31
|
+
checks.push({
|
|
32
|
+
status: "info",
|
|
33
|
+
key: "wiki.repo",
|
|
34
|
+
message: `repo: ${repoRoot}`
|
|
35
|
+
});
|
|
36
|
+
try {
|
|
37
|
+
await ensureFreshIndex({ repoRoot });
|
|
38
|
+
} catch {
|
|
39
|
+
}
|
|
40
|
+
checks.push(await describeRegistry(repoRoot));
|
|
41
|
+
const almanacDir = path.join(repoRoot, ".almanac");
|
|
42
|
+
const dbPath = path.join(almanacDir, "index.db");
|
|
43
|
+
checks.push(...describeCounts(dbPath));
|
|
44
|
+
checks.push(describeIndexFreshness(dbPath));
|
|
45
|
+
checks.push(describeLastCapture(almanacDir, options.now));
|
|
46
|
+
checks.push(await describeHealth(repoRoot, options));
|
|
47
|
+
return checks;
|
|
48
|
+
}
|
|
49
|
+
async function describeRegistry(repoRoot) {
|
|
50
|
+
try {
|
|
51
|
+
const entry = await findEntry({ path: repoRoot });
|
|
52
|
+
if (entry !== null) {
|
|
53
|
+
return {
|
|
54
|
+
status: "ok",
|
|
55
|
+
key: "wiki.registered",
|
|
56
|
+
message: `registered as '${entry.name}'`
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
return {
|
|
60
|
+
status: "info",
|
|
61
|
+
key: "wiki.registered",
|
|
62
|
+
message: "not yet registered (will register on first command)"
|
|
63
|
+
};
|
|
64
|
+
} catch (err) {
|
|
65
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
66
|
+
return {
|
|
67
|
+
status: "problem",
|
|
68
|
+
key: "wiki.registered",
|
|
69
|
+
message: `could not read registry: ${msg}`,
|
|
70
|
+
fix: "inspect ~/.almanac/registry.json; remove or fix the malformed entry"
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
function describeCounts(dbPath) {
|
|
75
|
+
const checks = [];
|
|
76
|
+
let pageCount = null;
|
|
77
|
+
let topicCount = null;
|
|
78
|
+
if (existsSync(dbPath)) {
|
|
79
|
+
try {
|
|
80
|
+
const db = openIndex(dbPath);
|
|
81
|
+
try {
|
|
82
|
+
pageCount = countRows(db, "pages");
|
|
83
|
+
topicCount = countRows(db, "topics");
|
|
84
|
+
} finally {
|
|
85
|
+
db.close();
|
|
86
|
+
}
|
|
87
|
+
} catch {
|
|
88
|
+
pageCount = null;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
if (pageCount !== null) {
|
|
92
|
+
checks.push({
|
|
93
|
+
status: "info",
|
|
94
|
+
key: "wiki.pages",
|
|
95
|
+
message: `pages: ${pageCount}`
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
if (topicCount !== null) {
|
|
99
|
+
checks.push({
|
|
100
|
+
status: "info",
|
|
101
|
+
key: "wiki.topics",
|
|
102
|
+
message: `topics: ${topicCount}`
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
return checks;
|
|
106
|
+
}
|
|
107
|
+
function countRows(db, table) {
|
|
108
|
+
const row = db.prepare(`SELECT COUNT(*) AS n FROM ${table}`).get();
|
|
109
|
+
return row?.n ?? 0;
|
|
110
|
+
}
|
|
111
|
+
function describeIndexFreshness(dbPath) {
|
|
112
|
+
if (!existsSync(dbPath)) {
|
|
113
|
+
return {
|
|
114
|
+
status: "info",
|
|
115
|
+
key: "wiki.index",
|
|
116
|
+
message: "index: not built yet (run any query command)"
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
try {
|
|
120
|
+
const dbMtime = statSync(dbPath).mtimeMs;
|
|
121
|
+
const age = Date.now() - dbMtime;
|
|
122
|
+
return {
|
|
123
|
+
status: "info",
|
|
124
|
+
key: "wiki.index",
|
|
125
|
+
message: `index: rebuilt ${formatDuration(age)} ago`
|
|
126
|
+
};
|
|
127
|
+
} catch {
|
|
128
|
+
return {
|
|
129
|
+
status: "info",
|
|
130
|
+
key: "wiki.index",
|
|
131
|
+
message: "index: present"
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
function describeLastCapture(almanacDir, nowFn) {
|
|
136
|
+
if (!existsSync(almanacDir)) {
|
|
137
|
+
return {
|
|
138
|
+
status: "info",
|
|
139
|
+
key: "wiki.capture",
|
|
140
|
+
message: "last capture: never"
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
let entries;
|
|
144
|
+
try {
|
|
145
|
+
entries = readdirSync(almanacDir);
|
|
146
|
+
} catch {
|
|
147
|
+
return {
|
|
148
|
+
status: "info",
|
|
149
|
+
key: "wiki.capture",
|
|
150
|
+
message: "last capture: unknown"
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
const captures = entries.filter(
|
|
154
|
+
(e) => e.startsWith(".capture-") && (e.endsWith(".log") || e.endsWith(".jsonl"))
|
|
155
|
+
).map((e) => {
|
|
156
|
+
try {
|
|
157
|
+
return {
|
|
158
|
+
name: e,
|
|
159
|
+
mtime: statSync(path.join(almanacDir, e)).mtimeMs
|
|
160
|
+
};
|
|
161
|
+
} catch {
|
|
162
|
+
return null;
|
|
163
|
+
}
|
|
164
|
+
}).filter((e) => e !== null);
|
|
165
|
+
if (captures.length === 0) {
|
|
166
|
+
return {
|
|
167
|
+
status: "info",
|
|
168
|
+
key: "wiki.capture",
|
|
169
|
+
message: "last capture: never"
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
captures.sort((a, b) => b.mtime - a.mtime);
|
|
173
|
+
const latest = captures[0];
|
|
174
|
+
const now = (nowFn?.() ?? /* @__PURE__ */ new Date()).getTime();
|
|
175
|
+
const age = now - latest.mtime;
|
|
176
|
+
return {
|
|
177
|
+
status: "info",
|
|
178
|
+
key: "wiki.capture",
|
|
179
|
+
message: `last capture: ${formatDuration(age)} ago (${latest.name})`
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
async function describeHealth(repoRoot, options) {
|
|
183
|
+
const healthFn = options.runHealthFn ?? runHealth;
|
|
184
|
+
try {
|
|
185
|
+
const healthRes = await healthFn({
|
|
186
|
+
cwd: repoRoot,
|
|
187
|
+
json: true
|
|
188
|
+
});
|
|
189
|
+
const problems = countHealthProblems(healthRes.stdout);
|
|
190
|
+
if (problems === 0) {
|
|
191
|
+
return {
|
|
192
|
+
status: "ok",
|
|
193
|
+
key: "wiki.health",
|
|
194
|
+
message: "almanac health reports 0 problems"
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
return {
|
|
198
|
+
status: "problem",
|
|
199
|
+
key: "wiki.health",
|
|
200
|
+
message: `almanac health reports ${problems} problem${problems === 1 ? "" : "s"}`,
|
|
201
|
+
fix: "run: almanac health"
|
|
202
|
+
};
|
|
203
|
+
} catch (err) {
|
|
204
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
205
|
+
return {
|
|
206
|
+
status: "info",
|
|
207
|
+
key: "wiki.health",
|
|
208
|
+
message: `could not run almanac health: ${msg}`
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
var HEALTH_PROBLEM_KEYS = [
|
|
213
|
+
"orphans",
|
|
214
|
+
"stale",
|
|
215
|
+
"dead_refs",
|
|
216
|
+
"broken_links",
|
|
217
|
+
"broken_xwiki",
|
|
218
|
+
"empty_topics",
|
|
219
|
+
"empty_pages",
|
|
220
|
+
"slug_collisions"
|
|
221
|
+
];
|
|
222
|
+
function countHealthProblems(jsonStdout) {
|
|
223
|
+
try {
|
|
224
|
+
const report = JSON.parse(jsonStdout);
|
|
225
|
+
let total = 0;
|
|
226
|
+
for (const key of HEALTH_PROBLEM_KEYS) {
|
|
227
|
+
const arr = report[key];
|
|
228
|
+
if (Array.isArray(arr)) total += arr.length;
|
|
229
|
+
}
|
|
230
|
+
return total;
|
|
231
|
+
} catch {
|
|
232
|
+
return 0;
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
export {
|
|
236
|
+
gatherWikiChecks
|
|
237
|
+
};
|
|
238
|
+
//# sourceMappingURL=wiki-EHZ7LG7R.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/commands/doctor-checks/wiki.ts"],"sourcesContent":["import { existsSync, readdirSync, statSync } from \"node:fs\";\nimport path from \"node:path\";\n\nimport type Database from \"better-sqlite3\";\n\nimport { ensureFreshIndex } from \"../../indexer/index.js\";\nimport { openIndex } from \"../../indexer/schema.js\";\nimport { findNearestAlmanacDir } from \"../../paths.js\";\nimport { findEntry } from \"../../registry/index.js\";\nimport { runHealth, type HealthReport } from \"../health.js\";\nimport { formatDuration } from \"./duration.js\";\nimport type { Check, DoctorOptions } from \"./types.js\";\n\nexport async function gatherWikiChecks(options: DoctorOptions): Promise<Check[]> {\n const checks: Check[] = [];\n const repoRoot = findNearestAlmanacDir(options.cwd);\n\n if (repoRoot === null) {\n checks.push({\n status: \"info\",\n key: \"wiki.none\",\n message: \"No wiki in current directory\",\n fix: \"run: almanac bootstrap (to create one in this repo)\",\n });\n return checks;\n }\n\n checks.push({\n status: \"info\",\n key: \"wiki.repo\",\n message: `repo: ${repoRoot}`,\n });\n\n try {\n await ensureFreshIndex({ repoRoot });\n } catch {\n // non-fatal: counts below and the health probe report any real issue.\n }\n\n checks.push(await describeRegistry(repoRoot));\n\n const almanacDir = path.join(repoRoot, \".almanac\");\n const dbPath = path.join(almanacDir, \"index.db\");\n checks.push(...describeCounts(dbPath));\n checks.push(describeIndexFreshness(dbPath));\n checks.push(describeLastCapture(almanacDir, options.now));\n checks.push(await describeHealth(repoRoot, options));\n\n return checks;\n}\n\nasync function describeRegistry(repoRoot: string): Promise<Check> {\n try {\n const entry = await findEntry({ path: repoRoot });\n if (entry !== null) {\n return {\n status: \"ok\",\n key: \"wiki.registered\",\n message: `registered as '${entry.name}'`,\n };\n }\n return {\n status: \"info\",\n key: \"wiki.registered\",\n message: \"not yet registered (will register on first command)\",\n };\n } catch (err: unknown) {\n const msg = err instanceof Error ? err.message : String(err);\n return {\n status: \"problem\",\n key: \"wiki.registered\",\n message: `could not read registry: ${msg}`,\n fix: \"inspect ~/.almanac/registry.json; remove or fix the malformed entry\",\n };\n }\n}\n\nfunction describeCounts(dbPath: string): Check[] {\n const checks: Check[] = [];\n let pageCount: number | null = null;\n let topicCount: number | null = null;\n\n if (existsSync(dbPath)) {\n try {\n const db = openIndex(dbPath);\n try {\n pageCount = countRows(db, \"pages\");\n topicCount = countRows(db, \"topics\");\n } finally {\n db.close();\n }\n } catch {\n pageCount = null;\n }\n }\n\n if (pageCount !== null) {\n checks.push({\n status: \"info\",\n key: \"wiki.pages\",\n message: `pages: ${pageCount}`,\n });\n }\n if (topicCount !== null) {\n checks.push({\n status: \"info\",\n key: \"wiki.topics\",\n message: `topics: ${topicCount}`,\n });\n }\n\n return checks;\n}\n\nfunction countRows(db: Database.Database, table: string): number {\n const row = db\n .prepare<[], { n: number }>(`SELECT COUNT(*) AS n FROM ${table}`)\n .get();\n return row?.n ?? 0;\n}\n\nfunction describeIndexFreshness(dbPath: string): Check {\n if (!existsSync(dbPath)) {\n return {\n status: \"info\",\n key: \"wiki.index\",\n message: \"index: not built yet (run any query command)\",\n };\n }\n try {\n const dbMtime = statSync(dbPath).mtimeMs;\n const age = Date.now() - dbMtime;\n return {\n status: \"info\",\n key: \"wiki.index\",\n message: `index: rebuilt ${formatDuration(age)} ago`,\n };\n } catch {\n return {\n status: \"info\",\n key: \"wiki.index\",\n message: \"index: present\",\n };\n }\n}\n\nfunction describeLastCapture(\n almanacDir: string,\n nowFn?: () => Date,\n): Check {\n if (!existsSync(almanacDir)) {\n return {\n status: \"info\",\n key: \"wiki.capture\",\n message: \"last capture: never\",\n };\n }\n let entries: string[];\n try {\n entries = readdirSync(almanacDir);\n } catch {\n return {\n status: \"info\",\n key: \"wiki.capture\",\n message: \"last capture: unknown\",\n };\n }\n const captures = entries\n .filter(\n (e) =>\n e.startsWith(\".capture-\") &&\n (e.endsWith(\".log\") || e.endsWith(\".jsonl\")),\n )\n .map((e) => {\n try {\n return {\n name: e,\n mtime: statSync(path.join(almanacDir, e)).mtimeMs,\n };\n } catch {\n return null;\n }\n })\n .filter((e): e is { name: string; mtime: number } => e !== null);\n if (captures.length === 0) {\n return {\n status: \"info\",\n key: \"wiki.capture\",\n message: \"last capture: never\",\n };\n }\n captures.sort((a, b) => b.mtime - a.mtime);\n const latest = captures[0]!;\n const now = (nowFn?.() ?? new Date()).getTime();\n const age = now - latest.mtime;\n return {\n status: \"info\",\n key: \"wiki.capture\",\n message: `last capture: ${formatDuration(age)} ago (${latest.name})`,\n };\n}\n\nasync function describeHealth(\n repoRoot: string,\n options: DoctorOptions,\n): Promise<Check> {\n const healthFn = options.runHealthFn ?? runHealth;\n try {\n const healthRes = await healthFn({\n cwd: repoRoot,\n json: true,\n });\n const problems = countHealthProblems(healthRes.stdout);\n if (problems === 0) {\n return {\n status: \"ok\",\n key: \"wiki.health\",\n message: \"almanac health reports 0 problems\",\n };\n }\n return {\n status: \"problem\",\n key: \"wiki.health\",\n message: `almanac health reports ${problems} problem${problems === 1 ? \"\" : \"s\"}`,\n fix: \"run: almanac health\",\n };\n } catch (err: unknown) {\n const msg = err instanceof Error ? err.message : String(err);\n return {\n status: \"info\",\n key: \"wiki.health\",\n message: `could not run almanac health: ${msg}`,\n };\n }\n}\n\nconst HEALTH_PROBLEM_KEYS: (keyof HealthReport)[] = [\n \"orphans\",\n \"stale\",\n \"dead_refs\",\n \"broken_links\",\n \"broken_xwiki\",\n \"empty_topics\",\n \"empty_pages\",\n \"slug_collisions\",\n];\n\nfunction countHealthProblems(jsonStdout: string): number {\n try {\n const report = JSON.parse(jsonStdout) as Partial<HealthReport>;\n let total = 0;\n for (const key of HEALTH_PROBLEM_KEYS) {\n const arr = report[key];\n if (Array.isArray(arr)) total += arr.length;\n }\n return total;\n } catch {\n return 0;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;AAAA,SAAS,YAAY,aAAa,gBAAgB;AAClD,OAAO,UAAU;AAYjB,eAAsB,iBAAiB,SAA0C;AAC/E,QAAM,SAAkB,CAAC;AACzB,QAAM,WAAW,sBAAsB,QAAQ,GAAG;AAElD,MAAI,aAAa,MAAM;AACrB,WAAO,KAAK;AAAA,MACV,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,MACT,KAAK;AAAA,IACP,CAAC;AACD,WAAO;AAAA,EACT;AAEA,SAAO,KAAK;AAAA,IACV,QAAQ;AAAA,IACR,KAAK;AAAA,IACL,SAAS,SAAS,QAAQ;AAAA,EAC5B,CAAC;AAED,MAAI;AACF,UAAM,iBAAiB,EAAE,SAAS,CAAC;AAAA,EACrC,QAAQ;AAAA,EAER;AAEA,SAAO,KAAK,MAAM,iBAAiB,QAAQ,CAAC;AAE5C,QAAM,aAAa,KAAK,KAAK,UAAU,UAAU;AACjD,QAAM,SAAS,KAAK,KAAK,YAAY,UAAU;AAC/C,SAAO,KAAK,GAAG,eAAe,MAAM,CAAC;AACrC,SAAO,KAAK,uBAAuB,MAAM,CAAC;AAC1C,SAAO,KAAK,oBAAoB,YAAY,QAAQ,GAAG,CAAC;AACxD,SAAO,KAAK,MAAM,eAAe,UAAU,OAAO,CAAC;AAEnD,SAAO;AACT;AAEA,eAAe,iBAAiB,UAAkC;AAChE,MAAI;AACF,UAAM,QAAQ,MAAM,UAAU,EAAE,MAAM,SAAS,CAAC;AAChD,QAAI,UAAU,MAAM;AAClB,aAAO;AAAA,QACL,QAAQ;AAAA,QACR,KAAK;AAAA,QACL,SAAS,kBAAkB,MAAM,IAAI;AAAA,MACvC;AAAA,IACF;AACA,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF,SAAS,KAAc;AACrB,UAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC3D,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,4BAA4B,GAAG;AAAA,MACxC,KAAK;AAAA,IACP;AAAA,EACF;AACF;AAEA,SAAS,eAAe,QAAyB;AAC/C,QAAM,SAAkB,CAAC;AACzB,MAAI,YAA2B;AAC/B,MAAI,aAA4B;AAEhC,MAAI,WAAW,MAAM,GAAG;AACtB,QAAI;AACF,YAAM,KAAK,UAAU,MAAM;AAC3B,UAAI;AACF,oBAAY,UAAU,IAAI,OAAO;AACjC,qBAAa,UAAU,IAAI,QAAQ;AAAA,MACrC,UAAE;AACA,WAAG,MAAM;AAAA,MACX;AAAA,IACF,QAAQ;AACN,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,cAAc,MAAM;AACtB,WAAO,KAAK;AAAA,MACV,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,UAAU,SAAS;AAAA,IAC9B,CAAC;AAAA,EACH;AACA,MAAI,eAAe,MAAM;AACvB,WAAO,KAAK;AAAA,MACV,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,WAAW,UAAU;AAAA,IAChC,CAAC;AAAA,EACH;AAEA,SAAO;AACT;AAEA,SAAS,UAAU,IAAuB,OAAuB;AAC/D,QAAM,MAAM,GACT,QAA2B,6BAA6B,KAAK,EAAE,EAC/D,IAAI;AACP,SAAO,KAAK,KAAK;AACnB;AAEA,SAAS,uBAAuB,QAAuB;AACrD,MAAI,CAAC,WAAW,MAAM,GAAG;AACvB,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACA,MAAI;AACF,UAAM,UAAU,SAAS,MAAM,EAAE;AACjC,UAAM,MAAM,KAAK,IAAI,IAAI;AACzB,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,kBAAkB,eAAe,GAAG,CAAC;AAAA,IAChD;AAAA,EACF,QAAQ;AACN,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACF;AAEA,SAAS,oBACP,YACA,OACO;AACP,MAAI,CAAC,WAAW,UAAU,GAAG;AAC3B,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACA,MAAI;AACJ,MAAI;AACF,cAAU,YAAY,UAAU;AAAA,EAClC,QAAQ;AACN,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACA,QAAM,WAAW,QACd;AAAA,IACC,CAAC,MACC,EAAE,WAAW,WAAW,MACvB,EAAE,SAAS,MAAM,KAAK,EAAE,SAAS,QAAQ;AAAA,EAC9C,EACC,IAAI,CAAC,MAAM;AACV,QAAI;AACF,aAAO;AAAA,QACL,MAAM;AAAA,QACN,OAAO,SAAS,KAAK,KAAK,YAAY,CAAC,CAAC,EAAE;AAAA,MAC5C;AAAA,IACF,QAAQ;AACN,aAAO;AAAA,IACT;AAAA,EACF,CAAC,EACA,OAAO,CAAC,MAA4C,MAAM,IAAI;AACjE,MAAI,SAAS,WAAW,GAAG;AACzB,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACA,WAAS,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AACzC,QAAM,SAAS,SAAS,CAAC;AACzB,QAAM,OAAO,QAAQ,KAAK,oBAAI,KAAK,GAAG,QAAQ;AAC9C,QAAM,MAAM,MAAM,OAAO;AACzB,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,KAAK;AAAA,IACL,SAAS,iBAAiB,eAAe,GAAG,CAAC,SAAS,OAAO,IAAI;AAAA,EACnE;AACF;AAEA,eAAe,eACb,UACA,SACgB;AAChB,QAAM,WAAW,QAAQ,eAAe;AACxC,MAAI;AACF,UAAM,YAAY,MAAM,SAAS;AAAA,MAC/B,KAAK;AAAA,MACL,MAAM;AAAA,IACR,CAAC;AACD,UAAM,WAAW,oBAAoB,UAAU,MAAM;AACrD,QAAI,aAAa,GAAG;AAClB,aAAO;AAAA,QACL,QAAQ;AAAA,QACR,KAAK;AAAA,QACL,SAAS;AAAA,MACX;AAAA,IACF;AACA,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,0BAA0B,QAAQ,WAAW,aAAa,IAAI,KAAK,GAAG;AAAA,MAC/E,KAAK;AAAA,IACP;AAAA,EACF,SAAS,KAAc;AACrB,UAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC3D,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,iCAAiC,GAAG;AAAA,IAC/C;AAAA,EACF;AACF;AAEA,IAAM,sBAA8C;AAAA,EAClD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,oBAAoB,YAA4B;AACvD,MAAI;AACF,UAAM,SAAS,KAAK,MAAM,UAAU;AACpC,QAAI,QAAQ;AACZ,eAAW,OAAO,qBAAqB;AACrC,YAAM,MAAM,OAAO,GAAG;AACtB,UAAI,MAAM,QAAQ,GAAG,EAAG,UAAS,IAAI;AAAA,IACvC;AACA,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;","names":[]}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# Processing Claude Code Sessions
|
|
2
|
+
|
|
3
|
+
## Format overview
|
|
4
|
+
|
|
5
|
+
Claude Code stores sessions as JSONL files (one JSON object per line) at:
|
|
6
|
+
```
|
|
7
|
+
~/.claude/projects/<project-hash>/<session-uuid>.jsonl
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
The project hash is a path with slashes replaced by dashes (e.g., `-Users-rohan-Desktop-Projects-myrepo`). Each session file contains the full conversation history including tool calls, tool results, thinking blocks, and metadata.
|
|
11
|
+
|
|
12
|
+
Typical session sizes: 150KB (short Q&A) to 8MB+ (multi-hour coding session). Line counts range from ~75 to ~1,750.
|
|
13
|
+
|
|
14
|
+
## Record types
|
|
15
|
+
|
|
16
|
+
Each line is a JSON object with a `type` field at the top level:
|
|
17
|
+
|
|
18
|
+
| Type | Frequency | What it contains |
|
|
19
|
+
|------|-----------|-----------------|
|
|
20
|
+
| `assistant` | ~40% of records, ~15% of bytes | Model responses: text, tool_use, thinking blocks. Each content item is in `message.content[]` |
|
|
21
|
+
| `user` | ~35% of records, ~55% of bytes | Human messages OR tool results. Check `message.content[].type` to distinguish |
|
|
22
|
+
| `attachment` | ~5% of records, ~7% of bytes | System context injected by the harness: deferred tool lists, skill listings, memory, task reminders, edited file snippets |
|
|
23
|
+
| `file-history-snapshot` | ~5% of records, <1% of bytes | Checkpoint markers for undo/redo. Always tiny (~250 bytes) |
|
|
24
|
+
| `permission-mode` | ~3% of records, <1% of bytes | Records when permission mode changes (e.g., `bypassPermissions`) |
|
|
25
|
+
| `last-prompt` | ~3% of records, <1% of bytes | Marks turn boundaries. ~120 bytes each |
|
|
26
|
+
| `system` | ~2% of records, <1% of bytes | System messages injected mid-conversation. Often empty content |
|
|
27
|
+
| `ai-title` | rare | Auto-generated session title |
|
|
28
|
+
| `queue-operation` | rare | Queued follow-up commands |
|
|
29
|
+
|
|
30
|
+
## What to extract (signal)
|
|
31
|
+
|
|
32
|
+
### 1. Human messages (highest signal density)
|
|
33
|
+
- **Where:** Records with `type: "user"` where `message.content[]` contains items with `type: "text"`
|
|
34
|
+
- **Also check:** `userType` field -- `"external"` means the actual human typed this
|
|
35
|
+
- **What:** Intent, requirements, feedback, decisions, bug reports, design direction
|
|
36
|
+
- **Example pattern:** `{"type": "user", "message": {"content": [{"type": "text", "text": "What problems did you run into?"}]}}`
|
|
37
|
+
|
|
38
|
+
### 2. Assistant text responses
|
|
39
|
+
- **Where:** Records with `type: "assistant"`, then `message.content[]` items with `type: "text"`
|
|
40
|
+
- **What:** Explanations, decisions, summaries, architecture analysis, bug diagnoses
|
|
41
|
+
- **Typical size:** 100-3000 chars per text block
|
|
42
|
+
- **Example pattern:** The assistant explains a root cause, summarizes what was built, or describes a design decision
|
|
43
|
+
|
|
44
|
+
### 3. Subagent results (high-value signal hidden in noise)
|
|
45
|
+
- **Where:** Records with `type: "user"` that have a top-level `toolUseResult` field with `agentType` set
|
|
46
|
+
- **What:** Complete results from subagents (review, critic, pair, etc.). The `content` field contains the full subagent output, often multi-thousand-character analysis
|
|
47
|
+
- **Key fields:** `toolUseResult.agentType` (e.g., "general-purpose", "review", "critic", "pair"), `toolUseResult.content[].text`, `toolUseResult.prompt` (what the subagent was asked to do)
|
|
48
|
+
- **Why it matters:** These are often the densest signal in a session -- complete audit reports, code reviews, architecture analyses
|
|
49
|
+
|
|
50
|
+
### 4. Session metadata
|
|
51
|
+
- **Where:** First few records of the file
|
|
52
|
+
- **Key fields on user records:** `cwd`, `gitBranch`, `version`, `timestamp`, `sessionId`, `entrypoint` (cli vs other)
|
|
53
|
+
- **Attachment records** with `type: "nested_memory"` contain the project's memory/context
|
|
54
|
+
|
|
55
|
+
## What to skip (noise)
|
|
56
|
+
|
|
57
|
+
### 1. Tool results (~24% of total bytes) -- SKIP
|
|
58
|
+
- **Where:** `user` records where `message.content[]` has `type: "tool_result"`
|
|
59
|
+
- **Why skip:** These are file contents, grep results, build output, test output. The actual files are in the repo; the output is transient
|
|
60
|
+
- **The biggest offenders:** Read tool results can be 150KB+ (entire file contents dumped inline)
|
|
61
|
+
|
|
62
|
+
### 2. Tool calls (~8% of total bytes) -- SKIP or SUMMARIZE
|
|
63
|
+
- **Where:** `assistant` records where `message.content[]` has `type: "tool_use"`
|
|
64
|
+
- **Contains:** `name` (Bash, Read, Grep, Edit, Write, Glob) and `input` (command, file path, pattern)
|
|
65
|
+
- **Why skip:** The sequence of tool calls is operational, not knowledge. Exception: summarize the *pattern* of tool usage ("read 15 files in src/auth/")
|
|
66
|
+
|
|
67
|
+
### 3. Wrapper overhead (~38% of total bytes in large sessions) -- SKIP
|
|
68
|
+
- **Where:** Top-level fields on `user` records: `parentUuid`, `sourceToolAssistantUUID`, `toolUseResult` (when not a subagent), `slug`, `requestId`, `isMeta`, `isSidechain`
|
|
69
|
+
- **Why skip:** The `toolUseResult` field on user records DUPLICATES the tool result content that already appears in `message.content[]`. This is the single biggest source of bloat. In one 8MB session, `toolUseResult` alone was 2.7MB (33%)
|
|
70
|
+
- **EXCEPTION:** When `toolUseResult.agentType` is set, this is a subagent result and IS signal
|
|
71
|
+
|
|
72
|
+
### 4. Empty thinking blocks (~6% in some sessions) -- SKIP
|
|
73
|
+
- **Where:** `assistant` records, `message.content[]` with `type: "thinking"` but `thinking: ""`
|
|
74
|
+
- **Why:** Claude Code often records thinking blocks with empty content (the actual thinking happened but was not persisted). These are pure waste
|
|
75
|
+
|
|
76
|
+
### 5. Attachments (~7%) -- SKIP
|
|
77
|
+
- **Where:** `type: "attachment"` records
|
|
78
|
+
- **Contains:** `deferred_tools_delta` (tool availability lists), `skill_listing` (repeated skill menus), `task_reminder` (repeated TODO lists), `mcp_instructions_delta` (MCP setup)
|
|
79
|
+
- **Why skip:** Harness infrastructure, not knowledge. Repeated across turns
|
|
80
|
+
|
|
81
|
+
### 6. Metadata records (<1%) -- SKIP
|
|
82
|
+
- `file-history-snapshot`, `permission-mode`, `last-prompt`, `queue-operation`, `ai-title`
|
|
83
|
+
|
|
84
|
+
### 7. Base64 image data (~1-7% in some sessions) -- SKIP
|
|
85
|
+
- **Where:** `user` records with `message.content[]` containing `type: "image"` and `source.type: "base64"`
|
|
86
|
+
- **Why skip:** Screenshots pasted by the user. Can be 100KB+ of base64 per image. Not extractable as knowledge
|
|
87
|
+
|
|
88
|
+
## What to summarize
|
|
89
|
+
|
|
90
|
+
These patterns should be compressed rather than fully extracted or fully skipped:
|
|
91
|
+
|
|
92
|
+
| Pattern | Summarize as |
|
|
93
|
+
|---------|-------------|
|
|
94
|
+
| 10+ consecutive tool_use/tool_result pairs reading files | "Read N files in {directory pattern}" |
|
|
95
|
+
| grep/glob sequences searching for a pattern | "Searched for {pattern} across {scope}" |
|
|
96
|
+
| Edit tool calls modifying files | "Modified {file}: {description from the Edit input}" |
|
|
97
|
+
| Bash commands running tests | "Ran tests: {pass/fail summary from result}" |
|
|
98
|
+
| Bash commands running builds | "Built {target}: {success/failure}" |
|
|
99
|
+
|
|
100
|
+
## Extraction approach
|
|
101
|
+
|
|
102
|
+
1. **Parse the JSONL file** line by line. Each line is one JSON object.
|
|
103
|
+
2. **First pass -- extract metadata:** From the first `user` record, grab `cwd`, `gitBranch`, `version`, `timestamp`, `sessionId`.
|
|
104
|
+
3. **For each record, check `type`:**
|
|
105
|
+
- `type: "user"` with `message.content[].type == "text"` -> **extract as human message**
|
|
106
|
+
- `type: "user"` with `toolUseResult.agentType` set -> **extract subagent result** from `toolUseResult.content[].text`
|
|
107
|
+
- `type: "user"` with `message.content[].type == "tool_result"` -> **skip** (or summarize tool call patterns)
|
|
108
|
+
- `type: "assistant"` with `message.content[].type == "text"` -> **extract as assistant reasoning**
|
|
109
|
+
- `type: "assistant"` with `message.content[].type == "thinking"` and non-empty `thinking` field -> **extract as internal reasoning**
|
|
110
|
+
- `type: "assistant"` with `message.content[].type == "tool_use"` -> **skip or summarize**
|
|
111
|
+
- `type: "attachment"`, `type: "system"`, metadata types -> **skip**
|
|
112
|
+
4. **Second pass -- deduplicate:** The `toolUseResult` field on user records often duplicates content from `message.content[]`. Always prefer `message.content[]` and only use `toolUseResult` for subagent data.
|
|
113
|
+
5. **Third pass -- compress tool sequences:** Collapse consecutive tool_use + tool_result pairs into summaries.
|
|
114
|
+
|
|
115
|
+
## Example: signal extraction from a real session
|
|
116
|
+
|
|
117
|
+
**Human intent (from user record):**
|
|
118
|
+
> "Tell me about our ref token?"
|
|
119
|
+
Signal: User wants to understand the ref token system.
|
|
120
|
+
|
|
121
|
+
**Assistant reasoning (from assistant text):**
|
|
122
|
+
> "Ref Token: Opaque backend-signed token that binds a locally-edited .md file to a specific page in the DB. Backend signs on download, verifies on publish. No signing secret ever touches the client."
|
|
123
|
+
Signal: Architectural explanation of the ref token system.
|
|
124
|
+
|
|
125
|
+
**Assistant problem diagnosis (from assistant text):**
|
|
126
|
+
> "Root cause: the codealmanac CLI expects topics.yaml in the new list format but .almanac/topics.yaml is still in the old dict format"
|
|
127
|
+
Signal: Bug identification with root cause.
|
|
128
|
+
|
|
129
|
+
**Subagent audit (from toolUseResult with agentType):**
|
|
130
|
+
> agentType: "general-purpose", prompt: "You are auditing the hosted editor..."
|
|
131
|
+
> content: "Audit Report: Hosted Editor / Quill Co-Editing (Pre-Phase-5) ... 3 HIGH gaps found ... Fix malformed edit-result surfacing ... Fix Quill session isolation ..."
|
|
132
|
+
Signal: Complete code audit with findings, priorities, and recommendations.
|
|
133
|
+
|
|
134
|
+
**Noise skipped:** 522 tool results totaling 2MB of file contents, 466KB of empty thinking blocks, 408KB of metadata records, 2.7MB of duplicated toolUseResult data.
|
|
135
|
+
|
|
136
|
+
## Gotchas
|
|
137
|
+
|
|
138
|
+
1. **toolUseResult duplication is massive.** In one 8MB session, `toolUseResult` accounted for 33% of the file. It duplicates `message.content[]` tool results AND sometimes contains subagent results. Always check `agentType` before discarding.
|
|
139
|
+
|
|
140
|
+
2. **Thinking blocks are always empty in recent versions.** The `thinking` field in content items of type `"thinking"` is consistently empty string in observed sessions (95 empty out of 95 in one session). The thinking content is not persisted. Do not expect signal here despite the promising field name.
|
|
141
|
+
|
|
142
|
+
3. **assistant records contain message-level metadata.** The `message` object has `usage` (token counts), `model` (model name), `stop_reason` (why generation stopped). These can be useful for understanding session dynamics but are not knowledge signal.
|
|
143
|
+
|
|
144
|
+
4. **user records serve double duty.** A `user` record with `userType: "external"` and text content is a real human message. A `user` record with tool_result content is just the harness returning tool output. Always check `message.content[].type`.
|
|
145
|
+
|
|
146
|
+
5. **The `isSidechain` field** indicates branched conversations (user went back and tried a different approach). Sidechain records may represent abandoned approaches -- still potentially valuable as "what was tried and rejected."
|
|
147
|
+
|
|
148
|
+
6. **Base64 images can be huge.** A single screenshot paste can add 100KB+ of base64 data to a user record. These look like small records until you measure them.
|
|
149
|
+
|
|
150
|
+
7. **Attachment records repeat.** `skill_listing` and `task_reminder` attachments are re-injected at many turn boundaries. The same content appears 10-15 times in a long session.
|
|
151
|
+
|
|
152
|
+
8. **Subagent data structure:** The `toolUseResult` for subagents includes `toolStats` (readCount, searchCount, bashCount, editFileCount, linesAdded, linesRemoved) and `usage` (input_tokens, output_tokens, cache stats). These are useful metadata about the subagent's work.
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# Processing Codex Sessions
|
|
2
|
+
|
|
3
|
+
## Format overview
|
|
4
|
+
|
|
5
|
+
Codex stores sessions as JSONL files at:
|
|
6
|
+
```
|
|
7
|
+
~/.codex/sessions/<year>/<month>/<day>/rollout-<timestamp>-<thread-uuid>.jsonl
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
A SQLite database at `~/.codex/state_5.sqlite` provides session metadata (title, cwd, model, tokens_used, git info) in the `threads` table.
|
|
11
|
+
|
|
12
|
+
Multiple rollout files for the same timestamp indicate subagent threads spawned by the parent session. The SQLite `source` column reveals this: `"vscode"` for top-level sessions, a JSON blob with `subagent.thread_spawn.parent_thread_id` for child threads.
|
|
13
|
+
|
|
14
|
+
Typical session sizes: 500KB (short task) to 12MB+ (multi-turn debugging session). Line counts range from ~175 to ~2,800.
|
|
15
|
+
|
|
16
|
+
## Record types
|
|
17
|
+
|
|
18
|
+
Each line is a JSON object with a `type` field:
|
|
19
|
+
|
|
20
|
+
| Type | % of records | % of bytes | What it contains |
|
|
21
|
+
|------|-------------|------------|-----------------|
|
|
22
|
+
| `response_item` | ~55% | ~36% | Model outputs: function calls, function call outputs, messages, reasoning |
|
|
23
|
+
| `event_msg` | ~43% | ~29% | Harness events: command execution results, token counts, agent messages, task lifecycle |
|
|
24
|
+
| `turn_context` | ~2% | ~5% | Per-turn context: model, cwd, instructions, settings. Repeated every turn |
|
|
25
|
+
| `session_meta` | 1 per file | ~1-3% | Session metadata: id, cwd, model, CLI version, base instructions, skills |
|
|
26
|
+
| `compacted` | rare (0-3) | 10-30% when present | Compressed conversation history from context window compaction |
|
|
27
|
+
|
|
28
|
+
### response_item subtypes (in `payload.type`)
|
|
29
|
+
|
|
30
|
+
| Subtype | What it contains |
|
|
31
|
+
|---------|-----------------|
|
|
32
|
+
| `function_call` | Tool invocations: `name` (always `exec_command`), `arguments` (JSON with cmd, workdir, yield_time_ms) |
|
|
33
|
+
| `function_call_output` | Tool results: `output` (command stdout/stderr as string) |
|
|
34
|
+
| `message` | Conversation messages. Check `payload.role`: `developer` (system prompts), `user` (human + context), `assistant` (model output) |
|
|
35
|
+
| `reasoning` | Model reasoning. Contains `encrypted_content` (unreadable) and `summary` (always empty in observed data) |
|
|
36
|
+
| `custom_tool_call` | File edit operations via `apply_patch`. Contains unified diff in `input` |
|
|
37
|
+
| `custom_tool_call_output` | Patch application results: success/failure + modified file list |
|
|
38
|
+
| `web_search_call` | Web search invocations (rare) |
|
|
39
|
+
|
|
40
|
+
### event_msg subtypes (in `payload.type`)
|
|
41
|
+
|
|
42
|
+
| Subtype | What it contains |
|
|
43
|
+
|---------|-----------------|
|
|
44
|
+
| `user_message` | Human input: `message` (text), `images` (base64 data URIs), `text_elements` |
|
|
45
|
+
| `agent_message` | Model commentary shown to user: `message`, `phase` (always "commentary") |
|
|
46
|
+
| `exec_command_end` | Command execution results (DUPLICATES `function_call_output`): stdout, stderr, aggregated_output, exit_code, duration, command |
|
|
47
|
+
| `token_count` | Token usage and rate limit info per turn |
|
|
48
|
+
| `task_started` | Turn lifecycle: turn_id, model_context_window, collaboration_mode |
|
|
49
|
+
| `task_complete` | Turn completion: `last_agent_message` (the final text shown to user) |
|
|
50
|
+
| `patch_apply_end` | File edit results: stdout, changes list, success boolean |
|
|
51
|
+
| `context_compacted` | Marker that context window was compacted |
|
|
52
|
+
| `turn_aborted` | Turn was cancelled |
|
|
53
|
+
|
|
54
|
+
## What to extract (signal)
|
|
55
|
+
|
|
56
|
+
### 1. Human messages (highest signal density)
|
|
57
|
+
- **Where:** `event_msg` records with `payload.type == "user_message"`
|
|
58
|
+
- **Field:** `payload.message`
|
|
59
|
+
- **Also in:** `response_item` with `payload.type == "message"` and `payload.role == "user"`, `payload.content[].type == "input_text"`. The event_msg version is cleaner
|
|
60
|
+
- **Watch for:** The `response_item` version also contains system/developer context injected alongside the real user message. Extract only `input_text` items where the text is NOT wrapped in XML tags like `<environment_context>`, `<permissions instructions>`, `<app-context>`, `<skills_instructions>`, `<collaboration_mode>`
|
|
61
|
+
|
|
62
|
+
### 2. Agent messages (model's user-facing commentary)
|
|
63
|
+
- **Where:** `event_msg` records with `payload.type == "agent_message"`
|
|
64
|
+
- **Fields:** `payload.message`, `payload.phase`
|
|
65
|
+
- **What:** Short status updates and reasoning the model shares with the user. These are the "thinking out loud" moments
|
|
66
|
+
- **Example:** "I'm inspecting the repo for category vs topic naming drift and I'll trace it through code, routes, API shapes, and copy so the discrepancies are concrete rather than guessed."
|
|
67
|
+
|
|
68
|
+
### 3. Task completion summaries
|
|
69
|
+
- **Where:** `event_msg` records with `payload.type == "task_complete"`
|
|
70
|
+
- **Field:** `payload.last_agent_message`
|
|
71
|
+
- **What:** The final, complete response for each turn. Often the densest signal -- the model's synthesized answer after all tool use. Can be multi-thousand characters of analysis
|
|
72
|
+
|
|
73
|
+
### 4. Assistant output text
|
|
74
|
+
- **Where:** `response_item` with `payload.type == "message"`, `payload.role == "assistant"`, content items with `type: "output_text"`
|
|
75
|
+
- **What:** Model's text responses interspersed with tool calls. Shorter than task_complete but captures incremental reasoning
|
|
76
|
+
|
|
77
|
+
### 5. File edits (apply_patch)
|
|
78
|
+
- **Where:** `response_item` with `payload.type == "custom_tool_call"` and `payload.name == "apply_patch"`
|
|
79
|
+
- **Field:** `payload.input` contains a unified diff
|
|
80
|
+
- **What:** Every code change the model made. Extract the file path and a summary of the change, not the full diff (the repo has the final state)
|
|
81
|
+
|
|
82
|
+
### 6. Session metadata
|
|
83
|
+
- **Where:** `session_meta` record (first line of file)
|
|
84
|
+
- **Key fields:** `payload.id`, `payload.cwd`, `payload.model_provider`, `payload.cli_version`, `payload.source`, `payload.model` (in turn_context)
|
|
85
|
+
- **SQLite enrichment:** Query `threads` table for `title`, `tokens_used`, `git_branch`, `first_user_message`, `source` (reveals if this is a subagent)
|
|
86
|
+
|
|
87
|
+
## What to skip (noise)
|
|
88
|
+
|
|
89
|
+
### 1. function_call_output records (~17% of bytes) -- SKIP
|
|
90
|
+
- **Where:** `response_item` with `payload.type == "function_call_output"`
|
|
91
|
+
- **Why:** Raw command output (file contents, grep results, build output). Already in the repo or transient
|
|
92
|
+
|
|
93
|
+
### 2. exec_command_end records (~15% of bytes) -- SKIP
|
|
94
|
+
- **Where:** `event_msg` with `payload.type == "exec_command_end"`
|
|
95
|
+
- **Why:** DUPLICATES `function_call_output` with the same `call_id`. Contains stdout, stderr, aggregated_output redundantly. In one 12MB session, 399 of these consumed 1.9MB
|
|
96
|
+
- **Note:** 100% overlap with function_call_output on shared call_ids
|
|
97
|
+
|
|
98
|
+
### 3. Reasoning records (~6% of bytes) -- SKIP
|
|
99
|
+
- **Where:** `response_item` with `payload.type == "reasoning"`
|
|
100
|
+
- **Why:** Contains `encrypted_content` (base64 blob, unreadable) and `summary` (consistently empty array in all observed sessions). No extractable signal
|
|
101
|
+
- **Do not confuse with:** `agent_message` records, which ARE readable model reasoning
|
|
102
|
+
|
|
103
|
+
### 4. turn_context records (~5-25% of bytes) -- SKIP
|
|
104
|
+
- **Where:** `type: "turn_context"`
|
|
105
|
+
- **Why:** Repeated every turn. Contains model name, cwd, instructions, sandbox policy, collaboration mode. Same content each time with minor variations
|
|
106
|
+
|
|
107
|
+
### 5. session_meta base_instructions (~3% of bytes) -- SKIP
|
|
108
|
+
- **Where:** `session_meta` record, `payload.base_instructions.text`
|
|
109
|
+
- **Why:** Codex's built-in system prompt. Same across all sessions. ~2000 chars of personality and behavior instructions
|
|
110
|
+
|
|
111
|
+
### 6. Developer instructions in message records -- SKIP
|
|
112
|
+
- **Where:** `response_item` messages with `payload.role == "developer"`
|
|
113
|
+
- **Contains:** `<permissions instructions>`, `<app-context>`, `<collaboration_mode>`, `<apps_instructions>`, `<skills_instructions>` XML blocks
|
|
114
|
+
- **Why:** Harness configuration, not user knowledge. Can be 10KB+ per occurrence
|
|
115
|
+
|
|
116
|
+
### 7. token_count records (~2%) -- SKIP
|
|
117
|
+
- Rate limit and token usage telemetry
|
|
118
|
+
|
|
119
|
+
### 8. function_call records (~1.5%) -- SKIP or SUMMARIZE
|
|
120
|
+
- **Where:** `response_item` with `payload.type == "function_call"`
|
|
121
|
+
- **Contains:** `name` (always `exec_command`), `arguments` (cmd, workdir)
|
|
122
|
+
- **Why:** Operational commands. Summarize the pattern, not individual calls
|
|
123
|
+
|
|
124
|
+
### 9. Base64 images in user_message (~4-7% when present) -- SKIP
|
|
125
|
+
- **Where:** `event_msg` with `payload.type == "user_message"`, `payload.images[]`
|
|
126
|
+
- **Format:** data URIs (`data:image/png;base64,...`), 350KB-550KB each
|
|
127
|
+
- **Also in:** `response_item` message content with `type: "input_image"` and `image_url`
|
|
128
|
+
- **Why:** Screenshots. Not extractable as text knowledge. A single image can be 550KB
|
|
129
|
+
|
|
130
|
+
### 10. Compacted records (10-30% when present) -- EXTRACT SELECTIVELY
|
|
131
|
+
- **Where:** `type: "compacted"` records
|
|
132
|
+
- **Contains:** `payload.replacement_history[]` -- a compressed version of earlier conversation
|
|
133
|
+
- **Treatment:** These contain summarized versions of earlier turns after context compaction. The `replacement_history` items have `role` and `content[]` with `input_text`/`output_text`. Extract output_text items (model summaries) but skip input_text (already captured from the original records earlier in the file)
|
|
134
|
+
|
|
135
|
+
## What to summarize
|
|
136
|
+
|
|
137
|
+
| Pattern | Summarize as |
|
|
138
|
+
|---------|-------------|
|
|
139
|
+
| N consecutive exec_command function_call/output pairs | "Ran N commands exploring {pattern}" |
|
|
140
|
+
| grep/find/sed sequences reading files | "Searched for {pattern} in {directory}" |
|
|
141
|
+
| apply_patch calls | "Modified {file}: {one-line description from diff}" |
|
|
142
|
+
| Multiple agent_message records saying similar things | Keep only the last one before task_complete |
|
|
143
|
+
|
|
144
|
+
## Extraction approach
|
|
145
|
+
|
|
146
|
+
1. **Check SQLite first** for session metadata: `SELECT id, title, cwd, model, tokens_used, git_branch, source, first_user_message FROM threads WHERE id = '<thread-id>'`. The `source` field tells you if this is a subagent session.
|
|
147
|
+
|
|
148
|
+
2. **Parse the JSONL file** line by line.
|
|
149
|
+
|
|
150
|
+
3. **Extract session_meta** (first record): grab `payload.id`, `payload.cwd`, `payload.source`, `payload.model_provider`.
|
|
151
|
+
|
|
152
|
+
4. **For each record, route by type and subtype:**
|
|
153
|
+
- `event_msg` + `user_message` -> **extract** `payload.message` as human input. Note `payload.images` count but skip the base64 data
|
|
154
|
+
- `event_msg` + `agent_message` -> **extract** `payload.message` as model reasoning
|
|
155
|
+
- `event_msg` + `task_complete` -> **extract** `payload.last_agent_message` as turn summary
|
|
156
|
+
- `event_msg` + `exec_command_end` -> **skip** (duplicated in response_item)
|
|
157
|
+
- `event_msg` + `token_count` / `task_started` / `context_compacted` -> **skip**
|
|
158
|
+
- `response_item` + `message` + `role: "assistant"` -> **extract** output_text content
|
|
159
|
+
- `response_item` + `message` + `role: "developer"` or `role: "user"` with XML-tagged content -> **skip**
|
|
160
|
+
- `response_item` + `message` + `role: "user"` with plain text -> **extract** (but deduplicate against event_msg user_message)
|
|
161
|
+
- `response_item` + `function_call_output` -> **skip**
|
|
162
|
+
- `response_item` + `function_call` -> **summarize** command pattern
|
|
163
|
+
- `response_item` + `reasoning` -> **skip** (encrypted, empty summary)
|
|
164
|
+
- `response_item` + `custom_tool_call` -> **summarize** file path and change description
|
|
165
|
+
- `response_item` + `custom_tool_call_output` -> **skip** (just success/fail)
|
|
166
|
+
- `turn_context` -> **skip**
|
|
167
|
+
- `compacted` -> **extract** output_text from `payload.replacement_history[]`
|
|
168
|
+
|
|
169
|
+
5. **Deduplicate across record types:** User messages appear in both `event_msg.user_message` AND `response_item.message` (role: user). Command output appears in both `response_item.function_call_output` AND `event_msg.exec_command_end`. Always prefer the event_msg version for user messages (cleaner), skip the duplicate command output entirely.
|
|
170
|
+
|
|
171
|
+
6. **Link subagent sessions:** Check the SQLite `source` column. If it contains `subagent.thread_spawn`, this session's knowledge should be attributed to the parent thread. The `parent_thread_id` field links them.
|
|
172
|
+
|
|
173
|
+
## Example: signal extraction from a real session
|
|
174
|
+
|
|
175
|
+
**Human intent (from event_msg.user_message):**
|
|
176
|
+
> "Look at our codebase, we have categories and topics, we want to unify everywhere to be called topics. Find all discrepancies."
|
|
177
|
+
Signal: User wants a category-to-topic naming audit.
|
|
178
|
+
|
|
179
|
+
**Agent reasoning (from event_msg.agent_message):**
|
|
180
|
+
> "I've isolated one concrete runtime mismatch already: the page-topic footer still talks about 'categories' in UI copy while the rest of the product model is 'topics.' I'm now separating first-party mismatches from external-source fields and old design docs so the final list is usable."
|
|
181
|
+
Signal: Agent's approach to categorizing the findings.
|
|
182
|
+
|
|
183
|
+
**Task completion (from event_msg.task_complete):**
|
|
184
|
+
> "Root Cause: This is not primarily a CSS-specificity problem. The break happens because the suggestion extension's renderHTML() emits bare <ins>/<del> tags without the CSS class..."
|
|
185
|
+
Signal: Complete diagnosis with root cause and fix path.
|
|
186
|
+
|
|
187
|
+
**File edit (from custom_tool_call):**
|
|
188
|
+
> name: apply_patch, file: SuggestionChangesExtension.ts
|
|
189
|
+
> Change: Added class attribute to renderHTML() output so CSS selectors match
|
|
190
|
+
Signal: What was actually changed and why.
|
|
191
|
+
|
|
192
|
+
**Noise skipped:** 442 function_call_output records (2.2MB), 399 duplicate exec_command_end records (1.9MB), 266 encrypted reasoning records (787KB), 67 repeated turn_context records (675KB), 368 token_count records (267KB).
|
|
193
|
+
|
|
194
|
+
## Gotchas
|
|
195
|
+
|
|
196
|
+
1. **exec_command_end duplicates function_call_output.** They share the same `call_id` and contain the same command output in different field names. 100% overlap observed. Skip exec_command_end entirely.
|
|
197
|
+
|
|
198
|
+
2. **Reasoning is unreadable.** Despite having `summary` and `content` fields, reasoning records contain only `encrypted_content` (opaque base64) and consistently empty `summary: []`. There is zero extractable signal from reasoning records.
|
|
199
|
+
|
|
200
|
+
3. **User messages appear twice.** Once in `event_msg.user_message` (clean, just the text + images) and again in `response_item.message` with `role: "user"` (mixed with system context). Use the event_msg version.
|
|
201
|
+
|
|
202
|
+
4. **Developer messages are system prompts, not human.** `response_item.message` with `role: "developer"` contains harness instructions (permissions, app context, collaboration mode, skills). These are NOT human messages. They are wrapped in XML tags like `<permissions instructions>`, `<app-context>`, etc.
|
|
203
|
+
|
|
204
|
+
5. **Images are massive.** User messages with screenshots contain base64 data URIs of 350-550KB each. A single image can make a 75-char message record balloon to 550KB. Check `payload.images` length but do not extract the base64 data.
|
|
205
|
+
|
|
206
|
+
6. **Compacted records contain earlier conversation.** When the context window fills up, Codex compacts history into `compacted` records. The `replacement_history` array contains summarized earlier turns. If you are processing the file start-to-finish, you will see the original records first and then the compacted summary later -- be careful not to double-count.
|
|
207
|
+
|
|
208
|
+
7. **Subagent sessions are separate files.** A parent session spawns subagent threads that are written to their own rollout files in the same date directory. The SQLite `source` column JSON identifies child threads. To get the complete picture of a multi-agent session, you must read all linked rollout files.
|
|
209
|
+
|
|
210
|
+
8. **Codex uses `exec_command` for everything.** Unlike Claude Code which has specialized tools (Read, Grep, Edit, Bash), Codex wraps all operations in `exec_command` with shell commands (`sed`, `rg`, `cat`, etc.) or `apply_patch` for file edits. This means function_call records are less informative about intent -- you need to parse the `cmd` field to understand what was done.
|
|
211
|
+
|
|
212
|
+
9. **task_complete contains the cleanest signal.** The `last_agent_message` in task_complete records is the final synthesized response after all tool use. If you can only extract one thing per turn, extract this.
|
|
213
|
+
|
|
214
|
+
10. **The model field is in turn_context, not session_meta.** Session_meta has `model_provider` ("openai") but the actual model name (e.g., "gpt-5.4") is in `turn_context.model`.
|