codealmanac 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/dist/chunk-2JJTTN7P.js +539 -0
  2. package/dist/chunk-2JJTTN7P.js.map +1 -0
  3. package/dist/chunk-3C5SY5SE.js +1239 -0
  4. package/dist/chunk-3C5SY5SE.js.map +1 -0
  5. package/dist/chunk-4CODZRHH.js +19 -0
  6. package/dist/chunk-4CODZRHH.js.map +1 -0
  7. package/dist/chunk-7JUX4ADQ.js +38 -0
  8. package/dist/chunk-7JUX4ADQ.js.map +1 -0
  9. package/dist/chunk-A6PUCAVJ.js +145 -0
  10. package/dist/chunk-A6PUCAVJ.js.map +1 -0
  11. package/dist/chunk-AXFPUHBN.js +227 -0
  12. package/dist/chunk-AXFPUHBN.js.map +1 -0
  13. package/dist/chunk-FM3VRDK7.js +20 -0
  14. package/dist/chunk-FM3VRDK7.js.map +1 -0
  15. package/dist/chunk-H6WU6PYH.js +441 -0
  16. package/dist/chunk-H6WU6PYH.js.map +1 -0
  17. package/dist/chunk-P3LDTCLB.js +34 -0
  18. package/dist/chunk-P3LDTCLB.js.map +1 -0
  19. package/dist/chunk-QHQ6YH7U.js +81 -0
  20. package/dist/chunk-QHQ6YH7U.js.map +1 -0
  21. package/dist/chunk-Z4MWLVS2.js +355 -0
  22. package/dist/chunk-Z4MWLVS2.js.map +1 -0
  23. package/dist/chunk-Z6MBJ3D2.js +203 -0
  24. package/dist/chunk-Z6MBJ3D2.js.map +1 -0
  25. package/dist/cli-AIH5QQ5H.js +393 -0
  26. package/dist/cli-AIH5QQ5H.js.map +1 -0
  27. package/dist/codealmanac.js +68 -5954
  28. package/dist/codealmanac.js.map +1 -1
  29. package/dist/doctor-6FN5JO5F.js +15 -0
  30. package/dist/doctor-6FN5JO5F.js.map +1 -0
  31. package/dist/hook-CRJMWSSO.js +12 -0
  32. package/dist/hook-CRJMWSSO.js.map +1 -0
  33. package/dist/register-commands-PZMQNGCH.js +2644 -0
  34. package/dist/register-commands-PZMQNGCH.js.map +1 -0
  35. package/dist/uninstall-NBEZNNKM.js +12 -0
  36. package/dist/uninstall-NBEZNNKM.js.map +1 -0
  37. package/dist/update-IL243I4E.js +10 -0
  38. package/dist/update-IL243I4E.js.map +1 -0
  39. package/dist/wiki-EHZ7LG7R.js +238 -0
  40. package/dist/wiki-EHZ7LG7R.js.map +1 -0
  41. package/guides/processing/claude-code.md +152 -0
  42. package/guides/processing/codex.md +214 -0
  43. package/guides/processing/generic.md +128 -0
  44. package/package.json +2 -2
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ runUpdate
4
+ } from "./chunk-Z6MBJ3D2.js";
5
+ import "./chunk-AXFPUHBN.js";
6
+ import "./chunk-7JUX4ADQ.js";
7
+ export {
8
+ runUpdate
9
+ };
10
+ //# sourceMappingURL=update-IL243I4E.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}
@@ -0,0 +1,238 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ ensureFreshIndex,
4
+ findEntry,
5
+ openIndex,
6
+ runHealth
7
+ } from "./chunk-3C5SY5SE.js";
8
+ import {
9
+ formatDuration
10
+ } from "./chunk-4CODZRHH.js";
11
+ import "./chunk-FM3VRDK7.js";
12
+ import {
13
+ findNearestAlmanacDir
14
+ } from "./chunk-7JUX4ADQ.js";
15
+
16
+ // src/commands/doctor-checks/wiki.ts
17
+ import { existsSync, readdirSync, statSync } from "fs";
18
+ import path from "path";
19
+ async function gatherWikiChecks(options) {
20
+ const checks = [];
21
+ const repoRoot = findNearestAlmanacDir(options.cwd);
22
+ if (repoRoot === null) {
23
+ checks.push({
24
+ status: "info",
25
+ key: "wiki.none",
26
+ message: "No wiki in current directory",
27
+ fix: "run: almanac bootstrap (to create one in this repo)"
28
+ });
29
+ return checks;
30
+ }
31
+ checks.push({
32
+ status: "info",
33
+ key: "wiki.repo",
34
+ message: `repo: ${repoRoot}`
35
+ });
36
+ try {
37
+ await ensureFreshIndex({ repoRoot });
38
+ } catch {
39
+ }
40
+ checks.push(await describeRegistry(repoRoot));
41
+ const almanacDir = path.join(repoRoot, ".almanac");
42
+ const dbPath = path.join(almanacDir, "index.db");
43
+ checks.push(...describeCounts(dbPath));
44
+ checks.push(describeIndexFreshness(dbPath));
45
+ checks.push(describeLastCapture(almanacDir, options.now));
46
+ checks.push(await describeHealth(repoRoot, options));
47
+ return checks;
48
+ }
49
+ async function describeRegistry(repoRoot) {
50
+ try {
51
+ const entry = await findEntry({ path: repoRoot });
52
+ if (entry !== null) {
53
+ return {
54
+ status: "ok",
55
+ key: "wiki.registered",
56
+ message: `registered as '${entry.name}'`
57
+ };
58
+ }
59
+ return {
60
+ status: "info",
61
+ key: "wiki.registered",
62
+ message: "not yet registered (will register on first command)"
63
+ };
64
+ } catch (err) {
65
+ const msg = err instanceof Error ? err.message : String(err);
66
+ return {
67
+ status: "problem",
68
+ key: "wiki.registered",
69
+ message: `could not read registry: ${msg}`,
70
+ fix: "inspect ~/.almanac/registry.json; remove or fix the malformed entry"
71
+ };
72
+ }
73
+ }
74
+ function describeCounts(dbPath) {
75
+ const checks = [];
76
+ let pageCount = null;
77
+ let topicCount = null;
78
+ if (existsSync(dbPath)) {
79
+ try {
80
+ const db = openIndex(dbPath);
81
+ try {
82
+ pageCount = countRows(db, "pages");
83
+ topicCount = countRows(db, "topics");
84
+ } finally {
85
+ db.close();
86
+ }
87
+ } catch {
88
+ pageCount = null;
89
+ }
90
+ }
91
+ if (pageCount !== null) {
92
+ checks.push({
93
+ status: "info",
94
+ key: "wiki.pages",
95
+ message: `pages: ${pageCount}`
96
+ });
97
+ }
98
+ if (topicCount !== null) {
99
+ checks.push({
100
+ status: "info",
101
+ key: "wiki.topics",
102
+ message: `topics: ${topicCount}`
103
+ });
104
+ }
105
+ return checks;
106
+ }
107
+ function countRows(db, table) {
108
+ const row = db.prepare(`SELECT COUNT(*) AS n FROM ${table}`).get();
109
+ return row?.n ?? 0;
110
+ }
111
+ function describeIndexFreshness(dbPath) {
112
+ if (!existsSync(dbPath)) {
113
+ return {
114
+ status: "info",
115
+ key: "wiki.index",
116
+ message: "index: not built yet (run any query command)"
117
+ };
118
+ }
119
+ try {
120
+ const dbMtime = statSync(dbPath).mtimeMs;
121
+ const age = Date.now() - dbMtime;
122
+ return {
123
+ status: "info",
124
+ key: "wiki.index",
125
+ message: `index: rebuilt ${formatDuration(age)} ago`
126
+ };
127
+ } catch {
128
+ return {
129
+ status: "info",
130
+ key: "wiki.index",
131
+ message: "index: present"
132
+ };
133
+ }
134
+ }
135
+ function describeLastCapture(almanacDir, nowFn) {
136
+ if (!existsSync(almanacDir)) {
137
+ return {
138
+ status: "info",
139
+ key: "wiki.capture",
140
+ message: "last capture: never"
141
+ };
142
+ }
143
+ let entries;
144
+ try {
145
+ entries = readdirSync(almanacDir);
146
+ } catch {
147
+ return {
148
+ status: "info",
149
+ key: "wiki.capture",
150
+ message: "last capture: unknown"
151
+ };
152
+ }
153
+ const captures = entries.filter(
154
+ (e) => e.startsWith(".capture-") && (e.endsWith(".log") || e.endsWith(".jsonl"))
155
+ ).map((e) => {
156
+ try {
157
+ return {
158
+ name: e,
159
+ mtime: statSync(path.join(almanacDir, e)).mtimeMs
160
+ };
161
+ } catch {
162
+ return null;
163
+ }
164
+ }).filter((e) => e !== null);
165
+ if (captures.length === 0) {
166
+ return {
167
+ status: "info",
168
+ key: "wiki.capture",
169
+ message: "last capture: never"
170
+ };
171
+ }
172
+ captures.sort((a, b) => b.mtime - a.mtime);
173
+ const latest = captures[0];
174
+ const now = (nowFn?.() ?? /* @__PURE__ */ new Date()).getTime();
175
+ const age = now - latest.mtime;
176
+ return {
177
+ status: "info",
178
+ key: "wiki.capture",
179
+ message: `last capture: ${formatDuration(age)} ago (${latest.name})`
180
+ };
181
+ }
182
+ async function describeHealth(repoRoot, options) {
183
+ const healthFn = options.runHealthFn ?? runHealth;
184
+ try {
185
+ const healthRes = await healthFn({
186
+ cwd: repoRoot,
187
+ json: true
188
+ });
189
+ const problems = countHealthProblems(healthRes.stdout);
190
+ if (problems === 0) {
191
+ return {
192
+ status: "ok",
193
+ key: "wiki.health",
194
+ message: "almanac health reports 0 problems"
195
+ };
196
+ }
197
+ return {
198
+ status: "problem",
199
+ key: "wiki.health",
200
+ message: `almanac health reports ${problems} problem${problems === 1 ? "" : "s"}`,
201
+ fix: "run: almanac health"
202
+ };
203
+ } catch (err) {
204
+ const msg = err instanceof Error ? err.message : String(err);
205
+ return {
206
+ status: "info",
207
+ key: "wiki.health",
208
+ message: `could not run almanac health: ${msg}`
209
+ };
210
+ }
211
+ }
212
+ var HEALTH_PROBLEM_KEYS = [
213
+ "orphans",
214
+ "stale",
215
+ "dead_refs",
216
+ "broken_links",
217
+ "broken_xwiki",
218
+ "empty_topics",
219
+ "empty_pages",
220
+ "slug_collisions"
221
+ ];
222
+ function countHealthProblems(jsonStdout) {
223
+ try {
224
+ const report = JSON.parse(jsonStdout);
225
+ let total = 0;
226
+ for (const key of HEALTH_PROBLEM_KEYS) {
227
+ const arr = report[key];
228
+ if (Array.isArray(arr)) total += arr.length;
229
+ }
230
+ return total;
231
+ } catch {
232
+ return 0;
233
+ }
234
+ }
235
+ export {
236
+ gatherWikiChecks
237
+ };
238
+ //# sourceMappingURL=wiki-EHZ7LG7R.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/commands/doctor-checks/wiki.ts"],"sourcesContent":["import { existsSync, readdirSync, statSync } from \"node:fs\";\nimport path from \"node:path\";\n\nimport type Database from \"better-sqlite3\";\n\nimport { ensureFreshIndex } from \"../../indexer/index.js\";\nimport { openIndex } from \"../../indexer/schema.js\";\nimport { findNearestAlmanacDir } from \"../../paths.js\";\nimport { findEntry } from \"../../registry/index.js\";\nimport { runHealth, type HealthReport } from \"../health.js\";\nimport { formatDuration } from \"./duration.js\";\nimport type { Check, DoctorOptions } from \"./types.js\";\n\nexport async function gatherWikiChecks(options: DoctorOptions): Promise<Check[]> {\n const checks: Check[] = [];\n const repoRoot = findNearestAlmanacDir(options.cwd);\n\n if (repoRoot === null) {\n checks.push({\n status: \"info\",\n key: \"wiki.none\",\n message: \"No wiki in current directory\",\n fix: \"run: almanac bootstrap (to create one in this repo)\",\n });\n return checks;\n }\n\n checks.push({\n status: \"info\",\n key: \"wiki.repo\",\n message: `repo: ${repoRoot}`,\n });\n\n try {\n await ensureFreshIndex({ repoRoot });\n } catch {\n // non-fatal: counts below and the health probe report any real issue.\n }\n\n checks.push(await describeRegistry(repoRoot));\n\n const almanacDir = path.join(repoRoot, \".almanac\");\n const dbPath = path.join(almanacDir, \"index.db\");\n checks.push(...describeCounts(dbPath));\n checks.push(describeIndexFreshness(dbPath));\n checks.push(describeLastCapture(almanacDir, options.now));\n checks.push(await describeHealth(repoRoot, options));\n\n return checks;\n}\n\nasync function describeRegistry(repoRoot: string): Promise<Check> {\n try {\n const entry = await findEntry({ path: repoRoot });\n if (entry !== null) {\n return {\n status: \"ok\",\n key: \"wiki.registered\",\n message: `registered as '${entry.name}'`,\n };\n }\n return {\n status: \"info\",\n key: \"wiki.registered\",\n message: \"not yet registered (will register on first command)\",\n };\n } catch (err: unknown) {\n const msg = err instanceof Error ? err.message : String(err);\n return {\n status: \"problem\",\n key: \"wiki.registered\",\n message: `could not read registry: ${msg}`,\n fix: \"inspect ~/.almanac/registry.json; remove or fix the malformed entry\",\n };\n }\n}\n\nfunction describeCounts(dbPath: string): Check[] {\n const checks: Check[] = [];\n let pageCount: number | null = null;\n let topicCount: number | null = null;\n\n if (existsSync(dbPath)) {\n try {\n const db = openIndex(dbPath);\n try {\n pageCount = countRows(db, \"pages\");\n topicCount = countRows(db, \"topics\");\n } finally {\n db.close();\n }\n } catch {\n pageCount = null;\n }\n }\n\n if (pageCount !== null) {\n checks.push({\n status: \"info\",\n key: \"wiki.pages\",\n message: `pages: ${pageCount}`,\n });\n }\n if (topicCount !== null) {\n checks.push({\n status: \"info\",\n key: \"wiki.topics\",\n message: `topics: ${topicCount}`,\n });\n }\n\n return checks;\n}\n\nfunction countRows(db: Database.Database, table: string): number {\n const row = db\n .prepare<[], { n: number }>(`SELECT COUNT(*) AS n FROM ${table}`)\n .get();\n return row?.n ?? 0;\n}\n\nfunction describeIndexFreshness(dbPath: string): Check {\n if (!existsSync(dbPath)) {\n return {\n status: \"info\",\n key: \"wiki.index\",\n message: \"index: not built yet (run any query command)\",\n };\n }\n try {\n const dbMtime = statSync(dbPath).mtimeMs;\n const age = Date.now() - dbMtime;\n return {\n status: \"info\",\n key: \"wiki.index\",\n message: `index: rebuilt ${formatDuration(age)} ago`,\n };\n } catch {\n return {\n status: \"info\",\n key: \"wiki.index\",\n message: \"index: present\",\n };\n }\n}\n\nfunction describeLastCapture(\n almanacDir: string,\n nowFn?: () => Date,\n): Check {\n if (!existsSync(almanacDir)) {\n return {\n status: \"info\",\n key: \"wiki.capture\",\n message: \"last capture: never\",\n };\n }\n let entries: string[];\n try {\n entries = readdirSync(almanacDir);\n } catch {\n return {\n status: \"info\",\n key: \"wiki.capture\",\n message: \"last capture: unknown\",\n };\n }\n const captures = entries\n .filter(\n (e) =>\n e.startsWith(\".capture-\") &&\n (e.endsWith(\".log\") || e.endsWith(\".jsonl\")),\n )\n .map((e) => {\n try {\n return {\n name: e,\n mtime: statSync(path.join(almanacDir, e)).mtimeMs,\n };\n } catch {\n return null;\n }\n })\n .filter((e): e is { name: string; mtime: number } => e !== null);\n if (captures.length === 0) {\n return {\n status: \"info\",\n key: \"wiki.capture\",\n message: \"last capture: never\",\n };\n }\n captures.sort((a, b) => b.mtime - a.mtime);\n const latest = captures[0]!;\n const now = (nowFn?.() ?? new Date()).getTime();\n const age = now - latest.mtime;\n return {\n status: \"info\",\n key: \"wiki.capture\",\n message: `last capture: ${formatDuration(age)} ago (${latest.name})`,\n };\n}\n\nasync function describeHealth(\n repoRoot: string,\n options: DoctorOptions,\n): Promise<Check> {\n const healthFn = options.runHealthFn ?? runHealth;\n try {\n const healthRes = await healthFn({\n cwd: repoRoot,\n json: true,\n });\n const problems = countHealthProblems(healthRes.stdout);\n if (problems === 0) {\n return {\n status: \"ok\",\n key: \"wiki.health\",\n message: \"almanac health reports 0 problems\",\n };\n }\n return {\n status: \"problem\",\n key: \"wiki.health\",\n message: `almanac health reports ${problems} problem${problems === 1 ? \"\" : \"s\"}`,\n fix: \"run: almanac health\",\n };\n } catch (err: unknown) {\n const msg = err instanceof Error ? err.message : String(err);\n return {\n status: \"info\",\n key: \"wiki.health\",\n message: `could not run almanac health: ${msg}`,\n };\n }\n}\n\nconst HEALTH_PROBLEM_KEYS: (keyof HealthReport)[] = [\n \"orphans\",\n \"stale\",\n \"dead_refs\",\n \"broken_links\",\n \"broken_xwiki\",\n \"empty_topics\",\n \"empty_pages\",\n \"slug_collisions\",\n];\n\nfunction countHealthProblems(jsonStdout: string): number {\n try {\n const report = JSON.parse(jsonStdout) as Partial<HealthReport>;\n let total = 0;\n for (const key of HEALTH_PROBLEM_KEYS) {\n const arr = report[key];\n if (Array.isArray(arr)) total += arr.length;\n }\n return total;\n } catch {\n return 0;\n }\n}\n"],"mappings":";;;;;;;;;;;;;;;;AAAA,SAAS,YAAY,aAAa,gBAAgB;AAClD,OAAO,UAAU;AAYjB,eAAsB,iBAAiB,SAA0C;AAC/E,QAAM,SAAkB,CAAC;AACzB,QAAM,WAAW,sBAAsB,QAAQ,GAAG;AAElD,MAAI,aAAa,MAAM;AACrB,WAAO,KAAK;AAAA,MACV,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,MACT,KAAK;AAAA,IACP,CAAC;AACD,WAAO;AAAA,EACT;AAEA,SAAO,KAAK;AAAA,IACV,QAAQ;AAAA,IACR,KAAK;AAAA,IACL,SAAS,SAAS,QAAQ;AAAA,EAC5B,CAAC;AAED,MAAI;AACF,UAAM,iBAAiB,EAAE,SAAS,CAAC;AAAA,EACrC,QAAQ;AAAA,EAER;AAEA,SAAO,KAAK,MAAM,iBAAiB,QAAQ,CAAC;AAE5C,QAAM,aAAa,KAAK,KAAK,UAAU,UAAU;AACjD,QAAM,SAAS,KAAK,KAAK,YAAY,UAAU;AAC/C,SAAO,KAAK,GAAG,eAAe,MAAM,CAAC;AACrC,SAAO,KAAK,uBAAuB,MAAM,CAAC;AAC1C,SAAO,KAAK,oBAAoB,YAAY,QAAQ,GAAG,CAAC;AACxD,SAAO,KAAK,MAAM,eAAe,UAAU,OAAO,CAAC;AAEnD,SAAO;AACT;AAEA,eAAe,iBAAiB,UAAkC;AAChE,MAAI;AACF,UAAM,QAAQ,MAAM,UAAU,EAAE,MAAM,SAAS,CAAC;AAChD,QAAI,UAAU,MAAM;AAClB,aAAO;AAAA,QACL,QAAQ;AAAA,QACR,KAAK;AAAA,QACL,SAAS,kBAAkB,MAAM,IAAI;AAAA,MACvC;AAAA,IACF;AACA,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF,SAAS,KAAc;AACrB,UAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC3D,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,4BAA4B,GAAG;AAAA,MACxC,KAAK;AAAA,IACP;AAAA,EACF;AACF;AAEA,SAAS,eAAe,QAAyB;AAC/C,QAAM,SAAkB,CAAC;AACzB,MAAI,YAA2B;AAC/B,MAAI,aAA4B;AAEhC,MAAI,WAAW,MAAM,GAAG;AACtB,QAAI;AACF,YAAM,KAAK,UAAU,MAAM;AAC3B,UAAI;AACF,oBAAY,UAAU,IAAI,OAAO;AACjC,qBAAa,UAAU,IAAI,QAAQ;AAAA,MACrC,UAAE;AACA,WAAG,MAAM;AAAA,MACX;AAAA,IACF,QAAQ;AACN,kBAAY;AAAA,IACd;AAAA,EACF;AAEA,MAAI,cAAc,MAAM;AACtB,WAAO,KAAK;AAAA,MACV,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,UAAU,SAAS;AAAA,IAC9B,CAAC;AAAA,EACH;AACA,MAAI,eAAe,MAAM;AACvB,WAAO,KAAK;AAAA,MACV,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,WAAW,UAAU;AAAA,IAChC,CAAC;AAAA,EACH;AAEA,SAAO;AACT;AAEA,SAAS,UAAU,IAAuB,OAAuB;AAC/D,QAAM,MAAM,GACT,QAA2B,6BAA6B,KAAK,EAAE,EAC/D,IAAI;AACP,SAAO,KAAK,KAAK;AACnB;AAEA,SAAS,uBAAuB,QAAuB;AACrD,MAAI,CAAC,WAAW,MAAM,GAAG;AACvB,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACA,MAAI;AACF,UAAM,UAAU,SAAS,MAAM,EAAE;AACjC,UAAM,MAAM,KAAK,IAAI,IAAI;AACzB,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,kBAAkB,eAAe,GAAG,CAAC;AAAA,IAChD;AAAA,EACF,QAAQ;AACN,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACF;AAEA,SAAS,oBACP,YACA,OACO;AACP,MAAI,CAAC,WAAW,UAAU,GAAG;AAC3B,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACA,MAAI;AACJ,MAAI;AACF,cAAU,YAAY,UAAU;AAAA,EAClC,QAAQ;AACN,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACA,QAAM,WAAW,QACd;AAAA,IACC,CAAC,MACC,EAAE,WAAW,WAAW,MACvB,EAAE,SAAS,MAAM,KAAK,EAAE,SAAS,QAAQ;AAAA,EAC9C,EACC,IAAI,CAAC,MAAM;AACV,QAAI;AACF,aAAO;AAAA,QACL,MAAM;AAAA,QACN,OAAO,SAAS,KAAK,KAAK,YAAY,CAAC,CAAC,EAAE;AAAA,MAC5C;AAAA,IACF,QAAQ;AACN,aAAO;AAAA,IACT;AAAA,EACF,CAAC,EACA,OAAO,CAAC,MAA4C,MAAM,IAAI;AACjE,MAAI,SAAS,WAAW,GAAG;AACzB,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS;AAAA,IACX;AAAA,EACF;AACA,WAAS,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AACzC,QAAM,SAAS,SAAS,CAAC;AACzB,QAAM,OAAO,QAAQ,KAAK,oBAAI,KAAK,GAAG,QAAQ;AAC9C,QAAM,MAAM,MAAM,OAAO;AACzB,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,KAAK;AAAA,IACL,SAAS,iBAAiB,eAAe,GAAG,CAAC,SAAS,OAAO,IAAI;AAAA,EACnE;AACF;AAEA,eAAe,eACb,UACA,SACgB;AAChB,QAAM,WAAW,QAAQ,eAAe;AACxC,MAAI;AACF,UAAM,YAAY,MAAM,SAAS;AAAA,MAC/B,KAAK;AAAA,MACL,MAAM;AAAA,IACR,CAAC;AACD,UAAM,WAAW,oBAAoB,UAAU,MAAM;AACrD,QAAI,aAAa,GAAG;AAClB,aAAO;AAAA,QACL,QAAQ;AAAA,QACR,KAAK;AAAA,QACL,SAAS;AAAA,MACX;AAAA,IACF;AACA,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,0BAA0B,QAAQ,WAAW,aAAa,IAAI,KAAK,GAAG;AAAA,MAC/E,KAAK;AAAA,IACP;AAAA,EACF,SAAS,KAAc;AACrB,UAAM,MAAM,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG;AAC3D,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,KAAK;AAAA,MACL,SAAS,iCAAiC,GAAG;AAAA,IAC/C;AAAA,EACF;AACF;AAEA,IAAM,sBAA8C;AAAA,EAClD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAEA,SAAS,oBAAoB,YAA4B;AACvD,MAAI;AACF,UAAM,SAAS,KAAK,MAAM,UAAU;AACpC,QAAI,QAAQ;AACZ,eAAW,OAAO,qBAAqB;AACrC,YAAM,MAAM,OAAO,GAAG;AACtB,UAAI,MAAM,QAAQ,GAAG,EAAG,UAAS,IAAI;AAAA,IACvC;AACA,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;","names":[]}
@@ -0,0 +1,152 @@
1
+ # Processing Claude Code Sessions
2
+
3
+ ## Format overview
4
+
5
+ Claude Code stores sessions as JSONL files (one JSON object per line) at:
6
+ ```
7
+ ~/.claude/projects/<project-hash>/<session-uuid>.jsonl
8
+ ```
9
+
10
+ The project hash is a path with slashes replaced by dashes (e.g., `-Users-rohan-Desktop-Projects-myrepo`). Each session file contains the full conversation history including tool calls, tool results, thinking blocks, and metadata.
11
+
12
+ Typical session sizes: 150KB (short Q&A) to 8MB+ (multi-hour coding session). Line counts range from ~75 to ~1,750.
13
+
14
+ ## Record types
15
+
16
+ Each line is a JSON object with a `type` field at the top level:
17
+
18
+ | Type | Frequency | What it contains |
19
+ |------|-----------|-----------------|
20
+ | `assistant` | ~40% of records, ~15% of bytes | Model responses: text, tool_use, thinking blocks. Each content item is in `message.content[]` |
21
+ | `user` | ~35% of records, ~55% of bytes | Human messages OR tool results. Check `message.content[].type` to distinguish |
22
+ | `attachment` | ~5% of records, ~7% of bytes | System context injected by the harness: deferred tool lists, skill listings, memory, task reminders, edited file snippets |
23
+ | `file-history-snapshot` | ~5% of records, <1% of bytes | Checkpoint markers for undo/redo. Always tiny (~250 bytes) |
24
+ | `permission-mode` | ~3% of records, <1% of bytes | Records when permission mode changes (e.g., `bypassPermissions`) |
25
+ | `last-prompt` | ~3% of records, <1% of bytes | Marks turn boundaries. ~120 bytes each |
26
+ | `system` | ~2% of records, <1% of bytes | System messages injected mid-conversation. Often empty content |
27
+ | `ai-title` | rare | Auto-generated session title |
28
+ | `queue-operation` | rare | Queued follow-up commands |
29
+
30
+ ## What to extract (signal)
31
+
32
+ ### 1. Human messages (highest signal density)
33
+ - **Where:** Records with `type: "user"` where `message.content[]` contains items with `type: "text"`
34
+ - **Also check:** `userType` field -- `"external"` means the actual human typed this
35
+ - **What:** Intent, requirements, feedback, decisions, bug reports, design direction
36
+ - **Example pattern:** `{"type": "user", "message": {"content": [{"type": "text", "text": "What problems did you run into?"}]}}`
37
+
38
+ ### 2. Assistant text responses
39
+ - **Where:** Records with `type: "assistant"`, then `message.content[]` items with `type: "text"`
40
+ - **What:** Explanations, decisions, summaries, architecture analysis, bug diagnoses
41
+ - **Typical size:** 100-3000 chars per text block
42
+ - **Example pattern:** The assistant explains a root cause, summarizes what was built, or describes a design decision
43
+
44
+ ### 3. Subagent results (high-value signal hidden in noise)
45
+ - **Where:** Records with `type: "user"` that have a top-level `toolUseResult` field with `agentType` set
46
+ - **What:** Complete results from subagents (review, critic, pair, etc.). The `content` field contains the full subagent output, often multi-thousand-character analysis
47
+ - **Key fields:** `toolUseResult.agentType` (e.g., "general-purpose", "review", "critic", "pair"), `toolUseResult.content[].text`, `toolUseResult.prompt` (what the subagent was asked to do)
48
+ - **Why it matters:** These are often the densest signal in a session -- complete audit reports, code reviews, architecture analyses
49
+
50
+ ### 4. Session metadata
51
+ - **Where:** First few records of the file
52
+ - **Key fields on user records:** `cwd`, `gitBranch`, `version`, `timestamp`, `sessionId`, `entrypoint` (cli vs other)
53
+ - **Attachment records** with `type: "nested_memory"` contain the project's memory/context
54
+
55
+ ## What to skip (noise)
56
+
57
+ ### 1. Tool results (~24% of total bytes) -- SKIP
58
+ - **Where:** `user` records where `message.content[]` has `type: "tool_result"`
59
+ - **Why skip:** These are file contents, grep results, build output, test output. The actual files are in the repo; the output is transient
60
+ - **The biggest offenders:** Read tool results can be 150KB+ (entire file contents dumped inline)
61
+
62
+ ### 2. Tool calls (~8% of total bytes) -- SKIP or SUMMARIZE
63
+ - **Where:** `assistant` records where `message.content[]` has `type: "tool_use"`
64
+ - **Contains:** `name` (Bash, Read, Grep, Edit, Write, Glob) and `input` (command, file path, pattern)
65
+ - **Why skip:** The sequence of tool calls is operational, not knowledge. Exception: summarize the *pattern* of tool usage ("read 15 files in src/auth/")
66
+
67
+ ### 3. Wrapper overhead (~38% of total bytes in large sessions) -- SKIP
68
+ - **Where:** Top-level fields on `user` records: `parentUuid`, `sourceToolAssistantUUID`, `toolUseResult` (when not a subagent), `slug`, `requestId`, `isMeta`, `isSidechain`
69
+ - **Why skip:** The `toolUseResult` field on user records DUPLICATES the tool result content that already appears in `message.content[]`. This is the single biggest source of bloat. In one 8MB session, `toolUseResult` alone was 2.7MB (33%)
70
+ - **EXCEPTION:** When `toolUseResult.agentType` is set, this is a subagent result and IS signal
71
+
72
+ ### 4. Empty thinking blocks (~6% in some sessions) -- SKIP
73
+ - **Where:** `assistant` records, `message.content[]` with `type: "thinking"` but `thinking: ""`
74
+ - **Why:** Claude Code often records thinking blocks with empty content (the actual thinking happened but was not persisted). These are pure waste
75
+
76
+ ### 5. Attachments (~7%) -- SKIP
77
+ - **Where:** `type: "attachment"` records
78
+ - **Contains:** `deferred_tools_delta` (tool availability lists), `skill_listing` (repeated skill menus), `task_reminder` (repeated TODO lists), `mcp_instructions_delta` (MCP setup)
79
+ - **Why skip:** Harness infrastructure, not knowledge. Repeated across turns
80
+
81
+ ### 6. Metadata records (<1%) -- SKIP
82
+ - `file-history-snapshot`, `permission-mode`, `last-prompt`, `queue-operation`, `ai-title`
83
+
84
+ ### 7. Base64 image data (~1-7% in some sessions) -- SKIP
85
+ - **Where:** `user` records with `message.content[]` containing `type: "image"` and `source.type: "base64"`
86
+ - **Why skip:** Screenshots pasted by the user. Can be 100KB+ of base64 per image. Not extractable as knowledge
87
+
88
+ ## What to summarize
89
+
90
+ These patterns should be compressed rather than fully extracted or fully skipped:
91
+
92
+ | Pattern | Summarize as |
93
+ |---------|-------------|
94
+ | 10+ consecutive tool_use/tool_result pairs reading files | "Read N files in {directory pattern}" |
95
+ | grep/glob sequences searching for a pattern | "Searched for {pattern} across {scope}" |
96
+ | Edit tool calls modifying files | "Modified {file}: {description from the Edit input}" |
97
+ | Bash commands running tests | "Ran tests: {pass/fail summary from result}" |
98
+ | Bash commands running builds | "Built {target}: {success/failure}" |
99
+
100
+ ## Extraction approach
101
+
102
+ 1. **Parse the JSONL file** line by line. Each line is one JSON object.
103
+ 2. **First pass -- extract metadata:** From the first `user` record, grab `cwd`, `gitBranch`, `version`, `timestamp`, `sessionId`.
104
+ 3. **For each record, check `type`:**
105
+ - `type: "user"` with `message.content[].type == "text"` -> **extract as human message**
106
+ - `type: "user"` with `toolUseResult.agentType` set -> **extract subagent result** from `toolUseResult.content[].text`
107
+ - `type: "user"` with `message.content[].type == "tool_result"` -> **skip** (or summarize tool call patterns)
108
+ - `type: "assistant"` with `message.content[].type == "text"` -> **extract as assistant reasoning**
109
+ - `type: "assistant"` with `message.content[].type == "thinking"` and non-empty `thinking` field -> **extract as internal reasoning**
110
+ - `type: "assistant"` with `message.content[].type == "tool_use"` -> **skip or summarize**
111
+ - `type: "attachment"`, `type: "system"`, metadata types -> **skip**
112
+ 4. **Second pass -- deduplicate:** The `toolUseResult` field on user records often duplicates content from `message.content[]`. Always prefer `message.content[]` and only use `toolUseResult` for subagent data.
113
+ 5. **Third pass -- compress tool sequences:** Collapse consecutive tool_use + tool_result pairs into summaries.
114
+
115
+ ## Example: signal extraction from a real session
116
+
117
+ **Human intent (from user record):**
118
+ > "Tell me about our ref token?"
119
+ Signal: User wants to understand the ref token system.
120
+
121
+ **Assistant reasoning (from assistant text):**
122
+ > "Ref Token: Opaque backend-signed token that binds a locally-edited .md file to a specific page in the DB. Backend signs on download, verifies on publish. No signing secret ever touches the client."
123
+ Signal: Architectural explanation of the ref token system.
124
+
125
+ **Assistant problem diagnosis (from assistant text):**
126
+ > "Root cause: the codealmanac CLI expects topics.yaml in the new list format but .almanac/topics.yaml is still in the old dict format"
127
+ Signal: Bug identification with root cause.
128
+
129
+ **Subagent audit (from toolUseResult with agentType):**
130
+ > agentType: "general-purpose", prompt: "You are auditing the hosted editor..."
131
+ > content: "Audit Report: Hosted Editor / Quill Co-Editing (Pre-Phase-5) ... 3 HIGH gaps found ... Fix malformed edit-result surfacing ... Fix Quill session isolation ..."
132
+ Signal: Complete code audit with findings, priorities, and recommendations.
133
+
134
+ **Noise skipped:** 522 tool results totaling 2MB of file contents, 466KB of empty thinking blocks, 408KB of metadata records, 2.7MB of duplicated toolUseResult data.
135
+
136
+ ## Gotchas
137
+
138
+ 1. **toolUseResult duplication is massive.** In one 8MB session, `toolUseResult` accounted for 33% of the file. It duplicates `message.content[]` tool results AND sometimes contains subagent results. Always check `agentType` before discarding.
139
+
140
+ 2. **Thinking blocks are always empty in recent versions.** The `thinking` field in content items of type `"thinking"` is consistently empty string in observed sessions (95 empty out of 95 in one session). The thinking content is not persisted. Do not expect signal here despite the promising field name.
141
+
142
+ 3. **assistant records contain message-level metadata.** The `message` object has `usage` (token counts), `model` (model name), `stop_reason` (why generation stopped). These can be useful for understanding session dynamics but are not knowledge signal.
143
+
144
+ 4. **user records serve double duty.** A `user` record with `userType: "external"` and text content is a real human message. A `user` record with tool_result content is just the harness returning tool output. Always check `message.content[].type`.
145
+
146
+ 5. **The `isSidechain` field** indicates branched conversations (user went back and tried a different approach). Sidechain records may represent abandoned approaches -- still potentially valuable as "what was tried and rejected."
147
+
148
+ 6. **Base64 images can be huge.** A single screenshot paste can add 100KB+ of base64 data to a user record. These look like small records until you measure them.
149
+
150
+ 7. **Attachment records repeat.** `skill_listing` and `task_reminder` attachments are re-injected at many turn boundaries. The same content appears 10-15 times in a long session.
151
+
152
+ 8. **Subagent data structure:** The `toolUseResult` for subagents includes `toolStats` (readCount, searchCount, bashCount, editFileCount, linesAdded, linesRemoved) and `usage` (input_tokens, output_tokens, cache stats). These are useful metadata about the subagent's work.
@@ -0,0 +1,214 @@
1
+ # Processing Codex Sessions
2
+
3
+ ## Format overview
4
+
5
+ Codex stores sessions as JSONL files at:
6
+ ```
7
+ ~/.codex/sessions/<year>/<month>/<day>/rollout-<timestamp>-<thread-uuid>.jsonl
8
+ ```
9
+
10
+ A SQLite database at `~/.codex/state_5.sqlite` provides session metadata (title, cwd, model, tokens_used, git info) in the `threads` table.
11
+
12
+ Multiple rollout files for the same timestamp indicate subagent threads spawned by the parent session. The SQLite `source` column reveals this: `"vscode"` for top-level sessions, a JSON blob with `subagent.thread_spawn.parent_thread_id` for child threads.
13
+
14
+ Typical session sizes: 500KB (short task) to 12MB+ (multi-turn debugging session). Line counts range from ~175 to ~2,800.
15
+
16
+ ## Record types
17
+
18
+ Each line is a JSON object with a `type` field:
19
+
20
+ | Type | % of records | % of bytes | What it contains |
21
+ |------|-------------|------------|-----------------|
22
+ | `response_item` | ~55% | ~36% | Model outputs: function calls, function call outputs, messages, reasoning |
23
+ | `event_msg` | ~43% | ~29% | Harness events: command execution results, token counts, agent messages, task lifecycle |
24
+ | `turn_context` | ~2% | ~5% | Per-turn context: model, cwd, instructions, settings. Repeated every turn |
25
+ | `session_meta` | 1 per file | ~1-3% | Session metadata: id, cwd, model, CLI version, base instructions, skills |
26
+ | `compacted` | rare (0-3) | 10-30% when present | Compressed conversation history from context window compaction |
27
+
28
+ ### response_item subtypes (in `payload.type`)
29
+
30
+ | Subtype | What it contains |
31
+ |---------|-----------------|
32
+ | `function_call` | Tool invocations: `name` (always `exec_command`), `arguments` (JSON with cmd, workdir, yield_time_ms) |
33
+ | `function_call_output` | Tool results: `output` (command stdout/stderr as string) |
34
+ | `message` | Conversation messages. Check `payload.role`: `developer` (system prompts), `user` (human + context), `assistant` (model output) |
35
+ | `reasoning` | Model reasoning. Contains `encrypted_content` (unreadable) and `summary` (always empty in observed data) |
36
+ | `custom_tool_call` | File edit operations via `apply_patch`. Contains unified diff in `input` |
37
+ | `custom_tool_call_output` | Patch application results: success/failure + modified file list |
38
+ | `web_search_call` | Web search invocations (rare) |
39
+
40
+ ### event_msg subtypes (in `payload.type`)
41
+
42
+ | Subtype | What it contains |
43
+ |---------|-----------------|
44
+ | `user_message` | Human input: `message` (text), `images` (base64 data URIs), `text_elements` |
45
+ | `agent_message` | Model commentary shown to user: `message`, `phase` (always "commentary") |
46
+ | `exec_command_end` | Command execution results (DUPLICATES `function_call_output`): stdout, stderr, aggregated_output, exit_code, duration, command |
47
+ | `token_count` | Token usage and rate limit info per turn |
48
+ | `task_started` | Turn lifecycle: turn_id, model_context_window, collaboration_mode |
49
+ | `task_complete` | Turn completion: `last_agent_message` (the final text shown to user) |
50
+ | `patch_apply_end` | File edit results: stdout, changes list, success boolean |
51
+ | `context_compacted` | Marker that context window was compacted |
52
+ | `turn_aborted` | Turn was cancelled |
53
+
54
+ ## What to extract (signal)
55
+
56
+ ### 1. Human messages (highest signal density)
57
+ - **Where:** `event_msg` records with `payload.type == "user_message"`
58
+ - **Field:** `payload.message`
59
+ - **Also in:** `response_item` with `payload.type == "message"` and `payload.role == "user"`, `payload.content[].type == "input_text"`. The event_msg version is cleaner
60
+ - **Watch for:** The `response_item` version also contains system/developer context injected alongside the real user message. Extract only `input_text` items where the text is NOT wrapped in XML tags like `<environment_context>`, `<permissions instructions>`, `<app-context>`, `<skills_instructions>`, `<collaboration_mode>`
61
+
62
+ ### 2. Agent messages (model's user-facing commentary)
63
+ - **Where:** `event_msg` records with `payload.type == "agent_message"`
64
+ - **Fields:** `payload.message`, `payload.phase`
65
+ - **What:** Short status updates and reasoning the model shares with the user. These are the "thinking out loud" moments
66
+ - **Example:** "I'm inspecting the repo for category vs topic naming drift and I'll trace it through code, routes, API shapes, and copy so the discrepancies are concrete rather than guessed."
67
+
68
+ ### 3. Task completion summaries
69
+ - **Where:** `event_msg` records with `payload.type == "task_complete"`
70
+ - **Field:** `payload.last_agent_message`
71
+ - **What:** The final, complete response for each turn. Often the densest signal -- the model's synthesized answer after all tool use. Can be multi-thousand characters of analysis
72
+
73
+ ### 4. Assistant output text
74
+ - **Where:** `response_item` with `payload.type == "message"`, `payload.role == "assistant"`, content items with `type: "output_text"`
75
+ - **What:** Model's text responses interspersed with tool calls. Shorter than task_complete but captures incremental reasoning
76
+
77
+ ### 5. File edits (apply_patch)
78
+ - **Where:** `response_item` with `payload.type == "custom_tool_call"` and `payload.name == "apply_patch"`
79
+ - **Field:** `payload.input` contains a unified diff
80
+ - **What:** Every code change the model made. Extract the file path and a summary of the change, not the full diff (the repo has the final state)
81
+
82
+ ### 6. Session metadata
83
+ - **Where:** `session_meta` record (first line of file)
84
+ - **Key fields:** `payload.id`, `payload.cwd`, `payload.model_provider`, `payload.cli_version`, `payload.source`, `payload.model` (in turn_context)
85
+ - **SQLite enrichment:** Query `threads` table for `title`, `tokens_used`, `git_branch`, `first_user_message`, `source` (reveals if this is a subagent)
86
+
87
+ ## What to skip (noise)
88
+
89
+ ### 1. function_call_output records (~17% of bytes) -- SKIP
90
+ - **Where:** `response_item` with `payload.type == "function_call_output"`
91
+ - **Why:** Raw command output (file contents, grep results, build output). Already in the repo or transient
92
+
93
+ ### 2. exec_command_end records (~15% of bytes) -- SKIP
94
+ - **Where:** `event_msg` with `payload.type == "exec_command_end"`
95
+ - **Why:** DUPLICATES `function_call_output` with the same `call_id`. Contains stdout, stderr, aggregated_output redundantly. In one 12MB session, 399 of these consumed 1.9MB
96
+ - **Note:** 100% overlap with function_call_output on shared call_ids
97
+
98
+ ### 3. Reasoning records (~6% of bytes) -- SKIP
99
+ - **Where:** `response_item` with `payload.type == "reasoning"`
100
+ - **Why:** Contains `encrypted_content` (base64 blob, unreadable) and `summary` (consistently empty array in all observed sessions). No extractable signal
101
+ - **Do not confuse with:** `agent_message` records, which ARE readable model reasoning
102
+
103
+ ### 4. turn_context records (~5-25% of bytes) -- SKIP
104
+ - **Where:** `type: "turn_context"`
105
+ - **Why:** Repeated every turn. Contains model name, cwd, instructions, sandbox policy, collaboration mode. Same content each time with minor variations
106
+
107
+ ### 5. session_meta base_instructions (~3% of bytes) -- SKIP
108
+ - **Where:** `session_meta` record, `payload.base_instructions.text`
109
+ - **Why:** Codex's built-in system prompt. Same across all sessions. ~2000 chars of personality and behavior instructions
110
+
111
+ ### 6. Developer instructions in message records -- SKIP
112
+ - **Where:** `response_item` messages with `payload.role == "developer"`
113
+ - **Contains:** `<permissions instructions>`, `<app-context>`, `<collaboration_mode>`, `<apps_instructions>`, `<skills_instructions>` XML blocks
114
+ - **Why:** Harness configuration, not user knowledge. Can be 10KB+ per occurrence
115
+
116
+ ### 7. token_count records (~2%) -- SKIP
117
+ - Rate limit and token usage telemetry
118
+
119
+ ### 8. function_call records (~1.5%) -- SKIP or SUMMARIZE
120
+ - **Where:** `response_item` with `payload.type == "function_call"`
121
+ - **Contains:** `name` (always `exec_command`), `arguments` (cmd, workdir)
122
+ - **Why:** Operational commands. Summarize the pattern, not individual calls
123
+
124
+ ### 9. Base64 images in user_message (~4-7% when present) -- SKIP
125
+ - **Where:** `event_msg` with `payload.type == "user_message"`, `payload.images[]`
126
+ - **Format:** data URIs (`data:image/png;base64,...`), 350KB-550KB each
127
+ - **Also in:** `response_item` message content with `type: "input_image"` and `image_url`
128
+ - **Why:** Screenshots. Not extractable as text knowledge. A single image can be 550KB
129
+
130
+ ### 10. Compacted records (10-30% when present) -- EXTRACT SELECTIVELY
131
+ - **Where:** `type: "compacted"` records
132
+ - **Contains:** `payload.replacement_history[]` -- a compressed version of earlier conversation
133
+ - **Treatment:** These contain summarized versions of earlier turns after context compaction. The `replacement_history` items have `role` and `content[]` with `input_text`/`output_text`. Extract output_text items (model summaries) but skip input_text (already captured from the original records earlier in the file)
134
+
135
+ ## What to summarize
136
+
137
+ | Pattern | Summarize as |
138
+ |---------|-------------|
139
+ | N consecutive exec_command function_call/output pairs | "Ran N commands exploring {pattern}" |
140
+ | grep/find/sed sequences reading files | "Searched for {pattern} in {directory}" |
141
+ | apply_patch calls | "Modified {file}: {one-line description from diff}" |
142
+ | Multiple agent_message records saying similar things | Keep only the last one before task_complete |
143
+
144
+ ## Extraction approach
145
+
146
+ 1. **Check SQLite first** for session metadata: `SELECT id, title, cwd, model, tokens_used, git_branch, source, first_user_message FROM threads WHERE id = '<thread-id>'`. The `source` field tells you if this is a subagent session.
147
+
148
+ 2. **Parse the JSONL file** line by line.
149
+
150
+ 3. **Extract session_meta** (first record): grab `payload.id`, `payload.cwd`, `payload.source`, `payload.model_provider`.
151
+
152
+ 4. **For each record, route by type and subtype:**
153
+ - `event_msg` + `user_message` -> **extract** `payload.message` as human input. Note `payload.images` count but skip the base64 data
154
+ - `event_msg` + `agent_message` -> **extract** `payload.message` as model reasoning
155
+ - `event_msg` + `task_complete` -> **extract** `payload.last_agent_message` as turn summary
156
+ - `event_msg` + `exec_command_end` -> **skip** (duplicated in response_item)
157
+ - `event_msg` + `token_count` / `task_started` / `context_compacted` -> **skip**
158
+ - `response_item` + `message` + `role: "assistant"` -> **extract** output_text content
159
+ - `response_item` + `message` + `role: "developer"` or `role: "user"` with XML-tagged content -> **skip**
160
+ - `response_item` + `message` + `role: "user"` with plain text -> **extract** (but deduplicate against event_msg user_message)
161
+ - `response_item` + `function_call_output` -> **skip**
162
+ - `response_item` + `function_call` -> **summarize** command pattern
163
+ - `response_item` + `reasoning` -> **skip** (encrypted, empty summary)
164
+ - `response_item` + `custom_tool_call` -> **summarize** file path and change description
165
+ - `response_item` + `custom_tool_call_output` -> **skip** (just success/fail)
166
+ - `turn_context` -> **skip**
167
+ - `compacted` -> **extract** output_text from `payload.replacement_history[]`
168
+
169
+ 5. **Deduplicate across record types:** User messages appear in both `event_msg.user_message` AND `response_item.message` (role: user). Command output appears in both `response_item.function_call_output` AND `event_msg.exec_command_end`. Always prefer the event_msg version for user messages (cleaner), skip the duplicate command output entirely.
170
+
171
+ 6. **Link subagent sessions:** Check the SQLite `source` column. If it contains `subagent.thread_spawn`, this session's knowledge should be attributed to the parent thread. The `parent_thread_id` field links them.
172
+
173
+ ## Example: signal extraction from a real session
174
+
175
+ **Human intent (from event_msg.user_message):**
176
+ > "Look at our codebase, we have categories and topics, we want to unify everywhere to be called topics. Find all discrepancies."
177
+ Signal: User wants a category-to-topic naming audit.
178
+
179
+ **Agent reasoning (from event_msg.agent_message):**
180
+ > "I've isolated one concrete runtime mismatch already: the page-topic footer still talks about 'categories' in UI copy while the rest of the product model is 'topics.' I'm now separating first-party mismatches from external-source fields and old design docs so the final list is usable."
181
+ Signal: Agent's approach to categorizing the findings.
182
+
183
+ **Task completion (from event_msg.task_complete):**
184
+ > "Root Cause: This is not primarily a CSS-specificity problem. The break happens because the suggestion extension's renderHTML() emits bare <ins>/<del> tags without the CSS class..."
185
+ Signal: Complete diagnosis with root cause and fix path.
186
+
187
+ **File edit (from custom_tool_call):**
188
+ > name: apply_patch, file: SuggestionChangesExtension.ts
189
+ > Change: Added class attribute to renderHTML() output so CSS selectors match
190
+ Signal: What was actually changed and why.
191
+
192
+ **Noise skipped:** 442 function_call_output records (2.2MB), 399 duplicate exec_command_end records (1.9MB), 266 encrypted reasoning records (787KB), 67 repeated turn_context records (675KB), 368 token_count records (267KB).
193
+
194
+ ## Gotchas
195
+
196
+ 1. **exec_command_end duplicates function_call_output.** They share the same `call_id` and contain the same command output in different field names. 100% overlap observed. Skip exec_command_end entirely.
197
+
198
+ 2. **Reasoning is unreadable.** Despite having `summary` and `content` fields, reasoning records contain only `encrypted_content` (opaque base64) and consistently empty `summary: []`. There is zero extractable signal from reasoning records.
199
+
200
+ 3. **User messages appear twice.** Once in `event_msg.user_message` (clean, just the text + images) and again in `response_item.message` with `role: "user"` (mixed with system context). Use the event_msg version.
201
+
202
+ 4. **Developer messages are system prompts, not human.** `response_item.message` with `role: "developer"` contains harness instructions (permissions, app context, collaboration mode, skills). These are NOT human messages. They are wrapped in XML tags like `<permissions instructions>`, `<app-context>`, etc.
203
+
204
+ 5. **Images are massive.** User messages with screenshots contain base64 data URIs of 350-550KB each. A single image can make a 75-char message record balloon to 550KB. Check `payload.images` length but do not extract the base64 data.
205
+
206
+ 6. **Compacted records contain earlier conversation.** When the context window fills up, Codex compacts history into `compacted` records. The `replacement_history` array contains summarized earlier turns. If you are processing the file start-to-finish, you will see the original records first and then the compacted summary later -- be careful not to double-count.
207
+
208
+ 7. **Subagent sessions are separate files.** A parent session spawns subagent threads that are written to their own rollout files in the same date directory. The SQLite `source` column JSON identifies child threads. To get the complete picture of a multi-agent session, you must read all linked rollout files.
209
+
210
+ 8. **Codex uses `exec_command` for everything.** Unlike Claude Code which has specialized tools (Read, Grep, Edit, Bash), Codex wraps all operations in `exec_command` with shell commands (`sed`, `rg`, `cat`, etc.) or `apply_patch` for file edits. This means function_call records are less informative about intent -- you need to parse the `cmd` field to understand what was done.
211
+
212
+ 9. **task_complete contains the cleanest signal.** The `last_agent_message` in task_complete records is the final synthesized response after all tool use. If you can only extract one thing per turn, extract this.
213
+
214
+ 10. **The model field is in turn_context, not session_meta.** Session_meta has `model_provider` ("openai") but the actual model name (e.g., "gpt-5.4") is in `turn_context.model`.