agentboss 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,162 +1,173 @@
1
- /**
2
- * Unified session-analysis prompt — one LLM call that returns BOTH the
3
- * v2.1 capability scores (H1/H2/E1/O1) AND the per-session collaboration
4
- * advice, as a single strict-JSON object:
5
- *
6
- * { "scores": { H1:{clarity,converge,drift cells}, H2:{...}, E1:{...}, O1:{...} },
7
- * "advice": { summary, categories:{cost,accuracy,context,skills,workflow}, rationale } }
8
- *
9
- * Replaces the two separate calls (judge-prompts.buildSessionJudgePrompt +
10
- * advice-prompt.buildAdvicePrompt).
11
- *
12
- * CRITICAL — score laundering guard: the model produces the scores AND the
13
- * advice in the same context, but the advice section MUST NOT reference any
14
- * score / level / dimension key. Earlier (separate-call) experience showed
15
- * the model otherwise echoes the numbers back as fake "evidence". The
16
- * `advice` rules below repeat this prohibition; the two JSON sections are
17
- * independent.
18
- *
19
- * @author Felix
20
- */
21
-
22
- 'use strict';
23
-
24
- const { truncateContext } = require('./advice-prompt');
25
-
26
- /** First line marker so the resulting CLI session is filtered by the ETL
27
- * (registered in server/etl/judge-filter.js). */
28
- const ANALYSIS_SENTINEL = '[ABOSS-ANALYZE]';
29
-
30
- /** Bump when the JSON output contract changes.
31
- * v2: sub-indicator cells return a granular 0–100 `score` (was `level`). */
32
- const ANALYSIS_PROMPT_VERSION = 2;
33
-
34
- const CATEGORIES = ['cost', 'accuracy', 'context', 'skills', 'workflow'];
35
-
36
- // ---------------------------------------------------------------------------
37
- // Context formatting (mirrors advice-prompt; kept local so this module is
38
- // self-contained and changes here can't break advice generation).
39
- // ---------------------------------------------------------------------------
40
-
41
- function fmtNum(n) {
42
- if (n == null || Number.isNaN(n)) return '–';
43
- if (typeof n !== 'number') return String(n);
44
- if (Number.isInteger(n)) return n.toLocaleString('en-US');
45
- return n.toFixed(3);
46
- }
47
-
48
- function fmtToolTable(tools) {
49
- if (!Array.isArray(tools) || tools.length === 0) return '(无工具调用)';
50
- return tools.slice(0, 20).map((t) =>
51
- ` ${(t.name || '?').padEnd(20)} count=${String(t.count ?? 0).padStart(4)} ` +
52
- `err=${String(t.errorCount ?? 0).padStart(3)} ` +
53
- `args="${(t.argsPreview || '').replace(/\s+/g, ' ').slice(0, 120)}"`
54
- ).join('\n');
55
- }
56
-
57
- function fmtMessages(messages) {
58
- if (!Array.isArray(messages) || messages.length === 0) return '(无消息)';
59
- return messages
60
- .filter((m) => m.text != null && m.text !== '')
61
- .map((m) => `[${(m.role || '?').toUpperCase().padEnd(9)}] ${m.text}`)
62
- .join('\n---\n');
63
- }
64
-
65
- // ---------------------------------------------------------------------------
66
- // Rubric (scores) — difficulty-conditioned L1–L4 anchors.
67
- // ---------------------------------------------------------------------------
68
-
69
- const RUBRIC = `
70
- H1 立意(把模糊需求收敛成可执行的精确问题的功力)
71
- clarity 初始指令清晰度 · converge 收敛效率 · drift 方向稳定性
72
- L4 开局即给出目标+约束+验收,几乎无需追问;L3 信息基本充分,少量澄清;
73
- L2 需多轮补全;L1 一句话甩任务、反复改方向。
74
- H2 判断(不盲从、敢质疑、敢推翻 vs 橡皮图章式照单全收)
75
- challenge 合理质疑 · override 该推翻时推翻 · accept_rate 采纳前是否有判断
76
- L4 在该质疑处质疑、在合理处高效采纳,质疑有依据;L3 大体审视偶有盲从;
77
- L2 多数直接采纳少量质疑;L1 几乎全程"好的/继续"式盲从。
78
- E1 知识(AI 对该技术栈的掌握)
79
- domain_errors 领域错误(越少越好)· staleness 过时引用(越少越好)· best_practice 最佳实践
80
- L4 无错误、无过时、全程最佳实践;L3 偶有小瑕疵;L2 多处问题;L1 频繁错误/过时。
81
- O1 产出(结果是否真的可用)
82
- first_take 一次采纳 · code_style 代码规范 · completeness 边界/异常/测试完备
83
- L4 几乎一次过且规范完备;L3 小修即可;L2 需明显返工;L1 大量返工或缺失完备性。
84
- `;
85
-
86
- // ---------------------------------------------------------------------------
87
- // Prompt
88
- // ---------------------------------------------------------------------------
89
-
90
- /**
91
- * Build the unified analysis prompt.
92
- *
93
- * @param {object} ctx advice-style context (assembleContext output) plus a
94
- * numeric `difficulty` (1-4). Shape:
95
- * { session:{model,difficulty,durationMinutes,cost,tokens,errorCount,
96
- * toolCallCount,messageCount,userCount,assistantCount,reverted},
97
- * toolBreakdown:[...], messages:[{role,text}], truncated, omittedMessages }
98
- * @returns {string}
99
- */
100
- function buildSessionAnalysisPrompt(ctx) {
101
- const s = ctx.session || {};
102
- const t = s.tokens || {};
103
- const difficulty = s.difficulty ?? ctx.difficulty ?? '?';
104
- const truncatedNote =
105
- ctx.truncated === 'hard' ? '(注意:会话很长,已强力截断。)'
106
- : ctx.truncated ? `(注意:中段已省略 ${ctx.omittedMessages} 条消息。)`
107
- : '';
108
-
109
- return `${ANALYSIS_SENTINEL}(内部标记,忽略本行)
110
- 你是一名严格、客观的"人机协作"审计员兼协作教练。下面是一段开发者与 AI 编程助手的会话原文与基础统计。
111
- 请一次性完成两件事,返回**单个严格 JSON 对象**,不要任何额外文字 / markdown:
112
- (A) scores —— rubric 给五维能力打分;(B) advice —— 给出可执行的协作改进建议。
113
-
114
- ================ (A) 评分 rubric(难度越高合格线越宽松,本会话难度 ${difficulty}/4) ================
115
- ${RUBRIC}
116
- scores 每个子指标返回三元组:
117
- - score: 0–100 的数值,**可含一位小数**(如 78.5);证据不足或对话太短无法判断 → null(不要硬给低分)。
118
- 档位↔分数区间:L4 = 85–100 · L3 = 65–84 · L2 = 40–64 · L1 = 0–39;
119
- 请在区间内按实际好坏细分到具体分值,**不要只给 25/55/80/95 这种档位中点或整十的死板分**。
120
- - confidence: 0.0–1.0。
121
- - evidence: 引用对话中的具体表现(哪条消息、什么内容),说明为何是这个分值。没有证据写"未发现相关证据"。
122
- 评分客观性:① 不因输出更长/更礼貌而加分;② 证据不足必须 null;③ 难度低不等于做得差。
123
-
124
- ================ (B) advice —— 只评"如何使用 AI",不评业务对错 ================
125
- 评估开发者的协作方式(提问清晰度、上下文准备、工具/模型使用、流程节奏、是否该用 skill/subagent、成本)。
126
- 不要评对话里讨论的具体技术/代码是否正确(那属于代码评审)。
127
- **关键禁令**:advice 部分禁止引用、推测或编造任何"分数/等级/Lx/子分/H1/H2/E1/O1"字眼;
128
- 每条 evidence 必须是对话事实(如"第 3 条用户消息只说'改一下',未给文件路径"),不得换算成分数。
129
- 5 个类别:cost 省钱 · accuracy 协作层提准确(暴露假设/要求自检/加验证,非业务对错)· context 上下文准备 ·
130
- skills 推荐 opencode skill/subagent · workflow 流程节奏。
131
- AdviceItem: { severity: high|medium|low, title:≤20字, why:1句, action:下次怎么做(协作动作), evidence:对话事实,
132
- actionable:bool, executor: opencode|claude|manual, cwd_hint: "project_root" }
133
- 硬规则:5 个类别键必须存在(无内容给空数组);AdviceItem 总数 ≤ 6,按 severity 由高到低;
134
- executor='manual' actionable 必须 false。
135
-
136
- ================ 会话基础(只作事实参考) ================
137
- 模型 ${s.model || '未知'} · 难度 ${difficulty}/4 · 时长 ${fmtNum(s.durationMinutes)} 分钟 · 已回退 ${s.reverted ? '是' : '否'}
138
- 消息 ${fmtNum(s.messageCount)}(用户 ${fmtNum(s.userCount)} / 助手 ${fmtNum(s.assistantCount)})· 工具 ${fmtNum(s.toolCallCount)} 次 错误 ${fmtNum(s.errorCount)}
139
- Token in ${fmtNum(t.input)} / out ${fmtNum(t.output)} / reasoning ${fmtNum(t.reasoning)} / cacheR ${fmtNum(t.cacheRead)} / cacheW ${fmtNum(t.cacheWrite)} · 成本 $${typeof s.cost === 'number' ? s.cost.toFixed(4) : '–'}
140
-
141
- 工具使用 Top 20:
142
- ${fmtToolTable(ctx.toolBreakdown)}
143
-
144
- 消息全文 ${truncatedNote}:
145
- ${fmtMessages(ctx.messages)}
146
-
147
- ================ 输出 JSON(仅此对象) ================
148
- {"scores":{
149
- "H1":{"clarity":{"score":78.5,"confidence":0.8,"evidence":"…"},"converge":{"score":90,"confidence":0.9,"evidence":"…"},"drift":{"score":null,"confidence":0,"evidence":"无法判断"}},
150
- "H2":{"challenge":{…},"override":{…},"accept_rate":{…}},
151
- "E1":{"domain_errors":{…},"staleness":{…},"best_practice":{…}},
152
- "O1":{"first_take":{…},"code_style":{…},"completeness":{…}}},
153
- "advice":{"summary":"≤60字只谈协作","categories":{"cost":[],"accuracy":[],"context":[],"skills":[],"workflow":[]},"rationale":"≤80字只谈协作"}}`;
154
- }
155
-
156
- module.exports = {
157
- ANALYSIS_SENTINEL,
158
- ANALYSIS_PROMPT_VERSION,
159
- CATEGORIES,
160
- buildSessionAnalysisPrompt,
161
- truncateContext,
162
- };
1
+ /**
2
+ * Unified session-analysis prompt — one LLM call that returns BOTH the
3
+ * v2.1 capability scores (H1/H2/E1/O1) AND the per-session collaboration
4
+ * advice, as a single strict-JSON object:
5
+ *
6
+ * { "scores": { H1:{clarity,converge,drift cells}, H2:{...}, E1:{...}, O1:{...} },
7
+ * "advice": { summary, categories:{cost,accuracy,context,skills,workflow}, rationale } }
8
+ *
9
+ * Replaces the two separate calls (judge-prompts.buildSessionJudgePrompt +
10
+ * advice-prompt.buildAdvicePrompt).
11
+ *
12
+ * CRITICAL — score laundering guard: the model produces the scores AND the
13
+ * advice in the same context, but the advice section MUST NOT reference any
14
+ * score / level / dimension key. Earlier (separate-call) experience showed
15
+ * the model otherwise echoes the numbers back as fake "evidence". The
16
+ * `advice` rules below repeat this prohibition; the two JSON sections are
17
+ * independent.
18
+ *
19
+ * @author Felix
20
+ */
21
+
22
+ 'use strict';
23
+
24
+ const { truncateContext } = require('./advice-prompt');
25
+
26
+ /** First line marker so the resulting CLI session is filtered by the ETL
27
+ * (registered in server/etl/judge-filter.js). */
28
+ const ANALYSIS_SENTINEL = '[ABOSS-ANALYZE]';
29
+
30
+ /** Bump when the JSON output contract changes.
31
+ * v2: sub-indicator cells return a granular 0–100 `score` (was `level`).
32
+ * v3: added H3 (system thinking) as a per-session dimension.
33
+ * v4: H3 expanded to 3 sub-indicators (abstraction / reuse / standard).
34
+ * v5: merged E1+E2 into ENV (knowledge / tooling / currency). */
35
+ const ANALYSIS_PROMPT_VERSION = 5;
36
+
37
+ const CATEGORIES = ['cost', 'accuracy', 'context', 'skills', 'workflow'];
38
+
39
+ // ---------------------------------------------------------------------------
40
+ // Context formatting (mirrors advice-prompt; kept local so this module is
41
+ // self-contained and changes here can't break advice generation).
42
+ // ---------------------------------------------------------------------------
43
+
44
+ function fmtNum(n) {
45
+ if (n == null || Number.isNaN(n)) return '–';
46
+ if (typeof n !== 'number') return String(n);
47
+ if (Number.isInteger(n)) return n.toLocaleString('en-US');
48
+ return n.toFixed(3);
49
+ }
50
+
51
+ function fmtToolTable(tools) {
52
+ if (!Array.isArray(tools) || tools.length === 0) return '(无工具调用)';
53
+ return tools.slice(0, 20).map((t) =>
54
+ ` ${(t.name || '?').padEnd(20)} count=${String(t.count ?? 0).padStart(4)} ` +
55
+ `err=${String(t.errorCount ?? 0).padStart(3)} ` +
56
+ `args="${(t.argsPreview || '').replace(/\s+/g, ' ').slice(0, 120)}"`
57
+ ).join('\n');
58
+ }
59
+
60
+ function fmtMessages(messages) {
61
+ if (!Array.isArray(messages) || messages.length === 0) return '(无消息)';
62
+ return messages
63
+ .filter((m) => m.text != null && m.text !== '')
64
+ .map((m) => `[${(m.role || '?').toUpperCase().padEnd(9)}] ${m.text}`)
65
+ .join('\n---\n');
66
+ }
67
+
68
+ // ---------------------------------------------------------------------------
69
+ // Rubric (scores) difficulty-conditioned L1–L4 anchors.
70
+ // ---------------------------------------------------------------------------
71
+
72
+ const RUBRIC = `
73
+ H1 立意(把模糊需求收敛成可执行的精确问题的功力)
74
+ clarity 初始指令清晰度 · converge 收敛效率 · drift 方向稳定性
75
+ L4 开局即给出目标+约束+验收,几乎无需追问;L3 信息基本充分,少量澄清;
76
+ L2 需多轮补全;L1 一句话甩任务、反复改方向。
77
+ H2 判断(不盲从、敢质疑、敢推翻 vs 橡皮图章式照单全收)
78
+ challenge 合理质疑 · override 该推翻时推翻 · accept_rate 采纳前是否有判断
79
+ L4 在该质疑处质疑、在合理处高效采纳,质疑有依据;L3 大体审视偶有盲从;
80
+ L2 多数直接采纳少量质疑;L1 几乎全程"好的/继续"式盲从。
81
+ H3 系统思维(本次对话是否体现系统性 / 抽象思维,三个子项)
82
+ abstraction 抽象层级:是否从架构 / 模块 / 接口 / 抽象层面思考,而非纯就事论事
83
+ reuse 复用意识:是否复用既有设计 / 组件 / 模式,不重复造轮子
84
+ standard 规范约束:是否主动设定或遵循规范 / 约束 / 协议 / 命名一致性
85
+ 各子项 L4 充分体现;L3 部分体现;L2 偶有;L1 基本没有。
86
+ ENV 环境(AI 能力环境诊断:本次 AI 凭借的知识 / 工具 / 时效够不够用,3 个子项)
87
+ knowledge 知识覆盖:AI 对该技术栈的掌握,领域准确、概念无误、符合最佳实践(高分=掌握好)
88
+ tooling 工具运用:工具选择是否得当、调用链路高效不啰嗦、出错能自愈(高分=运用好)
89
+ currency 时效性:是否避免了过时 / 已废弃的 API、版本、做法(高分=新,越少过时越高)
90
+ 各子项 L4 优秀;L3 良好;L2 一般;L1 差。
91
+ O1 产出(结果是否真的可用)
92
+ first_take 一次采纳 · code_style 代码规范 · completeness 边界/异常/测试完备
93
+ L4 几乎一次过且规范完备;L3 小修即可;L2 需明显返工;L1 大量返工或缺失完备性。
94
+ `;
95
+
96
+ // ---------------------------------------------------------------------------
97
+ // Prompt
98
+ // ---------------------------------------------------------------------------
99
+
100
+ /**
101
+ * Build the unified analysis prompt.
102
+ *
103
+ * @param {object} ctx advice-style context (assembleContext output) plus a
104
+ * numeric `difficulty` (1-4). Shape:
105
+ * { session:{model,difficulty,durationMinutes,cost,tokens,errorCount,
106
+ * toolCallCount,messageCount,userCount,assistantCount,reverted},
107
+ * toolBreakdown:[...], messages:[{role,text}], truncated, omittedMessages }
108
+ * @returns {string}
109
+ */
110
+ function buildSessionAnalysisPrompt(ctx) {
111
+ const s = ctx.session || {};
112
+ const t = s.tokens || {};
113
+ const difficulty = s.difficulty ?? ctx.difficulty ?? '?';
114
+ const truncatedNote =
115
+ ctx.truncated === 'hard' ? '(注意:会话很长,已强力截断。)'
116
+ : ctx.truncated ? `(注意:中段已省略 ${ctx.omittedMessages} 条消息。)`
117
+ : '';
118
+
119
+ return `${ANALYSIS_SENTINEL}(内部标记,忽略本行)
120
+ 你是一名严格、客观的"人机协作"审计员兼协作教练。下面是一段开发者与 AI 编程助手的会话原文与基础统计。
121
+ 请一次性完成两件事,返回**单个严格 JSON 对象**,不要任何额外文字 / markdown:
122
+ (A) scores —— rubric 给五维能力打分;(B) advice —— 给出可执行的协作改进建议。
123
+
124
+ ================ (A) 评分 rubric(难度越高合格线越宽松,本会话难度 ${difficulty}/4) ================
125
+ ${RUBRIC}
126
+ scores 每个子指标返回三元组:
127
+ - score: 0–100 的数值,**可含一位小数**(如 78.5);证据不足或对话太短无法判断 → null(不要硬给低分)。
128
+ 档位↔分数区间:L4 = 85–100 · L3 = 65–84 · L2 = 40–64 · L1 = 0–39;
129
+ 请在区间内按实际好坏细分到具体分值,**不要只给 25/55/80/95 这种档位中点或整十的死板分**。
130
+ - confidence: 0.0–1.0。
131
+ - evidence: 引用对话中的具体表现(哪条消息、什么内容),说明为何是这个分值。没有证据写"未发现相关证据"。
132
+ 评分客观性:① 不因输出更长/更礼貌而加分;② 证据不足必须 null;③ 难度低不等于做得差。
133
+
134
+ ================ (B) advice —— 只评"如何使用 AI",不评业务对错 ================
135
+ 评估开发者的协作方式(提问清晰度、上下文准备、工具/模型使用、流程节奏、是否该用 skill/subagent、成本)。
136
+ 不要评对话里讨论的具体技术/代码是否正确(那属于代码评审)。
137
+ **关键禁令**:advice 部分禁止引用、推测或编造任何"分数/等级/Lx/子分/H1/H2/E1/O1"字眼;
138
+ 每条 evidence 必须是对话事实(如"第 3 条用户消息只说'改一下',未给文件路径"),不得换算成分数。
139
+ 5 个类别:cost 省钱 · accuracy 协作层提准确(暴露假设/要求自检/加验证,非业务对错)· context 上下文准备 ·
140
+ skills 推荐 opencode skill/subagent · workflow 流程节奏。
141
+ AdviceItem: { severity: high|medium|low, title:≤20字, why:1句, action:下次怎么做(协作动作), evidence:对话事实,
142
+ actionable:bool, executor: opencode|claude|manual, cwd_hint: "project_root" }
143
+ 硬规则:5 个类别键必须存在(无内容给空数组);AdviceItem 总数 ≤ 6,按 severity 由高到低;
144
+ executor='manual' 时 actionable 必须 false。
145
+
146
+ ================ 会话基础(只作事实参考) ================
147
+ 模型 ${s.model || '未知'} · 难度 ${difficulty}/4 · 时长 ${fmtNum(s.durationMinutes)} 分钟 · 已回退 ${s.reverted ? '是' : '否'}
148
+ 消息 ${fmtNum(s.messageCount)}(用户 ${fmtNum(s.userCount)} / 助手 ${fmtNum(s.assistantCount)})· 工具 ${fmtNum(s.toolCallCount)} 次 错误 ${fmtNum(s.errorCount)}
149
+ Token in ${fmtNum(t.input)} / out ${fmtNum(t.output)} / reasoning ${fmtNum(t.reasoning)} / cacheR ${fmtNum(t.cacheRead)} / cacheW ${fmtNum(t.cacheWrite)} · 成本 $${typeof s.cost === 'number' ? s.cost.toFixed(4) : '–'}
150
+
151
+ 工具使用 Top 20:
152
+ ${fmtToolTable(ctx.toolBreakdown)}
153
+
154
+ 消息全文 ${truncatedNote}:
155
+ ${fmtMessages(ctx.messages)}
156
+
157
+ ================ 输出 JSON(仅此对象) ================
158
+ {"scores":{
159
+ "H1":{"clarity":{"score":78.5,"confidence":0.8,"evidence":"…"},"converge":{"score":90,"confidence":0.9,"evidence":"…"},"drift":{"score":null,"confidence":0,"evidence":"无法判断"}},
160
+ "H2":{"challenge":{…},"override":{…},"accept_rate":{…}},
161
+ "H3":{"abstraction":{"score":70,"confidence":0.7,"evidence":"…"},"reuse":{…},"standard":{…}},
162
+ "ENV":{"knowledge":{"score":80,"confidence":0.8,"evidence":"…"},"tooling":{…},"currency":{…}},
163
+ "O1":{"first_take":{…},"code_style":{…},"completeness":{…}}},
164
+ "advice":{"summary":"≤60字只谈协作","categories":{"cost":[],"accuracy":[],"context":[],"skills":[],"workflow":[]},"rationale":"≤80字只谈协作"}}`;
165
+ }
166
+
167
+ module.exports = {
168
+ ANALYSIS_SENTINEL,
169
+ ANALYSIS_PROMPT_VERSION,
170
+ CATEGORIES,
171
+ buildSessionAnalysisPrompt,
172
+ truncateContext,
173
+ };
@@ -141,6 +141,7 @@ async function runJudge(opts = {}) {
141
141
  proc = spawn(cli.bin, cli.argv(prompt), {
142
142
  stdio: [useStdin ? 'pipe' : 'ignore', 'pipe', 'pipe'],
143
143
  shell: process.platform === 'win32',
144
+ windowsHide: true,
144
145
  });
145
146
  if (useStdin && proc.stdin) {
146
147
  proc.stdin.on('error', () => {}); // EPIPE if CLI exits early
@@ -173,14 +174,29 @@ async function runJudge(opts = {}) {
173
174
  }
174
175
  const parsed = extractJson(stdout);
175
176
  if (parsed === undefined) {
176
- return settle({ ok: false, reason: 'bad-json', raw: stdout.slice(0, 500) });
177
+ // Include a snippet of stderr too when claude / opencode
178
+ // print a warning ("not logged in", "rate-limited", "use
179
+ // --print"), the stdout JSON parse fails but the actual cause
180
+ // lives in stderr.
181
+ return settle({
182
+ ok: false,
183
+ reason: 'bad-json',
184
+ raw: stdout.slice(0, 500),
185
+ error: stderr ? stderr.slice(0, 500) : undefined,
186
+ });
177
187
  }
178
188
  settle({ ok: true, data: parsed, raw: stdout, cli: cli.name });
179
189
  });
180
190
 
181
191
  const t = setTimeout(() => {
182
192
  try { proc.kill('SIGKILL'); } catch {}
183
- settle({ ok: false, reason: 'timeout' });
193
+ // Surface stderr on timeout — usually has "waiting for input" or
194
+ // a prompt that explains why the CLI is hanging.
195
+ settle({
196
+ ok: false,
197
+ reason: 'timeout',
198
+ error: stderr ? stderr.slice(0, 500) : undefined,
199
+ });
184
200
  }, timeoutMs);
185
201
  proc.on('exit', () => clearTimeout(t));
186
202
  });
@@ -85,7 +85,12 @@ async function judgeSession(db, session, messages, meta = {}) {
85
85
 
86
86
  const prompt = buildSessionJudgePrompt(messages, meta);
87
87
  const result = await withSlot(() => runJudge({ prompt, timeoutMs: 90_000 }));
88
- if (!result.ok || !result.data) return null;
88
+ if (!result.ok || !result.data) {
89
+ const reason = result.ok ? 'no-data' : (result.reason || 'unknown');
90
+ const detail = result.error ? ` — ${String(result.error).slice(0, 200)}` : '';
91
+ console.error('[judge]', session.id, 'LLM fell through:', reason + detail);
92
+ return null;
93
+ }
89
94
 
90
95
  return {
91
96
  ...result.data,
@@ -106,7 +106,16 @@ async function analyzeSessionLLM(db, session, opts = {}) {
106
106
  const prompt = buildSessionAnalysisPrompt(ctx);
107
107
 
108
108
  const result = await withSlot(() => runJudge({ prompt, timeoutMs: 90_000 }));
109
- if (!result.ok || !result.data || !result.data.scores) return null;
109
+ if (!result.ok || !result.data || !result.data.scores) {
110
+ // Surface why we fell back to rule-based judging. Silent nulls
111
+ // here made macOS `claude -p` failures (timeout / bad-json / exit
112
+ // non-zero) impossible to diagnose from the outside — the HTTP
113
+ // endpoint just returned 200 with empty v2 columns.
114
+ const reason = result.ok ? 'no-scores' : (result.reason || 'unknown');
115
+ const detail = result.error ? ` — ${String(result.error).slice(0, 200)}` : '';
116
+ console.error('[session-analyzer]', session.id, 'LLM fell through:', reason + detail);
117
+ return null;
118
+ }
110
119
 
111
120
  return {
112
121
  scores: result.data.scores,