agentboss 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +34 -0
  2. package/bin/aboss.js +288 -0
  3. package/client/dist/assets/index-C1wFD_Vo.css +1 -0
  4. package/client/dist/assets/index-DBj1Ujlx.js +137 -0
  5. package/client/dist/index.html +34 -0
  6. package/package.json +64 -0
  7. package/server/analysis/daily-aggregator.js +258 -0
  8. package/server/analysis/difficulty.js +129 -0
  9. package/server/analysis/dimensions/ai-knowledge.js +172 -0
  10. package/server/analysis/dimensions/ai-tools.js +161 -0
  11. package/server/analysis/dimensions/judgement.js +107 -0
  12. package/server/analysis/dimensions/llm-merge.js +57 -0
  13. package/server/analysis/dimensions/output-quality.js +167 -0
  14. package/server/analysis/dimensions/problem-definition.js +104 -0
  15. package/server/analysis/dimensions/system-thinking.js +225 -0
  16. package/server/analysis/evidence-builder.js +104 -0
  17. package/server/analysis/job.js +273 -0
  18. package/server/analysis/report-builder.js +581 -0
  19. package/server/analysis/scoring-v2.js +72 -0
  20. package/server/analysis/text-signals.js +179 -0
  21. package/server/analysis/thresholds-v2.js +358 -0
  22. package/server/api/advice.js +124 -0
  23. package/server/api/analysis.js +141 -0
  24. package/server/api/execution.js +330 -0
  25. package/server/api/metrics.js +277 -0
  26. package/server/api/overview.js +308 -0
  27. package/server/api/project.js +255 -0
  28. package/server/api/reports.js +125 -0
  29. package/server/api/sessions.js +118 -0
  30. package/server/api/settings.js +119 -0
  31. package/server/db/connection.js +175 -0
  32. package/server/db/queries.js +1051 -0
  33. package/server/db/schema.js +487 -0
  34. package/server/etl/active-time.js +150 -0
  35. package/server/etl/backfill-subagents.js +178 -0
  36. package/server/etl/claude-code.js +826 -0
  37. package/server/etl/detect.js +341 -0
  38. package/server/etl/judge-filter.js +117 -0
  39. package/server/etl/opencode.js +606 -0
  40. package/server/execution/job.js +662 -0
  41. package/server/execution/prompt.js +227 -0
  42. package/server/execution/runner.js +218 -0
  43. package/server/index.js +94 -0
  44. package/server/llm/advice-prompt.js +339 -0
  45. package/server/llm/advice.js +384 -0
  46. package/server/llm/analysis-prompt.js +162 -0
  47. package/server/llm/cli-runner.js +249 -0
  48. package/server/llm/judge-prompts.js +179 -0
  49. package/server/llm/judge.js +118 -0
  50. package/server/llm/project-advice-prompt.js +332 -0
  51. package/server/llm/project-advice.js +491 -0
  52. package/server/llm/session-analyzer.js +122 -0
  53. package/server/utils/project.js +80 -0
@@ -0,0 +1,249 @@
1
+ /**
2
+ * LLM judge runner — spawns a local AI CLI for evaluation tasks.
3
+ *
4
+ * Detection order (first one found wins):
5
+ * 1. `opencode run -p "<prompt>"`
6
+ * 2. `claude -p "<prompt>"`
7
+ *
8
+ * Returns parsed JSON. Failures (CLI missing / timeout / non-JSON
9
+ * output) resolve to `null` so callers can fall back to rule-based
10
+ * heuristics. Never throws.
11
+ *
12
+ * @author Felix
13
+ */
14
+
15
+ 'use strict';
16
+
17
+ const { spawn } = require('child_process');
18
+ const { JUDGE_SENTINEL } = require('./judge-prompts');
19
+
20
+ /**
21
+ * Prepend the JUDGE_SENTINEL to the prompt if it isn't already the very
22
+ * first line. This is the last-line defence that guarantees *every*
23
+ * LLM call originating from aboss is recognisable when its session
24
+ * later gets re-imported by the ETL (see server/etl/judge-filter.js).
25
+ *
26
+ * Callers (e.g. buildE1Prompt / buildO1Prompt) already prepend the
27
+ * sentinel, but enforcing it here means any future caller — or any
28
+ * accidentally-omitted sentinel — still produces a tagged session
29
+ * rather than polluting the user's own work.
30
+ */
31
+ function ensureSentinel(prompt) {
32
+ if (typeof prompt !== 'string') return prompt;
33
+ if (prompt.startsWith(JUDGE_SENTINEL)) return prompt;
34
+ return `${JUDGE_SENTINEL}(内部标记,忽略本行)\n${prompt}`;
35
+ }
36
+
37
+ // ---------------------------------------------------------------------------
38
+ // Detection
39
+ // ---------------------------------------------------------------------------
40
+
41
+ /**
42
+ * CLI candidates.
43
+ *
44
+ * `argv` builds the command-line args. When `stdinPrompt: true`, the
45
+ * prompt is fed on STDIN instead of being inlined into argv — this is
46
+ * essential on Windows where the command-line cap is ~8 KB and our
47
+ * judge prompts run 10 KB+.
48
+ */
49
+ const CANDIDATES = [
50
+ // opencode reads stdin when no positional arg is given (after `run`)
51
+ { name: 'opencode', bin: 'opencode', argv: () => ['run'], stdinPrompt: true },
52
+ // claude -p reads stdin when -p is used without an inline prompt
53
+ { name: 'claude', bin: 'claude', argv: () => ['-p'], stdinPrompt: true },
54
+ ];
55
+
56
+ let _cachedCli = undefined; // null = detected none; obj = found
57
+
58
+ /**
59
+ * Detect which CLI is available. Tries `bin --version` for each candidate.
60
+ * Caches the result for the process lifetime.
61
+ *
62
+ * @returns {Promise<{name:string, bin:string, argv:Function}|null>}
63
+ */
64
+ async function detectAvailableCli() {
65
+ if (_cachedCli !== undefined) return _cachedCli;
66
+ for (const c of CANDIDATES) {
67
+ if (await canSpawn(c.bin)) {
68
+ _cachedCli = c;
69
+ return c;
70
+ }
71
+ }
72
+ _cachedCli = null;
73
+ return null;
74
+ }
75
+
76
+ /** Reset the detection cache. Mostly useful in tests / settings reload. */
77
+ function _resetCache() { _cachedCli = undefined; }
78
+
79
+ /**
80
+ * Try to spawn `bin --version`. Resolves true on exit code 0. Cross-
81
+ * platform: on Windows `bin` is resolved via PATH automatically by spawn.
82
+ */
83
+ function canSpawn(bin) {
84
+ return new Promise((resolve) => {
85
+ let resolved = false;
86
+ const settle = (v) => { if (!resolved) { resolved = true; resolve(v); } };
87
+
88
+ try {
89
+ const proc = spawn(bin, ['--version'], {
90
+ stdio: 'ignore',
91
+ shell: process.platform === 'win32',
92
+ });
93
+ proc.on('error', () => settle(false));
94
+ proc.on('exit', (code) => settle(code === 0));
95
+ // hard timeout
96
+ setTimeout(() => { try { proc.kill('SIGKILL'); } catch {} settle(false); }, 5000);
97
+ } catch {
98
+ settle(false);
99
+ }
100
+ });
101
+ }
102
+
103
+ // ---------------------------------------------------------------------------
104
+ // Runner
105
+ // ---------------------------------------------------------------------------
106
+
107
+ /**
108
+ * Spawn the chosen CLI with the prompt, capture stdout, and try to parse
109
+ * it as JSON. The caller's prompt should *demand* JSON output.
110
+ *
111
+ * Options:
112
+ * timeoutMs (default 30_000)
113
+ * maxBytes (default 256 KB) — guard against runaway output
114
+ *
115
+ * Resolves:
116
+ * { ok: true, data: any, raw: string, cli: 'opencode'|'claude' }
117
+ * { ok: false, reason: 'no-cli' | 'timeout' | 'exit-non-zero' | 'bad-json' | 'spawn-error', raw?: string, error?: string }
118
+ *
119
+ * @param {Object} opts
120
+ * @returns {Promise<Object>}
121
+ */
122
+ async function runJudge(opts = {}) {
123
+ const { prompt: rawPrompt, timeoutMs = 30_000, maxBytes = 256 * 1024 } = opts;
124
+ if (!rawPrompt || typeof rawPrompt !== 'string') {
125
+ return { ok: false, reason: 'no-prompt' };
126
+ }
127
+ // Stamp the sentinel onto every outbound prompt so the ETL can later
128
+ // recognise and discard the session this CLI call will create.
129
+ const prompt = ensureSentinel(rawPrompt);
130
+
131
+ const cli = await detectAvailableCli();
132
+ if (!cli) return { ok: false, reason: 'no-cli' };
133
+
134
+ return new Promise((resolve) => {
135
+ let resolved = false;
136
+ const settle = (v) => { if (!resolved) { resolved = true; resolve(v); } };
137
+
138
+ let proc;
139
+ try {
140
+ const useStdin = cli.stdinPrompt === true;
141
+ proc = spawn(cli.bin, cli.argv(prompt), {
142
+ stdio: [useStdin ? 'pipe' : 'ignore', 'pipe', 'pipe'],
143
+ shell: process.platform === 'win32',
144
+ });
145
+ if (useStdin && proc.stdin) {
146
+ proc.stdin.on('error', () => {}); // EPIPE if CLI exits early
147
+ proc.stdin.end(prompt, 'utf8');
148
+ }
149
+ } catch (err) {
150
+ return settle({ ok: false, reason: 'spawn-error', error: err.message });
151
+ }
152
+
153
+ let stdout = '';
154
+ let stderr = '';
155
+ let truncated = false;
156
+
157
+ proc.stdout.on('data', (chunk) => {
158
+ if (truncated) return;
159
+ stdout += chunk.toString('utf8');
160
+ if (stdout.length > maxBytes) {
161
+ stdout = stdout.slice(0, maxBytes);
162
+ truncated = true;
163
+ try { proc.kill('SIGKILL'); } catch {}
164
+ }
165
+ });
166
+ proc.stderr.on('data', (chunk) => { stderr += chunk.toString('utf8'); });
167
+
168
+ proc.on('error', (err) => settle({ ok: false, reason: 'spawn-error', error: err.message }));
169
+
170
+ proc.on('exit', (code) => {
171
+ if (code !== 0 && !truncated) {
172
+ return settle({ ok: false, reason: 'exit-non-zero', raw: stdout, error: stderr.slice(0, 500) });
173
+ }
174
+ const parsed = extractJson(stdout);
175
+ if (parsed === undefined) {
176
+ return settle({ ok: false, reason: 'bad-json', raw: stdout.slice(0, 500) });
177
+ }
178
+ settle({ ok: true, data: parsed, raw: stdout, cli: cli.name });
179
+ });
180
+
181
+ const t = setTimeout(() => {
182
+ try { proc.kill('SIGKILL'); } catch {}
183
+ settle({ ok: false, reason: 'timeout' });
184
+ }, timeoutMs);
185
+ proc.on('exit', () => clearTimeout(t));
186
+ });
187
+ }
188
+
189
+ /**
190
+ * Try to find a JSON value in raw stdout. Tolerates leading log lines
191
+ * by scanning for the first { or [. Returns the parsed value or
192
+ * undefined on failure.
193
+ */
194
+ function extractJson(raw) {
195
+ if (!raw) return undefined;
196
+ // common case: stdout is pure JSON
197
+ const trimmed = raw.trim();
198
+ try { return JSON.parse(trimmed); } catch {}
199
+ // fall back: find first { or [
200
+ const i1 = trimmed.indexOf('{');
201
+ const i2 = trimmed.indexOf('[');
202
+ let start = -1;
203
+ if (i1 >= 0 && i2 >= 0) start = Math.min(i1, i2);
204
+ else if (i1 >= 0) start = i1;
205
+ else if (i2 >= 0) start = i2;
206
+ if (start < 0) return undefined;
207
+
208
+ // find matching last brace/bracket of the same kind
209
+ const open = trimmed[start];
210
+ const close = open === '{' ? '}' : ']';
211
+ const end = trimmed.lastIndexOf(close);
212
+ if (end < start) return undefined;
213
+ try { return JSON.parse(trimmed.slice(start, end + 1)); } catch {}
214
+ return undefined;
215
+ }
216
+
217
+ // ---------------------------------------------------------------------------
218
+ // Concurrency guard
219
+ // ---------------------------------------------------------------------------
220
+
221
+ let _inFlight = 0;
222
+ const _waiters = [];
223
+ const MAX_CONCURRENT = 2;
224
+
225
+ /** Run `fn` under a 2-wide semaphore so we don't fork-bomb the CLI. */
226
+ function withSlot(fn) {
227
+ return new Promise((resolve) => {
228
+ const start = async () => {
229
+ _inFlight++;
230
+ try { resolve(await fn()); }
231
+ finally {
232
+ _inFlight--;
233
+ const next = _waiters.shift();
234
+ if (next) next();
235
+ }
236
+ };
237
+ if (_inFlight < MAX_CONCURRENT) start();
238
+ else _waiters.push(start);
239
+ });
240
+ }
241
+
242
+ module.exports = {
243
+ detectAvailableCli,
244
+ runJudge,
245
+ withSlot,
246
+ // exported for tests
247
+ _resetCache,
248
+ extractJson,
249
+ };
@@ -0,0 +1,179 @@
1
+ /**
2
+ * Prompt templates for the LLM-judge runs used by E1 and O1.
3
+ *
4
+ * Each function returns a STRING prompt that demands strict JSON output.
5
+ * The runner enforces a 30s timeout and ~256 KB cap; we keep the input
6
+ * compact (only last N messages, truncated) so the CLI doesn't choke.
7
+ *
8
+ * @author Felix
9
+ */
10
+
11
+ 'use strict';
12
+
13
+ /**
14
+ * Bump whenever the prompt output contract changes — cached judge
15
+ * results stamped with an older version are ignored and re-judged.
16
+ * v2: added per-field `details` scoring evidence.
17
+ * v4: dropped H1.reframe sub-indicator.
18
+ */
19
+ const PROMPT_VERSION = 4;
20
+
21
+ /**
22
+ * First line of every judge prompt. The judge CLIs (opencode / claude)
23
+ * log each call as a session in their own data stores; the ETL uses this
24
+ * marker to recognise and skip those sessions so they don't get imported
25
+ * back as the user's own work (which would create a feedback loop).
26
+ */
27
+ const JUDGE_SENTINEL = '[ABOSS-JUDGE]';
28
+
29
+ const MAX_MESSAGES = 30;
30
+ const MAX_LEN_PER_MSG = 600;
31
+
32
+ /** Build a short, role-tagged transcript fragment. */
33
+ function buildTranscript(messages) {
34
+ const slice = messages.slice(-MAX_MESSAGES);
35
+ return slice
36
+ .filter((m) => m.text)
37
+ .map((m) => {
38
+ const role = (m.role || '?').toUpperCase().padEnd(9);
39
+ const text = m.text.length > MAX_LEN_PER_MSG
40
+ ? m.text.slice(0, MAX_LEN_PER_MSG) + '…[truncated]'
41
+ : m.text;
42
+ return `[${role}] ${text}`;
43
+ })
44
+ .join('\n---\n');
45
+ }
46
+
47
+ /**
48
+ * E1 — AI Knowledge Coverage judge prompt.
49
+ *
50
+ * Asks the judge to score three dimensions in [0, 1]. Hands back a
51
+ * strict JSON object.
52
+ */
53
+ function buildE1Prompt(messages, meta = {}) {
54
+ const transcript = buildTranscript(messages);
55
+ return `${JUDGE_SENTINEL}(内部标记,忽略本行)
56
+ 你是一名严格的 AI 协作审计员。下面是一段开发者与 AI 编程助手的对话片段(按时间序)。
57
+ 请评估这位 AI 助手在该会话中的"知识覆盖"表现,返回一个严格 JSON 对象,不要任何额外文字、解释或 markdown 代码块。
58
+
59
+ 字段定义(值域均为 0.0 - 1.0 之间的浮点数):
60
+ - domain_errors: 助手输出中存在领域错误(错误 API、概念混淆、虚构函数)的比例。越接近 0 越好。
61
+ - staleness: 助手使用已废弃 / 已移除技术的次数,归一化(0=没有,1=非常严重)。越接近 0 越好。
62
+ - best_practice: 助手输出符合当前最佳实践的比例。越接近 1 越好。
63
+
64
+ 同时必须在 details 中为每个字段给出打分依据:引用对话中的具体表现(哪条消息、什么内容),
65
+ 说明为什么打这个数值而不是更高或更低(例如"全程未发现虚构 API,仅第 12 条消息混淆了 X 与 Y,故 0.05 而非 0")。
66
+ 没有证据就明确写"未发现相关证据"。禁止空泛措辞。
67
+
68
+ 如果对话太短或无法判断,对应数值字段返回 null,details 中说明无法判断的原因。
69
+
70
+ 会话上下文:
71
+ - 模型:${meta.model || '未知'}
72
+ - 项目:${meta.project || '未知'}
73
+ - 任务难度:${meta.difficulty || '未知'} (1=琐碎 4=重型)
74
+
75
+ 对话片段:
76
+ ${transcript}
77
+
78
+ 只输出 JSON,形如:
79
+ {"domain_errors": 0.1, "staleness": 0.0, "best_practice": 0.85,
80
+ "details": {"domain_errors": "打分依据,引用具体消息", "staleness": "…", "best_practice": "…"},
81
+ "rationale": "一句话总评"}`;
82
+ }
83
+
84
+ /**
85
+ * O1 — Output Quality judge prompt.
86
+ */
87
+ function buildO1Prompt(messages, meta = {}) {
88
+ const transcript = buildTranscript(messages);
89
+ return `${JUDGE_SENTINEL}(内部标记,忽略本行)
90
+ 你是一名严格的代码与产出审计员。下面是一段开发者与 AI 编程助手的对话片段。
91
+ 请评估这位 AI 在该会话中的"输出质量",返回严格 JSON 对象(无任何额外文字 / markdown)。
92
+
93
+ 字段(0.0 - 1.0 浮点):
94
+ - first_take: 助手输出可被一次采纳(无需修改)的比例。
95
+ - code_style: 助手产出的代码规范度(命名、格式、注释、可读性)。
96
+ - completeness: 助手是否考虑边界条件、错误处理、测试等完备性。
97
+
98
+ 同时必须在 details 中为每个字段给出打分依据:引用对话中的具体表现,
99
+ 说明为什么是这个数值而不是更高或更低(例如"8 次产出中 2 次被用户要求返工,故 first_take=0.75 而非 0.9")。
100
+ 没有证据就明确写"未发现相关证据"。禁止空泛措辞。
101
+
102
+ 无法判断的字段返回 null,details 中说明原因。
103
+
104
+ 会话上下文:模型 ${meta.model || '未知'},难度 ${meta.difficulty || '未知'}/4。
105
+
106
+ 对话片段:
107
+ ${transcript}
108
+
109
+ 只输出 JSON:
110
+ {"first_take": 0.75, "code_style": 0.8, "completeness": 0.6,
111
+ "details": {"first_take": "打分依据,引用具体消息", "code_style": "…", "completeness": "…"},
112
+ "rationale": "一句话总评"}`;
113
+ }
114
+
115
+ /**
116
+ * Per-dimension L1–L4 anchors. Kept compact: one line per level so the
117
+ * full rubric stays well under the CLI prompt budget.
118
+ */
119
+ const RUBRIC = `
120
+ H1 立意(把模糊需求收敛成可执行的精确问题的功力)
121
+ clarity 初始指令清晰度 · converge 收敛效率 · drift 方向稳定性
122
+ L4 开局即给出目标+约束+验收,几乎无需追问;L3 信息基本充分,少量澄清;
123
+ L2 需多轮补全;L1 一句话甩任务、反复改方向。
124
+ H2 判断(不盲从、敢质疑、敢推翻 vs 橡皮图章式照单全收)
125
+ challenge 合理质疑 · override 该推翻时推翻 · accept_rate 采纳前是否有判断
126
+ L4 在该质疑处质疑、在合理处高效采纳,质疑有依据;L3 大体审视偶有盲从;
127
+ L2 多数直接采纳少量质疑;L1 几乎全程"好的/继续"式盲从。
128
+ E1 知识(AI 对该技术栈的掌握)
129
+ domain_errors 领域错误(越少越好)· staleness 过时引用(越少越好)· best_practice 最佳实践
130
+ L4 无错误、无过时、全程最佳实践;L3 偶有小瑕疵;L2 多处问题;L1 频繁错误/过时。
131
+ O1 产出(结果是否真的可用)
132
+ first_take 一次采纳 · code_style 代码规范 · completeness 边界/异常/测试完备
133
+ L4 几乎一次过且规范完备;L3 小修即可;L2 需明显返工;L1 大量返工或缺失完备性。
134
+ `;
135
+
136
+ /**
137
+ * Build the consolidated session-judge prompt. Demands a single strict
138
+ * JSON object scoring every semantic sub-indicator with level (1-4 or
139
+ * null), confidence (0-1) and evidence (must cite specific messages).
140
+ *
141
+ * @param {Array<{role:string,text:string}>} messages
142
+ * @param {{difficulty?:number, model?:string, project?:string}} meta
143
+ * @returns {string}
144
+ */
145
+ function buildSessionJudgePrompt(messages, meta = {}) {
146
+ const transcript = buildTranscript(messages);
147
+ return `${JUDGE_SENTINEL}(内部标记,忽略本行)
148
+ 你是一名严格、客观的"人机协作"审计员。下面是一段开发者与 AI 编程助手的对话片段(按时间序)。
149
+ 请按下面的 rubric,对四个维度的每个子指标给出 L1–L4 等级,返回**单个严格 JSON 对象**,
150
+ 不要任何额外文字、解释或 markdown 代码块。
151
+
152
+ 评分 rubric(难度越高合格线越宽松,本会话难度见下文):
153
+ ${RUBRIC}
154
+
155
+ 每个子指标返回三元组:
156
+ - level: 整数 1–4;证据不足或对话太短无法判断时返回 null(不要硬给低分)。
157
+ - confidence: 0.0–1.0,你对该 level 的把握。
158
+ - evidence: 打分依据,必须引用对话中的具体表现(哪条消息、什么内容),说明为何是这个等级
159
+ 而非更高/更低。没有证据就写"未发现相关证据"。禁止空泛措辞。
160
+
161
+ 客观性要求(必须遵守):
162
+ 1. 不因 AI 输出更长、更礼貌、更啰嗦而加分,只看实质内容。
163
+ 2. 证据不足或会话过短的子指标必须返回 level=null,而不是给低分。
164
+ 3. 任务难度低不代表做得差——难度已单独归一,不要二次惩罚简单任务。
165
+
166
+ 会话上下文:模型 ${meta.model || '未知'},项目 ${meta.project || '未知'},任务难度 ${meta.difficulty || '未知'}/4(1=琐碎 4=重型)。
167
+
168
+ 对话片段:
169
+ ${transcript}
170
+
171
+ 只输出 JSON,形如:
172
+ {"H1":{"clarity":{"level":3,"confidence":0.8,"evidence":"…"},"converge":{"level":4,"confidence":0.9,"evidence":"…"},"drift":{"level":null,"confidence":0,"evidence":"无法判断,原因…"}},
173
+ "H2":{"challenge":{…},"override":{…},"accept_rate":{…}},
174
+ "E1":{"domain_errors":{…},"staleness":{…},"best_practice":{…}},
175
+ "O1":{"first_take":{…},"code_style":{…},"completeness":{…}},
176
+ "rationale":"一句话总评"}`;
177
+ }
178
+
179
+ module.exports = { buildE1Prompt, buildO1Prompt, buildSessionJudgePrompt, PROMPT_VERSION, JUDGE_SENTINEL };
@@ -0,0 +1,118 @@
1
+ /**
2
+ * High-level LLM judge — bridges dimension scorers (E1, O1) to the
3
+ * cli-runner. Handles:
4
+ * • opt-in via user_settings.enable_llm_judge
5
+ * • per-session cache via session_analysis.llm_judge_v2
6
+ * • concurrency throttle (cli-runner.withSlot)
7
+ * • fall-back signalling so dimension scorers can branch
8
+ *
9
+ * @author Felix
10
+ */
11
+
12
+ 'use strict';
13
+
14
+ const { runJudge, detectAvailableCli, withSlot } = require('./cli-runner');
15
+ const { buildSessionJudgePrompt, PROMPT_VERSION } = require('./judge-prompts');
16
+ const { queryOne } = require('../db/queries');
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Settings cache
20
+ // ---------------------------------------------------------------------------
21
+
22
+ let _settingsCache = null;
23
+ let _settingsCacheAt = 0;
24
+ const SETTINGS_TTL_MS = 10_000;
25
+
26
+ function getSettings(db) {
27
+ const now = Date.now();
28
+ if (_settingsCache && now - _settingsCacheAt < SETTINGS_TTL_MS) {
29
+ return _settingsCache;
30
+ }
31
+ const rows = db.exec(
32
+ "SELECT key, value FROM user_settings WHERE key IN ('enable_llm_judge')"
33
+ );
34
+ const out = { enable_llm_judge: false };
35
+ if (rows[0]) {
36
+ for (const [k, v] of rows[0].values) {
37
+ if (k === 'enable_llm_judge') out.enable_llm_judge = String(v) === '1' || String(v).toLowerCase() === 'true';
38
+ }
39
+ }
40
+ _settingsCache = out;
41
+ _settingsCacheAt = now;
42
+ return out;
43
+ }
44
+
45
+ /** Public: force a settings reload (e.g. after PUT /api/settings). */
46
+ function invalidateSettingsCache() {
47
+ _settingsCache = null;
48
+ }
49
+
50
+ // ---------------------------------------------------------------------------
51
+ // Per-session cache
52
+ // ---------------------------------------------------------------------------
53
+
54
+ /** Return the cached llm_judge_v2 JSON for a session or null. */
55
+ function loadCache(db, sessionId) {
56
+ const row = queryOne(
57
+ db,
58
+ 'SELECT llm_judge_v2 FROM session_analysis WHERE session_id = ?',
59
+ [sessionId]
60
+ );
61
+ if (!row || !row.llm_judge_v2) return null;
62
+ try { return JSON.parse(row.llm_judge_v2); }
63
+ catch { return null; }
64
+ }
65
+
66
+ // ---------------------------------------------------------------------------
67
+ // Public judge functions
68
+ // ---------------------------------------------------------------------------
69
+
70
+ /**
71
+ * Consolidated judge — one LLM call scoring H1/H2/E1/O1 for a session.
72
+ * Returns the parsed payload (stamped with v / msgCount / cli) or null
73
+ * when disabled, no CLI, or the call fails. Cached in llm_judge_v2.
74
+ */
75
+ async function judgeSession(db, session, messages, meta = {}) {
76
+ const settings = getSettings(db);
77
+ if (!settings.enable_llm_judge) return null;
78
+
79
+ const msgCount = messages.length;
80
+ const cache = loadCache(db, session.id);
81
+ if (cache && cache.v === PROMPT_VERSION && cache.msgCount === msgCount) return cache;
82
+
83
+ const cli = await detectAvailableCli();
84
+ if (!cli) return null;
85
+
86
+ const prompt = buildSessionJudgePrompt(messages, meta);
87
+ const result = await withSlot(() => runJudge({ prompt, timeoutMs: 90_000 }));
88
+ if (!result.ok || !result.data) return null;
89
+
90
+ return {
91
+ ...result.data,
92
+ v: PROMPT_VERSION,
93
+ msgCount,
94
+ cli: result.cli,
95
+ cachedAt: new Date().toISOString(),
96
+ };
97
+ }
98
+
99
+ /**
100
+ * One-shot pre-flight to surface whether a CLI is configured. Used by
101
+ * the Settings page.
102
+ */
103
+ async function diagnose() {
104
+ const cli = await detectAvailableCli();
105
+ return {
106
+ available: !!cli,
107
+ name: cli ? cli.name : null,
108
+ };
109
+ }
110
+
111
+ module.exports = {
112
+ judgeSession,
113
+ diagnose,
114
+ invalidateSettingsCache,
115
+ // re-export so callers don't need cli-runner directly
116
+ detectAvailableCli,
117
+ PROMPT_VERSION,
118
+ };