agentboss 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/bin/aboss.js +288 -0
- package/client/dist/assets/index-C1wFD_Vo.css +1 -0
- package/client/dist/assets/index-DBj1Ujlx.js +137 -0
- package/client/dist/index.html +34 -0
- package/package.json +64 -0
- package/server/analysis/daily-aggregator.js +258 -0
- package/server/analysis/difficulty.js +129 -0
- package/server/analysis/dimensions/ai-knowledge.js +172 -0
- package/server/analysis/dimensions/ai-tools.js +161 -0
- package/server/analysis/dimensions/judgement.js +107 -0
- package/server/analysis/dimensions/llm-merge.js +57 -0
- package/server/analysis/dimensions/output-quality.js +167 -0
- package/server/analysis/dimensions/problem-definition.js +104 -0
- package/server/analysis/dimensions/system-thinking.js +225 -0
- package/server/analysis/evidence-builder.js +104 -0
- package/server/analysis/job.js +273 -0
- package/server/analysis/report-builder.js +581 -0
- package/server/analysis/scoring-v2.js +72 -0
- package/server/analysis/text-signals.js +179 -0
- package/server/analysis/thresholds-v2.js +358 -0
- package/server/api/advice.js +124 -0
- package/server/api/analysis.js +141 -0
- package/server/api/execution.js +330 -0
- package/server/api/metrics.js +277 -0
- package/server/api/overview.js +308 -0
- package/server/api/project.js +255 -0
- package/server/api/reports.js +125 -0
- package/server/api/sessions.js +118 -0
- package/server/api/settings.js +119 -0
- package/server/db/connection.js +175 -0
- package/server/db/queries.js +1051 -0
- package/server/db/schema.js +487 -0
- package/server/etl/active-time.js +150 -0
- package/server/etl/backfill-subagents.js +178 -0
- package/server/etl/claude-code.js +826 -0
- package/server/etl/detect.js +341 -0
- package/server/etl/judge-filter.js +117 -0
- package/server/etl/opencode.js +606 -0
- package/server/execution/job.js +662 -0
- package/server/execution/prompt.js +227 -0
- package/server/execution/runner.js +218 -0
- package/server/index.js +94 -0
- package/server/llm/advice-prompt.js +339 -0
- package/server/llm/advice.js +384 -0
- package/server/llm/analysis-prompt.js +162 -0
- package/server/llm/cli-runner.js +249 -0
- package/server/llm/judge-prompts.js +179 -0
- package/server/llm/judge.js +118 -0
- package/server/llm/project-advice-prompt.js +332 -0
- package/server/llm/project-advice.js +491 -0
- package/server/llm/session-analyzer.js +122 -0
- package/server/utils/project.js +80 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM judge runner — spawns a local AI CLI for evaluation tasks.
|
|
3
|
+
*
|
|
4
|
+
* Detection order (first one found wins):
|
|
5
|
+
* 1. `opencode run -p "<prompt>"`
|
|
6
|
+
* 2. `claude -p "<prompt>"`
|
|
7
|
+
*
|
|
8
|
+
* Returns parsed JSON. Failures (CLI missing / timeout / non-JSON
|
|
9
|
+
* output) resolve to `null` so callers can fall back to rule-based
|
|
10
|
+
* heuristics. Never throws.
|
|
11
|
+
*
|
|
12
|
+
* @author Felix
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
'use strict';
|
|
16
|
+
|
|
17
|
+
const { spawn } = require('child_process');
|
|
18
|
+
const { JUDGE_SENTINEL } = require('./judge-prompts');
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Prepend the JUDGE_SENTINEL to the prompt if it isn't already the very
|
|
22
|
+
* first line. This is the last-line defence that guarantees *every*
|
|
23
|
+
* LLM call originating from aboss is recognisable when its session
|
|
24
|
+
* later gets re-imported by the ETL (see server/etl/judge-filter.js).
|
|
25
|
+
*
|
|
26
|
+
* Callers (e.g. buildE1Prompt / buildO1Prompt) already prepend the
|
|
27
|
+
* sentinel, but enforcing it here means any future caller — or any
|
|
28
|
+
* accidentally-omitted sentinel — still produces a tagged session
|
|
29
|
+
* rather than polluting the user's own work.
|
|
30
|
+
*/
|
|
31
|
+
function ensureSentinel(prompt) {
|
|
32
|
+
if (typeof prompt !== 'string') return prompt;
|
|
33
|
+
if (prompt.startsWith(JUDGE_SENTINEL)) return prompt;
|
|
34
|
+
return `${JUDGE_SENTINEL}(内部标记,忽略本行)\n${prompt}`;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
// Detection
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* CLI candidates.
|
|
43
|
+
*
|
|
44
|
+
* `argv` builds the command-line args. When `stdinPrompt: true`, the
|
|
45
|
+
* prompt is fed on STDIN instead of being inlined into argv — this is
|
|
46
|
+
* essential on Windows where the command-line cap is ~8 KB and our
|
|
47
|
+
* judge prompts run 10 KB+.
|
|
48
|
+
*/
|
|
49
|
+
const CANDIDATES = [
|
|
50
|
+
// opencode reads stdin when no positional arg is given (after `run`)
|
|
51
|
+
{ name: 'opencode', bin: 'opencode', argv: () => ['run'], stdinPrompt: true },
|
|
52
|
+
// claude -p reads stdin when -p is used without an inline prompt
|
|
53
|
+
{ name: 'claude', bin: 'claude', argv: () => ['-p'], stdinPrompt: true },
|
|
54
|
+
];
|
|
55
|
+
|
|
56
|
+
let _cachedCli = undefined; // null = detected none; obj = found
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Detect which CLI is available. Tries `bin --version` for each candidate.
|
|
60
|
+
* Caches the result for the process lifetime.
|
|
61
|
+
*
|
|
62
|
+
* @returns {Promise<{name:string, bin:string, argv:Function}|null>}
|
|
63
|
+
*/
|
|
64
|
+
async function detectAvailableCli() {
|
|
65
|
+
if (_cachedCli !== undefined) return _cachedCli;
|
|
66
|
+
for (const c of CANDIDATES) {
|
|
67
|
+
if (await canSpawn(c.bin)) {
|
|
68
|
+
_cachedCli = c;
|
|
69
|
+
return c;
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
_cachedCli = null;
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/** Reset the detection cache. Mostly useful in tests / settings reload. */
|
|
77
|
+
function _resetCache() { _cachedCli = undefined; }
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Try to spawn `bin --version`. Resolves true on exit code 0. Cross-
|
|
81
|
+
* platform: on Windows `bin` is resolved via PATH automatically by spawn.
|
|
82
|
+
*/
|
|
83
|
+
function canSpawn(bin) {
|
|
84
|
+
return new Promise((resolve) => {
|
|
85
|
+
let resolved = false;
|
|
86
|
+
const settle = (v) => { if (!resolved) { resolved = true; resolve(v); } };
|
|
87
|
+
|
|
88
|
+
try {
|
|
89
|
+
const proc = spawn(bin, ['--version'], {
|
|
90
|
+
stdio: 'ignore',
|
|
91
|
+
shell: process.platform === 'win32',
|
|
92
|
+
});
|
|
93
|
+
proc.on('error', () => settle(false));
|
|
94
|
+
proc.on('exit', (code) => settle(code === 0));
|
|
95
|
+
// hard timeout
|
|
96
|
+
setTimeout(() => { try { proc.kill('SIGKILL'); } catch {} settle(false); }, 5000);
|
|
97
|
+
} catch {
|
|
98
|
+
settle(false);
|
|
99
|
+
}
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// ---------------------------------------------------------------------------
|
|
104
|
+
// Runner
|
|
105
|
+
// ---------------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Spawn the chosen CLI with the prompt, capture stdout, and try to parse
|
|
109
|
+
* it as JSON. The caller's prompt should *demand* JSON output.
|
|
110
|
+
*
|
|
111
|
+
* Options:
|
|
112
|
+
* timeoutMs (default 30_000)
|
|
113
|
+
* maxBytes (default 256 KB) — guard against runaway output
|
|
114
|
+
*
|
|
115
|
+
* Resolves:
|
|
116
|
+
* { ok: true, data: any, raw: string, cli: 'opencode'|'claude' }
|
|
117
|
+
* { ok: false, reason: 'no-cli' | 'timeout' | 'exit-non-zero' | 'bad-json' | 'spawn-error', raw?: string, error?: string }
|
|
118
|
+
*
|
|
119
|
+
* @param {Object} opts
|
|
120
|
+
* @returns {Promise<Object>}
|
|
121
|
+
*/
|
|
122
|
+
async function runJudge(opts = {}) {
|
|
123
|
+
const { prompt: rawPrompt, timeoutMs = 30_000, maxBytes = 256 * 1024 } = opts;
|
|
124
|
+
if (!rawPrompt || typeof rawPrompt !== 'string') {
|
|
125
|
+
return { ok: false, reason: 'no-prompt' };
|
|
126
|
+
}
|
|
127
|
+
// Stamp the sentinel onto every outbound prompt so the ETL can later
|
|
128
|
+
// recognise and discard the session this CLI call will create.
|
|
129
|
+
const prompt = ensureSentinel(rawPrompt);
|
|
130
|
+
|
|
131
|
+
const cli = await detectAvailableCli();
|
|
132
|
+
if (!cli) return { ok: false, reason: 'no-cli' };
|
|
133
|
+
|
|
134
|
+
return new Promise((resolve) => {
|
|
135
|
+
let resolved = false;
|
|
136
|
+
const settle = (v) => { if (!resolved) { resolved = true; resolve(v); } };
|
|
137
|
+
|
|
138
|
+
let proc;
|
|
139
|
+
try {
|
|
140
|
+
const useStdin = cli.stdinPrompt === true;
|
|
141
|
+
proc = spawn(cli.bin, cli.argv(prompt), {
|
|
142
|
+
stdio: [useStdin ? 'pipe' : 'ignore', 'pipe', 'pipe'],
|
|
143
|
+
shell: process.platform === 'win32',
|
|
144
|
+
});
|
|
145
|
+
if (useStdin && proc.stdin) {
|
|
146
|
+
proc.stdin.on('error', () => {}); // EPIPE if CLI exits early
|
|
147
|
+
proc.stdin.end(prompt, 'utf8');
|
|
148
|
+
}
|
|
149
|
+
} catch (err) {
|
|
150
|
+
return settle({ ok: false, reason: 'spawn-error', error: err.message });
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
let stdout = '';
|
|
154
|
+
let stderr = '';
|
|
155
|
+
let truncated = false;
|
|
156
|
+
|
|
157
|
+
proc.stdout.on('data', (chunk) => {
|
|
158
|
+
if (truncated) return;
|
|
159
|
+
stdout += chunk.toString('utf8');
|
|
160
|
+
if (stdout.length > maxBytes) {
|
|
161
|
+
stdout = stdout.slice(0, maxBytes);
|
|
162
|
+
truncated = true;
|
|
163
|
+
try { proc.kill('SIGKILL'); } catch {}
|
|
164
|
+
}
|
|
165
|
+
});
|
|
166
|
+
proc.stderr.on('data', (chunk) => { stderr += chunk.toString('utf8'); });
|
|
167
|
+
|
|
168
|
+
proc.on('error', (err) => settle({ ok: false, reason: 'spawn-error', error: err.message }));
|
|
169
|
+
|
|
170
|
+
proc.on('exit', (code) => {
|
|
171
|
+
if (code !== 0 && !truncated) {
|
|
172
|
+
return settle({ ok: false, reason: 'exit-non-zero', raw: stdout, error: stderr.slice(0, 500) });
|
|
173
|
+
}
|
|
174
|
+
const parsed = extractJson(stdout);
|
|
175
|
+
if (parsed === undefined) {
|
|
176
|
+
return settle({ ok: false, reason: 'bad-json', raw: stdout.slice(0, 500) });
|
|
177
|
+
}
|
|
178
|
+
settle({ ok: true, data: parsed, raw: stdout, cli: cli.name });
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
const t = setTimeout(() => {
|
|
182
|
+
try { proc.kill('SIGKILL'); } catch {}
|
|
183
|
+
settle({ ok: false, reason: 'timeout' });
|
|
184
|
+
}, timeoutMs);
|
|
185
|
+
proc.on('exit', () => clearTimeout(t));
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Try to find a JSON value in raw stdout. Tolerates leading log lines
|
|
191
|
+
* by scanning for the first { or [. Returns the parsed value or
|
|
192
|
+
* undefined on failure.
|
|
193
|
+
*/
|
|
194
|
+
function extractJson(raw) {
|
|
195
|
+
if (!raw) return undefined;
|
|
196
|
+
// common case: stdout is pure JSON
|
|
197
|
+
const trimmed = raw.trim();
|
|
198
|
+
try { return JSON.parse(trimmed); } catch {}
|
|
199
|
+
// fall back: find first { or [
|
|
200
|
+
const i1 = trimmed.indexOf('{');
|
|
201
|
+
const i2 = trimmed.indexOf('[');
|
|
202
|
+
let start = -1;
|
|
203
|
+
if (i1 >= 0 && i2 >= 0) start = Math.min(i1, i2);
|
|
204
|
+
else if (i1 >= 0) start = i1;
|
|
205
|
+
else if (i2 >= 0) start = i2;
|
|
206
|
+
if (start < 0) return undefined;
|
|
207
|
+
|
|
208
|
+
// find matching last brace/bracket of the same kind
|
|
209
|
+
const open = trimmed[start];
|
|
210
|
+
const close = open === '{' ? '}' : ']';
|
|
211
|
+
const end = trimmed.lastIndexOf(close);
|
|
212
|
+
if (end < start) return undefined;
|
|
213
|
+
try { return JSON.parse(trimmed.slice(start, end + 1)); } catch {}
|
|
214
|
+
return undefined;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// ---------------------------------------------------------------------------
|
|
218
|
+
// Concurrency guard
|
|
219
|
+
// ---------------------------------------------------------------------------
|
|
220
|
+
|
|
221
|
+
let _inFlight = 0;
|
|
222
|
+
const _waiters = [];
|
|
223
|
+
const MAX_CONCURRENT = 2;
|
|
224
|
+
|
|
225
|
+
/** Run `fn` under a 2-wide semaphore so we don't fork-bomb the CLI. */
|
|
226
|
+
function withSlot(fn) {
|
|
227
|
+
return new Promise((resolve) => {
|
|
228
|
+
const start = async () => {
|
|
229
|
+
_inFlight++;
|
|
230
|
+
try { resolve(await fn()); }
|
|
231
|
+
finally {
|
|
232
|
+
_inFlight--;
|
|
233
|
+
const next = _waiters.shift();
|
|
234
|
+
if (next) next();
|
|
235
|
+
}
|
|
236
|
+
};
|
|
237
|
+
if (_inFlight < MAX_CONCURRENT) start();
|
|
238
|
+
else _waiters.push(start);
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
module.exports = {
|
|
243
|
+
detectAvailableCli,
|
|
244
|
+
runJudge,
|
|
245
|
+
withSlot,
|
|
246
|
+
// exported for tests
|
|
247
|
+
_resetCache,
|
|
248
|
+
extractJson,
|
|
249
|
+
};
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Prompt templates for the LLM-judge runs used by E1 and O1.
|
|
3
|
+
*
|
|
4
|
+
* Each function returns a STRING prompt that demands strict JSON output.
|
|
5
|
+
* The runner enforces a 30s timeout and ~256 KB cap; we keep the input
|
|
6
|
+
* compact (only last N messages, truncated) so the CLI doesn't choke.
|
|
7
|
+
*
|
|
8
|
+
* @author Felix
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
'use strict';
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Bump whenever the prompt output contract changes — cached judge
|
|
15
|
+
* results stamped with an older version are ignored and re-judged.
|
|
16
|
+
* v2: added per-field `details` scoring evidence.
|
|
17
|
+
* v4: dropped H1.reframe sub-indicator.
|
|
18
|
+
*/
|
|
19
|
+
const PROMPT_VERSION = 4;
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* First line of every judge prompt. The judge CLIs (opencode / claude)
|
|
23
|
+
* log each call as a session in their own data stores; the ETL uses this
|
|
24
|
+
* marker to recognise and skip those sessions so they don't get imported
|
|
25
|
+
* back as the user's own work (which would create a feedback loop).
|
|
26
|
+
*/
|
|
27
|
+
const JUDGE_SENTINEL = '[ABOSS-JUDGE]';
|
|
28
|
+
|
|
29
|
+
const MAX_MESSAGES = 30;
|
|
30
|
+
const MAX_LEN_PER_MSG = 600;
|
|
31
|
+
|
|
32
|
+
/** Build a short, role-tagged transcript fragment. */
|
|
33
|
+
function buildTranscript(messages) {
|
|
34
|
+
const slice = messages.slice(-MAX_MESSAGES);
|
|
35
|
+
return slice
|
|
36
|
+
.filter((m) => m.text)
|
|
37
|
+
.map((m) => {
|
|
38
|
+
const role = (m.role || '?').toUpperCase().padEnd(9);
|
|
39
|
+
const text = m.text.length > MAX_LEN_PER_MSG
|
|
40
|
+
? m.text.slice(0, MAX_LEN_PER_MSG) + '…[truncated]'
|
|
41
|
+
: m.text;
|
|
42
|
+
return `[${role}] ${text}`;
|
|
43
|
+
})
|
|
44
|
+
.join('\n---\n');
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* E1 — AI Knowledge Coverage judge prompt.
|
|
49
|
+
*
|
|
50
|
+
* Asks the judge to score three dimensions in [0, 1]. Hands back a
|
|
51
|
+
* strict JSON object.
|
|
52
|
+
*/
|
|
53
|
+
function buildE1Prompt(messages, meta = {}) {
|
|
54
|
+
const transcript = buildTranscript(messages);
|
|
55
|
+
return `${JUDGE_SENTINEL}(内部标记,忽略本行)
|
|
56
|
+
你是一名严格的 AI 协作审计员。下面是一段开发者与 AI 编程助手的对话片段(按时间序)。
|
|
57
|
+
请评估这位 AI 助手在该会话中的"知识覆盖"表现,返回一个严格 JSON 对象,不要任何额外文字、解释或 markdown 代码块。
|
|
58
|
+
|
|
59
|
+
字段定义(值域均为 0.0 - 1.0 之间的浮点数):
|
|
60
|
+
- domain_errors: 助手输出中存在领域错误(错误 API、概念混淆、虚构函数)的比例。越接近 0 越好。
|
|
61
|
+
- staleness: 助手使用已废弃 / 已移除技术的次数,归一化(0=没有,1=非常严重)。越接近 0 越好。
|
|
62
|
+
- best_practice: 助手输出符合当前最佳实践的比例。越接近 1 越好。
|
|
63
|
+
|
|
64
|
+
同时必须在 details 中为每个字段给出打分依据:引用对话中的具体表现(哪条消息、什么内容),
|
|
65
|
+
说明为什么打这个数值而不是更高或更低(例如"全程未发现虚构 API,仅第 12 条消息混淆了 X 与 Y,故 0.05 而非 0")。
|
|
66
|
+
没有证据就明确写"未发现相关证据"。禁止空泛措辞。
|
|
67
|
+
|
|
68
|
+
如果对话太短或无法判断,对应数值字段返回 null,details 中说明无法判断的原因。
|
|
69
|
+
|
|
70
|
+
会话上下文:
|
|
71
|
+
- 模型:${meta.model || '未知'}
|
|
72
|
+
- 项目:${meta.project || '未知'}
|
|
73
|
+
- 任务难度:${meta.difficulty || '未知'} (1=琐碎 4=重型)
|
|
74
|
+
|
|
75
|
+
对话片段:
|
|
76
|
+
${transcript}
|
|
77
|
+
|
|
78
|
+
只输出 JSON,形如:
|
|
79
|
+
{"domain_errors": 0.1, "staleness": 0.0, "best_practice": 0.85,
|
|
80
|
+
"details": {"domain_errors": "打分依据,引用具体消息", "staleness": "…", "best_practice": "…"},
|
|
81
|
+
"rationale": "一句话总评"}`;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* O1 — Output Quality judge prompt.
|
|
86
|
+
*/
|
|
87
|
+
function buildO1Prompt(messages, meta = {}) {
|
|
88
|
+
const transcript = buildTranscript(messages);
|
|
89
|
+
return `${JUDGE_SENTINEL}(内部标记,忽略本行)
|
|
90
|
+
你是一名严格的代码与产出审计员。下面是一段开发者与 AI 编程助手的对话片段。
|
|
91
|
+
请评估这位 AI 在该会话中的"输出质量",返回严格 JSON 对象(无任何额外文字 / markdown)。
|
|
92
|
+
|
|
93
|
+
字段(0.0 - 1.0 浮点):
|
|
94
|
+
- first_take: 助手输出可被一次采纳(无需修改)的比例。
|
|
95
|
+
- code_style: 助手产出的代码规范度(命名、格式、注释、可读性)。
|
|
96
|
+
- completeness: 助手是否考虑边界条件、错误处理、测试等完备性。
|
|
97
|
+
|
|
98
|
+
同时必须在 details 中为每个字段给出打分依据:引用对话中的具体表现,
|
|
99
|
+
说明为什么是这个数值而不是更高或更低(例如"8 次产出中 2 次被用户要求返工,故 first_take=0.75 而非 0.9")。
|
|
100
|
+
没有证据就明确写"未发现相关证据"。禁止空泛措辞。
|
|
101
|
+
|
|
102
|
+
无法判断的字段返回 null,details 中说明原因。
|
|
103
|
+
|
|
104
|
+
会话上下文:模型 ${meta.model || '未知'},难度 ${meta.difficulty || '未知'}/4。
|
|
105
|
+
|
|
106
|
+
对话片段:
|
|
107
|
+
${transcript}
|
|
108
|
+
|
|
109
|
+
只输出 JSON:
|
|
110
|
+
{"first_take": 0.75, "code_style": 0.8, "completeness": 0.6,
|
|
111
|
+
"details": {"first_take": "打分依据,引用具体消息", "code_style": "…", "completeness": "…"},
|
|
112
|
+
"rationale": "一句话总评"}`;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Per-dimension L1–L4 anchors. Kept compact: one line per level so the
|
|
117
|
+
* full rubric stays well under the CLI prompt budget.
|
|
118
|
+
*/
|
|
119
|
+
const RUBRIC = `
|
|
120
|
+
H1 立意(把模糊需求收敛成可执行的精确问题的功力)
|
|
121
|
+
clarity 初始指令清晰度 · converge 收敛效率 · drift 方向稳定性
|
|
122
|
+
L4 开局即给出目标+约束+验收,几乎无需追问;L3 信息基本充分,少量澄清;
|
|
123
|
+
L2 需多轮补全;L1 一句话甩任务、反复改方向。
|
|
124
|
+
H2 判断(不盲从、敢质疑、敢推翻 vs 橡皮图章式照单全收)
|
|
125
|
+
challenge 合理质疑 · override 该推翻时推翻 · accept_rate 采纳前是否有判断
|
|
126
|
+
L4 在该质疑处质疑、在合理处高效采纳,质疑有依据;L3 大体审视偶有盲从;
|
|
127
|
+
L2 多数直接采纳少量质疑;L1 几乎全程"好的/继续"式盲从。
|
|
128
|
+
E1 知识(AI 对该技术栈的掌握)
|
|
129
|
+
domain_errors 领域错误(越少越好)· staleness 过时引用(越少越好)· best_practice 最佳实践
|
|
130
|
+
L4 无错误、无过时、全程最佳实践;L3 偶有小瑕疵;L2 多处问题;L1 频繁错误/过时。
|
|
131
|
+
O1 产出(结果是否真的可用)
|
|
132
|
+
first_take 一次采纳 · code_style 代码规范 · completeness 边界/异常/测试完备
|
|
133
|
+
L4 几乎一次过且规范完备;L3 小修即可;L2 需明显返工;L1 大量返工或缺失完备性。
|
|
134
|
+
`;
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Build the consolidated session-judge prompt. Demands a single strict
|
|
138
|
+
* JSON object scoring every semantic sub-indicator with level (1-4 or
|
|
139
|
+
* null), confidence (0-1) and evidence (must cite specific messages).
|
|
140
|
+
*
|
|
141
|
+
* @param {Array<{role:string,text:string}>} messages
|
|
142
|
+
* @param {{difficulty?:number, model?:string, project?:string}} meta
|
|
143
|
+
* @returns {string}
|
|
144
|
+
*/
|
|
145
|
+
function buildSessionJudgePrompt(messages, meta = {}) {
|
|
146
|
+
const transcript = buildTranscript(messages);
|
|
147
|
+
return `${JUDGE_SENTINEL}(内部标记,忽略本行)
|
|
148
|
+
你是一名严格、客观的"人机协作"审计员。下面是一段开发者与 AI 编程助手的对话片段(按时间序)。
|
|
149
|
+
请按下面的 rubric,对四个维度的每个子指标给出 L1–L4 等级,返回**单个严格 JSON 对象**,
|
|
150
|
+
不要任何额外文字、解释或 markdown 代码块。
|
|
151
|
+
|
|
152
|
+
评分 rubric(难度越高合格线越宽松,本会话难度见下文):
|
|
153
|
+
${RUBRIC}
|
|
154
|
+
|
|
155
|
+
每个子指标返回三元组:
|
|
156
|
+
- level: 整数 1–4;证据不足或对话太短无法判断时返回 null(不要硬给低分)。
|
|
157
|
+
- confidence: 0.0–1.0,你对该 level 的把握。
|
|
158
|
+
- evidence: 打分依据,必须引用对话中的具体表现(哪条消息、什么内容),说明为何是这个等级
|
|
159
|
+
而非更高/更低。没有证据就写"未发现相关证据"。禁止空泛措辞。
|
|
160
|
+
|
|
161
|
+
客观性要求(必须遵守):
|
|
162
|
+
1. 不因 AI 输出更长、更礼貌、更啰嗦而加分,只看实质内容。
|
|
163
|
+
2. 证据不足或会话过短的子指标必须返回 level=null,而不是给低分。
|
|
164
|
+
3. 任务难度低不代表做得差——难度已单独归一,不要二次惩罚简单任务。
|
|
165
|
+
|
|
166
|
+
会话上下文:模型 ${meta.model || '未知'},项目 ${meta.project || '未知'},任务难度 ${meta.difficulty || '未知'}/4(1=琐碎 4=重型)。
|
|
167
|
+
|
|
168
|
+
对话片段:
|
|
169
|
+
${transcript}
|
|
170
|
+
|
|
171
|
+
只输出 JSON,形如:
|
|
172
|
+
{"H1":{"clarity":{"level":3,"confidence":0.8,"evidence":"…"},"converge":{"level":4,"confidence":0.9,"evidence":"…"},"drift":{"level":null,"confidence":0,"evidence":"无法判断,原因…"}},
|
|
173
|
+
"H2":{"challenge":{…},"override":{…},"accept_rate":{…}},
|
|
174
|
+
"E1":{"domain_errors":{…},"staleness":{…},"best_practice":{…}},
|
|
175
|
+
"O1":{"first_take":{…},"code_style":{…},"completeness":{…}},
|
|
176
|
+
"rationale":"一句话总评"}`;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
module.exports = { buildE1Prompt, buildO1Prompt, buildSessionJudgePrompt, PROMPT_VERSION, JUDGE_SENTINEL };
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* High-level LLM judge — bridges dimension scorers (E1, O1) to the
|
|
3
|
+
* cli-runner. Handles:
|
|
4
|
+
* • opt-in via user_settings.enable_llm_judge
|
|
5
|
+
* • per-session cache via session_analysis.llm_judge_v2
|
|
6
|
+
* • concurrency throttle (cli-runner.withSlot)
|
|
7
|
+
* • fall-back signalling so dimension scorers can branch
|
|
8
|
+
*
|
|
9
|
+
* @author Felix
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
'use strict';
|
|
13
|
+
|
|
14
|
+
const { runJudge, detectAvailableCli, withSlot } = require('./cli-runner');
|
|
15
|
+
const { buildSessionJudgePrompt, PROMPT_VERSION } = require('./judge-prompts');
|
|
16
|
+
const { queryOne } = require('../db/queries');
|
|
17
|
+
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Settings cache
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
let _settingsCache = null;
|
|
23
|
+
let _settingsCacheAt = 0;
|
|
24
|
+
const SETTINGS_TTL_MS = 10_000;
|
|
25
|
+
|
|
26
|
+
function getSettings(db) {
|
|
27
|
+
const now = Date.now();
|
|
28
|
+
if (_settingsCache && now - _settingsCacheAt < SETTINGS_TTL_MS) {
|
|
29
|
+
return _settingsCache;
|
|
30
|
+
}
|
|
31
|
+
const rows = db.exec(
|
|
32
|
+
"SELECT key, value FROM user_settings WHERE key IN ('enable_llm_judge')"
|
|
33
|
+
);
|
|
34
|
+
const out = { enable_llm_judge: false };
|
|
35
|
+
if (rows[0]) {
|
|
36
|
+
for (const [k, v] of rows[0].values) {
|
|
37
|
+
if (k === 'enable_llm_judge') out.enable_llm_judge = String(v) === '1' || String(v).toLowerCase() === 'true';
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
_settingsCache = out;
|
|
41
|
+
_settingsCacheAt = now;
|
|
42
|
+
return out;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/** Public: force a settings reload (e.g. after PUT /api/settings). */
|
|
46
|
+
function invalidateSettingsCache() {
|
|
47
|
+
_settingsCache = null;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
// Per-session cache
|
|
52
|
+
// ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
/** Return the cached llm_judge_v2 JSON for a session or null. */
|
|
55
|
+
function loadCache(db, sessionId) {
|
|
56
|
+
const row = queryOne(
|
|
57
|
+
db,
|
|
58
|
+
'SELECT llm_judge_v2 FROM session_analysis WHERE session_id = ?',
|
|
59
|
+
[sessionId]
|
|
60
|
+
);
|
|
61
|
+
if (!row || !row.llm_judge_v2) return null;
|
|
62
|
+
try { return JSON.parse(row.llm_judge_v2); }
|
|
63
|
+
catch { return null; }
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Public judge functions
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Consolidated judge — one LLM call scoring H1/H2/E1/O1 for a session.
|
|
72
|
+
* Returns the parsed payload (stamped with v / msgCount / cli) or null
|
|
73
|
+
* when disabled, no CLI, or the call fails. Cached in llm_judge_v2.
|
|
74
|
+
*/
|
|
75
|
+
async function judgeSession(db, session, messages, meta = {}) {
|
|
76
|
+
const settings = getSettings(db);
|
|
77
|
+
if (!settings.enable_llm_judge) return null;
|
|
78
|
+
|
|
79
|
+
const msgCount = messages.length;
|
|
80
|
+
const cache = loadCache(db, session.id);
|
|
81
|
+
if (cache && cache.v === PROMPT_VERSION && cache.msgCount === msgCount) return cache;
|
|
82
|
+
|
|
83
|
+
const cli = await detectAvailableCli();
|
|
84
|
+
if (!cli) return null;
|
|
85
|
+
|
|
86
|
+
const prompt = buildSessionJudgePrompt(messages, meta);
|
|
87
|
+
const result = await withSlot(() => runJudge({ prompt, timeoutMs: 90_000 }));
|
|
88
|
+
if (!result.ok || !result.data) return null;
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
...result.data,
|
|
92
|
+
v: PROMPT_VERSION,
|
|
93
|
+
msgCount,
|
|
94
|
+
cli: result.cli,
|
|
95
|
+
cachedAt: new Date().toISOString(),
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* One-shot pre-flight to surface whether a CLI is configured. Used by
|
|
101
|
+
* the Settings page.
|
|
102
|
+
*/
|
|
103
|
+
async function diagnose() {
|
|
104
|
+
const cli = await detectAvailableCli();
|
|
105
|
+
return {
|
|
106
|
+
available: !!cli,
|
|
107
|
+
name: cli ? cli.name : null,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
module.exports = {
|
|
112
|
+
judgeSession,
|
|
113
|
+
diagnose,
|
|
114
|
+
invalidateSettingsCache,
|
|
115
|
+
// re-export so callers don't need cli-runner directly
|
|
116
|
+
detectAvailableCli,
|
|
117
|
+
PROMPT_VERSION,
|
|
118
|
+
};
|