agentboss 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/bin/aboss.js +288 -0
- package/client/dist/assets/index-C1wFD_Vo.css +1 -0
- package/client/dist/assets/index-DBj1Ujlx.js +137 -0
- package/client/dist/index.html +34 -0
- package/package.json +64 -0
- package/server/analysis/daily-aggregator.js +258 -0
- package/server/analysis/difficulty.js +129 -0
- package/server/analysis/dimensions/ai-knowledge.js +172 -0
- package/server/analysis/dimensions/ai-tools.js +161 -0
- package/server/analysis/dimensions/judgement.js +107 -0
- package/server/analysis/dimensions/llm-merge.js +57 -0
- package/server/analysis/dimensions/output-quality.js +167 -0
- package/server/analysis/dimensions/problem-definition.js +104 -0
- package/server/analysis/dimensions/system-thinking.js +225 -0
- package/server/analysis/evidence-builder.js +104 -0
- package/server/analysis/job.js +273 -0
- package/server/analysis/report-builder.js +581 -0
- package/server/analysis/scoring-v2.js +72 -0
- package/server/analysis/text-signals.js +179 -0
- package/server/analysis/thresholds-v2.js +358 -0
- package/server/api/advice.js +124 -0
- package/server/api/analysis.js +141 -0
- package/server/api/execution.js +330 -0
- package/server/api/metrics.js +277 -0
- package/server/api/overview.js +308 -0
- package/server/api/project.js +255 -0
- package/server/api/reports.js +125 -0
- package/server/api/sessions.js +118 -0
- package/server/api/settings.js +119 -0
- package/server/db/connection.js +175 -0
- package/server/db/queries.js +1051 -0
- package/server/db/schema.js +487 -0
- package/server/etl/active-time.js +150 -0
- package/server/etl/backfill-subagents.js +178 -0
- package/server/etl/claude-code.js +826 -0
- package/server/etl/detect.js +341 -0
- package/server/etl/judge-filter.js +117 -0
- package/server/etl/opencode.js +606 -0
- package/server/execution/job.js +662 -0
- package/server/execution/prompt.js +227 -0
- package/server/execution/runner.js +218 -0
- package/server/index.js +94 -0
- package/server/llm/advice-prompt.js +339 -0
- package/server/llm/advice.js +384 -0
- package/server/llm/analysis-prompt.js +162 -0
- package/server/llm/cli-runner.js +249 -0
- package/server/llm/judge-prompts.js +179 -0
- package/server/llm/judge.js +118 -0
- package/server/llm/project-advice-prompt.js +332 -0
- package/server/llm/project-advice.js +491 -0
- package/server/llm/session-analyzer.js +122 -0
- package/server/utils/project.js +80 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* E2 — AI Tool Coverage.
|
|
3
|
+
*
|
|
4
|
+
* Captures how well the AI environment picks, chains and recovers from
|
|
5
|
+
* tools.
|
|
6
|
+
* • tool_pick — fraction of tool calls NOT followed by a tool switch
|
|
7
|
+
* or user override
|
|
8
|
+
* • chain_eff — calls-per-user-intent ratio against a baseline of 1.0
|
|
9
|
+
* (lower is better; baselines are crude heuristics)
|
|
10
|
+
* • self_heal — after a tool error, fraction where the next same-tool
|
|
11
|
+
* call succeeds
|
|
12
|
+
*
|
|
13
|
+
* See spec §4.5.
|
|
14
|
+
*
|
|
15
|
+
* @author Felix
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
'use strict';
|
|
19
|
+
|
|
20
|
+
const { queryAll } = require('../../db/queries');
|
|
21
|
+
const { explainIndicator, rollupDimension, scoreToLevel, E2 } = require('../thresholds-v2');
|
|
22
|
+
const { makeEvidence } = require('../evidence-builder');
|
|
23
|
+
|
|
24
|
+
// Baseline calls-per-intent by difficulty. Hand-tuned: trivial tasks
|
|
25
|
+
// shouldn't need more than 2 tools, heavy ones legitimately use 8+.
|
|
26
|
+
const BASELINE_CALLS_PER_INTENT = { 1: 1.5, 2: 3, 3: 6, 4: 10 };
|
|
27
|
+
|
|
28
|
+
function analyze(db, session, difficulty = 2) {
|
|
29
|
+
const calls = queryAll(
|
|
30
|
+
db,
|
|
31
|
+
`SELECT tool_name, status, error_message, timestamp
|
|
32
|
+
FROM unified_tool_call
|
|
33
|
+
WHERE session_id = ?
|
|
34
|
+
ORDER BY timestamp ASC`,
|
|
35
|
+
[session.id]
|
|
36
|
+
);
|
|
37
|
+
|
|
38
|
+
const userMsgs = queryAll(
|
|
39
|
+
db,
|
|
40
|
+
`SELECT timestamp
|
|
41
|
+
FROM unified_message
|
|
42
|
+
WHERE session_id = ? AND role = 'user'
|
|
43
|
+
ORDER BY timestamp ASC`,
|
|
44
|
+
[session.id]
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
// ---- tool_pick: not-followed-by-switch ratio ----
|
|
48
|
+
// A "good" tool call is one whose next call (if any) is the SAME tool
|
|
49
|
+
// OR whose next event is the user accepting (no immediate switch).
|
|
50
|
+
// Lacking a clean "intent boundary", we use: fraction of calls that
|
|
51
|
+
// are NOT immediately followed by a different tool within 30 seconds.
|
|
52
|
+
let tool_pick = null;
|
|
53
|
+
if (calls.length > 0) {
|
|
54
|
+
let good = 0;
|
|
55
|
+
for (let i = 0; i < calls.length; i++) {
|
|
56
|
+
const cur = calls[i];
|
|
57
|
+
const next = calls[i + 1];
|
|
58
|
+
// Error calls are never "good picks"
|
|
59
|
+
if (cur.status === 'error') continue;
|
|
60
|
+
if (!next) { good++; continue; }
|
|
61
|
+
const dt = new Date(next.timestamp) - new Date(cur.timestamp);
|
|
62
|
+
const sameTool = next.tool_name === cur.tool_name;
|
|
63
|
+
if (sameTool || dt > 30_000) good++;
|
|
64
|
+
}
|
|
65
|
+
tool_pick = good / calls.length;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// ---- chain_eff: calls per user intent ratio ----
|
|
69
|
+
// intent ~ user message that prompted assistant action. We use total
|
|
70
|
+
// user messages as the count of intents (minus initial trivial ones).
|
|
71
|
+
let chain_eff = null;
|
|
72
|
+
if (calls.length > 0 && userMsgs.length > 0) {
|
|
73
|
+
const intents = Math.max(1, userMsgs.length);
|
|
74
|
+
const callsPerIntent = calls.length / intents;
|
|
75
|
+
const baseline = BASELINE_CALLS_PER_INTENT[difficulty] || 3;
|
|
76
|
+
chain_eff = callsPerIntent / baseline; // 1.0 = matches baseline
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// ---- self_heal: post-error recovery ----
|
|
80
|
+
// For each error call, look at the next call with the same tool_name.
|
|
81
|
+
// If that succeeds (status != 'error') → counted as healed.
|
|
82
|
+
let healEligible = 0;
|
|
83
|
+
let healed = 0;
|
|
84
|
+
for (let i = 0; i < calls.length; i++) {
|
|
85
|
+
const cur = calls[i];
|
|
86
|
+
if (cur.status !== 'error') continue;
|
|
87
|
+
// find next same-tool call
|
|
88
|
+
for (let j = i + 1; j < calls.length; j++) {
|
|
89
|
+
if (calls[j].tool_name === cur.tool_name) {
|
|
90
|
+
healEligible++;
|
|
91
|
+
if (calls[j].status !== 'error') healed++;
|
|
92
|
+
break;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
const self_heal = healEligible > 0 ? healed / healEligible : null;
|
|
97
|
+
|
|
98
|
+
const tpE = explainIndicator(E2.tool_pick, tool_pick, difficulty);
|
|
99
|
+
const ceE = explainIndicator(E2.chain_eff, chain_eff, difficulty);
|
|
100
|
+
const shE = explainIndicator(E2.self_heal, self_heal, difficulty);
|
|
101
|
+
|
|
102
|
+
const subScores = {
|
|
103
|
+
tool_pick: tpE.score,
|
|
104
|
+
chain_eff: ceE.score,
|
|
105
|
+
self_heal: shE.score,
|
|
106
|
+
};
|
|
107
|
+
const subLevels = {
|
|
108
|
+
tool_pick: tpE.level,
|
|
109
|
+
chain_eff: ceE.level,
|
|
110
|
+
self_heal: shE.level,
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
const baseline = BASELINE_CALLS_PER_INTENT[difficulty] || 3;
|
|
114
|
+
const callsPerIntent = (calls.length > 0 && userMsgs.length > 0)
|
|
115
|
+
? calls.length / Math.max(1, userMsgs.length)
|
|
116
|
+
: null;
|
|
117
|
+
|
|
118
|
+
const subEvidence = {
|
|
119
|
+
tool_pick: makeEvidence({
|
|
120
|
+
key: 'tool_pick', label: '工具选择准确率',
|
|
121
|
+
what: calls.length > 0
|
|
122
|
+
? `没有立即被切换或报错的工具调用占比:${calls.length} 次工具调用中有 ${Math.round((tool_pick || 0) * calls.length)} 次属于"选对了"。占比越高越好。`
|
|
123
|
+
: '本会话无工具调用。',
|
|
124
|
+
expl: tpE,
|
|
125
|
+
unit: '%',
|
|
126
|
+
difficulty,
|
|
127
|
+
}),
|
|
128
|
+
chain_eff: makeEvidence({
|
|
129
|
+
key: 'chain_eff', label: '工具链编排效率',
|
|
130
|
+
what: callsPerIntent != null
|
|
131
|
+
? `平均每个用户意图所用工具数 / 该难度的基线:${calls.length} 次工具 / ${userMsgs.length} 条用户消息 = ${callsPerIntent.toFixed(2)} 次/意图;难度 ${difficulty} 基线 ${baseline} 次/意图。比值越接近 1 越好。`
|
|
132
|
+
: '工具调用数或用户消息数为 0,无法评估。',
|
|
133
|
+
expl: ceE,
|
|
134
|
+
unit: 'x',
|
|
135
|
+
difficulty,
|
|
136
|
+
}),
|
|
137
|
+
self_heal: makeEvidence({
|
|
138
|
+
key: 'self_heal', label: '错误自愈率',
|
|
139
|
+
what: healEligible > 0
|
|
140
|
+
? `每次工具错误后,下一次同名工具调用成功的比例:${healed} / ${healEligible}。比例越高代表 AI 越能自我恢复。`
|
|
141
|
+
: '本会话无工具报错,无需自愈。',
|
|
142
|
+
expl: shE,
|
|
143
|
+
unit: '%',
|
|
144
|
+
difficulty,
|
|
145
|
+
}),
|
|
146
|
+
};
|
|
147
|
+
|
|
148
|
+
const score = rollupDimension('E2', subScores);
|
|
149
|
+
const level = scoreToLevel(score);
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
subScores,
|
|
153
|
+
subLevels,
|
|
154
|
+
subEvidence,
|
|
155
|
+
raw: { tool_pick, chain_eff, self_heal, callCount: calls.length, intents: userMsgs.length, healEligible, healed, baseline, callsPerIntent },
|
|
156
|
+
score,
|
|
157
|
+
level,
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
module.exports = { analyze };
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* H2 — Judgement & Decision.
|
|
3
|
+
*
|
|
4
|
+
* Captures whether the human pushes back when the AI is wrong instead
|
|
5
|
+
* of rubber-stamping output.
|
|
6
|
+
* • challenge — challenge / questioning rate
|
|
7
|
+
* • override — override-the-AI rate (band metric)
|
|
8
|
+
* • accept_rate — passive-acceptance rate (ideal band 60-85%)
|
|
9
|
+
*
|
|
10
|
+
* See spec §4.2.
|
|
11
|
+
*
|
|
12
|
+
* @author Felix
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
'use strict';
|
|
16
|
+
|
|
17
|
+
const {
|
|
18
|
+
fetchMessages,
|
|
19
|
+
userMessages,
|
|
20
|
+
matchesAny,
|
|
21
|
+
CHALLENGE_PATTERNS,
|
|
22
|
+
OVERRIDE_PATTERNS,
|
|
23
|
+
ACCEPT_PATTERNS,
|
|
24
|
+
} = require('../text-signals');
|
|
25
|
+
const { explainIndicator, rollupDimension, scoreToLevel, H2 } = require('../thresholds-v2');
|
|
26
|
+
const { makeEvidence } = require('../evidence-builder');
|
|
27
|
+
const { mergeIndicator, dimensionSource } = require('./llm-merge');
|
|
28
|
+
|
|
29
|
+
function analyze(db, session, difficulty = 2, llmCell = null) {
|
|
30
|
+
const messages = fetchMessages(db, session.id);
|
|
31
|
+
const users = userMessages(messages);
|
|
32
|
+
const haveText = users.some((m) => m.text && m.text.length > 0);
|
|
33
|
+
|
|
34
|
+
let challenge = null;
|
|
35
|
+
let override = null;
|
|
36
|
+
let accept = null;
|
|
37
|
+
let challengeHits = 0;
|
|
38
|
+
let overrideHits = 0;
|
|
39
|
+
let overrideEligible = 0;
|
|
40
|
+
let acceptHits = 0;
|
|
41
|
+
|
|
42
|
+
if (haveText && users.length > 0) {
|
|
43
|
+
challengeHits = users.filter((m) => matchesAny(m.text, CHALLENGE_PATTERNS)).length;
|
|
44
|
+
challenge = challengeHits / users.length;
|
|
45
|
+
|
|
46
|
+
// Override rate: user messages directly after an assistant reply
|
|
47
|
+
// that explicitly override its choice.
|
|
48
|
+
for (let i = 0; i < messages.length; i++) {
|
|
49
|
+
const m = messages[i];
|
|
50
|
+
if (m.role !== 'user') continue;
|
|
51
|
+
const prev = i > 0 ? messages[i - 1] : null;
|
|
52
|
+
if (!prev || prev.role !== 'assistant') continue;
|
|
53
|
+
overrideEligible++;
|
|
54
|
+
if (m.text && matchesAny(m.text, OVERRIDE_PATTERNS)) overrideHits++;
|
|
55
|
+
}
|
|
56
|
+
override = overrideEligible > 0 ? overrideHits / overrideEligible : null;
|
|
57
|
+
|
|
58
|
+
// Accept rate: short pure-affirmation messages.
|
|
59
|
+
acceptHits = users.filter((m) => {
|
|
60
|
+
const t = (m.text || '').trim();
|
|
61
|
+
if (!t || t.length > 30) return false;
|
|
62
|
+
return matchesAny(t, ACCEPT_PATTERNS);
|
|
63
|
+
}).length;
|
|
64
|
+
accept = acceptHits / users.length;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const challengeE = explainIndicator(H2.challenge, challenge, difficulty);
|
|
68
|
+
const overrideE = explainIndicator(H2.override, override, difficulty);
|
|
69
|
+
const acceptE = explainIndicator(H2.accept_rate, accept, difficulty);
|
|
70
|
+
|
|
71
|
+
const rule = {
|
|
72
|
+
challenge: { score: challengeE.score, level: challengeE.level },
|
|
73
|
+
override: { score: overrideE.score, level: overrideE.level },
|
|
74
|
+
accept_rate: { score: acceptE.score, level: acceptE.level },
|
|
75
|
+
};
|
|
76
|
+
const cell = llmCell || {};
|
|
77
|
+
const m = {
|
|
78
|
+
challenge: mergeIndicator(cell.challenge, rule.challenge.score, rule.challenge.level),
|
|
79
|
+
override: mergeIndicator(cell.override, rule.override.score, rule.override.level),
|
|
80
|
+
accept_rate: mergeIndicator(cell.accept_rate, rule.accept_rate.score, rule.accept_rate.level),
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
const subScores = { challenge: m.challenge.score, override: m.override.score, accept_rate: m.accept_rate.score };
|
|
84
|
+
const subLevels = { challenge: m.challenge.level, override: m.override.level, accept_rate: m.accept_rate.level };
|
|
85
|
+
|
|
86
|
+
const subEvidence = {
|
|
87
|
+
challenge: makeEvidence({ key: 'challenge', label: '合理质疑率', what: m.challenge.evidence || (haveText ? `规则版:质疑关键词命中 ${challengeHits}/${users.length}。` : '无文本,无法识别。'), expl: challengeE, unit: '%', difficulty }),
|
|
88
|
+
override: makeEvidence({ key: 'override', label: '推翻率', what: m.override.evidence || (haveText ? `规则版:推翻关键词命中 ${overrideHits}/${overrideEligible}。` : '无文本,无法识别。'), expl: overrideE, unit: '%', difficulty }),
|
|
89
|
+
accept_rate: makeEvidence({ key: 'accept_rate', label: '顺从/采纳判断', what: m.accept_rate.evidence || (haveText ? `规则版(兜底):纯肯定短消息占比 ${acceptHits}/${users.length}。LLM 开启后改为判断"采纳前是否有判断"。` : '无文本,无法识别。'), expl: acceptE, unit: '%', difficulty }),
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
const score = rollupDimension('H2', subScores);
|
|
93
|
+
const level = scoreToLevel(score);
|
|
94
|
+
const judgeSource = dimensionSource([m.challenge.source, m.override.source, m.accept_rate.source]);
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
subScores,
|
|
98
|
+
subLevels,
|
|
99
|
+
subEvidence,
|
|
100
|
+
raw: { challenge, override, accept_rate: accept, difficulty, haveText, challengeHits, overrideHits, overrideEligible, acceptHits },
|
|
101
|
+
score,
|
|
102
|
+
level,
|
|
103
|
+
judgeSource,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
module.exports = { analyze };
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
// server/analysis/dimensions/llm-merge.js
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
const { LEVEL_SCORE, scoreToLevel } = require('../thresholds-v2');
|
|
5
|
+
|
|
6
|
+
/** Minimum LLM self-reported confidence to trust over the rule fallback. */
|
|
7
|
+
const CONF_THRESHOLD = 0.5;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Merge one LLM-judged indicator cell with the rule-derived score.
|
|
11
|
+
*
|
|
12
|
+
* The LLM may report either a granular 0–100 `score` (preferred — gives
|
|
13
|
+
* non-blocky sub-scores) or, for back-compat with older cached payloads, a
|
|
14
|
+
* discrete `level` (1–4). With a score we derive the level band via
|
|
15
|
+
* scoreToLevel; with only a level we fall back to the level's centred score.
|
|
16
|
+
*
|
|
17
|
+
* @param {{score:?number, level:?number, confidence:?number, evidence:?string}|null|undefined} cell
|
|
18
|
+
* @param {number|null} ruleScore centred score from explainIndicator (fallback)
|
|
19
|
+
* @param {number|null} ruleLevel
|
|
20
|
+
* @returns {{score:number|null, level:number|null, source:'llm'|'rules', evidence:string|null}}
|
|
21
|
+
*/
|
|
22
|
+
function mergeIndicator(cell, ruleScore, ruleLevel) {
|
|
23
|
+
if (cell && typeof cell.confidence === 'number' && cell.confidence >= CONF_THRESHOLD) {
|
|
24
|
+
// Preferred: granular 0–100 score (kept to one decimal).
|
|
25
|
+
if (typeof cell.score === 'number' && Number.isFinite(cell.score)
|
|
26
|
+
&& cell.score >= 0 && cell.score <= 100) {
|
|
27
|
+
const score = Math.round(cell.score * 10) / 10;
|
|
28
|
+
return { score, level: scoreToLevel(score), source: 'llm', evidence: cell.evidence || null };
|
|
29
|
+
}
|
|
30
|
+
// Back-compat: discrete level → centred score.
|
|
31
|
+
if (Number.isInteger(cell.level) && cell.level >= 1 && cell.level <= 4) {
|
|
32
|
+
return { score: LEVEL_SCORE[cell.level], level: cell.level, source: 'llm', evidence: cell.evidence || null };
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return {
|
|
36
|
+
score: ruleScore ?? null,
|
|
37
|
+
level: ruleLevel ?? null,
|
|
38
|
+
source: 'rules',
|
|
39
|
+
evidence: null,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Aggregate per-indicator sources into a dimension-level source label.
|
|
45
|
+
* @param {Array<'llm'|'rules'|null>} sources
|
|
46
|
+
* @returns {'llm'|'rules'|'mixed'|null}
|
|
47
|
+
*/
|
|
48
|
+
function dimensionSource(sources) {
|
|
49
|
+
const used = sources.filter(Boolean);
|
|
50
|
+
if (!used.length) return null;
|
|
51
|
+
const hasLlm = used.includes('llm');
|
|
52
|
+
const hasRules = used.includes('rules');
|
|
53
|
+
if (hasLlm && hasRules) return 'mixed';
|
|
54
|
+
return hasLlm ? 'llm' : 'rules';
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
module.exports = { mergeIndicator, dimensionSource, CONF_THRESHOLD };
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* O1 — AI Output Quality.
|
|
3
|
+
*
|
|
4
|
+
* Sub-indicators:
|
|
5
|
+
* • first_take — fraction of assistant turns NOT followed by an
|
|
6
|
+
* immediate user correction
|
|
7
|
+
* • code_style — LLM-judged (rule fallback returns 0.7 neutral)
|
|
8
|
+
* • completeness — assistant text mentions edge/error/test concepts
|
|
9
|
+
* (LLM judge is more accurate; rules give a proxy)
|
|
10
|
+
*
|
|
11
|
+
* See spec §4.6.
|
|
12
|
+
*
|
|
13
|
+
* @author Felix
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
'use strict';
|
|
17
|
+
|
|
18
|
+
const {
|
|
19
|
+
fetchMessages,
|
|
20
|
+
matchesAny,
|
|
21
|
+
} = require('../text-signals');
|
|
22
|
+
const { explainIndicator, rollupDimension, scoreToLevel, O1 } = require('../thresholds-v2');
|
|
23
|
+
const { mergeIndicator, dimensionSource } = require('./llm-merge');
|
|
24
|
+
const { makeEvidence } = require('../evidence-builder');
|
|
25
|
+
|
|
26
|
+
const CORRECTION_PATTERNS = [
|
|
27
|
+
/改一下/, /不对/, /(再|重新)(写|来|改|做)/, /(错|有问题|有bug)/, /漏了/, /换/,
|
|
28
|
+
/fix that/i, /that'?s wrong/i, /redo/i, /try again/i, /not quite/i,
|
|
29
|
+
];
|
|
30
|
+
|
|
31
|
+
const COMPLETENESS_PATTERNS = [
|
|
32
|
+
/边界/, /异常/, /错误处理/, /测试/, /单元测试/, /回退/, /兜底/,
|
|
33
|
+
/edge case/i, /error handling/i, /test/i, /fallback/i, /exception/i,
|
|
34
|
+
];
|
|
35
|
+
|
|
36
|
+
function analyzeRules(db, session, difficulty = 2) {
|
|
37
|
+
const messages = fetchMessages(db, session.id);
|
|
38
|
+
const haveText = messages.some((m) => m.text && m.text.length > 0);
|
|
39
|
+
|
|
40
|
+
let first_take = null;
|
|
41
|
+
let assistantTurns = 0;
|
|
42
|
+
let correctionsAfter = 0;
|
|
43
|
+
|
|
44
|
+
for (let i = 0; i < messages.length; i++) {
|
|
45
|
+
const m = messages[i];
|
|
46
|
+
if (m.role !== 'assistant') continue;
|
|
47
|
+
assistantTurns++;
|
|
48
|
+
const next = i + 1 < messages.length ? messages[i + 1] : null;
|
|
49
|
+
if (!next || next.role !== 'user' || !next.text) continue;
|
|
50
|
+
if (matchesAny(next.text, CORRECTION_PATTERNS)) correctionsAfter++;
|
|
51
|
+
}
|
|
52
|
+
if (assistantTurns > 0) {
|
|
53
|
+
first_take = 1 - correctionsAfter / assistantTurns;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const code_style = haveText ? 0.7 : null;
|
|
57
|
+
|
|
58
|
+
let completeness = null;
|
|
59
|
+
let completenessHits = 0;
|
|
60
|
+
let assistantMsgsWithText = 0;
|
|
61
|
+
if (haveText) {
|
|
62
|
+
const assistantMsgs = messages.filter((m) => m.role === 'assistant' && m.text);
|
|
63
|
+
assistantMsgsWithText = assistantMsgs.length;
|
|
64
|
+
if (assistantMsgs.length > 0) {
|
|
65
|
+
completenessHits = assistantMsgs.filter((m) => matchesAny(m.text, COMPLETENESS_PATTERNS)).length;
|
|
66
|
+
completeness = completenessHits / assistantMsgs.length;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const ftE = explainIndicator(O1.first_take, first_take, difficulty);
|
|
71
|
+
const csE = explainIndicator(O1.code_style, code_style, difficulty);
|
|
72
|
+
const cpE = explainIndicator(O1.completeness, completeness, difficulty);
|
|
73
|
+
|
|
74
|
+
const subScores = {
|
|
75
|
+
first_take: ftE.score,
|
|
76
|
+
code_style: csE.score,
|
|
77
|
+
completeness: cpE.score,
|
|
78
|
+
};
|
|
79
|
+
const subLevels = {
|
|
80
|
+
first_take: ftE.level,
|
|
81
|
+
code_style: csE.level,
|
|
82
|
+
completeness: cpE.level,
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
const subEvidence = buildSubEvidence(
|
|
86
|
+
{ ft: ftE, cs: csE, cp: cpE },
|
|
87
|
+
{
|
|
88
|
+
mode: 'rules', haveText, assistantTurns, correctionsAfter,
|
|
89
|
+
assistantMsgsWithText, completenessHits,
|
|
90
|
+
},
|
|
91
|
+
difficulty
|
|
92
|
+
);
|
|
93
|
+
|
|
94
|
+
const score = rollupDimension('O1', subScores);
|
|
95
|
+
const level = scoreToLevel(score);
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
subScores,
|
|
99
|
+
subLevels,
|
|
100
|
+
subEvidence,
|
|
101
|
+
raw: { first_take, code_style, completeness, assistantTurns, correctionsAfter, completenessHits, assistantMsgsWithText },
|
|
102
|
+
score,
|
|
103
|
+
level,
|
|
104
|
+
judgeSource: 'rules',
|
|
105
|
+
llmJudge: null,
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/** Public entry — synchronous. `llmCell` is llmJudge.O1 (or null). */
|
|
110
|
+
function analyze(db, session, difficulty = 2, llmCell = null) {
|
|
111
|
+
const ruleResult = analyzeRules(db, session, difficulty);
|
|
112
|
+
if (!llmCell) return ruleResult;
|
|
113
|
+
|
|
114
|
+
const cell = llmCell;
|
|
115
|
+
const m = {
|
|
116
|
+
first_take: mergeIndicator(cell.first_take, ruleResult.subScores.first_take, ruleResult.subLevels.first_take),
|
|
117
|
+
code_style: mergeIndicator(cell.code_style, ruleResult.subScores.code_style, ruleResult.subLevels.code_style),
|
|
118
|
+
completeness: mergeIndicator(cell.completeness, ruleResult.subScores.completeness, ruleResult.subLevels.completeness),
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
const subScores = { first_take: m.first_take.score, code_style: m.code_style.score, completeness: m.completeness.score };
|
|
122
|
+
const subLevels = { first_take: m.first_take.level, code_style: m.code_style.level, completeness: m.completeness.level };
|
|
123
|
+
|
|
124
|
+
const subEvidence = {
|
|
125
|
+
first_take: { ...ruleResult.subEvidence.first_take, what: m.first_take.evidence || ruleResult.subEvidence.first_take.what, level: subLevels.first_take, score: subScores.first_take },
|
|
126
|
+
code_style: { ...ruleResult.subEvidence.code_style, what: m.code_style.evidence || ruleResult.subEvidence.code_style.what, level: subLevels.code_style, score: subScores.code_style },
|
|
127
|
+
completeness: { ...ruleResult.subEvidence.completeness, what: m.completeness.evidence || ruleResult.subEvidence.completeness.what, level: subLevels.completeness, score: subScores.completeness },
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
const score = rollupDimension('O1', subScores);
|
|
131
|
+
const level = scoreToLevel(score);
|
|
132
|
+
const judgeSource = dimensionSource([m.first_take.source, m.code_style.source, m.completeness.source]);
|
|
133
|
+
|
|
134
|
+
return { subScores, subLevels, subEvidence, raw: { llmCell: cell, ruleRaw: ruleResult.raw }, score, level, judgeSource, llmJudge: null };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function buildSubEvidence(expls, ctx, difficulty) {
|
|
138
|
+
const { ft, cs, cp } = expls;
|
|
139
|
+
|
|
140
|
+
// rules
|
|
141
|
+
const { haveText, assistantTurns, correctionsAfter, assistantMsgsWithText, completenessHits } = ctx;
|
|
142
|
+
return {
|
|
143
|
+
first_take: makeEvidence({
|
|
144
|
+
key: 'first_take', label: '一次采纳率',
|
|
145
|
+
what: assistantTurns > 0
|
|
146
|
+
? `规则版:1 - (AI 回合后用户立即纠错的比例) = 1 - ${correctionsAfter} / ${assistantTurns} 个 AI 回合。`
|
|
147
|
+
: '规则版:无 AI 回合可评估。',
|
|
148
|
+
expl: ft, unit: '%', difficulty,
|
|
149
|
+
}),
|
|
150
|
+
code_style: makeEvidence({
|
|
151
|
+
key: 'code_style', label: '代码规范性',
|
|
152
|
+
what: haveText
|
|
153
|
+
? '规则版无法评估代码规范,默认给中性 0.7 分。开启 LLM judge 可获得真实评估。'
|
|
154
|
+
: '消息文本缺失,无法评估。',
|
|
155
|
+
expl: cs, unit: '%', difficulty,
|
|
156
|
+
}),
|
|
157
|
+
completeness: makeEvidence({
|
|
158
|
+
key: 'completeness', label: '方案完备性',
|
|
159
|
+
what: assistantMsgsWithText > 0
|
|
160
|
+
? `规则版:AI 消息中含完备性关键词("边界"、"异常"、"测试"等)的比例:${completenessHits} / ${assistantMsgsWithText}。`
|
|
161
|
+
: '规则版:无 AI 文本,无法评估。',
|
|
162
|
+
expl: cp, unit: '%', difficulty,
|
|
163
|
+
}),
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
module.exports = { analyze, analyzeRules };
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* H1 — Problem Definition.
|
|
3
|
+
*
|
|
4
|
+
* Captures the human ability to turn a vague need into a precise,
|
|
5
|
+
* AI-executable problem. Three sub-indicators:
|
|
6
|
+
* • clarity — AI proactive-question count in the first 30%
|
|
7
|
+
* • converge — number of user-message rounds to convergence
|
|
8
|
+
* • drift — direction-change events
|
|
9
|
+
*
|
|
10
|
+
* See docs/superpowers/specs/2026-06-13-capability-model-v2.md §4.1.
|
|
11
|
+
*
|
|
12
|
+
* @author Felix
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
'use strict';
|
|
16
|
+
|
|
17
|
+
const { queryAll } = require('../../db/queries');
|
|
18
|
+
const {
|
|
19
|
+
fetchMessages,
|
|
20
|
+
userMessages,
|
|
21
|
+
matchesAny,
|
|
22
|
+
DRIFT_PATTERNS,
|
|
23
|
+
} = require('../text-signals');
|
|
24
|
+
const { explainIndicator, rollupDimension, scoreToLevel, H1 } = require('../thresholds-v2');
|
|
25
|
+
const { makeEvidence } = require('../evidence-builder');
|
|
26
|
+
const { mergeIndicator, dimensionSource } = require('./llm-merge');
|
|
27
|
+
|
|
28
|
+
function analyze(db, session, difficulty = 2, llmCell = null) {
|
|
29
|
+
const messages = fetchMessages(db, session.id);
|
|
30
|
+
const users = userMessages(messages);
|
|
31
|
+
|
|
32
|
+
// ---- clarity: count of `question` tool calls in first 30% ----------
|
|
33
|
+
const toolCalls = queryAll(
|
|
34
|
+
db,
|
|
35
|
+
`SELECT tool_name, timestamp
|
|
36
|
+
FROM unified_tool_call
|
|
37
|
+
WHERE session_id = ?
|
|
38
|
+
ORDER BY timestamp ASC`,
|
|
39
|
+
[session.id]
|
|
40
|
+
);
|
|
41
|
+
let clarity = 0;
|
|
42
|
+
let clarityCutoff = 0;
|
|
43
|
+
if (toolCalls.length > 0) {
|
|
44
|
+
clarityCutoff = Math.max(1, Math.floor(toolCalls.length * 0.3));
|
|
45
|
+
clarity = toolCalls.slice(0, clarityCutoff).filter((t) => t.tool_name === 'question').length;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// ---- converge: number of user messages -----------------------------
|
|
49
|
+
const converge = users.length;
|
|
50
|
+
|
|
51
|
+
// ---- drift: keyword spotting on user text -------------------------
|
|
52
|
+
const haveText = users.some((m) => m.text && m.text.length > 0);
|
|
53
|
+
|
|
54
|
+
let drift = null;
|
|
55
|
+
if (haveText) {
|
|
56
|
+
drift = users.filter((m) => matchesAny(m.text, DRIFT_PATTERNS)).length;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// ---- eval each ----------------------------------------------------
|
|
60
|
+
const clarityE = explainIndicator(H1.clarity, clarity, difficulty);
|
|
61
|
+
const convergeE = explainIndicator(H1.converge, converge, difficulty);
|
|
62
|
+
const driftE = explainIndicator(H1.drift, drift, difficulty);
|
|
63
|
+
|
|
64
|
+
// Rule baseline per indicator (level + centred score).
|
|
65
|
+
const rule = {
|
|
66
|
+
clarity: { score: clarityE.score, level: clarityE.level },
|
|
67
|
+
converge: { score: convergeE.score, level: convergeE.level },
|
|
68
|
+
drift: { score: driftE.score, level: driftE.level },
|
|
69
|
+
};
|
|
70
|
+
|
|
71
|
+
// Merge LLM cell (if any) over the rule baseline.
|
|
72
|
+
const cell = llmCell || {};
|
|
73
|
+
const m = {
|
|
74
|
+
clarity: mergeIndicator(cell.clarity, rule.clarity.score, rule.clarity.level),
|
|
75
|
+
converge: mergeIndicator(cell.converge, rule.converge.score, rule.converge.level),
|
|
76
|
+
drift: mergeIndicator(cell.drift, rule.drift.score, rule.drift.level),
|
|
77
|
+
};
|
|
78
|
+
|
|
79
|
+
const subScores = { clarity: m.clarity.score, converge: m.converge.score, drift: m.drift.score };
|
|
80
|
+
const subLevels = { clarity: m.clarity.level, converge: m.converge.level, drift: m.drift.level };
|
|
81
|
+
|
|
82
|
+
// Evidence: prefer the LLM's cited rationale when that indicator used the LLM.
|
|
83
|
+
const subEvidence = {
|
|
84
|
+
clarity: makeEvidence({ key: 'clarity', label: '初始指令清晰度', what: m.clarity.evidence || `规则版:前 30% 工具调用中 question 次数 ${clarity}(共 ${toolCalls.length} 次调用)。`, expl: clarityE, unit: '次', difficulty }),
|
|
85
|
+
converge: makeEvidence({ key: 'converge', label: '任务收敛轮次', what: m.converge.evidence || `规则版:用户消息 ${converge} 条。`, expl: convergeE, unit: '轮', difficulty }),
|
|
86
|
+
drift: makeEvidence({ key: 'drift', label: '方向变更次数', what: m.drift.evidence || (haveText ? `规则版:方向变更关键词命中 ${drift} 条。` : '用户消息无文本,无法识别。'), expl: driftE, unit: '次', difficulty }),
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
const score = rollupDimension('H1', subScores);
|
|
90
|
+
const level = scoreToLevel(score);
|
|
91
|
+
const judgeSource = dimensionSource([m.clarity.source, m.converge.source, m.drift.source]);
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
subScores,
|
|
95
|
+
subLevels,
|
|
96
|
+
subEvidence,
|
|
97
|
+
raw: { clarity, converge, drift, difficulty, haveText, toolCallCount: toolCalls.length, clarityCutoff },
|
|
98
|
+
score,
|
|
99
|
+
level,
|
|
100
|
+
judgeSource,
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
module.exports = { analyze };
|