agentboss 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/client/dist/assets/{index-CsVml4AS.js → index-DxoLOxZ8.js} +53 -49
- package/client/dist/index.html +1 -1
- package/package.json +1 -1
- package/server/analysis/dimensions/judgement.js +111 -107
- package/server/analysis/dimensions/llm-merge.js +59 -57
- package/server/analysis/dimensions/output-quality.js +167 -167
- package/server/analysis/dimensions/problem-definition.js +109 -104
- package/server/analysis/job.js +37 -6
- package/server/analysis/scoring-v2.js +12 -8
- package/server/api/execution.js +94 -0
- package/server/db/schema.js +5 -2
- package/server/etl/opencode.js +5 -1
- package/server/execution/job.js +141 -2
- package/server/llm/advice-prompt.js +74 -11
- package/server/llm/advice.js +50 -1
- package/server/llm/mcp-classify.js +147 -0
- package/server/llm/project-advice-prompt.js +106 -6
- package/server/llm/project-advice.js +55 -2
package/client/dist/index.html
CHANGED
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
} catch (e) {}
|
|
26
26
|
})();
|
|
27
27
|
</script>
|
|
28
|
-
<script type="module" crossorigin src="/assets/index-
|
|
28
|
+
<script type="module" crossorigin src="/assets/index-DxoLOxZ8.js"></script>
|
|
29
29
|
<link rel="stylesheet" crossorigin href="/assets/index-C1wFD_Vo.css">
|
|
30
30
|
</head>
|
|
31
31
|
<body>
|
package/package.json
CHANGED
|
@@ -1,107 +1,111 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* H2 — Judgement & Decision.
|
|
3
|
-
*
|
|
4
|
-
* Captures whether the human pushes back when the AI is wrong instead
|
|
5
|
-
* of rubber-stamping output.
|
|
6
|
-
* • challenge — challenge / questioning rate
|
|
7
|
-
* • override — override-the-AI rate (band metric)
|
|
8
|
-
* • accept_rate — passive-acceptance rate (ideal band 60-85%)
|
|
9
|
-
*
|
|
10
|
-
* See spec §4.2.
|
|
11
|
-
*
|
|
12
|
-
* @author Felix
|
|
13
|
-
*/
|
|
14
|
-
|
|
15
|
-
'use strict';
|
|
16
|
-
|
|
17
|
-
const {
|
|
18
|
-
fetchMessages,
|
|
19
|
-
userMessages,
|
|
20
|
-
matchesAny,
|
|
21
|
-
CHALLENGE_PATTERNS,
|
|
22
|
-
OVERRIDE_PATTERNS,
|
|
23
|
-
ACCEPT_PATTERNS,
|
|
24
|
-
} = require('../text-signals');
|
|
25
|
-
const { explainIndicator, rollupDimension, scoreToLevel, H2 } = require('../thresholds-v2');
|
|
26
|
-
const { makeEvidence } = require('../evidence-builder');
|
|
27
|
-
const { mergeIndicator, dimensionSource } = require('./llm-merge');
|
|
28
|
-
|
|
29
|
-
function analyze(db, session, difficulty = 2, llmCell = null) {
|
|
30
|
-
const messages = fetchMessages(db, session.id);
|
|
31
|
-
const users = userMessages(messages);
|
|
32
|
-
const haveText = users.some((m) => m.text && m.text.length > 0);
|
|
33
|
-
|
|
34
|
-
let challenge = null;
|
|
35
|
-
let override = null;
|
|
36
|
-
let accept = null;
|
|
37
|
-
let challengeHits = 0;
|
|
38
|
-
let overrideHits = 0;
|
|
39
|
-
let overrideEligible = 0;
|
|
40
|
-
let acceptHits = 0;
|
|
41
|
-
|
|
42
|
-
if (haveText && users.length > 0) {
|
|
43
|
-
challengeHits = users.filter((m) => matchesAny(m.text, CHALLENGE_PATTERNS)).length;
|
|
44
|
-
challenge = challengeHits / users.length;
|
|
45
|
-
|
|
46
|
-
// Override rate: user messages directly after an assistant reply
|
|
47
|
-
// that explicitly override its choice.
|
|
48
|
-
for (let i = 0; i < messages.length; i++) {
|
|
49
|
-
const m = messages[i];
|
|
50
|
-
if (m.role !== 'user') continue;
|
|
51
|
-
const prev = i > 0 ? messages[i - 1] : null;
|
|
52
|
-
if (!prev || prev.role !== 'assistant') continue;
|
|
53
|
-
overrideEligible++;
|
|
54
|
-
if (m.text && matchesAny(m.text, OVERRIDE_PATTERNS)) overrideHits++;
|
|
55
|
-
}
|
|
56
|
-
override = overrideEligible > 0 ? overrideHits / overrideEligible : null;
|
|
57
|
-
|
|
58
|
-
// Accept rate: short pure-affirmation messages.
|
|
59
|
-
acceptHits = users.filter((m) => {
|
|
60
|
-
const t = (m.text || '').trim();
|
|
61
|
-
if (!t || t.length > 30) return false;
|
|
62
|
-
return matchesAny(t, ACCEPT_PATTERNS);
|
|
63
|
-
}).length;
|
|
64
|
-
accept = acceptHits / users.length;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
const challengeE = explainIndicator(H2.challenge, challenge, difficulty);
|
|
68
|
-
const overrideE = explainIndicator(H2.override, override, difficulty);
|
|
69
|
-
const acceptE = explainIndicator(H2.accept_rate, accept, difficulty);
|
|
70
|
-
|
|
71
|
-
const rule = {
|
|
72
|
-
challenge: { score: challengeE.score, level: challengeE.level },
|
|
73
|
-
override: { score: overrideE.score, level: overrideE.level },
|
|
74
|
-
accept_rate: { score: acceptE.score, level: acceptE.level },
|
|
75
|
-
};
|
|
76
|
-
const cell = llmCell || {};
|
|
77
|
-
const m = {
|
|
78
|
-
challenge: mergeIndicator(cell.challenge, rule.challenge.score, rule.challenge.level),
|
|
79
|
-
override: mergeIndicator(cell.override, rule.override.score, rule.override.level),
|
|
80
|
-
accept_rate: mergeIndicator(cell.accept_rate, rule.accept_rate.score, rule.accept_rate.level),
|
|
81
|
-
};
|
|
82
|
-
|
|
83
|
-
const subScores = { challenge: m.challenge.score, override: m.override.score, accept_rate: m.accept_rate.score };
|
|
84
|
-
const subLevels = { challenge: m.challenge.level, override: m.override.level, accept_rate: m.accept_rate.level };
|
|
85
|
-
|
|
86
|
-
const subEvidence = {
|
|
87
|
-
challenge: makeEvidence({ key: 'challenge', label: '合理质疑率', what: m.challenge.evidence || (haveText ? `规则版:质疑关键词命中 ${challengeHits}/${users.length}。` : '无文本,无法识别。'), expl: challengeE, unit: '%', difficulty }),
|
|
88
|
-
override: makeEvidence({ key: 'override', label: '推翻率', what: m.override.evidence || (haveText ? `规则版:推翻关键词命中 ${overrideHits}/${overrideEligible}。` : '无文本,无法识别。'), expl: overrideE, unit: '%', difficulty }),
|
|
89
|
-
accept_rate: makeEvidence({ key: 'accept_rate', label: '顺从/采纳判断', what: m.accept_rate.evidence || (haveText ? `规则版(兜底):纯肯定短消息占比 ${acceptHits}/${users.length}。LLM 开启后改为判断"采纳前是否有判断"。` : '无文本,无法识别。'), expl: acceptE, unit: '%', difficulty }),
|
|
90
|
-
};
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
1
|
+
/**
|
|
2
|
+
* H2 — Judgement & Decision.
|
|
3
|
+
*
|
|
4
|
+
* Captures whether the human pushes back when the AI is wrong instead
|
|
5
|
+
* of rubber-stamping output.
|
|
6
|
+
* • challenge — challenge / questioning rate
|
|
7
|
+
* • override — override-the-AI rate (band metric)
|
|
8
|
+
* • accept_rate — passive-acceptance rate (ideal band 60-85%)
|
|
9
|
+
*
|
|
10
|
+
* See spec §4.2.
|
|
11
|
+
*
|
|
12
|
+
* @author Felix
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
'use strict';
|
|
16
|
+
|
|
17
|
+
const {
|
|
18
|
+
fetchMessages,
|
|
19
|
+
userMessages,
|
|
20
|
+
matchesAny,
|
|
21
|
+
CHALLENGE_PATTERNS,
|
|
22
|
+
OVERRIDE_PATTERNS,
|
|
23
|
+
ACCEPT_PATTERNS,
|
|
24
|
+
} = require('../text-signals');
|
|
25
|
+
const { explainIndicator, rollupDimension, scoreToLevel, H2 } = require('../thresholds-v2');
|
|
26
|
+
const { makeEvidence } = require('../evidence-builder');
|
|
27
|
+
const { mergeIndicator, dimensionSource } = require('./llm-merge');
|
|
28
|
+
|
|
29
|
+
function analyze(db, session, difficulty = 2, llmCell = null) {
|
|
30
|
+
const messages = fetchMessages(db, session.id);
|
|
31
|
+
const users = userMessages(messages);
|
|
32
|
+
const haveText = users.some((m) => m.text && m.text.length > 0);
|
|
33
|
+
|
|
34
|
+
let challenge = null;
|
|
35
|
+
let override = null;
|
|
36
|
+
let accept = null;
|
|
37
|
+
let challengeHits = 0;
|
|
38
|
+
let overrideHits = 0;
|
|
39
|
+
let overrideEligible = 0;
|
|
40
|
+
let acceptHits = 0;
|
|
41
|
+
|
|
42
|
+
if (haveText && users.length > 0) {
|
|
43
|
+
challengeHits = users.filter((m) => matchesAny(m.text, CHALLENGE_PATTERNS)).length;
|
|
44
|
+
challenge = challengeHits / users.length;
|
|
45
|
+
|
|
46
|
+
// Override rate: user messages directly after an assistant reply
|
|
47
|
+
// that explicitly override its choice.
|
|
48
|
+
for (let i = 0; i < messages.length; i++) {
|
|
49
|
+
const m = messages[i];
|
|
50
|
+
if (m.role !== 'user') continue;
|
|
51
|
+
const prev = i > 0 ? messages[i - 1] : null;
|
|
52
|
+
if (!prev || prev.role !== 'assistant') continue;
|
|
53
|
+
overrideEligible++;
|
|
54
|
+
if (m.text && matchesAny(m.text, OVERRIDE_PATTERNS)) overrideHits++;
|
|
55
|
+
}
|
|
56
|
+
override = overrideEligible > 0 ? overrideHits / overrideEligible : null;
|
|
57
|
+
|
|
58
|
+
// Accept rate: short pure-affirmation messages.
|
|
59
|
+
acceptHits = users.filter((m) => {
|
|
60
|
+
const t = (m.text || '').trim();
|
|
61
|
+
if (!t || t.length > 30) return false;
|
|
62
|
+
return matchesAny(t, ACCEPT_PATTERNS);
|
|
63
|
+
}).length;
|
|
64
|
+
accept = acceptHits / users.length;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const challengeE = explainIndicator(H2.challenge, challenge, difficulty);
|
|
68
|
+
const overrideE = explainIndicator(H2.override, override, difficulty);
|
|
69
|
+
const acceptE = explainIndicator(H2.accept_rate, accept, difficulty);
|
|
70
|
+
|
|
71
|
+
const rule = {
|
|
72
|
+
challenge: { score: challengeE.score, level: challengeE.level },
|
|
73
|
+
override: { score: overrideE.score, level: overrideE.level },
|
|
74
|
+
accept_rate: { score: acceptE.score, level: acceptE.level },
|
|
75
|
+
};
|
|
76
|
+
const cell = llmCell || {};
|
|
77
|
+
const m = {
|
|
78
|
+
challenge: mergeIndicator(cell.challenge, rule.challenge.score, rule.challenge.level),
|
|
79
|
+
override: mergeIndicator(cell.override, rule.override.score, rule.override.level),
|
|
80
|
+
accept_rate: mergeIndicator(cell.accept_rate, rule.accept_rate.score, rule.accept_rate.level),
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
const subScores = { challenge: m.challenge.score, override: m.override.score, accept_rate: m.accept_rate.score };
|
|
84
|
+
const subLevels = { challenge: m.challenge.level, override: m.override.level, accept_rate: m.accept_rate.level };
|
|
85
|
+
|
|
86
|
+
const subEvidence = {
|
|
87
|
+
challenge: makeEvidence({ key: 'challenge', label: '合理质疑率', what: m.challenge.evidence || (haveText ? `规则版:质疑关键词命中 ${challengeHits}/${users.length}。` : '无文本,无法识别。'), expl: challengeE, unit: '%', difficulty }),
|
|
88
|
+
override: makeEvidence({ key: 'override', label: '推翻率', what: m.override.evidence || (haveText ? `规则版:推翻关键词命中 ${overrideHits}/${overrideEligible}。` : '无文本,无法识别。'), expl: overrideE, unit: '%', difficulty }),
|
|
89
|
+
accept_rate: makeEvidence({ key: 'accept_rate', label: '顺从/采纳判断', what: m.accept_rate.evidence || (haveText ? `规则版(兜底):纯肯定短消息占比 ${acceptHits}/${users.length}。LLM 开启后改为判断"采纳前是否有判断"。` : '无文本,无法识别。'), expl: acceptE, unit: '%', difficulty }),
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
subEvidence.challenge.source = m.challenge.source;
|
|
93
|
+
subEvidence.override.source = m.override.source;
|
|
94
|
+
subEvidence.accept_rate.source = m.accept_rate.source;
|
|
95
|
+
|
|
96
|
+
const score = rollupDimension('H2', subScores);
|
|
97
|
+
const level = scoreToLevel(score);
|
|
98
|
+
const judgeSource = dimensionSource([m.challenge.source, m.override.source, m.accept_rate.source]);
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
subScores,
|
|
102
|
+
subLevels,
|
|
103
|
+
subEvidence,
|
|
104
|
+
raw: { challenge, override, accept_rate: accept, difficulty, haveText, challengeHits, overrideHits, overrideEligible, acceptHits },
|
|
105
|
+
score,
|
|
106
|
+
level,
|
|
107
|
+
judgeSource,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
module.exports = { analyze };
|
|
@@ -1,57 +1,59 @@
|
|
|
1
|
-
// server/analysis/dimensions/llm-merge.js
|
|
2
|
-
'use strict';
|
|
3
|
-
|
|
4
|
-
const { LEVEL_SCORE, scoreToLevel } = require('../thresholds-v2');
|
|
5
|
-
|
|
6
|
-
/** Minimum LLM self-reported confidence to trust over the rule fallback. */
|
|
7
|
-
const CONF_THRESHOLD = 0.5;
|
|
8
|
-
|
|
9
|
-
/**
|
|
10
|
-
* Merge one LLM-judged indicator cell with the rule-derived score.
|
|
11
|
-
*
|
|
12
|
-
* The LLM may report either a granular 0–100 `score` (preferred — gives
|
|
13
|
-
* non-blocky sub-scores) or, for back-compat with older cached payloads, a
|
|
14
|
-
* discrete `level` (1–4). With a score we derive the level band via
|
|
15
|
-
* scoreToLevel; with only a level we fall back to the level's centred score.
|
|
16
|
-
*
|
|
17
|
-
* @param {{score:?number, level:?number, confidence:?number, evidence:?string}|null|undefined} cell
|
|
18
|
-
* @param {number|null} ruleScore centred score from explainIndicator (fallback)
|
|
19
|
-
* @param {number|null} ruleLevel
|
|
20
|
-
* @returns {{score:number|null, level:number|null, source:'llm'|'rules', evidence:string|null}}
|
|
21
|
-
*/
|
|
22
|
-
function mergeIndicator(cell, ruleScore, ruleLevel) {
|
|
23
|
-
if (cell && typeof cell.confidence === 'number' && cell.confidence >= CONF_THRESHOLD) {
|
|
24
|
-
// Preferred: granular 0–100 score (kept to one decimal).
|
|
25
|
-
if (typeof cell.score === 'number' && Number.isFinite(cell.score)
|
|
26
|
-
&& cell.score >= 0 && cell.score <= 100) {
|
|
27
|
-
const score = Math.round(cell.score * 10) / 10;
|
|
28
|
-
return { score, level: scoreToLevel(score), source: 'llm', evidence: cell.evidence || null };
|
|
29
|
-
}
|
|
30
|
-
// Back-compat: discrete level → centred score.
|
|
31
|
-
if (Number.isInteger(cell.level) && cell.level >= 1 && cell.level <= 4) {
|
|
32
|
-
return { score: LEVEL_SCORE[cell.level], level: cell.level, source: 'llm', evidence: cell.evidence || null };
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
return {
|
|
36
|
-
score: ruleScore ?? null,
|
|
37
|
-
level: ruleLevel ?? null,
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
*
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
const
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
1
|
+
// server/analysis/dimensions/llm-merge.js
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
const { LEVEL_SCORE, scoreToLevel } = require('../thresholds-v2');
|
|
5
|
+
|
|
6
|
+
/** Minimum LLM self-reported confidence to trust over the rule fallback. */
|
|
7
|
+
const CONF_THRESHOLD = 0.5;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Merge one LLM-judged indicator cell with the rule-derived score.
|
|
11
|
+
*
|
|
12
|
+
* The LLM may report either a granular 0–100 `score` (preferred — gives
|
|
13
|
+
* non-blocky sub-scores) or, for back-compat with older cached payloads, a
|
|
14
|
+
* discrete `level` (1–4). With a score we derive the level band via
|
|
15
|
+
* scoreToLevel; with only a level we fall back to the level's centred score.
|
|
16
|
+
*
|
|
17
|
+
* @param {{score:?number, level:?number, confidence:?number, evidence:?string}|null|undefined} cell
|
|
18
|
+
* @param {number|null} ruleScore centred score from explainIndicator (fallback)
|
|
19
|
+
* @param {number|null} ruleLevel
|
|
20
|
+
* @returns {{score:number|null, level:number|null, source:'llm'|'rules', evidence:string|null}}
|
|
21
|
+
*/
|
|
22
|
+
function mergeIndicator(cell, ruleScore, ruleLevel) {
|
|
23
|
+
if (cell && typeof cell.confidence === 'number' && cell.confidence >= CONF_THRESHOLD) {
|
|
24
|
+
// Preferred: granular 0–100 score (kept to one decimal).
|
|
25
|
+
if (typeof cell.score === 'number' && Number.isFinite(cell.score)
|
|
26
|
+
&& cell.score >= 0 && cell.score <= 100) {
|
|
27
|
+
const score = Math.round(cell.score * 10) / 10;
|
|
28
|
+
return { score, level: scoreToLevel(score), source: 'llm', evidence: cell.evidence || null };
|
|
29
|
+
}
|
|
30
|
+
// Back-compat: discrete level → centred score.
|
|
31
|
+
if (Number.isInteger(cell.level) && cell.level >= 1 && cell.level <= 4) {
|
|
32
|
+
return { score: LEVEL_SCORE[cell.level], level: cell.level, source: 'llm', evidence: cell.evidence || null };
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return {
|
|
36
|
+
score: ruleScore ?? null,
|
|
37
|
+
level: ruleLevel ?? null,
|
|
38
|
+
// No rule value supplied → the indicator is simply "not assessed",
|
|
39
|
+
// not rule-scored. Only call it 'rules' when a rule score exists.
|
|
40
|
+
source: ruleScore != null ? 'rules' : null,
|
|
41
|
+
evidence: null,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Aggregate per-indicator sources into a dimension-level source label.
|
|
47
|
+
* @param {Array<'llm'|'rules'|null>} sources
|
|
48
|
+
* @returns {'llm'|'rules'|'mixed'|null}
|
|
49
|
+
*/
|
|
50
|
+
function dimensionSource(sources) {
|
|
51
|
+
const used = sources.filter(Boolean);
|
|
52
|
+
if (!used.length) return null;
|
|
53
|
+
const hasLlm = used.includes('llm');
|
|
54
|
+
const hasRules = used.includes('rules');
|
|
55
|
+
if (hasLlm && hasRules) return 'mixed';
|
|
56
|
+
return hasLlm ? 'llm' : 'rules';
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
module.exports = { mergeIndicator, dimensionSource, CONF_THRESHOLD };
|