@kodevibe/harness 0.11.0 → 0.11.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ko.md +101 -4
- package/README.md +108 -5
- package/harness/core-rules.md +2 -0
- package/harness/project-brief.md +18 -0
- package/harness/skills/docs-bridge.md +161 -0
- package/harness/skills/setup.md +10 -0
- package/harness/skills/state-check.md +19 -0
- package/harness/skills/wrap-up.md +9 -0
- package/package.json +11 -2
- package/src/dependency-scan.js +194 -0
- package/src/guard.js +717 -0
- package/src/init.js +754 -8
- package/src/llm-bench.js +323 -0
- package/src/pack-check.js +47 -0
package/src/llm-bench.js
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const crypto = require('crypto');
|
|
4
|
+
|
|
5
|
+
const DEFAULT_THRESHOLDS = {
|
|
6
|
+
overall: 0.7,
|
|
7
|
+
decompositionPrecision: 0.6,
|
|
8
|
+
passAt1: 0.6,
|
|
9
|
+
instructionCompliance: 0.8,
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
const DEFAULT_WEIGHTS = {
|
|
13
|
+
decompositionPrecision: 0.4,
|
|
14
|
+
passAt1: 0.4,
|
|
15
|
+
instructionCompliance: 0.2,
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
const DEFAULT_REQUIREMENTS = {
|
|
19
|
+
minModels: 1,
|
|
20
|
+
minRunsPerModel: 1,
|
|
21
|
+
requiredScenarios: [],
|
|
22
|
+
allowExampleIds: true,
|
|
23
|
+
requireRunSeals: false,
|
|
24
|
+
requireScenarioManifest: false,
|
|
25
|
+
requireScenarioPromptHash: false,
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
function clamp01(value) {
|
|
29
|
+
if (!Number.isFinite(value)) return 0;
|
|
30
|
+
return Math.max(0, Math.min(1, value));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function normalizeText(value) {
|
|
34
|
+
return String(value || '')
|
|
35
|
+
.toLowerCase()
|
|
36
|
+
.replace(/[`*_()[\]{}:;,.!?]/g, ' ')
|
|
37
|
+
.replace(/\s+/g, ' ')
|
|
38
|
+
.trim();
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function taskText(task) {
|
|
42
|
+
if (typeof task === 'string') return task;
|
|
43
|
+
if (task && typeof task === 'object') return task.id || task.title || task.text || task.name || '';
|
|
44
|
+
return '';
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function sha256Hex(value) {
|
|
48
|
+
return crypto.createHash('sha256').update(String(value || ''), 'utf8').digest('hex');
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function scenarioPromptHash(scenario = {}) {
|
|
52
|
+
if (scenario.promptHash) return String(scenario.promptHash);
|
|
53
|
+
if (typeof scenario.prompt === 'string' && scenario.prompt.length > 0) return sha256Hex(scenario.prompt);
|
|
54
|
+
return '';
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
function normalizeScenarioManifest(manifest = []) {
|
|
58
|
+
const scenarios = Array.isArray(manifest)
|
|
59
|
+
? manifest
|
|
60
|
+
: manifest.scenarios || manifest.requiredScenarios || [];
|
|
61
|
+
|
|
62
|
+
return scenarios
|
|
63
|
+
.filter((scenario) => scenario && (scenario.id || scenario.scenario))
|
|
64
|
+
.map((scenario) => ({
|
|
65
|
+
id: scenario.id || scenario.scenario,
|
|
66
|
+
title: scenario.title || scenario.name || '',
|
|
67
|
+
prompt: scenario.prompt || '',
|
|
68
|
+
promptHash: scenarioPromptHash(scenario),
|
|
69
|
+
expectedTasks: Array.isArray(scenario.expectedTasks) ? scenario.expectedTasks : [],
|
|
70
|
+
requiredInstructionChecks: Array.isArray(scenario.requiredInstructionChecks)
|
|
71
|
+
? scenario.requiredInstructionChecks
|
|
72
|
+
: Array.isArray(scenario.instructionChecks)
|
|
73
|
+
? scenario.instructionChecks.map((check) => (typeof check === 'string' ? check : check.id)).filter(Boolean)
|
|
74
|
+
: [],
|
|
75
|
+
}));
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function sealRun({
|
|
79
|
+
scenario,
|
|
80
|
+
prompt,
|
|
81
|
+
output,
|
|
82
|
+
capturedAt = new Date().toISOString(),
|
|
83
|
+
expectedTasks = [],
|
|
84
|
+
actualTasks = [],
|
|
85
|
+
passed = false,
|
|
86
|
+
instructionChecks = [],
|
|
87
|
+
} = {}) {
|
|
88
|
+
return {
|
|
89
|
+
scenario,
|
|
90
|
+
capturedAt,
|
|
91
|
+
promptHash: sha256Hex(prompt),
|
|
92
|
+
outputHash: sha256Hex(output),
|
|
93
|
+
expectedTasks,
|
|
94
|
+
actualTasks,
|
|
95
|
+
passed,
|
|
96
|
+
instructionChecks,
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function scoreDecomposition(expectedTasks = [], actualTasks = []) {
|
|
101
|
+
const expected = expectedTasks.map(taskText).map(normalizeText).filter(Boolean);
|
|
102
|
+
const actual = actualTasks.map(taskText).map(normalizeText).filter(Boolean);
|
|
103
|
+
if (expected.length === 0) return 1;
|
|
104
|
+
|
|
105
|
+
let matched = 0;
|
|
106
|
+
for (const exp of expected) {
|
|
107
|
+
if (actual.some((act) => act === exp || act.includes(exp) || exp.includes(act))) matched++;
|
|
108
|
+
}
|
|
109
|
+
return clamp01(matched / expected.length);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function scoreInstructionChecks(checks = []) {
|
|
113
|
+
if (!Array.isArray(checks) || checks.length === 0) return 1;
|
|
114
|
+
const passed = checks.filter((check) => check && check.passed === true).length;
|
|
115
|
+
return clamp01(passed / checks.length);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function weightedOverall(metrics, weights = DEFAULT_WEIGHTS) {
|
|
119
|
+
const weightSum = Object.values(weights).reduce((sum, weight) => sum + weight, 0) || 1;
|
|
120
|
+
const score = Object.entries(weights).reduce((sum, [key, weight]) => {
|
|
121
|
+
return sum + clamp01(metrics[key]) * weight;
|
|
122
|
+
}, 0);
|
|
123
|
+
return clamp01(score / weightSum);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function scoreRun(run = {}, weights = DEFAULT_WEIGHTS) {
|
|
127
|
+
const metrics = {
|
|
128
|
+
decompositionPrecision: scoreDecomposition(run.expectedTasks || run.requiredTasks || [], run.actualTasks || run.producedTasks || []),
|
|
129
|
+
passAt1: run.passed === true ? 1 : 0,
|
|
130
|
+
instructionCompliance: scoreInstructionChecks(run.instructionChecks || run.checks || []),
|
|
131
|
+
};
|
|
132
|
+
return {
|
|
133
|
+
scenario: run.scenario || run.id || '(unnamed)',
|
|
134
|
+
metrics,
|
|
135
|
+
overall: weightedOverall(metrics, weights),
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function average(values) {
|
|
140
|
+
if (!values.length) return 0;
|
|
141
|
+
return values.reduce((sum, value) => sum + value, 0) / values.length;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function scoreModel(model = {}, weights = DEFAULT_WEIGHTS) {
|
|
145
|
+
const runs = Array.isArray(model.runs) ? model.runs.map((run) => scoreRun(run, weights)) : [];
|
|
146
|
+
const metrics = {
|
|
147
|
+
decompositionPrecision: average(runs.map((run) => run.metrics.decompositionPrecision)),
|
|
148
|
+
passAt1: average(runs.map((run) => run.metrics.passAt1)),
|
|
149
|
+
instructionCompliance: average(runs.map((run) => run.metrics.instructionCompliance)),
|
|
150
|
+
};
|
|
151
|
+
return {
|
|
152
|
+
id: model.id || model.name || '(unnamed-model)',
|
|
153
|
+
tier: model.tier || 'unknown',
|
|
154
|
+
runs,
|
|
155
|
+
metrics,
|
|
156
|
+
overall: weightedOverall(metrics, weights),
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function validateThresholds(summary, thresholds = DEFAULT_THRESHOLDS) {
|
|
161
|
+
const violations = [];
|
|
162
|
+
for (const model of summary.models) {
|
|
163
|
+
if (model.overall < thresholds.overall) {
|
|
164
|
+
violations.push(`${model.id}: overall ${model.overall.toFixed(2)} < ${thresholds.overall}`);
|
|
165
|
+
}
|
|
166
|
+
for (const key of ['decompositionPrecision', 'passAt1', 'instructionCompliance']) {
|
|
167
|
+
const min = thresholds[key];
|
|
168
|
+
if (typeof min === 'number' && model.metrics[key] < min) {
|
|
169
|
+
violations.push(`${model.id}: ${key} ${model.metrics[key].toFixed(2)} < ${min}`);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return violations;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function validateRequirements(input = {}, summary = {}, requirements = DEFAULT_REQUIREMENTS) {
|
|
177
|
+
const violations = [];
|
|
178
|
+
const models = Array.isArray(input.models) ? input.models : [];
|
|
179
|
+
const scenarioSpecs = normalizeScenarioManifest(requirements.scenarios || requirements.scenarioManifest || []);
|
|
180
|
+
const scenarioSpecMap = new Map(scenarioSpecs.map((scenario) => [scenario.id, scenario]));
|
|
181
|
+
|
|
182
|
+
if (models.length < requirements.minModels) {
|
|
183
|
+
violations.push(`bench requires at least ${requirements.minModels} model(s), found ${models.length}`);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const requiredScenarios = requirements.requiredScenarios || [];
|
|
187
|
+
if (requirements.requireScenarioManifest) {
|
|
188
|
+
for (const scenario of requiredScenarios) {
|
|
189
|
+
if (!scenarioSpecMap.has(scenario)) {
|
|
190
|
+
violations.push(`bench scenario manifest missing required scenario "${scenario}"`);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
for (const model of models) {
|
|
196
|
+
const id = model.id || model.name || '(unnamed-model)';
|
|
197
|
+
const runs = Array.isArray(model.runs) ? model.runs : [];
|
|
198
|
+
if (runs.length < requirements.minRunsPerModel) {
|
|
199
|
+
violations.push(`${id}: bench requires at least ${requirements.minRunsPerModel} run(s), found ${runs.length}`);
|
|
200
|
+
}
|
|
201
|
+
if (!requirements.allowExampleIds && /\bexample|sample|placeholder|dummy\b/i.test(id)) {
|
|
202
|
+
violations.push(`${id}: example/sample model id is not valid release evidence`);
|
|
203
|
+
}
|
|
204
|
+
const scenarioSet = new Set(runs.map((run) => run.scenario || run.id).filter(Boolean));
|
|
205
|
+
for (const scenario of requiredScenarios) {
|
|
206
|
+
if (!scenarioSet.has(scenario)) {
|
|
207
|
+
violations.push(`${id}: missing required scenario "${scenario}"`);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
if (requirements.requireRunSeals) {
|
|
211
|
+
for (const run of runs) {
|
|
212
|
+
const scenario = run.scenario || run.id || '(unnamed-scenario)';
|
|
213
|
+
if (!/^\d{4}-\d{2}-\d{2}T/.test(String(run.capturedAt || ''))) {
|
|
214
|
+
violations.push(`${id}/${scenario}: missing capturedAt ISO timestamp`);
|
|
215
|
+
}
|
|
216
|
+
if (!/^[a-f0-9]{8,}$/i.test(String(run.promptHash || ''))) {
|
|
217
|
+
violations.push(`${id}/${scenario}: missing promptHash`);
|
|
218
|
+
}
|
|
219
|
+
if (!/^[a-f0-9]{8,}$/i.test(String(run.outputHash || run.transcriptHash || ''))) {
|
|
220
|
+
violations.push(`${id}/${scenario}: missing outputHash or transcriptHash`);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
if (requirements.requireScenarioManifest) {
|
|
225
|
+
for (const run of runs) {
|
|
226
|
+
const scenario = run.scenario || run.id || '(unnamed-scenario)';
|
|
227
|
+
const spec = scenarioSpecMap.get(scenario);
|
|
228
|
+
if (!spec) {
|
|
229
|
+
violations.push(`${id}/${scenario}: scenario is not listed in the bench manifest`);
|
|
230
|
+
continue;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
for (const expectedTask of spec.expectedTasks) {
|
|
234
|
+
if (scoreDecomposition([expectedTask], run.expectedTasks || run.requiredTasks || []) < 1) {
|
|
235
|
+
violations.push(`${id}/${scenario}: expectedTasks missing manifest task "${expectedTask}"`);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
const checks = run.instructionChecks || run.checks || [];
|
|
240
|
+
const checkMap = new Map(checks
|
|
241
|
+
.filter((check) => check && check.id)
|
|
242
|
+
.map((check) => [check.id, check]));
|
|
243
|
+
for (const checkId of spec.requiredInstructionChecks) {
|
|
244
|
+
const check = checkMap.get(checkId);
|
|
245
|
+
if (!check) {
|
|
246
|
+
violations.push(`${id}/${scenario}: instructionChecks missing manifest check "${checkId}"`);
|
|
247
|
+
} else if (check.passed !== true) {
|
|
248
|
+
violations.push(`${id}/${scenario}: manifest check "${checkId}" did not pass`);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if (requirements.requireScenarioPromptHash) {
|
|
253
|
+
const runPromptHash = String(run.promptHash || '');
|
|
254
|
+
if (!spec.promptHash) {
|
|
255
|
+
violations.push(`${id}/${scenario}: manifest promptHash is missing`);
|
|
256
|
+
} else if (!runPromptHash) {
|
|
257
|
+
if (!requirements.requireRunSeals) {
|
|
258
|
+
violations.push(`${id}/${scenario}: missing promptHash for manifest comparison`);
|
|
259
|
+
}
|
|
260
|
+
} else if (runPromptHash !== spec.promptHash) {
|
|
261
|
+
violations.push(`${id}/${scenario}: promptHash does not match manifest prompt`);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
if (!summary.models || summary.models.length === 0) {
|
|
269
|
+
violations.push('bench has no scored model results');
|
|
270
|
+
}
|
|
271
|
+
return violations;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
function scoreBench(input = {}, options = {}) {
|
|
275
|
+
const weights = { ...DEFAULT_WEIGHTS, ...(input.weights || {}) };
|
|
276
|
+
const thresholds = { ...DEFAULT_THRESHOLDS, ...(input.thresholds || {}) };
|
|
277
|
+
const requirements = {
|
|
278
|
+
...DEFAULT_REQUIREMENTS,
|
|
279
|
+
...(input.requirements || {}),
|
|
280
|
+
...(options.requirements || {}),
|
|
281
|
+
};
|
|
282
|
+
const models = Array.isArray(input.models) ? input.models.map((model) => scoreModel(model, weights)) : [];
|
|
283
|
+
const metrics = {
|
|
284
|
+
decompositionPrecision: average(models.map((model) => model.metrics.decompositionPrecision)),
|
|
285
|
+
passAt1: average(models.map((model) => model.metrics.passAt1)),
|
|
286
|
+
instructionCompliance: average(models.map((model) => model.metrics.instructionCompliance)),
|
|
287
|
+
};
|
|
288
|
+
const summary = {
|
|
289
|
+
suite: input.suite || 'kode:harness-r10',
|
|
290
|
+
models,
|
|
291
|
+
metrics,
|
|
292
|
+
overall: weightedOverall(metrics, weights),
|
|
293
|
+
thresholds,
|
|
294
|
+
requirements,
|
|
295
|
+
};
|
|
296
|
+
const violations = [
|
|
297
|
+
...validateThresholds(summary, thresholds),
|
|
298
|
+
...validateRequirements(input, summary, requirements),
|
|
299
|
+
];
|
|
300
|
+
return {
|
|
301
|
+
...summary,
|
|
302
|
+
ok: violations.length === 0,
|
|
303
|
+
violations,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
module.exports = {
|
|
308
|
+
DEFAULT_THRESHOLDS,
|
|
309
|
+
DEFAULT_WEIGHTS,
|
|
310
|
+
DEFAULT_REQUIREMENTS,
|
|
311
|
+
sha256Hex,
|
|
312
|
+
scenarioPromptHash,
|
|
313
|
+
normalizeScenarioManifest,
|
|
314
|
+
sealRun,
|
|
315
|
+
normalizeText,
|
|
316
|
+
scoreDecomposition,
|
|
317
|
+
scoreInstructionChecks,
|
|
318
|
+
scoreRun,
|
|
319
|
+
scoreModel,
|
|
320
|
+
scoreBench,
|
|
321
|
+
validateThresholds,
|
|
322
|
+
validateRequirements,
|
|
323
|
+
};
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const fs = require('fs');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const { runGuard } = require('./guard');
|
|
6
|
+
|
|
7
|
+
function parsePackJson(input) {
|
|
8
|
+
const parsed = typeof input === 'string' ? JSON.parse(input) : input;
|
|
9
|
+
const pack = Array.isArray(parsed) ? parsed[0] : parsed;
|
|
10
|
+
if (!pack || !Array.isArray(pack.files)) {
|
|
11
|
+
throw new Error('npm pack JSON must include a files array');
|
|
12
|
+
}
|
|
13
|
+
return pack;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function normalizePackPath(filePath) {
|
|
17
|
+
return String(filePath || '')
|
|
18
|
+
.replace(/\\/g, '/')
|
|
19
|
+
.replace(/^package\//, '')
|
|
20
|
+
.replace(/^\.?\//, '');
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function packFilePaths(input) {
|
|
24
|
+
const pack = parsePackJson(input);
|
|
25
|
+
return pack.files
|
|
26
|
+
.map((file) => normalizePackPath(file.path))
|
|
27
|
+
.filter(Boolean);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function checkPublicPack({ packJson, cwd = process.cwd() }) {
|
|
31
|
+
const files = packFilePaths(packJson)
|
|
32
|
+
.filter((file) => {
|
|
33
|
+
const abs = path.join(cwd, file);
|
|
34
|
+
return fs.existsSync(abs) && fs.statSync(abs).isFile();
|
|
35
|
+
});
|
|
36
|
+
return {
|
|
37
|
+
files,
|
|
38
|
+
...runGuard({ files, cwd }),
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
module.exports = {
|
|
43
|
+
parsePackJson,
|
|
44
|
+
normalizePackPath,
|
|
45
|
+
packFilePaths,
|
|
46
|
+
checkPublicPack,
|
|
47
|
+
};
|