akm-cli 0.7.0-rc1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/cli.js +100 -16
- package/dist/src/commands/config-cli.js +42 -0
- package/dist/src/commands/history.js +78 -7
- package/dist/src/commands/registry-search.js +69 -6
- package/dist/src/commands/search.js +30 -3
- package/dist/src/commands/show.js +29 -0
- package/dist/src/commands/source-add.js +5 -1
- package/dist/src/commands/source-manage.js +7 -1
- package/dist/src/core/config.js +28 -0
- package/dist/src/indexer/db-search.js +1 -0
- package/dist/src/indexer/indexer.js +16 -2
- package/dist/src/indexer/matchers.js +1 -1
- package/dist/src/indexer/search-source.js +4 -2
- package/dist/src/integrations/agent/profiles.js +1 -1
- package/dist/src/integrations/agent/spawn.js +67 -16
- package/dist/src/integrations/github.js +9 -3
- package/dist/src/llm/embedders/remote.js +37 -3
- package/dist/src/output/cli-hints.js +15 -2
- package/dist/src/output/renderers.js +3 -1
- package/dist/src/output/shapes.js +8 -1
- package/dist/src/output/text.js +156 -3
- package/dist/src/registry/build-index.js +5 -4
- package/dist/src/registry/providers/static-index.js +3 -1
- package/dist/src/setup/setup.js +9 -0
- package/dist/src/wiki/wiki.js +54 -6
- package/dist/src/workflows/runs.js +37 -3
- package/dist/tests/architecture/agent-no-llm-sdk-guard.test.js +1 -1
- package/dist/tests/bench/attribution.test.js +24 -23
- package/dist/tests/bench/cleanup.js +31 -0
- package/dist/tests/bench/cli.js +366 -31
- package/dist/tests/bench/cli.test.js +282 -14
- package/dist/tests/bench/corpus.js +3 -0
- package/dist/tests/bench/corpus.test.js +10 -10
- package/dist/tests/bench/doctor.js +525 -0
- package/dist/tests/bench/driver.js +77 -22
- package/dist/tests/bench/driver.test.js +142 -1
- package/dist/tests/bench/environment.js +233 -0
- package/dist/tests/bench/environment.test.js +199 -0
- package/dist/tests/bench/evolve.js +67 -0
- package/dist/tests/bench/evolve.test.js +12 -4
- package/dist/tests/bench/failure-modes.test.js +52 -3
- package/dist/tests/bench/feedback-integrity.test.js +3 -2
- package/dist/tests/bench/leakage.test.js +105 -2
- package/dist/tests/bench/learning-curve.test.js +3 -2
- package/dist/tests/bench/metrics.js +102 -26
- package/dist/tests/bench/metrics.test.js +10 -4
- package/dist/tests/bench/opencode-config.js +194 -0
- package/dist/tests/bench/opencode-config.test.js +370 -0
- package/dist/tests/bench/report.js +73 -9
- package/dist/tests/bench/report.test.js +59 -10
- package/dist/tests/bench/run-config.js +355 -0
- package/dist/tests/bench/run-config.test.js +298 -0
- package/dist/tests/bench/run-curate-test.js +32 -0
- package/dist/tests/bench/run-failing-tasks.js +56 -0
- package/dist/tests/bench/run-full-bench.js +51 -0
- package/dist/tests/bench/run-items36-targeted.js +69 -0
- package/dist/tests/bench/run-nano-quick.js +42 -0
- package/dist/tests/bench/run-waveg-targeted.js +62 -0
- package/dist/tests/bench/runner.js +257 -94
- package/dist/tests/bench/tmp.js +90 -0
- package/dist/tests/bench/trajectory.js +2 -2
- package/dist/tests/bench/verifier.js +6 -1
- package/dist/tests/bench/workflow-spec.js +11 -24
- package/dist/tests/bench/workflow-spec.test.js +1 -1
- package/dist/tests/bench/workflow-trace.js +34 -0
- package/dist/tests/cli-errors.test.js +1 -0
- package/dist/tests/commands/history.test.js +195 -0
- package/dist/tests/config.test.js +25 -0
- package/dist/tests/e2e.test.js +23 -2
- package/dist/tests/fixtures/stashes/load.js +1 -1
- package/dist/tests/fixtures/stashes/load.test.js +11 -2
- package/dist/tests/indexer.test.js +12 -1
- package/dist/tests/output-baseline.test.js +2 -1
- package/dist/tests/output-shapes-unit.test.js +3 -1
- package/dist/tests/registry-build-index.test.js +17 -1
- package/dist/tests/registry-providers/static-index.test.js +34 -0
- package/dist/tests/registry-search.test.js +200 -0
- package/dist/tests/remember-frontmatter.test.js +11 -13
- package/dist/tests/source-qa-fixes.test.js +18 -0
- package/dist/tests/source-registry.test.js +3 -3
- package/dist/tests/source-source.test.js +61 -1
- package/dist/tests/workflow-qa-fixes.test.js +18 -0
- package/package.json +1 -1
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OBSOLETE: superseded by `bun run tests/bench/cli.ts tests/bench/configs/curate-test.json`.
|
|
3
|
+
* Kept for backward compatibility; will be removed in the standalone-bench-repo extraction.
|
|
4
|
+
*
|
|
5
|
+
* Test akm curate as first command on configure-scaling.
|
|
6
|
+
* Usage: bun run tests/bench/run-curate-test.ts
|
|
7
|
+
*/
|
|
8
|
+
import fs from "node:fs";
|
|
9
|
+
import path from "node:path";
|
|
10
|
+
import { loadTask } from "./corpus";
|
|
11
|
+
import { loadOpencodeProviders } from "./opencode-config";
|
|
12
|
+
import { runUtility } from "./runner";
|
|
13
|
+
process.stderr.write("[obsolete] run-curate-test.ts → see tests/bench/configs/curate-test.json (`bun run tests/bench/cli.ts tests/bench/configs/curate-test.json`)\n");
|
|
14
|
+
const tasks = [loadTask("inkwell/configure-scaling")];
|
|
15
|
+
const LOCAL = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.local.json");
|
|
16
|
+
const DEFAULT = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.json");
|
|
17
|
+
const providers = loadOpencodeProviders(fs.existsSync(LOCAL) ? LOCAL : DEFAULT);
|
|
18
|
+
process.stderr.write(`Running configure-scaling × 5 seeds (curate as first cmd)\nModel: ${providers.defaultModel}\n\n`);
|
|
19
|
+
const report = await runUtility({
|
|
20
|
+
tasks,
|
|
21
|
+
arms: ["akm"],
|
|
22
|
+
model: providers.defaultModel,
|
|
23
|
+
seedsPerArm: 5,
|
|
24
|
+
budgetTokens: 25000,
|
|
25
|
+
budgetWallMs: 360000,
|
|
26
|
+
parallel: 3,
|
|
27
|
+
opencodeProviders: providers,
|
|
28
|
+
});
|
|
29
|
+
process.stdout.write(`${JSON.stringify(report, null, 2)}\n`);
|
|
30
|
+
const t = report.tasks?.[0];
|
|
31
|
+
const rate = t?.akm?.passRate ?? 0;
|
|
32
|
+
process.stderr.write(`\nconfigure-scaling: ${(rate * 100).toFixed(0)}% (baseline 80%)\n`);
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OBSOLETE: superseded by `bun run tests/bench/cli.ts tests/bench/configs/failing-tasks.json`.
|
|
3
|
+
* Kept for backward compatibility; will be removed in the standalone-bench-repo extraction.
|
|
4
|
+
*
|
|
5
|
+
* Targeted retest of failing/partial tasks after stash improvements.
|
|
6
|
+
* Usage: bun run tests/bench/run-failing-tasks.ts
|
|
7
|
+
*/
|
|
8
|
+
import fs from "node:fs";
|
|
9
|
+
import path from "node:path";
|
|
10
|
+
import { loadTask } from "./corpus";
|
|
11
|
+
import { loadOpencodeProviders } from "./opencode-config";
|
|
12
|
+
import { runUtility } from "./runner";
|
|
13
|
+
process.stderr.write("[obsolete] run-failing-tasks.ts → see tests/bench/configs/failing-tasks.json (`bun run tests/bench/cli.ts tests/bench/configs/failing-tasks.json`)\n");
|
|
14
|
+
const TASK_IDS = [
|
|
15
|
+
"drillbit/backup-policy",
|
|
16
|
+
"drillbit/canary-enable",
|
|
17
|
+
"inkwell/add-healthcheck",
|
|
18
|
+
"inkwell/configure-scaling",
|
|
19
|
+
"opencode/select-correct-skill",
|
|
20
|
+
];
|
|
21
|
+
const tasks = TASK_IDS.map((id) => loadTask(id));
|
|
22
|
+
const LOCAL = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.local.json");
|
|
23
|
+
const DEFAULT = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.json");
|
|
24
|
+
const providers = loadOpencodeProviders(fs.existsSync(LOCAL) ? LOCAL : DEFAULT);
|
|
25
|
+
process.stderr.write(`Running ${tasks.length} tasks × 5 seeds (akm only)\nModel: ${providers.defaultModel}\n\n`);
|
|
26
|
+
const report = await runUtility({
|
|
27
|
+
tasks,
|
|
28
|
+
arms: ["akm"],
|
|
29
|
+
model: providers.defaultModel,
|
|
30
|
+
seedsPerArm: 5,
|
|
31
|
+
budgetTokens: 25000,
|
|
32
|
+
budgetWallMs: 360000,
|
|
33
|
+
parallel: 3,
|
|
34
|
+
opencodeProviders: providers,
|
|
35
|
+
});
|
|
36
|
+
process.stdout.write(`${JSON.stringify(report, null, 2)}\n`);
|
|
37
|
+
const agg = report.aggregateAkm;
|
|
38
|
+
process.stderr.write(`\n=== RESULTS vs BASELINE ===\n`);
|
|
39
|
+
// Qwen 9B baseline for comparison
|
|
40
|
+
const BASELINE = {
|
|
41
|
+
"drillbit/backup-policy": 1.0,
|
|
42
|
+
"drillbit/canary-enable": 1.0,
|
|
43
|
+
"inkwell/add-healthcheck": 0.8,
|
|
44
|
+
"inkwell/configure-scaling": 0.8,
|
|
45
|
+
"opencode/select-correct-skill": 1.0,
|
|
46
|
+
};
|
|
47
|
+
for (const t of report.tasks ?? []) {
|
|
48
|
+
const rate = t.akm?.passRate ?? 0;
|
|
49
|
+
const base = BASELINE[t.id] ?? 0;
|
|
50
|
+
const delta = rate - base;
|
|
51
|
+
const arrow = delta > 0 ? "↑" : delta < 0 ? "↓" : "=";
|
|
52
|
+
const bar = "█".repeat(Math.round(rate * 5)) + "░".repeat(5 - Math.round(rate * 5));
|
|
53
|
+
const deltaStr = delta !== 0 ? ` (${arrow}${Math.abs(delta * 100).toFixed(0)}pp)` : "";
|
|
54
|
+
process.stderr.write(`${t.id.padEnd(48)} ${(rate * 100).toFixed(0).padStart(3)}% ${bar}${deltaStr}\n`);
|
|
55
|
+
}
|
|
56
|
+
process.stderr.write(`\nOverall: ${((agg?.passRate ?? 0) * 100).toFixed(1)}%\n`);
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OBSOLETE: superseded by `bun run tests/bench/cli.ts tests/bench/configs/full.json`.
|
|
3
|
+
* Kept for backward compatibility; will be removed in the standalone-bench-repo extraction.
|
|
4
|
+
*
|
|
5
|
+
* Full benchmark run — all tasks, 5 seeds, akm arm only.
|
|
6
|
+
* Usage: bun run tests/bench/run-full-bench.ts
|
|
7
|
+
*/
|
|
8
|
+
import fs from "node:fs";
|
|
9
|
+
import path from "node:path";
|
|
10
|
+
import { listTasks } from "./corpus";
|
|
11
|
+
import { loadOpencodeProviders } from "./opencode-config";
|
|
12
|
+
import { runUtility } from "./runner";
|
|
13
|
+
process.stderr.write("[obsolete] run-full-bench.ts → see tests/bench/configs/full.json (`bun run tests/bench/cli.ts tests/bench/configs/full.json`)\n");
|
|
14
|
+
const tasks = listTasks();
|
|
15
|
+
const LOCAL = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.local.json");
|
|
16
|
+
const DEFAULT = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.json");
|
|
17
|
+
const providers = loadOpencodeProviders(fs.existsSync(LOCAL) ? LOCAL : DEFAULT);
|
|
18
|
+
process.stderr.write(`Running ${tasks.length} tasks × 5 seeds (akm only)\nModel: ${providers.defaultModel}\n\n`);
|
|
19
|
+
const report = await runUtility({
|
|
20
|
+
tasks,
|
|
21
|
+
arms: ["akm"],
|
|
22
|
+
model: providers.defaultModel,
|
|
23
|
+
seedsPerArm: 5,
|
|
24
|
+
budgetTokens: 25000,
|
|
25
|
+
budgetWallMs: 360000,
|
|
26
|
+
parallel: 3,
|
|
27
|
+
opencodeProviders: providers,
|
|
28
|
+
});
|
|
29
|
+
process.stdout.write(`${JSON.stringify(report, null, 2)}\n`);
|
|
30
|
+
const BASELINE = {
|
|
31
|
+
"drillbit/backup-policy": 1.0,
|
|
32
|
+
"drillbit/canary-enable": 1.0,
|
|
33
|
+
"inkwell/add-healthcheck": 0.8,
|
|
34
|
+
"inkwell/configure-scaling": 0.8,
|
|
35
|
+
"opencode/select-correct-skill": 1.0,
|
|
36
|
+
};
|
|
37
|
+
process.stderr.write(`\n=== RESULTS vs BASELINE ===\n`);
|
|
38
|
+
for (const t of report.tasks ?? []) {
|
|
39
|
+
const rate = t.akm?.passRate ?? 0;
|
|
40
|
+
const base = BASELINE[t.id] ?? null;
|
|
41
|
+
const bar = "█".repeat(Math.round(rate * 5)) + "░".repeat(5 - Math.round(rate * 5));
|
|
42
|
+
const deltaStr = base !== null
|
|
43
|
+
? (() => {
|
|
44
|
+
const d = rate - base;
|
|
45
|
+
const arrow = d > 0 ? "↑" : d < 0 ? "↓" : "=";
|
|
46
|
+
return d !== 0 ? ` (${arrow}${Math.abs(d * 100).toFixed(0)}pp)` : " (=)";
|
|
47
|
+
})()
|
|
48
|
+
: "";
|
|
49
|
+
process.stderr.write(`${t.id.padEnd(52)} ${(rate * 100).toFixed(0).padStart(3)}% ${bar}${deltaStr}\n`);
|
|
50
|
+
}
|
|
51
|
+
process.stderr.write(`\nOverall: ${((report.aggregateAkm?.passRate ?? 0) * 100).toFixed(1)}%\n`);
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Items 3-6 targeted bench — tasks most directly affected by fixture stash
|
|
3
|
+
* content additions and skill frontmatter strip (commit 92196c7).
|
|
4
|
+
* Usage: bun run tests/bench/run-items36-targeted.ts
|
|
5
|
+
*/
|
|
6
|
+
import fs from "node:fs";
|
|
7
|
+
import path from "node:path";
|
|
8
|
+
import { loadTask } from "./corpus";
|
|
9
|
+
import { loadOpencodeProviders } from "./opencode-config";
|
|
10
|
+
import { runUtility } from "./runner";
|
|
11
|
+
const TARGET_TASKS = [
|
|
12
|
+
// item 4: env_file section moved to top of compose-conventions.md
|
|
13
|
+
"docker-homelab/env-from-file",
|
|
14
|
+
// item 3: az-storage-lifecycle knowledge added to az-cli stash
|
|
15
|
+
"workflow-compliance/repeated-fail-storage-lifecycle-a",
|
|
16
|
+
// item 5: memory assets (compound-tag-filter, null-value-trap)
|
|
17
|
+
"az-cli/query-by-tag",
|
|
18
|
+
// item 5: memory asset (healthcheck-test-cmd)
|
|
19
|
+
"inkwell/add-healthcheck-train",
|
|
20
|
+
// item 6: skill frontmatter strip — previously low-scoring tasks
|
|
21
|
+
"docker-homelab/restart-policy",
|
|
22
|
+
"docker-homelab/redis-healthcheck",
|
|
23
|
+
"docker-homelab/named-volume",
|
|
24
|
+
"az-cli/storage-account-create",
|
|
25
|
+
"inkwell/configure-scaling",
|
|
26
|
+
];
|
|
27
|
+
const tasks = TARGET_TASKS.map((id) => loadTask(id));
|
|
28
|
+
const LOCAL = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.local.json");
|
|
29
|
+
const DEFAULT = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.json");
|
|
30
|
+
const providers = loadOpencodeProviders(fs.existsSync(LOCAL) ? LOCAL : DEFAULT);
|
|
31
|
+
process.stderr.write(`Items 3-6 targeted bench: ${tasks.length} tasks × 3 seeds\nModel: ${providers.defaultModel}\n\n`);
|
|
32
|
+
const report = await runUtility({
|
|
33
|
+
tasks,
|
|
34
|
+
arms: ["akm"],
|
|
35
|
+
model: providers.defaultModel,
|
|
36
|
+
seedsPerArm: 3,
|
|
37
|
+
budgetTokens: 25000,
|
|
38
|
+
budgetWallMs: 360000,
|
|
39
|
+
parallel: 3,
|
|
40
|
+
opencodeProviders: providers,
|
|
41
|
+
});
|
|
42
|
+
process.stdout.write(`${JSON.stringify(report, null, 2)}\n`);
|
|
43
|
+
// Wave G baselines from 2026-05-03 targeted run
|
|
44
|
+
const BASELINE = {
|
|
45
|
+
"docker-homelab/env-from-file": 0.0,
|
|
46
|
+
"workflow-compliance/repeated-fail-storage-lifecycle-a": 0.0,
|
|
47
|
+
"az-cli/query-by-tag": 0.4,
|
|
48
|
+
"inkwell/add-healthcheck-train": 0.67,
|
|
49
|
+
"docker-homelab/restart-policy": 0.33,
|
|
50
|
+
"docker-homelab/redis-healthcheck": 0.33,
|
|
51
|
+
"docker-homelab/named-volume": 0.33,
|
|
52
|
+
"az-cli/storage-account-create": 1.0,
|
|
53
|
+
"inkwell/configure-scaling": 0.6,
|
|
54
|
+
};
|
|
55
|
+
process.stderr.write(`\n=== RESULTS vs Wave G BASELINE ===\n`);
|
|
56
|
+
for (const t of report.tasks ?? []) {
|
|
57
|
+
const rate = t.akm?.passRate ?? 0;
|
|
58
|
+
const base = BASELINE[t.id] ?? null;
|
|
59
|
+
const bar = "█".repeat(Math.round(rate * 5)) + "░".repeat(5 - Math.round(rate * 5));
|
|
60
|
+
const deltaStr = base !== null
|
|
61
|
+
? (() => {
|
|
62
|
+
const d = rate - base;
|
|
63
|
+
const arrow = d > 0 ? "↑" : d < 0 ? "↓" : "=";
|
|
64
|
+
return d !== 0 ? ` (${arrow}${Math.abs(d * 100).toFixed(0)}pp)` : " (=)";
|
|
65
|
+
})()
|
|
66
|
+
: "";
|
|
67
|
+
process.stderr.write(`${t.id.padEnd(52)} ${(rate * 100).toFixed(0).padStart(3)}% ${bar}${deltaStr}\n`);
|
|
68
|
+
}
|
|
69
|
+
process.stderr.write(`\nOverall: ${((report.aggregateAkm?.passRate ?? 0) * 100).toFixed(1)}%\n`);
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OBSOLETE: superseded by `bun run tests/bench/cli.ts tests/bench/configs/nano-quick.json`.
|
|
3
|
+
* Kept for backward compatibility; will be removed in the standalone-bench-repo extraction.
|
|
4
|
+
*
|
|
5
|
+
* Quick 5-task × 2-seed run for Nemotron Nano evaluation.
|
|
6
|
+
* Usage: bun run tests/bench/run-nano-quick.ts
|
|
7
|
+
*/
|
|
8
|
+
import fs from "node:fs";
|
|
9
|
+
import path from "node:path";
|
|
10
|
+
import { loadTask } from "./corpus";
|
|
11
|
+
import { loadOpencodeProviders } from "./opencode-config";
|
|
12
|
+
import { runUtility } from "./runner";
|
|
13
|
+
process.stderr.write("[obsolete] run-nano-quick.ts → see tests/bench/configs/nano-quick.json (`bun run tests/bench/cli.ts tests/bench/configs/nano-quick.json`)\n");
|
|
14
|
+
const TASK_IDS = [
|
|
15
|
+
"drillbit/backup-policy",
|
|
16
|
+
"drillbit/canary-enable",
|
|
17
|
+
"inkwell/add-healthcheck",
|
|
18
|
+
"inkwell/configure-scaling",
|
|
19
|
+
"opencode/select-correct-skill",
|
|
20
|
+
];
|
|
21
|
+
const tasks = TASK_IDS.map((id) => loadTask(id));
|
|
22
|
+
const LOCAL = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.local.json");
|
|
23
|
+
const DEFAULT = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.json");
|
|
24
|
+
const providers = loadOpencodeProviders(fs.existsSync(LOCAL) ? LOCAL : DEFAULT);
|
|
25
|
+
process.stderr.write(`Running ${tasks.length} tasks × 2 seeds\nModel: ${providers.defaultModel}\n\n`);
|
|
26
|
+
const report = await runUtility({
|
|
27
|
+
tasks,
|
|
28
|
+
arms: ["akm"],
|
|
29
|
+
model: providers.defaultModel,
|
|
30
|
+
seedsPerArm: 2,
|
|
31
|
+
budgetTokens: 25000,
|
|
32
|
+
budgetWallMs: 360000,
|
|
33
|
+
parallel: 2,
|
|
34
|
+
opencodeProviders: providers,
|
|
35
|
+
});
|
|
36
|
+
process.stdout.write(`${JSON.stringify(report, null, 2)}\n`);
|
|
37
|
+
for (const t of report.tasks ?? []) {
|
|
38
|
+
const rate = t.akm?.passRate ?? 0;
|
|
39
|
+
const bar = "█".repeat(Math.round(rate * 5)) + "░".repeat(5 - Math.round(rate * 5));
|
|
40
|
+
process.stderr.write(`${t.id.padEnd(48)} ${(rate * 100).toFixed(0).padStart(3)}% ${bar}\n`);
|
|
41
|
+
}
|
|
42
|
+
process.stderr.write(`\nOverall: ${((report.aggregateAkm?.passRate ?? 0) * 100).toFixed(1)}%\n`);
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Wave G targeted bench — 9 previously-failing tasks, 3 seeds.
|
|
3
|
+
* Usage: bun run tests/bench/run-waveg-targeted.ts
|
|
4
|
+
*/
|
|
5
|
+
import fs from "node:fs";
|
|
6
|
+
import path from "node:path";
|
|
7
|
+
import { loadTask } from "./corpus";
|
|
8
|
+
import { loadOpencodeProviders } from "./opencode-config";
|
|
9
|
+
import { runUtility } from "./runner";
|
|
10
|
+
const TARGET_TASKS = [
|
|
11
|
+
"inkwell/configure-scaling",
|
|
12
|
+
"inkwell/add-healthcheck-train",
|
|
13
|
+
"inkwell/full-config",
|
|
14
|
+
"az-cli/storage-account-create",
|
|
15
|
+
"docker-homelab/bridge-network",
|
|
16
|
+
"docker-homelab/compose-version-upgrade",
|
|
17
|
+
"docker-homelab/env-from-file",
|
|
18
|
+
"workflow-compliance/feedback-trap-az-tag-list",
|
|
19
|
+
"workflow-compliance/repeated-fail-storage-lifecycle-a",
|
|
20
|
+
];
|
|
21
|
+
const tasks = TARGET_TASKS.map((id) => loadTask(id));
|
|
22
|
+
const LOCAL = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.local.json");
|
|
23
|
+
const DEFAULT = path.resolve(__dirname, "..", "fixtures", "bench", "opencode-providers.json");
|
|
24
|
+
const providers = loadOpencodeProviders(fs.existsSync(LOCAL) ? LOCAL : DEFAULT);
|
|
25
|
+
process.stderr.write(`Wave G targeted bench: ${tasks.length} tasks × 3 seeds\nModel: ${providers.defaultModel}\n\n`);
|
|
26
|
+
const report = await runUtility({
|
|
27
|
+
tasks,
|
|
28
|
+
arms: ["akm"],
|
|
29
|
+
model: providers.defaultModel,
|
|
30
|
+
seedsPerArm: 3,
|
|
31
|
+
budgetTokens: 25000,
|
|
32
|
+
budgetWallMs: 360000,
|
|
33
|
+
parallel: 3,
|
|
34
|
+
opencodeProviders: providers,
|
|
35
|
+
});
|
|
36
|
+
process.stdout.write(`${JSON.stringify(report, null, 2)}\n`);
|
|
37
|
+
const BASELINE = {
|
|
38
|
+
"inkwell/configure-scaling": 0.6,
|
|
39
|
+
"inkwell/add-healthcheck-train": 0.4,
|
|
40
|
+
"inkwell/full-config": 0.0,
|
|
41
|
+
"az-cli/storage-account-create": 0.4,
|
|
42
|
+
"docker-homelab/bridge-network": 0.2,
|
|
43
|
+
"docker-homelab/compose-version-upgrade": 0.4,
|
|
44
|
+
"docker-homelab/env-from-file": 0.0,
|
|
45
|
+
"workflow-compliance/feedback-trap-az-tag-list": 0.2,
|
|
46
|
+
"workflow-compliance/repeated-fail-storage-lifecycle-a": 0.0,
|
|
47
|
+
};
|
|
48
|
+
process.stderr.write(`\n=== RESULTS vs 2026-05-03 BASELINE ===\n`);
|
|
49
|
+
for (const t of report.tasks ?? []) {
|
|
50
|
+
const rate = t.akm?.passRate ?? 0;
|
|
51
|
+
const base = BASELINE[t.id] ?? null;
|
|
52
|
+
const bar = "█".repeat(Math.round(rate * 5)) + "░".repeat(5 - Math.round(rate * 5));
|
|
53
|
+
const deltaStr = base !== null
|
|
54
|
+
? (() => {
|
|
55
|
+
const d = rate - base;
|
|
56
|
+
const arrow = d > 0 ? "↑" : d < 0 ? "↓" : "=";
|
|
57
|
+
return d !== 0 ? ` (${arrow}${Math.abs(d * 100).toFixed(0)}pp)` : " (=)";
|
|
58
|
+
})()
|
|
59
|
+
: "";
|
|
60
|
+
process.stderr.write(`${t.id.padEnd(52)} ${(rate * 100).toFixed(0).padStart(3)}% ${bar}${deltaStr}\n`);
|
|
61
|
+
}
|
|
62
|
+
process.stderr.write(`\nOverall: ${((report.aggregateAkm?.passRate ?? 0) * 100).toFixed(1)}%\n`);
|