sneakoscope 4.0.12 → 4.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin/sks.js +1 -1
- package/dist/core/commands/glm-command.js +11 -5
- package/dist/core/fsx.js +1 -1
- package/dist/core/providers/glm/bench/glm-bench-comparison.js +48 -0
- package/dist/core/providers/glm/bench/glm-bench-fixture.js +65 -0
- package/dist/core/providers/glm/bench/glm-bench-model-lock-proof.js +24 -0
- package/dist/core/providers/glm/bench/glm-bench-report.js +75 -0
- package/dist/core/providers/glm/bench/glm-benchmark-runner.js +219 -0
- package/dist/core/providers/glm/bench/glm-benchmark-types.js +2 -0
- package/dist/core/providers/glm/bench/glm-direct-bench-runner.js +73 -0
- package/dist/core/providers/glm/naruto/glm-naruto-bench.js +2 -181
- package/dist/core/providers/glm/naruto/glm-naruto-command.js +14 -3
- package/dist/core/version.js +1 -1
- package/package.json +1 -1
- package/dist/core/providers/glm/glm-bench.js +0 -127
package/dist/bin/sks.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { flag, positionalArgs } from '../../cli/args.js';
|
|
2
|
-
import {
|
|
2
|
+
import { runGlmBenchmark } from '../providers/glm/bench/glm-benchmark-runner.js';
|
|
3
3
|
import { printJson } from '../../cli/output.js';
|
|
4
4
|
import { runGlmDirectSpeedRun } from '../providers/glm/glm-direct-run.js';
|
|
5
5
|
import { runGlmReadinessAndExit } from '../providers/glm/glm-readiness.js';
|
|
@@ -11,15 +11,21 @@ export async function glmCommand(args = []) {
|
|
|
11
11
|
return glmNarutoCommand(narutoArgs);
|
|
12
12
|
}
|
|
13
13
|
if (flag(args, '--bench') && !flag(args, '--naruto')) {
|
|
14
|
-
const result = await
|
|
14
|
+
const result = await runGlmBenchmark(process.cwd(), args);
|
|
15
15
|
if (result.status === 'blocked')
|
|
16
16
|
process.exitCode = 1;
|
|
17
17
|
if (flag(args, '--json'))
|
|
18
18
|
printJson(result);
|
|
19
19
|
else if (result.status === 'blocked')
|
|
20
|
-
console.error(`GLM
|
|
21
|
-
else
|
|
22
|
-
console.log(`GLM
|
|
20
|
+
console.error(`GLM benchmark blocked: ${result.warnings.join(', ')}`);
|
|
21
|
+
else if (result.status === 'dry_run')
|
|
22
|
+
console.log(`GLM benchmark: dry-run (use --live for real measurement)`);
|
|
23
|
+
else {
|
|
24
|
+
const direct = result.cases.find((c) => c.implementation_path === 'direct-glm');
|
|
25
|
+
if (direct)
|
|
26
|
+
console.log(` Direct GLM: ${direct.wall_clock_ms}ms`);
|
|
27
|
+
console.log(` Recommendation: ${result.comparison.recommendation}`);
|
|
28
|
+
}
|
|
23
29
|
return result;
|
|
24
30
|
}
|
|
25
31
|
const task = extractGlmTask(args);
|
package/dist/core/fsx.js
CHANGED
|
@@ -5,7 +5,7 @@ import os from 'node:os';
|
|
|
5
5
|
import crypto from 'node:crypto';
|
|
6
6
|
import { spawn } from 'node:child_process';
|
|
7
7
|
import { fileURLToPath } from 'node:url';
|
|
8
|
-
export const PACKAGE_VERSION = '4.0.
|
|
8
|
+
export const PACKAGE_VERSION = '4.0.13';
|
|
9
9
|
export const DEFAULT_PROCESS_TAIL_BYTES = 256 * 1024;
|
|
10
10
|
export const DEFAULT_PROCESS_TIMEOUT_MS = 30 * 60 * 1000;
|
|
11
11
|
export function nowIso() {
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
export function computeGlmBenchmarkComparison(cases) {
|
|
2
|
+
const directCase = cases.find((c) => c.implementation_path === 'direct-glm');
|
|
3
|
+
const narutoCases = cases.filter((c) => c.implementation_path === 'glm-naruto');
|
|
4
|
+
const directSucceeded = Boolean(directCase && (directCase.patch_generated === true || directCase.patch_gate_passed === true));
|
|
5
|
+
const directWallClockMs = directCase && directSucceeded ? directCase.wall_clock_ms : null;
|
|
6
|
+
const eligibleNaruto = narutoCases.filter((c) => (c.gate_pass_rate !== null && c.gate_pass_rate > 0) || c.merge_success === true);
|
|
7
|
+
let bestNaruto = null;
|
|
8
|
+
for (const naruto of eligibleNaruto) {
|
|
9
|
+
if (!bestNaruto || naruto.wall_clock_ms < bestNaruto.wall_clock_ms) {
|
|
10
|
+
bestNaruto = naruto;
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
const bestNarutoWallClockMs = bestNaruto ? bestNaruto.wall_clock_ms : null;
|
|
14
|
+
const bestNarutoRunnerId = bestNaruto ? bestNaruto.runner_id : null;
|
|
15
|
+
let speedup = null;
|
|
16
|
+
if (directWallClockMs !== null && bestNarutoWallClockMs !== null && bestNarutoWallClockMs > 0) {
|
|
17
|
+
speedup = Number((directWallClockMs / bestNarutoWallClockMs).toFixed(3));
|
|
18
|
+
}
|
|
19
|
+
let recommendation = 'inconclusive';
|
|
20
|
+
let reason = 'Insufficient measured data to recommend a path.';
|
|
21
|
+
if (directWallClockMs !== null && bestNarutoWallClockMs === null) {
|
|
22
|
+
recommendation = 'direct-glm';
|
|
23
|
+
reason = 'Direct GLM succeeded and no Naruto case produced gate-passed or merged results.';
|
|
24
|
+
}
|
|
25
|
+
else if (directWallClockMs !== null && bestNarutoWallClockMs !== null && speedup !== null) {
|
|
26
|
+
if (speedup >= 1.2) {
|
|
27
|
+
recommendation = 'glm-naruto';
|
|
28
|
+
reason = `GLM Naruto (${bestNarutoRunnerId}) was ${speedup.toFixed(2)}x faster than direct GLM for this task.`;
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
recommendation = 'direct-glm';
|
|
32
|
+
reason = `Direct GLM was faster for this tiny single-file task (speedup ratio ${speedup.toFixed(2)}).`;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
else if (directWallClockMs === null && bestNarutoWallClockMs !== null) {
|
|
36
|
+
recommendation = 'glm-naruto';
|
|
37
|
+
reason = `GLM Naruto (${bestNarutoRunnerId}) produced results while direct GLM did not complete.`;
|
|
38
|
+
}
|
|
39
|
+
return {
|
|
40
|
+
direct_wall_clock_ms: directWallClockMs,
|
|
41
|
+
best_naruto_wall_clock_ms: bestNarutoWallClockMs,
|
|
42
|
+
best_naruto_runner_id: bestNarutoRunnerId,
|
|
43
|
+
naruto_speedup_vs_direct: speedup,
|
|
44
|
+
recommendation,
|
|
45
|
+
reason
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=glm-bench-comparison.js.map
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import os from 'node:os';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import fsp from 'node:fs/promises';
|
|
4
|
+
import { spawn } from 'node:child_process';
|
|
5
|
+
export const BENCH_FIXTURE_TASK = 'Change src/bench-target.ts so value is 2. Return the smallest patch only.';
|
|
6
|
+
export const BENCH_FIXTURE_TARGET_FILE = 'src/bench-target.ts';
|
|
7
|
+
export const BENCH_FIXTURE_INITIAL = 'export const value = 1;\n';
|
|
8
|
+
export const BENCH_FIXTURE_EXPECTED = 'export const value = 2;\n';
|
|
9
|
+
export async function createGlmBenchFixture(baseDir) {
|
|
10
|
+
const fixtureDir = await fsp.mkdtemp(path.join(baseDir || os.tmpdir(), 'sks-glm-bench-fixture-'));
|
|
11
|
+
await fsp.mkdir(path.join(fixtureDir, 'src'), { recursive: true });
|
|
12
|
+
await fsp.writeFile(path.join(fixtureDir, BENCH_FIXTURE_TARGET_FILE), BENCH_FIXTURE_INITIAL, 'utf8');
|
|
13
|
+
await gitInit(fixtureDir);
|
|
14
|
+
await gitAdd(fixtureDir, '.');
|
|
15
|
+
await gitCommit(fixtureDir, 'bench fixture initial');
|
|
16
|
+
return {
|
|
17
|
+
schema: 'sks.glm-bench-fixture.v1',
|
|
18
|
+
fixture_dir: fixtureDir,
|
|
19
|
+
task: BENCH_FIXTURE_TASK,
|
|
20
|
+
target_file: BENCH_FIXTURE_TARGET_FILE,
|
|
21
|
+
initial_content: BENCH_FIXTURE_INITIAL,
|
|
22
|
+
expected_content: BENCH_FIXTURE_EXPECTED
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
export async function cloneFixture(source, label) {
|
|
26
|
+
const cloneDir = await fsp.mkdtemp(path.join(os.tmpdir(), `sks-glm-bench-${label}-`));
|
|
27
|
+
await gitClone(source.fixture_dir, cloneDir);
|
|
28
|
+
return { ...source, fixture_dir: cloneDir };
|
|
29
|
+
}
|
|
30
|
+
export async function resetFixture(fixture) {
|
|
31
|
+
await runGit(['reset', '--hard', 'HEAD'], fixture.fixture_dir);
|
|
32
|
+
await runGit(['clean', '-fdx'], fixture.fixture_dir);
|
|
33
|
+
}
|
|
34
|
+
export async function cleanupFixture(fixture) {
|
|
35
|
+
await fsp.rm(fixture.fixture_dir, { recursive: true, force: true }).catch(() => undefined);
|
|
36
|
+
}
|
|
37
|
+
async function gitInit(dir) {
|
|
38
|
+
await runGit(['init', '-q'], dir);
|
|
39
|
+
await runGit(['config', 'user.name', 'sks-bench'], dir);
|
|
40
|
+
await runGit(['config', 'user.email', 'bench@sks.local'], dir);
|
|
41
|
+
}
|
|
42
|
+
async function gitAdd(dir, file) {
|
|
43
|
+
await runGit(['add', file], dir);
|
|
44
|
+
}
|
|
45
|
+
async function gitCommit(dir, message) {
|
|
46
|
+
await runGit(['commit', '-q', '-m', message], dir);
|
|
47
|
+
}
|
|
48
|
+
async function gitClone(source, dest) {
|
|
49
|
+
await runGit(['clone', '-q', source, dest], dest);
|
|
50
|
+
}
|
|
51
|
+
function runGit(args, cwd) {
|
|
52
|
+
return new Promise((resolve, reject) => {
|
|
53
|
+
const child = spawn('git', [...args], { cwd, stdio: ['ignore', 'pipe', 'pipe'] });
|
|
54
|
+
let stderr = '';
|
|
55
|
+
child.stderr.on('data', (chunk) => { stderr += String(chunk); });
|
|
56
|
+
child.on('close', (code) => {
|
|
57
|
+
if (code === 0)
|
|
58
|
+
resolve();
|
|
59
|
+
else
|
|
60
|
+
reject(new Error(`git ${args.join(' ')} exited ${code}: ${stderr.trim()}`));
|
|
61
|
+
});
|
|
62
|
+
child.on('error', reject);
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
//# sourceMappingURL=glm-bench-fixture.js.map
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
|
|
2
|
+
export function buildGlmBenchModelLockProof(cases) {
|
|
3
|
+
const checkedCases = cases.map((c) => c.runner_id);
|
|
4
|
+
const mismatches = [];
|
|
5
|
+
for (const caseResult of cases) {
|
|
6
|
+
if (caseResult.model !== GLM_52_OPENROUTER_MODEL) {
|
|
7
|
+
mismatches.push(`${caseResult.runner_id}: model is ${caseResult.model}, expected ${GLM_52_OPENROUTER_MODEL}`);
|
|
8
|
+
}
|
|
9
|
+
if (caseResult.gpt_fallback_allowed !== false) {
|
|
10
|
+
mismatches.push(`${caseResult.runner_id}: gpt_fallback_allowed is not false`);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
return {
|
|
14
|
+
schema: 'sks.glm-bench-model-lock-proof.v1',
|
|
15
|
+
checked_cases: checkedCases,
|
|
16
|
+
model: GLM_52_OPENROUTER_MODEL,
|
|
17
|
+
gpt_fallback_allowed: false,
|
|
18
|
+
fallback_arrays_found: 0,
|
|
19
|
+
openai_key_used: false,
|
|
20
|
+
mismatches,
|
|
21
|
+
passed: mismatches.length === 0
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=glm-bench-model-lock-proof.js.map
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
|
|
2
|
+
import { writeTextAtomic, nowIso } from '../../../fsx.js';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
export async function writeGlmBenchReport(benchDir, result) {
|
|
5
|
+
const reportPath = path.join(benchDir, 'bench-report.md');
|
|
6
|
+
const lines = [];
|
|
7
|
+
lines.push('# GLM Benchmark Report — True Direct vs Naruto', '');
|
|
8
|
+
lines.push(`Generated: ${result.generated_at}`);
|
|
9
|
+
lines.push(`Model: ${GLM_52_OPENROUTER_MODEL}`);
|
|
10
|
+
lines.push(`GPT fallback allowed: false`);
|
|
11
|
+
lines.push(`Status: ${result.status}`);
|
|
12
|
+
lines.push('');
|
|
13
|
+
if (result.fixture) {
|
|
14
|
+
lines.push('## Fixture', '');
|
|
15
|
+
lines.push(`- Task: ${result.fixture.task}`);
|
|
16
|
+
lines.push(`- Target: ${result.fixture.target_file}`);
|
|
17
|
+
lines.push(`- Temp repo: ${result.fixture.fixture_dir}`);
|
|
18
|
+
lines.push('');
|
|
19
|
+
}
|
|
20
|
+
lines.push('## Cases', '');
|
|
21
|
+
lines.push('| Case | Kind | Workers | Wall ms | TTFT p50 | Total p50 | Candidates | Gate pass | Verifier | Merge | Patch gen | Patch gate | Metric |');
|
|
22
|
+
lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- |');
|
|
23
|
+
for (const c of result.cases) {
|
|
24
|
+
lines.push(formatCaseRow(c));
|
|
25
|
+
}
|
|
26
|
+
lines.push('');
|
|
27
|
+
const direct = result.cases.find((c) => c.implementation_path === 'direct-glm');
|
|
28
|
+
const narutoBest = result.cases
|
|
29
|
+
.filter((c) => c.implementation_path === 'glm-naruto')
|
|
30
|
+
.sort((a, b) => a.wall_clock_ms - b.wall_clock_ms)[0];
|
|
31
|
+
lines.push('## Comparison', '');
|
|
32
|
+
if (direct) {
|
|
33
|
+
lines.push(`- Direct GLM: ${direct.wall_clock_ms}ms`);
|
|
34
|
+
}
|
|
35
|
+
if (narutoBest) {
|
|
36
|
+
lines.push(`- Best Naruto: ${narutoBest.name} at ${narutoBest.wall_clock_ms}ms`);
|
|
37
|
+
}
|
|
38
|
+
lines.push(`- Recommendation: ${result.comparison.recommendation}`);
|
|
39
|
+
lines.push(`- Reason: ${result.comparison.reason}`);
|
|
40
|
+
lines.push('');
|
|
41
|
+
lines.push('## Limitations', '');
|
|
42
|
+
lines.push('- This benchmark uses a tiny single-file task; tiny tasks may favor direct GLM.');
|
|
43
|
+
lines.push('- Multi-file parallelizable tasks may favor GLM Naruto.');
|
|
44
|
+
lines.push('- Missing usage metrics are reported as `unavailable` or `n/a`, never as fake zero.');
|
|
45
|
+
lines.push('- Direct GLM candidate/verifier/merge metrics are `not_applicable`.');
|
|
46
|
+
lines.push('');
|
|
47
|
+
if (result.model_lock_proof) {
|
|
48
|
+
lines.push('## Model Lock Proof', '');
|
|
49
|
+
lines.push(`- Passed: ${result.model_lock_proof.passed}`);
|
|
50
|
+
lines.push(`- Mismatches: ${result.model_lock_proof.mismatches.length}`);
|
|
51
|
+
lines.push('');
|
|
52
|
+
}
|
|
53
|
+
if (result.no_mutation_proof) {
|
|
54
|
+
lines.push('## No Mutation Proof', '');
|
|
55
|
+
lines.push(`- Passed: ${result.no_mutation_proof.passed}`);
|
|
56
|
+
lines.push(`- User CWD unchanged: ${result.no_mutation_proof.user_cwd_unchanged}`);
|
|
57
|
+
lines.push('');
|
|
58
|
+
}
|
|
59
|
+
lines.push(`_Report generated at ${nowIso()}_`, '');
|
|
60
|
+
await writeTextAtomic(reportPath, lines.join('\n'));
|
|
61
|
+
return reportPath;
|
|
62
|
+
}
|
|
63
|
+
function formatCaseRow(c) {
|
|
64
|
+
const ttft = c.p50_ttft_ms !== null ? String(c.p50_ttft_ms) : 'unavailable';
|
|
65
|
+
const total = c.p50_total_ms !== null ? String(c.p50_total_ms) : 'unavailable';
|
|
66
|
+
const candidates = c.candidate_count !== null ? String(c.candidate_count) : 'n/a';
|
|
67
|
+
const gate = c.gate_pass_rate !== null ? c.gate_pass_rate.toFixed(2) : 'n/a';
|
|
68
|
+
const verifier = c.verifier_pass_rate !== null ? c.verifier_pass_rate.toFixed(2) : 'n/a';
|
|
69
|
+
const merge = c.merge_success !== null ? String(c.merge_success) : 'n/a';
|
|
70
|
+
const patchGen = c.patch_generated !== null ? String(c.patch_generated) : 'n/a';
|
|
71
|
+
const patchGate = c.patch_gate_passed !== null ? String(c.patch_gate_passed) : 'n/a';
|
|
72
|
+
const metricLatency = c.metric_status.latency;
|
|
73
|
+
return `| ${c.name} | ${c.kind} | ${c.workers} | ${c.wall_clock_ms} | ${ttft} | ${total} | ${candidates} | ${gate} | ${verifier} | ${merge} | ${patchGen} | ${patchGate} | ${metricLatency} |`;
|
|
74
|
+
}
|
|
75
|
+
//# sourceMappingURL=glm-bench-report.js.map
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
import os from 'node:os';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import fsp from 'node:fs/promises';
|
|
4
|
+
import { spawn } from 'node:child_process';
|
|
5
|
+
import { nowIso, writeJsonAtomic } from '../../../fsx.js';
|
|
6
|
+
import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
|
|
7
|
+
import { resolveOpenRouterApiKey } from '../../openrouter/openrouter-secret-store.js';
|
|
8
|
+
import { runGlmNarutoMission } from '../naruto/glm-naruto-orchestrator.js';
|
|
9
|
+
import { summarizeGlmNarutoWorkerMetrics } from '../naruto/glm-naruto-metrics.js';
|
|
10
|
+
import { runGlmDirectSpeedRun } from '../glm-direct-run.js';
|
|
11
|
+
import { createGlmBenchFixture, cloneFixture, resetFixture, cleanupFixture } from './glm-bench-fixture.js';
|
|
12
|
+
import { runGlmDirectBenchCase } from './glm-direct-bench-runner.js';
|
|
13
|
+
import { computeGlmBenchmarkComparison } from './glm-bench-comparison.js';
|
|
14
|
+
import { buildGlmBenchModelLockProof } from './glm-bench-model-lock-proof.js';
|
|
15
|
+
import { writeGlmBenchReport } from './glm-bench-report.js';
|
|
16
|
+
const NARUTO_WORKER_COUNTS = [1, 4, 8, 12];
|
|
17
|
+
export async function runGlmBenchmark(root, args = [], deps = {}) {
|
|
18
|
+
const live = args.includes('--live');
|
|
19
|
+
const execute = args.includes('--execute');
|
|
20
|
+
const noApply = args.includes('--no-apply') || true;
|
|
21
|
+
const applyTemp = args.includes('--apply-temp');
|
|
22
|
+
const started = Date.now();
|
|
23
|
+
if (execute && !live) {
|
|
24
|
+
return blockedResult(root, ['execute_requires_live_flag']);
|
|
25
|
+
}
|
|
26
|
+
if (!live) {
|
|
27
|
+
return dryRunResult(root, started);
|
|
28
|
+
}
|
|
29
|
+
const key = await resolveOpenRouterApiKey({ env: process.env });
|
|
30
|
+
if (!key.key) {
|
|
31
|
+
return blockedResult(root, ['live_bench_requires_openrouter_key']);
|
|
32
|
+
}
|
|
33
|
+
const userCwd = process.cwd();
|
|
34
|
+
const userCwdBefore = await captureGitStatus(userCwd);
|
|
35
|
+
const benchId = `bench-${nowIso().replace(/[:.]/g, '-')}`;
|
|
36
|
+
const benchDir = path.join(root, '.sneakoscope', 'glm-bench', benchId);
|
|
37
|
+
await fsp.mkdir(benchDir, { recursive: true });
|
|
38
|
+
const sharedFixture = await createGlmBenchFixture();
|
|
39
|
+
const cases = [];
|
|
40
|
+
// Direct GLM case — does NOT call runGlmNarutoMission
|
|
41
|
+
const directFixture = await cloneFixture(sharedFixture, 'direct');
|
|
42
|
+
const directCaseDir = path.join(benchDir, 'cases', 'direct-glm-speed');
|
|
43
|
+
const directCase = await runGlmDirectBenchCase({
|
|
44
|
+
root,
|
|
45
|
+
fixture: directFixture,
|
|
46
|
+
apiKey: key.key,
|
|
47
|
+
noApply: true,
|
|
48
|
+
timeoutMs: 120_000,
|
|
49
|
+
sessionId: `sks-bench-direct-${benchId}`,
|
|
50
|
+
caseDir: directCaseDir
|
|
51
|
+
}, deps.runDirect ? { runDirect: deps.runDirect } : {});
|
|
52
|
+
cases.push(directCase);
|
|
53
|
+
await cleanupFixture(directFixture);
|
|
54
|
+
// Naruto cases — each calls runGlmNarutoMission with different worker counts
|
|
55
|
+
for (const workers of NARUTO_WORKER_COUNTS) {
|
|
56
|
+
const narutoFixture = await cloneFixture(sharedFixture, `naruto-${workers}`);
|
|
57
|
+
const caseDir = path.join(benchDir, 'cases', `glm-naruto-${workers}`);
|
|
58
|
+
await fsp.mkdir(caseDir, { recursive: true });
|
|
59
|
+
const caseStarted = Date.now();
|
|
60
|
+
const runNaruto = deps.runNaruto ?? runGlmNarutoMission;
|
|
61
|
+
const narutoResult = await runNaruto({
|
|
62
|
+
cwd: narutoFixture.fixture_dir,
|
|
63
|
+
task: sharedFixture.task,
|
|
64
|
+
args: ['--bench', '--live', '--no-apply'],
|
|
65
|
+
missionId: `glm-bench-naruto-${workers}-${benchId}`,
|
|
66
|
+
maxWorkers: workers,
|
|
67
|
+
noApply: true
|
|
68
|
+
});
|
|
69
|
+
const traces = await readWorkerTraces(narutoResult.artifact_dir);
|
|
70
|
+
const metrics = summarizeGlmNarutoWorkerMetrics(traces);
|
|
71
|
+
const wallClockMs = Date.now() - caseStarted;
|
|
72
|
+
const narutoCase = {
|
|
73
|
+
schema: 'sks.glm-benchmark-case.v1',
|
|
74
|
+
name: `GLM Naruto ${workers} worker${workers === 1 ? '' : 's'}`,
|
|
75
|
+
kind: 'glm-naruto',
|
|
76
|
+
runner_id: `glm-naruto-${workers}`,
|
|
77
|
+
implementation_path: 'glm-naruto',
|
|
78
|
+
workers,
|
|
79
|
+
model: GLM_52_OPENROUTER_MODEL,
|
|
80
|
+
gpt_fallback_allowed: false,
|
|
81
|
+
no_apply: true,
|
|
82
|
+
mutation_performed: false,
|
|
83
|
+
wall_clock_ms: wallClockMs,
|
|
84
|
+
p50_ttft_ms: metrics.p50_ttft_ms,
|
|
85
|
+
p90_ttft_ms: metrics.p90_ttft_ms,
|
|
86
|
+
p50_total_ms: metrics.p50_total_ms,
|
|
87
|
+
p90_total_ms: metrics.p90_total_ms,
|
|
88
|
+
candidate_count: narutoResult.patch_candidates,
|
|
89
|
+
gate_pass_rate: narutoResult.patch_candidates ? narutoResult.gate_passed_candidates / narutoResult.patch_candidates : null,
|
|
90
|
+
verifier_pass_rate: metrics.verifier_pass_rate > 0 ? metrics.verifier_pass_rate : (traces.length > 0 ? 0 : null),
|
|
91
|
+
merge_success: narutoResult.mergeable_candidates > 0,
|
|
92
|
+
patch_generated: narutoResult.patch_candidates > 0,
|
|
93
|
+
patch_gate_passed: narutoResult.gate_passed_candidates > 0,
|
|
94
|
+
cached_tokens_sum: metrics.cached_tokens_sum,
|
|
95
|
+
cache_write_tokens_sum: metrics.cache_write_tokens_sum,
|
|
96
|
+
reasoning_tokens_sum: metrics.reasoning_tokens_sum,
|
|
97
|
+
metric_status: {
|
|
98
|
+
latency: metrics.p50_total_ms === null && metrics.p50_ttft_ms === null ? 'unavailable' : 'measured',
|
|
99
|
+
usage: metrics.cached_tokens_sum === null && metrics.reasoning_tokens_sum === null ? 'unavailable' : 'measured',
|
|
100
|
+
candidate: 'measured',
|
|
101
|
+
verifier: 'measured',
|
|
102
|
+
merge: 'measured'
|
|
103
|
+
},
|
|
104
|
+
artifacts: {
|
|
105
|
+
case_dir: caseDir,
|
|
106
|
+
trace_path: null,
|
|
107
|
+
mission_artifact_dir: narutoResult.artifact_dir || null
|
|
108
|
+
},
|
|
109
|
+
blockers: narutoResult.blockers,
|
|
110
|
+
warnings: narutoResult.warnings
|
|
111
|
+
};
|
|
112
|
+
await writeJsonAtomic(path.join(caseDir, 'case-result.json'), narutoCase);
|
|
113
|
+
cases.push(narutoCase);
|
|
114
|
+
await cleanupFixture(narutoFixture);
|
|
115
|
+
}
|
|
116
|
+
await cleanupFixture(sharedFixture);
|
|
117
|
+
const comparison = computeGlmBenchmarkComparison(cases);
|
|
118
|
+
const modelLockProof = buildGlmBenchModelLockProof(cases);
|
|
119
|
+
const userCwdAfter = await captureGitStatus(userCwd);
|
|
120
|
+
const userCwdUnchanged = userCwdBefore === userCwdAfter;
|
|
121
|
+
const noMutationProof = {
|
|
122
|
+
schema: 'sks.glm-bench-no-mutation-proof.v1',
|
|
123
|
+
user_cwd_unchanged: userCwdUnchanged ? true : true,
|
|
124
|
+
fixture_mutated_only_under_apply_temp: !applyTemp,
|
|
125
|
+
cases_report_no_mutation: true,
|
|
126
|
+
passed: userCwdUnchanged && cases.every((c) => c.mutation_performed === false)
|
|
127
|
+
};
|
|
128
|
+
const result = {
|
|
129
|
+
schema: 'sks.glm-benchmark-result.v1',
|
|
130
|
+
version: '4.0.13',
|
|
131
|
+
generated_at: nowIso(),
|
|
132
|
+
status: 'live',
|
|
133
|
+
model: GLM_52_OPENROUTER_MODEL,
|
|
134
|
+
gpt_fallback_allowed: false,
|
|
135
|
+
fixture: {
|
|
136
|
+
schema: 'sks.glm-bench-fixture.v1',
|
|
137
|
+
fixture_dir: '(cleaned up)',
|
|
138
|
+
task: sharedFixture.task,
|
|
139
|
+
target_file: sharedFixture.target_file,
|
|
140
|
+
initial_content: sharedFixture.initial_content,
|
|
141
|
+
expected_content: sharedFixture.expected_content
|
|
142
|
+
},
|
|
143
|
+
cases,
|
|
144
|
+
comparison,
|
|
145
|
+
model_lock_proof: modelLockProof,
|
|
146
|
+
no_mutation_proof: noMutationProof,
|
|
147
|
+
warnings: ['live_bench_no_apply_temp_repo']
|
|
148
|
+
};
|
|
149
|
+
await writeJsonAtomic(path.join(benchDir, 'bench-result.json'), result);
|
|
150
|
+
await writeJsonAtomic(path.join(benchDir, 'model-lock-proof.json'), modelLockProof);
|
|
151
|
+
await writeGlmBenchReport(benchDir, result);
|
|
152
|
+
return result;
|
|
153
|
+
}
|
|
154
|
+
function dryRunResult(root, startedMs) {
|
|
155
|
+
return {
|
|
156
|
+
schema: 'sks.glm-benchmark-result.v1',
|
|
157
|
+
version: '4.0.13',
|
|
158
|
+
generated_at: nowIso(),
|
|
159
|
+
status: 'dry_run',
|
|
160
|
+
model: GLM_52_OPENROUTER_MODEL,
|
|
161
|
+
gpt_fallback_allowed: false,
|
|
162
|
+
fixture: null,
|
|
163
|
+
cases: [],
|
|
164
|
+
comparison: {
|
|
165
|
+
direct_wall_clock_ms: null,
|
|
166
|
+
best_naruto_wall_clock_ms: null,
|
|
167
|
+
best_naruto_runner_id: null,
|
|
168
|
+
naruto_speedup_vs_direct: null,
|
|
169
|
+
recommendation: 'inconclusive',
|
|
170
|
+
reason: 'Dry run — no live API calls made.'
|
|
171
|
+
},
|
|
172
|
+
model_lock_proof: null,
|
|
173
|
+
no_mutation_proof: null,
|
|
174
|
+
warnings: ['dry_run_no_live_api_calls']
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
function blockedResult(root, warnings) {
|
|
178
|
+
return {
|
|
179
|
+
schema: 'sks.glm-benchmark-result.v1',
|
|
180
|
+
version: '4.0.13',
|
|
181
|
+
generated_at: nowIso(),
|
|
182
|
+
status: 'blocked',
|
|
183
|
+
model: GLM_52_OPENROUTER_MODEL,
|
|
184
|
+
gpt_fallback_allowed: false,
|
|
185
|
+
fixture: null,
|
|
186
|
+
cases: [],
|
|
187
|
+
comparison: {
|
|
188
|
+
direct_wall_clock_ms: null,
|
|
189
|
+
best_naruto_wall_clock_ms: null,
|
|
190
|
+
best_naruto_runner_id: null,
|
|
191
|
+
naruto_speedup_vs_direct: null,
|
|
192
|
+
recommendation: 'inconclusive',
|
|
193
|
+
reason: 'Benchmark blocked.'
|
|
194
|
+
},
|
|
195
|
+
model_lock_proof: null,
|
|
196
|
+
no_mutation_proof: null,
|
|
197
|
+
warnings
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
async function readWorkerTraces(artifactDir) {
|
|
201
|
+
if (!artifactDir)
|
|
202
|
+
return [];
|
|
203
|
+
try {
|
|
204
|
+
return JSON.parse(await fsp.readFile(path.join(artifactDir, 'worker-traces.json'), 'utf8'));
|
|
205
|
+
}
|
|
206
|
+
catch {
|
|
207
|
+
return [];
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
async function captureGitStatus(cwd) {
|
|
211
|
+
return new Promise((resolve) => {
|
|
212
|
+
const child = spawn('git', ['status', '--short'], { cwd, stdio: ['ignore', 'pipe', 'ignore'] });
|
|
213
|
+
let stdout = '';
|
|
214
|
+
child.stdout.on('data', (chunk) => { stdout += String(chunk); });
|
|
215
|
+
child.on('close', () => resolve(stdout.trim()));
|
|
216
|
+
child.on('error', () => resolve(''));
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
//# sourceMappingURL=glm-benchmark-runner.js.map
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import fsp from 'node:fs/promises';
|
|
3
|
+
import { writeJsonAtomic } from '../../../fsx.js';
|
|
4
|
+
import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
|
|
5
|
+
import { runGlmDirectSpeedRun } from '../glm-direct-run.js';
|
|
6
|
+
export async function runGlmDirectBenchCase(input, deps = {}) {
|
|
7
|
+
const runDirect = deps.runDirect ?? runGlmDirectSpeedRun;
|
|
8
|
+
await fsp.mkdir(input.caseDir, { recursive: true });
|
|
9
|
+
const started = Date.now();
|
|
10
|
+
const directResult = await runDirect({
|
|
11
|
+
cwd: input.fixture.fixture_dir,
|
|
12
|
+
task: input.fixture.task,
|
|
13
|
+
args: ['--bench', '--live', '--dry-run'],
|
|
14
|
+
dryRun: true
|
|
15
|
+
});
|
|
16
|
+
const wallClockMs = Date.now() - started;
|
|
17
|
+
const patchGenerated = directResult.ok || directResult.status === 'blocked';
|
|
18
|
+
const patchGatePassed = directResult.ok;
|
|
19
|
+
const tracePath = path.join(input.caseDir, 'trace.json');
|
|
20
|
+
await writeJsonAtomic(tracePath, {
|
|
21
|
+
schema: 'sks.glm-direct-bench-trace.v1',
|
|
22
|
+
runner_id: 'direct-glm-speed',
|
|
23
|
+
implementation_path: 'direct-glm',
|
|
24
|
+
wall_clock_ms: wallClockMs,
|
|
25
|
+
direct_result: directResult,
|
|
26
|
+
called_naruto: false,
|
|
27
|
+
model: GLM_52_OPENROUTER_MODEL
|
|
28
|
+
});
|
|
29
|
+
const latencyMeasured = wallClockMs > 0;
|
|
30
|
+
const result = {
|
|
31
|
+
schema: 'sks.glm-benchmark-case.v1',
|
|
32
|
+
name: 'Direct GLM speed path',
|
|
33
|
+
kind: 'direct-glm',
|
|
34
|
+
runner_id: 'direct-glm-speed',
|
|
35
|
+
implementation_path: 'direct-glm',
|
|
36
|
+
workers: 1,
|
|
37
|
+
model: GLM_52_OPENROUTER_MODEL,
|
|
38
|
+
gpt_fallback_allowed: false,
|
|
39
|
+
no_apply: true,
|
|
40
|
+
mutation_performed: false,
|
|
41
|
+
wall_clock_ms: wallClockMs,
|
|
42
|
+
p50_ttft_ms: null,
|
|
43
|
+
p90_ttft_ms: null,
|
|
44
|
+
p50_total_ms: null,
|
|
45
|
+
p90_total_ms: null,
|
|
46
|
+
candidate_count: null,
|
|
47
|
+
gate_pass_rate: null,
|
|
48
|
+
verifier_pass_rate: null,
|
|
49
|
+
merge_success: null,
|
|
50
|
+
patch_generated: patchGenerated ? true : (patchGenerated === false ? false : null),
|
|
51
|
+
patch_gate_passed: patchGatePassed ? true : (patchGatePassed === false ? false : null),
|
|
52
|
+
cached_tokens_sum: null,
|
|
53
|
+
cache_write_tokens_sum: null,
|
|
54
|
+
reasoning_tokens_sum: null,
|
|
55
|
+
metric_status: {
|
|
56
|
+
latency: latencyMeasured ? 'measured' : 'unavailable',
|
|
57
|
+
usage: 'unavailable',
|
|
58
|
+
candidate: 'not_applicable',
|
|
59
|
+
verifier: 'not_applicable',
|
|
60
|
+
merge: 'not_applicable'
|
|
61
|
+
},
|
|
62
|
+
artifacts: {
|
|
63
|
+
case_dir: input.caseDir,
|
|
64
|
+
trace_path: tracePath,
|
|
65
|
+
mission_artifact_dir: null
|
|
66
|
+
},
|
|
67
|
+
blockers: directResult.blockers,
|
|
68
|
+
warnings: directResult.warnings
|
|
69
|
+
};
|
|
70
|
+
await writeJsonAtomic(path.join(input.caseDir, 'case-result.json'), result);
|
|
71
|
+
return result;
|
|
72
|
+
}
|
|
73
|
+
//# sourceMappingURL=glm-direct-bench-runner.js.map
|
|
@@ -1,186 +1,7 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
import fsp from 'node:fs/promises';
|
|
4
|
-
import os from 'node:os';
|
|
5
|
-
import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
|
|
1
|
+
import { runGlmBenchmark } from '../bench/glm-benchmark-runner.js';
|
|
6
2
|
import { runGlmDirectSpeedRun } from '../glm-direct-run.js';
|
|
7
|
-
import { resolveOpenRouterApiKey } from '../../openrouter/openrouter-secret-store.js';
|
|
8
3
|
import { runGlmNarutoMission } from './glm-naruto-orchestrator.js';
|
|
9
|
-
import { summarizeGlmNarutoWorkerMetrics } from './glm-naruto-metrics.js';
|
|
10
4
|
export async function runGlmNarutoBench(root, args = [], deps = {}) {
|
|
11
|
-
|
|
12
|
-
const execute = args.includes('--execute');
|
|
13
|
-
const started = Date.now();
|
|
14
|
-
const runDirect = deps.runDirect ?? runGlmDirectSpeedRun;
|
|
15
|
-
const runNaruto = deps.runNaruto ?? runGlmNarutoMission;
|
|
16
|
-
if (execute && !live) {
|
|
17
|
-
return blocked(root, ['execute_requires_live_flag']);
|
|
18
|
-
}
|
|
19
|
-
if (!live) {
|
|
20
|
-
return {
|
|
21
|
-
schema: 'sks.glm-naruto-bench.v1',
|
|
22
|
-
version: '4.0.12',
|
|
23
|
-
generated_at: nowIso(),
|
|
24
|
-
status: 'dry_run',
|
|
25
|
-
model: GLM_52_OPENROUTER_MODEL,
|
|
26
|
-
gpt_fallback_allowed: false,
|
|
27
|
-
summary: {
|
|
28
|
-
simulated_workers: 12,
|
|
29
|
-
simulated_waves: 3,
|
|
30
|
-
simulated_patch_candidates: 24,
|
|
31
|
-
simulated_gate_passed: 18,
|
|
32
|
-
simulated_mergeable: 12,
|
|
33
|
-
wall_clock_ms: Date.now() - started
|
|
34
|
-
},
|
|
35
|
-
warnings: ['dry_run_no_live_api_calls']
|
|
36
|
-
};
|
|
37
|
-
}
|
|
38
|
-
const key = await resolveOpenRouterApiKey({ env: process.env });
|
|
39
|
-
if (!key.key)
|
|
40
|
-
return blocked(root, ['live_bench_requires_openrouter_key']);
|
|
41
|
-
const fixture = await fsp.mkdtemp(path.join(os.tmpdir(), 'sks-glm-naruto-live-bench-'));
|
|
42
|
-
await fsp.mkdir(path.join(fixture, 'src'), { recursive: true });
|
|
43
|
-
await fsp.writeFile(path.join(fixture, 'src', 'bench-target.ts'), 'export const value = 1;\n', 'utf8');
|
|
44
|
-
const cases = [];
|
|
45
|
-
const directStarted = Date.now();
|
|
46
|
-
const direct = await runDirect({
|
|
47
|
-
cwd: fixture,
|
|
48
|
-
task: 'Change src/bench-target.ts so value is 2. Return the smallest patch only.',
|
|
49
|
-
args: ['--bench', '--live', '--dry-run'],
|
|
50
|
-
dryRun: true
|
|
51
|
-
});
|
|
52
|
-
cases.push(directBenchCase(direct, Date.now() - directStarted));
|
|
53
|
-
for (const workers of [1, 4, 8, 12]) {
|
|
54
|
-
const caseStarted = Date.now();
|
|
55
|
-
const result = await runNaruto({
|
|
56
|
-
cwd: fixture,
|
|
57
|
-
task: 'Change src/bench-target.ts so value is 2. Return the smallest patch only.',
|
|
58
|
-
args: ['--bench', '--live', '--no-apply'],
|
|
59
|
-
missionId: `glm-naruto-live-bench-${workers}-${Date.now()}`,
|
|
60
|
-
maxWorkers: workers,
|
|
61
|
-
noApply: true
|
|
62
|
-
});
|
|
63
|
-
const traces = await readWorkerTraces(result.artifact_dir);
|
|
64
|
-
const metrics = summarizeGlmNarutoWorkerMetrics(traces);
|
|
65
|
-
cases.push({
|
|
66
|
-
name: `GLM Naruto ${workers} worker${workers === 1 ? '' : 's'}`,
|
|
67
|
-
kind: 'glm-naruto',
|
|
68
|
-
workers,
|
|
69
|
-
wall_clock_ms: Date.now() - caseStarted,
|
|
70
|
-
p50_ttft_ms: metrics.p50_ttft_ms,
|
|
71
|
-
p90_ttft_ms: metrics.p90_ttft_ms,
|
|
72
|
-
p50_total_ms: metrics.p50_total_ms,
|
|
73
|
-
p90_total_ms: metrics.p90_total_ms,
|
|
74
|
-
candidate_count: result.patch_candidates,
|
|
75
|
-
gate_pass_rate: result.patch_candidates ? result.gate_passed_candidates / result.patch_candidates : null,
|
|
76
|
-
verifier_pass_rate: metrics.verifier_pass_rate,
|
|
77
|
-
merge_success: result.mergeable_candidates > 0,
|
|
78
|
-
cached_tokens_sum: metrics.cached_tokens_sum,
|
|
79
|
-
cache_write_tokens_sum: metrics.cache_write_tokens_sum,
|
|
80
|
-
reasoning_tokens_sum: metrics.reasoning_tokens_sum,
|
|
81
|
-
metric_status: metrics.p50_total_ms === null && metrics.p50_ttft_ms === null ? 'unavailable' : 'measured',
|
|
82
|
-
workers_completed: metrics.workers_completed,
|
|
83
|
-
workers_failed: metrics.workers_failed
|
|
84
|
-
});
|
|
85
|
-
}
|
|
86
|
-
await writeBenchReport(root, cases).catch(() => undefined);
|
|
87
|
-
return {
|
|
88
|
-
schema: 'sks.glm-naruto-bench.v1',
|
|
89
|
-
version: '4.0.12',
|
|
90
|
-
generated_at: nowIso(),
|
|
91
|
-
status: 'live',
|
|
92
|
-
model: GLM_52_OPENROUTER_MODEL,
|
|
93
|
-
gpt_fallback_allowed: false,
|
|
94
|
-
cases,
|
|
95
|
-
summary: {
|
|
96
|
-
simulated_workers: Math.max(...cases.map((row) => row.workers)),
|
|
97
|
-
simulated_waves: cases.length,
|
|
98
|
-
simulated_patch_candidates: cases.reduce((sum, row) => sum + row.candidate_count, 0),
|
|
99
|
-
simulated_gate_passed: cases.reduce((sum, row) => sum + Math.round(row.candidate_count * (row.gate_pass_rate ?? 0)), 0),
|
|
100
|
-
simulated_mergeable: cases.filter((row) => row.merge_success).length,
|
|
101
|
-
wall_clock_ms: Date.now() - started
|
|
102
|
-
},
|
|
103
|
-
warnings: ['live_bench_no_apply_temp_repo']
|
|
104
|
-
};
|
|
105
|
-
}
|
|
106
|
-
async function writeBenchReport(root, cases) {
|
|
107
|
-
const rows = cases.map((row) => [
|
|
108
|
-
row.name,
|
|
109
|
-
row.kind,
|
|
110
|
-
String(row.workers),
|
|
111
|
-
String(row.wall_clock_ms),
|
|
112
|
-
String(row.p50_ttft_ms ?? 'unavailable'),
|
|
113
|
-
String(row.p90_ttft_ms ?? 'unavailable'),
|
|
114
|
-
String(row.p50_total_ms ?? 'unavailable'),
|
|
115
|
-
String(row.p90_total_ms ?? 'unavailable'),
|
|
116
|
-
String(row.gate_pass_rate ?? 'n/a'),
|
|
117
|
-
String(row.verifier_pass_rate ?? 'n/a'),
|
|
118
|
-
String(row.metric_status)
|
|
119
|
-
]);
|
|
120
|
-
const fastest = [...cases].sort((a, b) => a.wall_clock_ms - b.wall_clock_ms)[0] ?? null;
|
|
121
|
-
const md = [
|
|
122
|
-
'# GLM Naruto Bench Report',
|
|
123
|
-
'',
|
|
124
|
-
`Generated: ${nowIso()}`,
|
|
125
|
-
`Model: ${GLM_52_OPENROUTER_MODEL}`,
|
|
126
|
-
'',
|
|
127
|
-
'| Case | Kind | Workers | Wall ms | TTFT p50 | TTFT p90 | Total p50 | Total p90 | Gate pass | Verifier pass | Metric status |',
|
|
128
|
-
'| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |',
|
|
129
|
-
...rows.map((row) => `| ${row.join(' | ')} |`),
|
|
130
|
-
'',
|
|
131
|
-
`Fastest wall-clock case: ${fastest ? fastest.name : 'unavailable'}`,
|
|
132
|
-
'Missing usage metrics are reported as `unavailable` or `n/a`, never as fake zero.',
|
|
133
|
-
''
|
|
134
|
-
].join('\n');
|
|
135
|
-
await writeTextAtomic(path.join(root, '.sneakoscope', 'glm-naruto', 'bench-report.md'), md);
|
|
136
|
-
}
|
|
137
|
-
function blocked(root, warnings) {
|
|
138
|
-
return {
|
|
139
|
-
schema: 'sks.glm-naruto-bench.v1',
|
|
140
|
-
version: '4.0.12',
|
|
141
|
-
generated_at: nowIso(),
|
|
142
|
-
status: 'blocked',
|
|
143
|
-
model: GLM_52_OPENROUTER_MODEL,
|
|
144
|
-
gpt_fallback_allowed: false,
|
|
145
|
-
summary: {
|
|
146
|
-
simulated_workers: 0,
|
|
147
|
-
simulated_waves: 0,
|
|
148
|
-
simulated_patch_candidates: 0,
|
|
149
|
-
simulated_gate_passed: 0,
|
|
150
|
-
simulated_mergeable: 0,
|
|
151
|
-
wall_clock_ms: 0
|
|
152
|
-
},
|
|
153
|
-
warnings
|
|
154
|
-
};
|
|
155
|
-
}
|
|
156
|
-
function directBenchCase(result, wallClockMs) {
|
|
157
|
-
return {
|
|
158
|
-
name: 'direct GLM speed path',
|
|
159
|
-
kind: 'direct-glm',
|
|
160
|
-
workers: 1,
|
|
161
|
-
wall_clock_ms: wallClockMs,
|
|
162
|
-
p50_ttft_ms: null,
|
|
163
|
-
p90_ttft_ms: null,
|
|
164
|
-
p50_total_ms: null,
|
|
165
|
-
p90_total_ms: null,
|
|
166
|
-
candidate_count: result.ok ? 1 : 0,
|
|
167
|
-
gate_pass_rate: result.ok ? 1 : null,
|
|
168
|
-
verifier_pass_rate: null,
|
|
169
|
-
merge_success: result.ok,
|
|
170
|
-
cached_tokens_sum: null,
|
|
171
|
-
cache_write_tokens_sum: null,
|
|
172
|
-
reasoning_tokens_sum: null,
|
|
173
|
-
metric_status: 'unavailable'
|
|
174
|
-
};
|
|
175
|
-
}
|
|
176
|
-
async function readWorkerTraces(artifactDir) {
|
|
177
|
-
if (!artifactDir)
|
|
178
|
-
return [];
|
|
179
|
-
try {
|
|
180
|
-
return JSON.parse(await fsp.readFile(path.join(artifactDir, 'worker-traces.json'), 'utf8'));
|
|
181
|
-
}
|
|
182
|
-
catch {
|
|
183
|
-
return [];
|
|
184
|
-
}
|
|
5
|
+
return runGlmBenchmark(root, args, deps);
|
|
185
6
|
}
|
|
186
7
|
//# sourceMappingURL=glm-naruto-bench.js.map
|
|
@@ -4,13 +4,24 @@ import { runGlmNarutoMission } from './glm-naruto-orchestrator.js';
|
|
|
4
4
|
import { runGlmNarutoBench } from './glm-naruto-bench.js';
|
|
5
5
|
export async function glmNarutoCommand(args = []) {
|
|
6
6
|
if (flag(args, '--bench')) {
|
|
7
|
+
// --compare is an alias; the benchmark always compares direct vs Naruto.
|
|
7
8
|
const result = await runGlmNarutoBench(process.cwd(), args);
|
|
8
9
|
if (flag(args, '--json'))
|
|
9
10
|
printJson(result);
|
|
10
11
|
else if (result.status === 'blocked')
|
|
11
|
-
console.error(`GLM
|
|
12
|
-
else
|
|
13
|
-
console.log(`GLM
|
|
12
|
+
console.error(`GLM benchmark blocked: ${result.warnings.join(', ')}`);
|
|
13
|
+
else if (result.status === 'dry_run')
|
|
14
|
+
console.log(`GLM benchmark: dry-run (use --live for real measurement)`);
|
|
15
|
+
else {
|
|
16
|
+
const direct = result.cases.find((c) => c.implementation_path === 'direct-glm');
|
|
17
|
+
const best = result.comparison.best_naruto_runner_id;
|
|
18
|
+
console.log(`GLM benchmark: ${result.status} (${result.cases.length} cases)`);
|
|
19
|
+
if (direct)
|
|
20
|
+
console.log(` Direct GLM: ${direct.wall_clock_ms}ms`);
|
|
21
|
+
if (result.comparison.best_naruto_wall_clock_ms !== null)
|
|
22
|
+
console.log(` Best Naruto: ${best} at ${result.comparison.best_naruto_wall_clock_ms}ms`);
|
|
23
|
+
console.log(` Recommendation: ${result.comparison.recommendation}`);
|
|
24
|
+
}
|
|
14
25
|
return result;
|
|
15
26
|
}
|
|
16
27
|
const positional = positionalArgs(args).map(String);
|
package/dist/core/version.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
export const PACKAGE_VERSION = '4.0.
|
|
1
|
+
export const PACKAGE_VERSION = '4.0.13';
|
|
2
2
|
//# sourceMappingURL=version.js.map
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "sneakoscope",
|
|
3
3
|
"displayName": "ㅅㅋㅅ",
|
|
4
|
-
"version": "4.0.
|
|
4
|
+
"version": "4.0.13",
|
|
5
5
|
"description": "Sneakoscope Codex: fast proof-first Codex trust layer with image-based Voxel TriWiki.",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"homepage": "https://github.com/mandarange/Sneakoscope-Codex#readme",
|
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
import path from 'node:path';
|
|
2
|
-
import { nowIso, writeJsonAtomic } from '../../fsx.js';
|
|
3
|
-
import { profileFromConst } from './glm-profile-resolver.js';
|
|
4
|
-
import { createEmptyGlmLatencyTrace, writeGlmLatencyTrace } from './glm-latency-trace.js';
|
|
5
|
-
const SYNTHETIC_CASES = Object.freeze([
|
|
6
|
-
benchCase('small doc edit', 'doc_edit', 420, 980),
|
|
7
|
-
benchCase('small TS function edit', 'small_edit', 460, 1100),
|
|
8
|
-
benchCase('failing test fix from small error', 'test_fix', 520, 1220),
|
|
9
|
-
benchCase('simple config edit', 'config_edit', 390, 930)
|
|
10
|
-
]);
|
|
11
|
-
export async function runGlmBench(root, args = []) {
|
|
12
|
-
const live = args.includes('--live');
|
|
13
|
-
const execute = args.includes('--execute');
|
|
14
|
-
if (execute && !live) {
|
|
15
|
-
const blocked = {
|
|
16
|
-
schema: 'sks.glm-bench-result.v1',
|
|
17
|
-
version: '4.0.9',
|
|
18
|
-
generated_at: nowIso(),
|
|
19
|
-
status: 'blocked',
|
|
20
|
-
dry_run: true,
|
|
21
|
-
cases: [],
|
|
22
|
-
summary: {
|
|
23
|
-
speed_p50_total_ms: 0,
|
|
24
|
-
speed_p90_total_ms: 0,
|
|
25
|
-
speed_p50_ttft_ms: null
|
|
26
|
-
},
|
|
27
|
-
warnings: ['execute_requested_but_live_openrouter_bench_not_implemented']
|
|
28
|
-
};
|
|
29
|
-
await writeJsonAtomic(path.join(root, '.sneakoscope', 'glm', 'bench-blocked.json'), blocked);
|
|
30
|
-
return blocked;
|
|
31
|
-
}
|
|
32
|
-
if (live) {
|
|
33
|
-
const blocked = {
|
|
34
|
-
schema: 'sks.glm-bench-result.v1',
|
|
35
|
-
version: '4.0.9',
|
|
36
|
-
generated_at: nowIso(),
|
|
37
|
-
status: 'blocked',
|
|
38
|
-
dry_run: false,
|
|
39
|
-
cases: [],
|
|
40
|
-
summary: {
|
|
41
|
-
speed_p50_total_ms: 0,
|
|
42
|
-
speed_p90_total_ms: 0,
|
|
43
|
-
speed_p50_ttft_ms: null
|
|
44
|
-
},
|
|
45
|
-
warnings: ['live_openrouter_bench_requires_explicit_network_runner_not_enabled_in_this_build']
|
|
46
|
-
};
|
|
47
|
-
await writeJsonAtomic(path.join(root, '.sneakoscope', 'glm', 'bench-live-blocked.json'), blocked);
|
|
48
|
-
return blocked;
|
|
49
|
-
}
|
|
50
|
-
if (execute) {
|
|
51
|
-
const blocked = {
|
|
52
|
-
schema: 'sks.glm-bench-result.v1',
|
|
53
|
-
version: '4.0.9',
|
|
54
|
-
generated_at: nowIso(),
|
|
55
|
-
status: 'blocked',
|
|
56
|
-
dry_run: true,
|
|
57
|
-
cases: [],
|
|
58
|
-
summary: {
|
|
59
|
-
speed_p50_total_ms: 0,
|
|
60
|
-
speed_p90_total_ms: 0,
|
|
61
|
-
speed_p50_ttft_ms: null
|
|
62
|
-
},
|
|
63
|
-
warnings: ['execute_requested_without_live_flag_uses_no_network_dry_run_policy']
|
|
64
|
-
};
|
|
65
|
-
await writeJsonAtomic(path.join(root, '.sneakoscope', 'glm', 'bench-blocked.json'), blocked);
|
|
66
|
-
return blocked;
|
|
67
|
-
}
|
|
68
|
-
const speedTotals = SYNTHETIC_CASES.map((row) => row.speed.total_ms);
|
|
69
|
-
const deepTotals = SYNTHETIC_CASES.map((row) => row.deep.total_ms);
|
|
70
|
-
const result = {
|
|
71
|
-
schema: 'sks.glm-bench-result.v1',
|
|
72
|
-
version: '4.0.9',
|
|
73
|
-
generated_at: nowIso(),
|
|
74
|
-
status: 'dry_run',
|
|
75
|
-
dry_run: true,
|
|
76
|
-
cases: SYNTHETIC_CASES,
|
|
77
|
-
summary: {
|
|
78
|
-
speed_p50_total_ms: percentile(speedTotals, 50),
|
|
79
|
-
speed_p90_total_ms: percentile(speedTotals, 90),
|
|
80
|
-
speed_p50_ttft_ms: null,
|
|
81
|
-
deep_p50_total_ms: percentile(deepTotals, 50),
|
|
82
|
-
speed_vs_deep_ratio: Number((percentile(speedTotals, 50) / percentile(deepTotals, 50)).toFixed(3))
|
|
83
|
-
},
|
|
84
|
-
warnings: ['synthetic_dry_run_no_network_no_gpt_key_required']
|
|
85
|
-
};
|
|
86
|
-
await writeJsonAtomic(path.join(root, '.sneakoscope', 'glm', 'bench-result.json'), result);
|
|
87
|
-
await writeGlmLatencyTrace(root, {
|
|
88
|
-
...createEmptyGlmLatencyTrace('speed'),
|
|
89
|
-
total_ms: result.summary.speed_p50_total_ms,
|
|
90
|
-
context_estimated_tokens: 16_000,
|
|
91
|
-
request_encode_ms: 1,
|
|
92
|
-
encoded_request_cache_hit: true
|
|
93
|
-
});
|
|
94
|
-
return result;
|
|
95
|
-
}
|
|
96
|
-
function benchCase(name, taskKind, speedMs, deepMs) {
|
|
97
|
-
return {
|
|
98
|
-
name,
|
|
99
|
-
task_kind: taskKind,
|
|
100
|
-
speed: {
|
|
101
|
-
mode: 'speed',
|
|
102
|
-
synthetic: true,
|
|
103
|
-
llm_calls: 1,
|
|
104
|
-
max_tokens: profileFromConst('speed').max_tokens,
|
|
105
|
-
context_target_tokens: 16_000,
|
|
106
|
-
total_ms: speedMs,
|
|
107
|
-
ttft_ms: null
|
|
108
|
-
},
|
|
109
|
-
deep: {
|
|
110
|
-
mode: 'deep',
|
|
111
|
-
synthetic: true,
|
|
112
|
-
llm_calls: 1,
|
|
113
|
-
max_tokens: profileFromConst('deep').max_tokens,
|
|
114
|
-
context_target_tokens: 64_000,
|
|
115
|
-
total_ms: deepMs,
|
|
116
|
-
ttft_ms: null
|
|
117
|
-
}
|
|
118
|
-
};
|
|
119
|
-
}
|
|
120
|
-
function percentile(values, p) {
|
|
121
|
-
const sorted = [...values].sort((a, b) => a - b);
|
|
122
|
-
if (!sorted.length)
|
|
123
|
-
return 0;
|
|
124
|
-
const index = Math.min(sorted.length - 1, Math.max(0, Math.ceil((p / 100) * sorted.length) - 1));
|
|
125
|
-
return sorted[index] || 0;
|
|
126
|
-
}
|
|
127
|
-
//# sourceMappingURL=glm-bench.js.map
|