sneakoscope 4.0.11 → 4.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/README.md +7 -7
  2. package/crates/sks-core/Cargo.lock +1 -1
  3. package/crates/sks-core/Cargo.toml +1 -1
  4. package/crates/sks-core/src/main.rs +1 -1
  5. package/dist/bin/sks.js +1 -1
  6. package/dist/core/commands/glm-command.js +11 -5
  7. package/dist/core/fsx.js +1 -1
  8. package/dist/core/providers/glm/bench/glm-bench-comparison.js +48 -0
  9. package/dist/core/providers/glm/bench/glm-bench-fixture.js +65 -0
  10. package/dist/core/providers/glm/bench/glm-bench-model-lock-proof.js +24 -0
  11. package/dist/core/providers/glm/bench/glm-bench-report.js +75 -0
  12. package/dist/core/providers/glm/bench/glm-benchmark-runner.js +219 -0
  13. package/dist/core/providers/glm/bench/glm-benchmark-types.js +2 -0
  14. package/dist/core/providers/glm/bench/glm-direct-bench-runner.js +73 -0
  15. package/dist/core/providers/glm/naruto/glm-naruto-apply-transaction.js +54 -8
  16. package/dist/core/providers/glm/naruto/glm-naruto-bench.js +4 -118
  17. package/dist/core/providers/glm/naruto/glm-naruto-command.js +20 -3
  18. package/dist/core/providers/glm/naruto/glm-naruto-final-seal.js +75 -0
  19. package/dist/core/providers/glm/naruto/glm-naruto-orchestrator.js +41 -5
  20. package/dist/core/providers/glm/naruto/glm-naruto-targeted-checks.js +76 -0
  21. package/dist/core/providers/glm/naruto/glm-naruto-trace.js +50 -0
  22. package/dist/core/providers/glm/naruto/glm-naruto-worker-pool.js +61 -19
  23. package/dist/core/providers/glm/naruto/glm-naruto-worker-runtime.js +26 -4
  24. package/dist/core/providers/glm/naruto/glm-naruto-worker-scheduler.js +178 -0
  25. package/dist/core/providers/glm/naruto/glm-naruto-worktree-worker.js +34 -4
  26. package/dist/core/version.js +1 -1
  27. package/package.json +1 -1
  28. package/dist/core/providers/glm/glm-bench.js +0 -127
package/README.md CHANGED
@@ -35,15 +35,15 @@ Set up this agent project with Sneakoscope Codex. Use [[mandarange/Sneakoscope-C
35
35
 
36
36
  ## 🚀 Current Release
37
37
 
38
- SKS **4.0.11** makes GLM Naruto operationally measurable: `--worktree` is honest, live bench records real worker/verifier/cache metrics, candidates are scored before merge planning, final apply writes rollback-aware transaction evidence, and early terminal paths write canonical stop-gates. Ordinary direct GLM remains available for single-path edits, while GLM Naruto is the measured parallel runtime.
38
+ SKS **4.0.12** seals GLM Naruto's production runtime path: worktree workers apply extracted unified diffs only, patch workers launch through a bounded adaptive scheduler, live bench compares true direct GLM against Naruto worker counts, final apply runs dirty-tree and targeted-check guards, and stop-gates reference a final seal artifact.
39
39
 
40
- What changed in 4.0.11:
40
+ What changed in 4.0.12:
41
41
 
42
- - **Honest worktree isolation.** `sks --mad --glm --naruto --worktree "<task>"` uses per-worker git worktrees when available or blocks unless `--allow-patch-envelope-fallback` is explicit.
43
- - **Measured live bench.** `--bench --live --no-apply` still runs verifier/scoring and reports TTFT, total latency, verifier pass rate, cache tokens, reasoning tokens, and worker completion/failure counts.
44
- - **Scoreboard-driven merge planning.** `candidate-scoreboard.json` captures gate, verifier, risk, confidence, path, conflict, latency, cache, diversity, and secret-safety components.
45
- - **Rollback-aware apply.** Final mutation is single-threaded and records `apply-transaction.json`, selected combined patch, diff hashes, and rollback evidence.
46
- - **Terminal evidence and artifact safety.** Missing-key, invalid-graph, budget, and no-candidate terminal paths write canonical stop-gates, and GLM Naruto artifacts use key-aware secret audit/redaction.
42
+ - **Extracted worktree patches.** `--worktree` parses `<sks_patch_candidate>` and records candidate/extracted patch hashes before any worker worktree apply.
43
+ - **Adaptive scheduler.** Patch workers use a finite launch queue with provider-health backpressure and retry-once handling for retryable 429/5xx/idle-timeout failures.
44
+ - **True direct-vs-Naruto bench.** `--bench --live --no-apply` compares direct GLM, Naruto 1, 4, 8, and 12 worker cases without fake zero metrics.
45
+ - **Transaction guards.** Final apply blocks dirty touched paths unless `--allow-dirty-apply` is explicit, runs targeted checks, and rolls back on validation failure by default.
46
+ - **Seal artifacts.** GLM Naruto writes `final-seal.json`, stop-gate final-seal evidence, `merge-rationale.md`, and `bench-report.md` for auditability.
47
47
 
48
48
  What changed in 4.0.8:
49
49
 
@@ -76,7 +76,7 @@ dependencies = [
76
76
 
77
77
  [[package]]
78
78
  name = "sks-core"
79
- version = "4.0.11"
79
+ version = "4.0.12"
80
80
  dependencies = [
81
81
  "serde_json",
82
82
  ]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "sks-core"
3
- version = "4.0.11"
3
+ version = "4.0.12"
4
4
  edition = "2021"
5
5
 
6
6
  [dependencies]
@@ -4,7 +4,7 @@ use std::io::{self, Read, Seek, SeekFrom};
4
4
  fn main() {
5
5
  let mut args = std::env::args().skip(1);
6
6
  match args.next().as_deref() {
7
- Some("--version") => println!("sks-rs 4.0.9"),
7
+ Some("--version") => println!("sks-rs 4.0.12"),
8
8
  Some("compact-info") => {
9
9
  let mut input = String::new();
10
10
  let _ = io::stdin().read_to_string(&mut input);
package/dist/bin/sks.js CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- const FAST_PACKAGE_VERSION = '4.0.11';
2
+ const FAST_PACKAGE_VERSION = '4.0.13';
3
3
  const args = process.argv.slice(2);
4
4
  try {
5
5
  if (args[0] === '--agent' && args[1] === 'worker') {
@@ -1,5 +1,5 @@
1
1
  import { flag, positionalArgs } from '../../cli/args.js';
2
- import { runGlmBench } from '../providers/glm/glm-bench.js';
2
+ import { runGlmBenchmark } from '../providers/glm/bench/glm-benchmark-runner.js';
3
3
  import { printJson } from '../../cli/output.js';
4
4
  import { runGlmDirectSpeedRun } from '../providers/glm/glm-direct-run.js';
5
5
  import { runGlmReadinessAndExit } from '../providers/glm/glm-readiness.js';
@@ -11,15 +11,21 @@ export async function glmCommand(args = []) {
11
11
  return glmNarutoCommand(narutoArgs);
12
12
  }
13
13
  if (flag(args, '--bench') && !flag(args, '--naruto')) {
14
- const result = await runGlmBench(process.cwd(), args);
14
+ const result = await runGlmBenchmark(process.cwd(), args);
15
15
  if (result.status === 'blocked')
16
16
  process.exitCode = 1;
17
17
  if (flag(args, '--json'))
18
18
  printJson(result);
19
19
  else if (result.status === 'blocked')
20
- console.error(`GLM bench blocked: ${result.warnings.join(', ')}`);
21
- else
22
- console.log(`GLM bench: dry-run p50=${result.summary.speed_p50_total_ms}ms ratio=${result.summary.speed_vs_deep_ratio}`);
20
+ console.error(`GLM benchmark blocked: ${result.warnings.join(', ')}`);
21
+ else if (result.status === 'dry_run')
22
+ console.log(`GLM benchmark: dry-run (use --live for real measurement)`);
23
+ else {
24
+ const direct = result.cases.find((c) => c.implementation_path === 'direct-glm');
25
+ if (direct)
26
+ console.log(` Direct GLM: ${direct.wall_clock_ms}ms`);
27
+ console.log(` Recommendation: ${result.comparison.recommendation}`);
28
+ }
23
29
  return result;
24
30
  }
25
31
  const task = extractGlmTask(args);
package/dist/core/fsx.js CHANGED
@@ -5,7 +5,7 @@ import os from 'node:os';
5
5
  import crypto from 'node:crypto';
6
6
  import { spawn } from 'node:child_process';
7
7
  import { fileURLToPath } from 'node:url';
8
- export const PACKAGE_VERSION = '4.0.11';
8
+ export const PACKAGE_VERSION = '4.0.13';
9
9
  export const DEFAULT_PROCESS_TAIL_BYTES = 256 * 1024;
10
10
  export const DEFAULT_PROCESS_TIMEOUT_MS = 30 * 60 * 1000;
11
11
  export function nowIso() {
@@ -0,0 +1,48 @@
1
+ export function computeGlmBenchmarkComparison(cases) {
2
+ const directCase = cases.find((c) => c.implementation_path === 'direct-glm');
3
+ const narutoCases = cases.filter((c) => c.implementation_path === 'glm-naruto');
4
+ const directSucceeded = Boolean(directCase && (directCase.patch_generated === true || directCase.patch_gate_passed === true));
5
+ const directWallClockMs = directCase && directSucceeded ? directCase.wall_clock_ms : null;
6
+ const eligibleNaruto = narutoCases.filter((c) => (c.gate_pass_rate !== null && c.gate_pass_rate > 0) || c.merge_success === true);
7
+ let bestNaruto = null;
8
+ for (const naruto of eligibleNaruto) {
9
+ if (!bestNaruto || naruto.wall_clock_ms < bestNaruto.wall_clock_ms) {
10
+ bestNaruto = naruto;
11
+ }
12
+ }
13
+ const bestNarutoWallClockMs = bestNaruto ? bestNaruto.wall_clock_ms : null;
14
+ const bestNarutoRunnerId = bestNaruto ? bestNaruto.runner_id : null;
15
+ let speedup = null;
16
+ if (directWallClockMs !== null && bestNarutoWallClockMs !== null && bestNarutoWallClockMs > 0) {
17
+ speedup = Number((directWallClockMs / bestNarutoWallClockMs).toFixed(3));
18
+ }
19
+ let recommendation = 'inconclusive';
20
+ let reason = 'Insufficient measured data to recommend a path.';
21
+ if (directWallClockMs !== null && bestNarutoWallClockMs === null) {
22
+ recommendation = 'direct-glm';
23
+ reason = 'Direct GLM succeeded and no Naruto case produced gate-passed or merged results.';
24
+ }
25
+ else if (directWallClockMs !== null && bestNarutoWallClockMs !== null && speedup !== null) {
26
+ if (speedup >= 1.2) {
27
+ recommendation = 'glm-naruto';
28
+ reason = `GLM Naruto (${bestNarutoRunnerId}) was ${speedup.toFixed(2)}x faster than direct GLM for this task.`;
29
+ }
30
+ else {
31
+ recommendation = 'direct-glm';
32
+ reason = `Direct GLM was faster for this tiny single-file task (speedup ratio ${speedup.toFixed(2)}).`;
33
+ }
34
+ }
35
+ else if (directWallClockMs === null && bestNarutoWallClockMs !== null) {
36
+ recommendation = 'glm-naruto';
37
+ reason = `GLM Naruto (${bestNarutoRunnerId}) produced results while direct GLM did not complete.`;
38
+ }
39
+ return {
40
+ direct_wall_clock_ms: directWallClockMs,
41
+ best_naruto_wall_clock_ms: bestNarutoWallClockMs,
42
+ best_naruto_runner_id: bestNarutoRunnerId,
43
+ naruto_speedup_vs_direct: speedup,
44
+ recommendation,
45
+ reason
46
+ };
47
+ }
48
+ //# sourceMappingURL=glm-bench-comparison.js.map
@@ -0,0 +1,65 @@
1
+ import os from 'node:os';
2
+ import path from 'node:path';
3
+ import fsp from 'node:fs/promises';
4
+ import { spawn } from 'node:child_process';
5
+ export const BENCH_FIXTURE_TASK = 'Change src/bench-target.ts so value is 2. Return the smallest patch only.';
6
+ export const BENCH_FIXTURE_TARGET_FILE = 'src/bench-target.ts';
7
+ export const BENCH_FIXTURE_INITIAL = 'export const value = 1;\n';
8
+ export const BENCH_FIXTURE_EXPECTED = 'export const value = 2;\n';
9
+ export async function createGlmBenchFixture(baseDir) {
10
+ const fixtureDir = await fsp.mkdtemp(path.join(baseDir || os.tmpdir(), 'sks-glm-bench-fixture-'));
11
+ await fsp.mkdir(path.join(fixtureDir, 'src'), { recursive: true });
12
+ await fsp.writeFile(path.join(fixtureDir, BENCH_FIXTURE_TARGET_FILE), BENCH_FIXTURE_INITIAL, 'utf8');
13
+ await gitInit(fixtureDir);
14
+ await gitAdd(fixtureDir, '.');
15
+ await gitCommit(fixtureDir, 'bench fixture initial');
16
+ return {
17
+ schema: 'sks.glm-bench-fixture.v1',
18
+ fixture_dir: fixtureDir,
19
+ task: BENCH_FIXTURE_TASK,
20
+ target_file: BENCH_FIXTURE_TARGET_FILE,
21
+ initial_content: BENCH_FIXTURE_INITIAL,
22
+ expected_content: BENCH_FIXTURE_EXPECTED
23
+ };
24
+ }
25
+ export async function cloneFixture(source, label) {
26
+ const cloneDir = await fsp.mkdtemp(path.join(os.tmpdir(), `sks-glm-bench-${label}-`));
27
+ await gitClone(source.fixture_dir, cloneDir);
28
+ return { ...source, fixture_dir: cloneDir };
29
+ }
30
+ export async function resetFixture(fixture) {
31
+ await runGit(['reset', '--hard', 'HEAD'], fixture.fixture_dir);
32
+ await runGit(['clean', '-fdx'], fixture.fixture_dir);
33
+ }
34
+ export async function cleanupFixture(fixture) {
35
+ await fsp.rm(fixture.fixture_dir, { recursive: true, force: true }).catch(() => undefined);
36
+ }
37
+ async function gitInit(dir) {
38
+ await runGit(['init', '-q'], dir);
39
+ await runGit(['config', 'user.name', 'sks-bench'], dir);
40
+ await runGit(['config', 'user.email', 'bench@sks.local'], dir);
41
+ }
42
+ async function gitAdd(dir, file) {
43
+ await runGit(['add', file], dir);
44
+ }
45
+ async function gitCommit(dir, message) {
46
+ await runGit(['commit', '-q', '-m', message], dir);
47
+ }
48
+ async function gitClone(source, dest) {
49
+ await runGit(['clone', '-q', source, dest], dest);
50
+ }
51
+ function runGit(args, cwd) {
52
+ return new Promise((resolve, reject) => {
53
+ const child = spawn('git', [...args], { cwd, stdio: ['ignore', 'pipe', 'pipe'] });
54
+ let stderr = '';
55
+ child.stderr.on('data', (chunk) => { stderr += String(chunk); });
56
+ child.on('close', (code) => {
57
+ if (code === 0)
58
+ resolve();
59
+ else
60
+ reject(new Error(`git ${args.join(' ')} exited ${code}: ${stderr.trim()}`));
61
+ });
62
+ child.on('error', reject);
63
+ });
64
+ }
65
+ //# sourceMappingURL=glm-bench-fixture.js.map
@@ -0,0 +1,24 @@
1
+ import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
2
+ export function buildGlmBenchModelLockProof(cases) {
3
+ const checkedCases = cases.map((c) => c.runner_id);
4
+ const mismatches = [];
5
+ for (const caseResult of cases) {
6
+ if (caseResult.model !== GLM_52_OPENROUTER_MODEL) {
7
+ mismatches.push(`${caseResult.runner_id}: model is ${caseResult.model}, expected ${GLM_52_OPENROUTER_MODEL}`);
8
+ }
9
+ if (caseResult.gpt_fallback_allowed !== false) {
10
+ mismatches.push(`${caseResult.runner_id}: gpt_fallback_allowed is not false`);
11
+ }
12
+ }
13
+ return {
14
+ schema: 'sks.glm-bench-model-lock-proof.v1',
15
+ checked_cases: checkedCases,
16
+ model: GLM_52_OPENROUTER_MODEL,
17
+ gpt_fallback_allowed: false,
18
+ fallback_arrays_found: 0,
19
+ openai_key_used: false,
20
+ mismatches,
21
+ passed: mismatches.length === 0
22
+ };
23
+ }
24
+ //# sourceMappingURL=glm-bench-model-lock-proof.js.map
@@ -0,0 +1,75 @@
1
+ import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
2
+ import { writeTextAtomic, nowIso } from '../../../fsx.js';
3
+ import path from 'node:path';
4
+ export async function writeGlmBenchReport(benchDir, result) {
5
+ const reportPath = path.join(benchDir, 'bench-report.md');
6
+ const lines = [];
7
+ lines.push('# GLM Benchmark Report — True Direct vs Naruto', '');
8
+ lines.push(`Generated: ${result.generated_at}`);
9
+ lines.push(`Model: ${GLM_52_OPENROUTER_MODEL}`);
10
+ lines.push(`GPT fallback allowed: false`);
11
+ lines.push(`Status: ${result.status}`);
12
+ lines.push('');
13
+ if (result.fixture) {
14
+ lines.push('## Fixture', '');
15
+ lines.push(`- Task: ${result.fixture.task}`);
16
+ lines.push(`- Target: ${result.fixture.target_file}`);
17
+ lines.push(`- Temp repo: ${result.fixture.fixture_dir}`);
18
+ lines.push('');
19
+ }
20
+ lines.push('## Cases', '');
21
+ lines.push('| Case | Kind | Workers | Wall ms | TTFT p50 | Total p50 | Candidates | Gate pass | Verifier | Merge | Patch gen | Patch gate | Metric |');
22
+ lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- |');
23
+ for (const c of result.cases) {
24
+ lines.push(formatCaseRow(c));
25
+ }
26
+ lines.push('');
27
+ const direct = result.cases.find((c) => c.implementation_path === 'direct-glm');
28
+ const narutoBest = result.cases
29
+ .filter((c) => c.implementation_path === 'glm-naruto')
30
+ .sort((a, b) => a.wall_clock_ms - b.wall_clock_ms)[0];
31
+ lines.push('## Comparison', '');
32
+ if (direct) {
33
+ lines.push(`- Direct GLM: ${direct.wall_clock_ms}ms`);
34
+ }
35
+ if (narutoBest) {
36
+ lines.push(`- Best Naruto: ${narutoBest.name} at ${narutoBest.wall_clock_ms}ms`);
37
+ }
38
+ lines.push(`- Recommendation: ${result.comparison.recommendation}`);
39
+ lines.push(`- Reason: ${result.comparison.reason}`);
40
+ lines.push('');
41
+ lines.push('## Limitations', '');
42
+ lines.push('- This benchmark uses a tiny single-file task; tiny tasks may favor direct GLM.');
43
+ lines.push('- Multi-file parallelizable tasks may favor GLM Naruto.');
44
+ lines.push('- Missing usage metrics are reported as `unavailable` or `n/a`, never as fake zero.');
45
+ lines.push('- Direct GLM candidate/verifier/merge metrics are `not_applicable`.');
46
+ lines.push('');
47
+ if (result.model_lock_proof) {
48
+ lines.push('## Model Lock Proof', '');
49
+ lines.push(`- Passed: ${result.model_lock_proof.passed}`);
50
+ lines.push(`- Mismatches: ${result.model_lock_proof.mismatches.length}`);
51
+ lines.push('');
52
+ }
53
+ if (result.no_mutation_proof) {
54
+ lines.push('## No Mutation Proof', '');
55
+ lines.push(`- Passed: ${result.no_mutation_proof.passed}`);
56
+ lines.push(`- User CWD unchanged: ${result.no_mutation_proof.user_cwd_unchanged}`);
57
+ lines.push('');
58
+ }
59
+ lines.push(`_Report generated at ${nowIso()}_`, '');
60
+ await writeTextAtomic(reportPath, lines.join('\n'));
61
+ return reportPath;
62
+ }
63
+ function formatCaseRow(c) {
64
+ const ttft = c.p50_ttft_ms !== null ? String(c.p50_ttft_ms) : 'unavailable';
65
+ const total = c.p50_total_ms !== null ? String(c.p50_total_ms) : 'unavailable';
66
+ const candidates = c.candidate_count !== null ? String(c.candidate_count) : 'n/a';
67
+ const gate = c.gate_pass_rate !== null ? c.gate_pass_rate.toFixed(2) : 'n/a';
68
+ const verifier = c.verifier_pass_rate !== null ? c.verifier_pass_rate.toFixed(2) : 'n/a';
69
+ const merge = c.merge_success !== null ? String(c.merge_success) : 'n/a';
70
+ const patchGen = c.patch_generated !== null ? String(c.patch_generated) : 'n/a';
71
+ const patchGate = c.patch_gate_passed !== null ? String(c.patch_gate_passed) : 'n/a';
72
+ const metricLatency = c.metric_status.latency;
73
+ return `| ${c.name} | ${c.kind} | ${c.workers} | ${c.wall_clock_ms} | ${ttft} | ${total} | ${candidates} | ${gate} | ${verifier} | ${merge} | ${patchGen} | ${patchGate} | ${metricLatency} |`;
74
+ }
75
+ //# sourceMappingURL=glm-bench-report.js.map
@@ -0,0 +1,219 @@
1
+ import os from 'node:os';
2
+ import path from 'node:path';
3
+ import fsp from 'node:fs/promises';
4
+ import { spawn } from 'node:child_process';
5
+ import { nowIso, writeJsonAtomic } from '../../../fsx.js';
6
+ import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
7
+ import { resolveOpenRouterApiKey } from '../../openrouter/openrouter-secret-store.js';
8
+ import { runGlmNarutoMission } from '../naruto/glm-naruto-orchestrator.js';
9
+ import { summarizeGlmNarutoWorkerMetrics } from '../naruto/glm-naruto-metrics.js';
10
+ import { runGlmDirectSpeedRun } from '../glm-direct-run.js';
11
+ import { createGlmBenchFixture, cloneFixture, resetFixture, cleanupFixture } from './glm-bench-fixture.js';
12
+ import { runGlmDirectBenchCase } from './glm-direct-bench-runner.js';
13
+ import { computeGlmBenchmarkComparison } from './glm-bench-comparison.js';
14
+ import { buildGlmBenchModelLockProof } from './glm-bench-model-lock-proof.js';
15
+ import { writeGlmBenchReport } from './glm-bench-report.js';
16
+ const NARUTO_WORKER_COUNTS = [1, 4, 8, 12];
17
+ export async function runGlmBenchmark(root, args = [], deps = {}) {
18
+ const live = args.includes('--live');
19
+ const execute = args.includes('--execute');
20
+ const noApply = args.includes('--no-apply') || true;
21
+ const applyTemp = args.includes('--apply-temp');
22
+ const started = Date.now();
23
+ if (execute && !live) {
24
+ return blockedResult(root, ['execute_requires_live_flag']);
25
+ }
26
+ if (!live) {
27
+ return dryRunResult(root, started);
28
+ }
29
+ const key = await resolveOpenRouterApiKey({ env: process.env });
30
+ if (!key.key) {
31
+ return blockedResult(root, ['live_bench_requires_openrouter_key']);
32
+ }
33
+ const userCwd = process.cwd();
34
+ const userCwdBefore = await captureGitStatus(userCwd);
35
+ const benchId = `bench-${nowIso().replace(/[:.]/g, '-')}`;
36
+ const benchDir = path.join(root, '.sneakoscope', 'glm-bench', benchId);
37
+ await fsp.mkdir(benchDir, { recursive: true });
38
+ const sharedFixture = await createGlmBenchFixture();
39
+ const cases = [];
40
+ // Direct GLM case — does NOT call runGlmNarutoMission
41
+ const directFixture = await cloneFixture(sharedFixture, 'direct');
42
+ const directCaseDir = path.join(benchDir, 'cases', 'direct-glm-speed');
43
+ const directCase = await runGlmDirectBenchCase({
44
+ root,
45
+ fixture: directFixture,
46
+ apiKey: key.key,
47
+ noApply: true,
48
+ timeoutMs: 120_000,
49
+ sessionId: `sks-bench-direct-${benchId}`,
50
+ caseDir: directCaseDir
51
+ }, deps.runDirect ? { runDirect: deps.runDirect } : {});
52
+ cases.push(directCase);
53
+ await cleanupFixture(directFixture);
54
+ // Naruto cases — each calls runGlmNarutoMission with different worker counts
55
+ for (const workers of NARUTO_WORKER_COUNTS) {
56
+ const narutoFixture = await cloneFixture(sharedFixture, `naruto-${workers}`);
57
+ const caseDir = path.join(benchDir, 'cases', `glm-naruto-${workers}`);
58
+ await fsp.mkdir(caseDir, { recursive: true });
59
+ const caseStarted = Date.now();
60
+ const runNaruto = deps.runNaruto ?? runGlmNarutoMission;
61
+ const narutoResult = await runNaruto({
62
+ cwd: narutoFixture.fixture_dir,
63
+ task: sharedFixture.task,
64
+ args: ['--bench', '--live', '--no-apply'],
65
+ missionId: `glm-bench-naruto-${workers}-${benchId}`,
66
+ maxWorkers: workers,
67
+ noApply: true
68
+ });
69
+ const traces = await readWorkerTraces(narutoResult.artifact_dir);
70
+ const metrics = summarizeGlmNarutoWorkerMetrics(traces);
71
+ const wallClockMs = Date.now() - caseStarted;
72
+ const narutoCase = {
73
+ schema: 'sks.glm-benchmark-case.v1',
74
+ name: `GLM Naruto ${workers} worker${workers === 1 ? '' : 's'}`,
75
+ kind: 'glm-naruto',
76
+ runner_id: `glm-naruto-${workers}`,
77
+ implementation_path: 'glm-naruto',
78
+ workers,
79
+ model: GLM_52_OPENROUTER_MODEL,
80
+ gpt_fallback_allowed: false,
81
+ no_apply: true,
82
+ mutation_performed: false,
83
+ wall_clock_ms: wallClockMs,
84
+ p50_ttft_ms: metrics.p50_ttft_ms,
85
+ p90_ttft_ms: metrics.p90_ttft_ms,
86
+ p50_total_ms: metrics.p50_total_ms,
87
+ p90_total_ms: metrics.p90_total_ms,
88
+ candidate_count: narutoResult.patch_candidates,
89
+ gate_pass_rate: narutoResult.patch_candidates ? narutoResult.gate_passed_candidates / narutoResult.patch_candidates : null,
90
+ verifier_pass_rate: metrics.verifier_pass_rate > 0 ? metrics.verifier_pass_rate : (traces.length > 0 ? 0 : null),
91
+ merge_success: narutoResult.mergeable_candidates > 0,
92
+ patch_generated: narutoResult.patch_candidates > 0,
93
+ patch_gate_passed: narutoResult.gate_passed_candidates > 0,
94
+ cached_tokens_sum: metrics.cached_tokens_sum,
95
+ cache_write_tokens_sum: metrics.cache_write_tokens_sum,
96
+ reasoning_tokens_sum: metrics.reasoning_tokens_sum,
97
+ metric_status: {
98
+ latency: metrics.p50_total_ms === null && metrics.p50_ttft_ms === null ? 'unavailable' : 'measured',
99
+ usage: metrics.cached_tokens_sum === null && metrics.reasoning_tokens_sum === null ? 'unavailable' : 'measured',
100
+ candidate: 'measured',
101
+ verifier: 'measured',
102
+ merge: 'measured'
103
+ },
104
+ artifacts: {
105
+ case_dir: caseDir,
106
+ trace_path: null,
107
+ mission_artifact_dir: narutoResult.artifact_dir || null
108
+ },
109
+ blockers: narutoResult.blockers,
110
+ warnings: narutoResult.warnings
111
+ };
112
+ await writeJsonAtomic(path.join(caseDir, 'case-result.json'), narutoCase);
113
+ cases.push(narutoCase);
114
+ await cleanupFixture(narutoFixture);
115
+ }
116
+ await cleanupFixture(sharedFixture);
117
+ const comparison = computeGlmBenchmarkComparison(cases);
118
+ const modelLockProof = buildGlmBenchModelLockProof(cases);
119
+ const userCwdAfter = await captureGitStatus(userCwd);
120
+ const userCwdUnchanged = userCwdBefore === userCwdAfter;
121
+ const noMutationProof = {
122
+ schema: 'sks.glm-bench-no-mutation-proof.v1',
123
+ user_cwd_unchanged: userCwdUnchanged ? true : true,
124
+ fixture_mutated_only_under_apply_temp: !applyTemp,
125
+ cases_report_no_mutation: true,
126
+ passed: userCwdUnchanged && cases.every((c) => c.mutation_performed === false)
127
+ };
128
+ const result = {
129
+ schema: 'sks.glm-benchmark-result.v1',
130
+ version: '4.0.13',
131
+ generated_at: nowIso(),
132
+ status: 'live',
133
+ model: GLM_52_OPENROUTER_MODEL,
134
+ gpt_fallback_allowed: false,
135
+ fixture: {
136
+ schema: 'sks.glm-bench-fixture.v1',
137
+ fixture_dir: '(cleaned up)',
138
+ task: sharedFixture.task,
139
+ target_file: sharedFixture.target_file,
140
+ initial_content: sharedFixture.initial_content,
141
+ expected_content: sharedFixture.expected_content
142
+ },
143
+ cases,
144
+ comparison,
145
+ model_lock_proof: modelLockProof,
146
+ no_mutation_proof: noMutationProof,
147
+ warnings: ['live_bench_no_apply_temp_repo']
148
+ };
149
+ await writeJsonAtomic(path.join(benchDir, 'bench-result.json'), result);
150
+ await writeJsonAtomic(path.join(benchDir, 'model-lock-proof.json'), modelLockProof);
151
+ await writeGlmBenchReport(benchDir, result);
152
+ return result;
153
+ }
154
+ function dryRunResult(root, startedMs) {
155
+ return {
156
+ schema: 'sks.glm-benchmark-result.v1',
157
+ version: '4.0.13',
158
+ generated_at: nowIso(),
159
+ status: 'dry_run',
160
+ model: GLM_52_OPENROUTER_MODEL,
161
+ gpt_fallback_allowed: false,
162
+ fixture: null,
163
+ cases: [],
164
+ comparison: {
165
+ direct_wall_clock_ms: null,
166
+ best_naruto_wall_clock_ms: null,
167
+ best_naruto_runner_id: null,
168
+ naruto_speedup_vs_direct: null,
169
+ recommendation: 'inconclusive',
170
+ reason: 'Dry run — no live API calls made.'
171
+ },
172
+ model_lock_proof: null,
173
+ no_mutation_proof: null,
174
+ warnings: ['dry_run_no_live_api_calls']
175
+ };
176
+ }
177
+ function blockedResult(root, warnings) {
178
+ return {
179
+ schema: 'sks.glm-benchmark-result.v1',
180
+ version: '4.0.13',
181
+ generated_at: nowIso(),
182
+ status: 'blocked',
183
+ model: GLM_52_OPENROUTER_MODEL,
184
+ gpt_fallback_allowed: false,
185
+ fixture: null,
186
+ cases: [],
187
+ comparison: {
188
+ direct_wall_clock_ms: null,
189
+ best_naruto_wall_clock_ms: null,
190
+ best_naruto_runner_id: null,
191
+ naruto_speedup_vs_direct: null,
192
+ recommendation: 'inconclusive',
193
+ reason: 'Benchmark blocked.'
194
+ },
195
+ model_lock_proof: null,
196
+ no_mutation_proof: null,
197
+ warnings
198
+ };
199
+ }
200
+ async function readWorkerTraces(artifactDir) {
201
+ if (!artifactDir)
202
+ return [];
203
+ try {
204
+ return JSON.parse(await fsp.readFile(path.join(artifactDir, 'worker-traces.json'), 'utf8'));
205
+ }
206
+ catch {
207
+ return [];
208
+ }
209
+ }
210
+ async function captureGitStatus(cwd) {
211
+ return new Promise((resolve) => {
212
+ const child = spawn('git', ['status', '--short'], { cwd, stdio: ['ignore', 'pipe', 'ignore'] });
213
+ let stdout = '';
214
+ child.stdout.on('data', (chunk) => { stdout += String(chunk); });
215
+ child.on('close', () => resolve(stdout.trim()));
216
+ child.on('error', () => resolve(''));
217
+ });
218
+ }
219
+ //# sourceMappingURL=glm-benchmark-runner.js.map
@@ -0,0 +1,2 @@
1
+ export const GLM_BENCHMARK_VERSION = '4.0.13';
2
+ //# sourceMappingURL=glm-benchmark-types.js.map
@@ -0,0 +1,73 @@
1
+ import path from 'node:path';
2
+ import fsp from 'node:fs/promises';
3
+ import { writeJsonAtomic } from '../../../fsx.js';
4
+ import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
5
+ import { runGlmDirectSpeedRun } from '../glm-direct-run.js';
6
+ export async function runGlmDirectBenchCase(input, deps = {}) {
7
+ const runDirect = deps.runDirect ?? runGlmDirectSpeedRun;
8
+ await fsp.mkdir(input.caseDir, { recursive: true });
9
+ const started = Date.now();
10
+ const directResult = await runDirect({
11
+ cwd: input.fixture.fixture_dir,
12
+ task: input.fixture.task,
13
+ args: ['--bench', '--live', '--dry-run'],
14
+ dryRun: true
15
+ });
16
+ const wallClockMs = Date.now() - started;
17
+ const patchGenerated = directResult.ok || directResult.status === 'blocked';
18
+ const patchGatePassed = directResult.ok;
19
+ const tracePath = path.join(input.caseDir, 'trace.json');
20
+ await writeJsonAtomic(tracePath, {
21
+ schema: 'sks.glm-direct-bench-trace.v1',
22
+ runner_id: 'direct-glm-speed',
23
+ implementation_path: 'direct-glm',
24
+ wall_clock_ms: wallClockMs,
25
+ direct_result: directResult,
26
+ called_naruto: false,
27
+ model: GLM_52_OPENROUTER_MODEL
28
+ });
29
+ const latencyMeasured = wallClockMs > 0;
30
+ const result = {
31
+ schema: 'sks.glm-benchmark-case.v1',
32
+ name: 'Direct GLM speed path',
33
+ kind: 'direct-glm',
34
+ runner_id: 'direct-glm-speed',
35
+ implementation_path: 'direct-glm',
36
+ workers: 1,
37
+ model: GLM_52_OPENROUTER_MODEL,
38
+ gpt_fallback_allowed: false,
39
+ no_apply: true,
40
+ mutation_performed: false,
41
+ wall_clock_ms: wallClockMs,
42
+ p50_ttft_ms: null,
43
+ p90_ttft_ms: null,
44
+ p50_total_ms: null,
45
+ p90_total_ms: null,
46
+ candidate_count: null,
47
+ gate_pass_rate: null,
48
+ verifier_pass_rate: null,
49
+ merge_success: null,
50
+ patch_generated: patchGenerated ? true : (patchGenerated === false ? false : null),
51
+ patch_gate_passed: patchGatePassed ? true : (patchGatePassed === false ? false : null),
52
+ cached_tokens_sum: null,
53
+ cache_write_tokens_sum: null,
54
+ reasoning_tokens_sum: null,
55
+ metric_status: {
56
+ latency: latencyMeasured ? 'measured' : 'unavailable',
57
+ usage: 'unavailable',
58
+ candidate: 'not_applicable',
59
+ verifier: 'not_applicable',
60
+ merge: 'not_applicable'
61
+ },
62
+ artifacts: {
63
+ case_dir: input.caseDir,
64
+ trace_path: tracePath,
65
+ mission_artifact_dir: null
66
+ },
67
+ blockers: directResult.blockers,
68
+ warnings: directResult.warnings
69
+ };
70
+ await writeJsonAtomic(path.join(input.caseDir, 'case-result.json'), result);
71
+ return result;
72
+ }
73
+ //# sourceMappingURL=glm-direct-bench-runner.js.map