sneakoscope 4.0.12 → 4.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/README.md +10 -2
  2. package/crates/sks-core/Cargo.lock +1 -1
  3. package/crates/sks-core/Cargo.toml +1 -1
  4. package/crates/sks-core/src/main.rs +1 -1
  5. package/dist/bin/sks.js +1 -1
  6. package/dist/cli/global-mode-router.js +2 -1
  7. package/dist/core/commands/glm-command.js +11 -5
  8. package/dist/core/commands/mad-sks-command.js +3 -0
  9. package/dist/core/fsx.js +1 -1
  10. package/dist/core/providers/glm/bench/glm-bench-comparison.js +48 -0
  11. package/dist/core/providers/glm/bench/glm-bench-fixture.js +65 -0
  12. package/dist/core/providers/glm/bench/glm-bench-model-lock-proof.js +53 -0
  13. package/dist/core/providers/glm/bench/glm-bench-report.js +75 -0
  14. package/dist/core/providers/glm/bench/glm-benchmark-runner.js +243 -0
  15. package/dist/core/providers/glm/bench/glm-benchmark-types.js +2 -0
  16. package/dist/core/providers/glm/bench/glm-direct-bench-runner.js +73 -0
  17. package/dist/core/providers/glm/naruto/glm-naruto-bench.js +2 -181
  18. package/dist/core/providers/glm/naruto/glm-naruto-command.js +14 -3
  19. package/dist/core/providers/glm/naruto/glm-naruto-critical-path.js +51 -0
  20. package/dist/core/providers/glm/naruto/glm-naruto-final-seal.js +9 -2
  21. package/dist/core/providers/glm/naruto/glm-naruto-orchestrator.js +101 -15
  22. package/dist/core/providers/glm/naruto/glm-naruto-parallelism-summary.js +55 -0
  23. package/dist/core/providers/glm/naruto/glm-naruto-requirement-coverage.js +92 -0
  24. package/dist/core/providers/glm/naruto/glm-naruto-requirement-ledger.js +42 -0
  25. package/dist/core/providers/glm/naruto/glm-naruto-stage-scheduler.js +85 -0
  26. package/dist/core/providers/glm/naruto/glm-naruto-task-size-classifier.js +12 -0
  27. package/dist/core/providers/glm/naruto/glm-naruto-trace.js +4 -0
  28. package/dist/core/providers/glm/naruto/glm-naruto-verifier-output.js +5 -0
  29. package/dist/core/providers/glm/naruto/glm-naruto-worker-pool.js +130 -44
  30. package/dist/core/providers/glm/naruto/glm-naruto-worker-runtime.js +6 -2
  31. package/dist/core/routes/model-mode-router.js +44 -0
  32. package/dist/core/version.js +1 -1
  33. package/package.json +24 -1
  34. package/dist/core/providers/glm/glm-bench.js +0 -127
  35. package/dist/scripts/agent-dynamic-pool-fixture.js +0 -80
  36. package/dist/scripts/agent-native-release-gate.js +0 -274
  37. package/dist/scripts/agent-patch-swarm-gate-lib.js +0 -113
  38. package/dist/scripts/agent-real-codex-patch-envelope-smoke.js +0 -126
  39. package/dist/scripts/agent-route-blackbox-lib.js +0 -132
  40. package/dist/scripts/blackbox-command-import-smoke.js +0 -143
  41. package/dist/scripts/blackbox-global-shim.js +0 -77
  42. package/dist/scripts/blackbox-matrix.js +0 -70
  43. package/dist/scripts/blackbox-npx-one-shot.js +0 -69
  44. package/dist/scripts/blackbox-pack-install.js +0 -174
  45. package/dist/scripts/build-dist.js +0 -64
  46. package/dist/scripts/check-architecture.js +0 -135
  47. package/dist/scripts/check-cli-entrypoint.js +0 -43
  48. package/dist/scripts/check-command-module-budget.js +0 -25
  49. package/dist/scripts/check-dist-runtime.js +0 -100
  50. package/dist/scripts/check-feature-quality.js +0 -53
  51. package/dist/scripts/check-legacy-free.js +0 -66
  52. package/dist/scripts/check-package-boundary.js +0 -108
  53. package/dist/scripts/check-pipeline-budget.js +0 -69
  54. package/dist/scripts/check-pipeline-runtime.js +0 -25
  55. package/dist/scripts/check-publish-tag.js +0 -30
  56. package/dist/scripts/check-route-modularity.js +0 -82
  57. package/dist/scripts/check-runtime-schemas.js +0 -87
  58. package/dist/scripts/check-source-runtime.js +0 -4
  59. package/dist/scripts/check-ts-contracts.js +0 -69
  60. package/dist/scripts/check-ts-suppressions.js +0 -58
  61. package/dist/scripts/clean-dist.js +0 -8
  62. package/dist/scripts/codex-0140-feature-gate-lib.js +0 -14
  63. package/dist/scripts/codex-config-eperm-fixture.js +0 -32
  64. package/dist/scripts/codex-lb-missing-env-regression.js +0 -40
  65. package/dist/scripts/codex-native-runtime-e2e-fixture.js +0 -75
  66. package/dist/scripts/codex-project-config-policy-merge-regression.js +0 -92
  67. package/dist/scripts/core-skill-legacy-promotion-api-audit.js +0 -54
  68. package/dist/scripts/ensure-bin-executable.js +0 -10
  69. package/dist/scripts/fixtures/fake-codex-config-loader.js +0 -51
  70. package/dist/scripts/github-release-body-helper.js +0 -65
  71. package/dist/scripts/gpt-image-2-real-file-smoke.js +0 -448
  72. package/dist/scripts/hooks-no-unsupported-handlers.js +0 -15
  73. package/dist/scripts/hooks-runtime-replay-warning-zero-v2.js +0 -26
  74. package/dist/scripts/hooks-runtime-replay-warning-zero.js +0 -10
  75. package/dist/scripts/hooks-trust-warning-zero.js +0 -14
  76. package/dist/scripts/lib/codex-sdk-gate-lib.js +0 -92
  77. package/dist/scripts/lib/ensure-dist-fresh.js +0 -142
  78. package/dist/scripts/lib/git-worktree-fixture.js +0 -33
  79. package/dist/scripts/lib/mad-sks-actual-executor-check-lib.js +0 -255
  80. package/dist/scripts/lib/native-cli-session-swarm-check-lib.js +0 -79
  81. package/dist/scripts/lib/real-codex-parallel-gate.js +0 -94
  82. package/dist/scripts/lib/real-codex-parallel-proof-fixture.js +0 -55
  83. package/dist/scripts/lib/valid-png-fixture.js +0 -25
  84. package/dist/scripts/mad-sks-live-protected-core-smoke.js +0 -5
  85. package/dist/scripts/naruto-real-local-gpt-final-smoke.js +0 -25
  86. package/dist/scripts/perf-gate.js +0 -39
  87. package/dist/scripts/prepublish-release-check-or-fast.js +0 -121
  88. package/dist/scripts/release-3112-required-gates.js +0 -30
  89. package/dist/scripts/release-3113-required-gates.js +0 -25
  90. package/dist/scripts/release-4000-required-gates.js +0 -36
  91. package/dist/scripts/release-4001-required-gates.js +0 -13
  92. package/dist/scripts/release-4002-required-gates.js +0 -14
  93. package/dist/scripts/release-check-dynamic-execute.js +0 -259
  94. package/dist/scripts/release-check-dynamic.js +0 -107
  95. package/dist/scripts/release-check-stamp.js +0 -261
  96. package/dist/scripts/release-gate-dag-runner.js +0 -56
  97. package/dist/scripts/release-gate-existence-audit.js +0 -111
  98. package/dist/scripts/release-gate-planner.js +0 -34
  99. package/dist/scripts/release-gate-worker.js +0 -10
  100. package/dist/scripts/release-speed-summary.js +0 -67
  101. package/dist/scripts/repo-audit.js +0 -83
  102. package/dist/scripts/rust-smoke.js +0 -5
  103. package/dist/scripts/sizecheck.js +0 -146
  104. package/dist/scripts/sks-1-11-gate-lib.js +0 -78
  105. package/dist/scripts/sks-1-18-gate-lib.js +0 -55
  106. package/dist/scripts/tmux-removal-inventory.js +0 -36
  107. package/dist/scripts/write-build-manifest.js +0 -71
  108. package/dist/scripts/zellij-dashboard-watch.js +0 -41
  109. package/dist/scripts/zellij-right-column-geometry-proof.js +0 -162
package/README.md CHANGED
@@ -35,9 +35,17 @@ Set up this agent project with Sneakoscope Codex. Use [[mandarange/Sneakoscope-C
35
35
 
36
36
  ## 🚀 Current Release
37
37
 
38
- SKS **4.0.12** seals GLM Naruto's production runtime path: worktree workers apply extracted unified diffs only, patch workers launch through a bounded adaptive scheduler, live bench compares true direct GLM against Naruto worker counts, final apply runs dirty-tree and targeted-check guards, and stop-gates reference a final seal artifact.
38
+ SKS **4.0.14** seals GLM Naruto real parallelism while preserving the existing GPT/Codex/MAD `sks --mad` route. GLM mode stays locked to OpenRouter `z-ai/glm-5.2`; non-GLM MAD does not require OpenRouter, does not select GLM, and does not enter the GLM Naruto scheduler.
39
39
 
40
- What changed in 4.0.12:
40
+ What changed in 4.0.14:
41
+
42
+ - **Real stage parallelism evidence.** GLM Naruto records bounded parallel stage timelines, overlap ratios, parallelism summaries, critical-path metrics, and speed diagnosis artifacts.
43
+ - **Parallel gate/verifier/worktree stages.** Candidate gate, worktree materialization, and verifier checks no longer have to run candidate-by-candidate when multiple candidates are available.
44
+ - **Requirement coverage seal.** GLM Naruto writes a requirement ledger and candidate coverage artifacts, and the final seal blocks when required requirements remain uncovered.
45
+ - **MAD route isolation.** `sks --mad` without `--glm` remains the GPT/Codex/MAD route and does not resolve OpenRouter or run GLM-specific benchmark/Naruto code.
46
+ - **Benchmark proof honesty.** GLM benchmark proof now reports request-summary availability separately from case-level model lock checks and fixes the no-mutation proof boolean.
47
+
48
+ What changed in 4.0.13:
41
49
 
42
50
  - **Extracted worktree patches.** `--worktree` parses `<sks_patch_candidate>` and records candidate/extracted patch hashes before any worker worktree apply.
43
51
  - **Adaptive scheduler.** Patch workers use a finite launch queue with provider-health backpressure and retry-once handling for retryable 429/5xx/idle-timeout failures.
@@ -76,7 +76,7 @@ dependencies = [
76
76
 
77
77
  [[package]]
78
78
  name = "sks-core"
79
- version = "4.0.12"
79
+ version = "4.0.14"
80
80
  dependencies = [
81
81
  "serde_json",
82
82
  ]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "sks-core"
3
- version = "4.0.12"
3
+ version = "4.0.14"
4
4
  edition = "2021"
5
5
 
6
6
  [dependencies]
@@ -4,7 +4,7 @@ use std::io::{self, Read, Seek, SeekFrom};
4
4
  fn main() {
5
5
  let mut args = std::env::args().skip(1);
6
6
  match args.next().as_deref() {
7
- Some("--version") => println!("sks-rs 4.0.12"),
7
+ Some("--version") => println!("sks-rs 4.0.14"),
8
8
  Some("compact-info") => {
9
9
  let mut input = String::new();
10
10
  let _ = io::stdin().read_to_string(&mut input);
package/dist/bin/sks.js CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- const FAST_PACKAGE_VERSION = '4.0.12';
2
+ const FAST_PACKAGE_VERSION = '4.0.14';
3
3
  const args = process.argv.slice(2);
4
4
  try {
5
5
  if (args[0] === '--agent' && args[1] === 'worker') {
@@ -6,8 +6,9 @@ export function detectGlobalMode(args = []) {
6
6
  const hasGlm = args.includes('--glm');
7
7
  if (hasMad && hasGlm)
8
8
  return { kind: 'mad-glm', args: stripGlobalModeFlags(args) };
9
- if (hasGlm && !hasMad)
9
+ if (hasGlm && !hasMad && String(args[0]).startsWith('-')) {
10
10
  return { kind: 'glm-without-mad', args: stripGlobalModeFlags(args) };
11
+ }
11
12
  return null;
12
13
  }
13
14
  export function stripGlobalModeFlags(args) {
@@ -1,5 +1,5 @@
1
1
  import { flag, positionalArgs } from '../../cli/args.js';
2
- import { runGlmBench } from '../providers/glm/glm-bench.js';
2
+ import { runGlmBenchmark } from '../providers/glm/bench/glm-benchmark-runner.js';
3
3
  import { printJson } from '../../cli/output.js';
4
4
  import { runGlmDirectSpeedRun } from '../providers/glm/glm-direct-run.js';
5
5
  import { runGlmReadinessAndExit } from '../providers/glm/glm-readiness.js';
@@ -11,15 +11,21 @@ export async function glmCommand(args = []) {
11
11
  return glmNarutoCommand(narutoArgs);
12
12
  }
13
13
  if (flag(args, '--bench') && !flag(args, '--naruto')) {
14
- const result = await runGlmBench(process.cwd(), args);
14
+ const result = await runGlmBenchmark(process.cwd(), args);
15
15
  if (result.status === 'blocked')
16
16
  process.exitCode = 1;
17
17
  if (flag(args, '--json'))
18
18
  printJson(result);
19
19
  else if (result.status === 'blocked')
20
- console.error(`GLM bench blocked: ${result.warnings.join(', ')}`);
21
- else
22
- console.log(`GLM bench: dry-run p50=${result.summary.speed_p50_total_ms}ms ratio=${result.summary.speed_vs_deep_ratio}`);
20
+ console.error(`GLM benchmark blocked: ${result.warnings.join(', ')}`);
21
+ else if (result.status === 'dry_run')
22
+ console.log(`GLM benchmark: dry-run (use --live for real measurement)`);
23
+ else {
24
+ const direct = result.cases.find((c) => c.implementation_path === 'direct-glm');
25
+ if (direct)
26
+ console.log(` Direct GLM: ${direct.wall_clock_ms}ms`);
27
+ console.log(` Recommendation: ${result.comparison.recommendation}`);
28
+ }
23
29
  return result;
24
30
  }
25
31
  const task = extractGlmTask(args);
@@ -26,12 +26,15 @@ import { resolveCodexNativeInvocationPlan } from '../codex-native/codex-native-i
26
26
  import { repairZellijForSks } from '../zellij/zellij-self-heal.js';
27
27
  import { buildMadGlmLaunchArtifact, buildMadGlmLaunchProfileNoWrite, resolveMadGlmLaunchKey, writeMadGlmCodexWrapper } from '../providers/glm/glm-mad-launch.js';
28
28
  import { GLM_MAD_MODE } from '../providers/glm/glm-52-settings.js';
29
+ import { assertNonGlmMadRoute } from '../routes/model-mode-router.js';
29
30
  export async function madHighCommand(args = [], deps = {}) {
30
31
  const subcommand = firstSubcommand(args);
31
32
  if (subcommand)
32
33
  return madSksSubcommand(subcommand, args.filter((arg) => String(arg) !== subcommand));
33
34
  const rawArgs = (args || []).map((arg) => String(arg));
34
35
  const glmMadLaunch = isMadGlmLaunch(rawArgs, deps);
36
+ if (!glmMadLaunch)
37
+ assertNonGlmMadRoute(rawArgs.includes('--mad') ? rawArgs : ['--mad', ...rawArgs]);
35
38
  const glmOnlyFlagBlockers = findGlmOnlyMadFlagBlockers(rawArgs, glmMadLaunch);
36
39
  if (glmOnlyFlagBlockers.length) {
37
40
  const result = {
package/dist/core/fsx.js CHANGED
@@ -5,7 +5,7 @@ import os from 'node:os';
5
5
  import crypto from 'node:crypto';
6
6
  import { spawn } from 'node:child_process';
7
7
  import { fileURLToPath } from 'node:url';
8
- export const PACKAGE_VERSION = '4.0.12';
8
+ export const PACKAGE_VERSION = '4.0.14';
9
9
  export const DEFAULT_PROCESS_TAIL_BYTES = 256 * 1024;
10
10
  export const DEFAULT_PROCESS_TIMEOUT_MS = 30 * 60 * 1000;
11
11
  export function nowIso() {
@@ -0,0 +1,48 @@
1
+ export function computeGlmBenchmarkComparison(cases) {
2
+ const directCase = cases.find((c) => c.implementation_path === 'direct-glm');
3
+ const narutoCases = cases.filter((c) => c.implementation_path === 'glm-naruto');
4
+ const directSucceeded = Boolean(directCase && (directCase.patch_generated === true || directCase.patch_gate_passed === true));
5
+ const directWallClockMs = directCase && directSucceeded ? directCase.wall_clock_ms : null;
6
+ const eligibleNaruto = narutoCases.filter((c) => (c.gate_pass_rate !== null && c.gate_pass_rate > 0) || c.merge_success === true);
7
+ let bestNaruto = null;
8
+ for (const naruto of eligibleNaruto) {
9
+ if (!bestNaruto || naruto.wall_clock_ms < bestNaruto.wall_clock_ms) {
10
+ bestNaruto = naruto;
11
+ }
12
+ }
13
+ const bestNarutoWallClockMs = bestNaruto ? bestNaruto.wall_clock_ms : null;
14
+ const bestNarutoRunnerId = bestNaruto ? bestNaruto.runner_id : null;
15
+ let speedup = null;
16
+ if (directWallClockMs !== null && bestNarutoWallClockMs !== null && bestNarutoWallClockMs > 0) {
17
+ speedup = Number((directWallClockMs / bestNarutoWallClockMs).toFixed(3));
18
+ }
19
+ let recommendation = 'inconclusive';
20
+ let reason = 'Insufficient measured data to recommend a path.';
21
+ if (directWallClockMs !== null && bestNarutoWallClockMs === null) {
22
+ recommendation = 'direct-glm';
23
+ reason = 'Direct GLM succeeded and no Naruto case produced gate-passed or merged results.';
24
+ }
25
+ else if (directWallClockMs !== null && bestNarutoWallClockMs !== null && speedup !== null) {
26
+ if (speedup >= 1.2) {
27
+ recommendation = 'glm-naruto';
28
+ reason = `GLM Naruto (${bestNarutoRunnerId}) was ${speedup.toFixed(2)}x faster than direct GLM for this task.`;
29
+ }
30
+ else {
31
+ recommendation = 'direct-glm';
32
+ reason = `Direct GLM was faster for this tiny single-file task (speedup ratio ${speedup.toFixed(2)}).`;
33
+ }
34
+ }
35
+ else if (directWallClockMs === null && bestNarutoWallClockMs !== null) {
36
+ recommendation = 'glm-naruto';
37
+ reason = `GLM Naruto (${bestNarutoRunnerId}) produced results while direct GLM did not complete.`;
38
+ }
39
+ return {
40
+ direct_wall_clock_ms: directWallClockMs,
41
+ best_naruto_wall_clock_ms: bestNarutoWallClockMs,
42
+ best_naruto_runner_id: bestNarutoRunnerId,
43
+ naruto_speedup_vs_direct: speedup,
44
+ recommendation,
45
+ reason
46
+ };
47
+ }
48
+ //# sourceMappingURL=glm-bench-comparison.js.map
@@ -0,0 +1,65 @@
1
+ import os from 'node:os';
2
+ import path from 'node:path';
3
+ import fsp from 'node:fs/promises';
4
+ import { spawn } from 'node:child_process';
5
+ export const BENCH_FIXTURE_TASK = 'Change src/bench-target.ts so value is 2. Return the smallest patch only.';
6
+ export const BENCH_FIXTURE_TARGET_FILE = 'src/bench-target.ts';
7
+ export const BENCH_FIXTURE_INITIAL = 'export const value = 1;\n';
8
+ export const BENCH_FIXTURE_EXPECTED = 'export const value = 2;\n';
9
+ export async function createGlmBenchFixture(baseDir) {
10
+ const fixtureDir = await fsp.mkdtemp(path.join(baseDir || os.tmpdir(), 'sks-glm-bench-fixture-'));
11
+ await fsp.mkdir(path.join(fixtureDir, 'src'), { recursive: true });
12
+ await fsp.writeFile(path.join(fixtureDir, BENCH_FIXTURE_TARGET_FILE), BENCH_FIXTURE_INITIAL, 'utf8');
13
+ await gitInit(fixtureDir);
14
+ await gitAdd(fixtureDir, '.');
15
+ await gitCommit(fixtureDir, 'bench fixture initial');
16
+ return {
17
+ schema: 'sks.glm-bench-fixture.v1',
18
+ fixture_dir: fixtureDir,
19
+ task: BENCH_FIXTURE_TASK,
20
+ target_file: BENCH_FIXTURE_TARGET_FILE,
21
+ initial_content: BENCH_FIXTURE_INITIAL,
22
+ expected_content: BENCH_FIXTURE_EXPECTED
23
+ };
24
+ }
25
+ export async function cloneFixture(source, label) {
26
+ const cloneDir = await fsp.mkdtemp(path.join(os.tmpdir(), `sks-glm-bench-${label}-`));
27
+ await gitClone(source.fixture_dir, cloneDir);
28
+ return { ...source, fixture_dir: cloneDir };
29
+ }
30
+ export async function resetFixture(fixture) {
31
+ await runGit(['reset', '--hard', 'HEAD'], fixture.fixture_dir);
32
+ await runGit(['clean', '-fdx'], fixture.fixture_dir);
33
+ }
34
+ export async function cleanupFixture(fixture) {
35
+ await fsp.rm(fixture.fixture_dir, { recursive: true, force: true }).catch(() => undefined);
36
+ }
37
+ async function gitInit(dir) {
38
+ await runGit(['init', '-q'], dir);
39
+ await runGit(['config', 'user.name', 'sks-bench'], dir);
40
+ await runGit(['config', 'user.email', 'bench@sks.local'], dir);
41
+ }
42
+ async function gitAdd(dir, file) {
43
+ await runGit(['add', file], dir);
44
+ }
45
+ async function gitCommit(dir, message) {
46
+ await runGit(['commit', '-q', '-m', message], dir);
47
+ }
48
+ async function gitClone(source, dest) {
49
+ await runGit(['clone', '-q', source, dest], dest);
50
+ }
51
+ function runGit(args, cwd) {
52
+ return new Promise((resolve, reject) => {
53
+ const child = spawn('git', [...args], { cwd, stdio: ['ignore', 'pipe', 'pipe'] });
54
+ let stderr = '';
55
+ child.stderr.on('data', (chunk) => { stderr += String(chunk); });
56
+ child.on('close', (code) => {
57
+ if (code === 0)
58
+ resolve();
59
+ else
60
+ reject(new Error(`git ${args.join(' ')} exited ${code}: ${stderr.trim()}`));
61
+ });
62
+ child.on('error', reject);
63
+ });
64
+ }
65
+ //# sourceMappingURL=glm-bench-fixture.js.map
@@ -0,0 +1,53 @@
1
+ import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
2
+ export function buildGlmBenchModelLockProof(cases, proofInput = {}) {
3
+ const checkedCases = cases.map((c) => c.runner_id);
4
+ const mismatches = [];
5
+ const requestSummaries = proofInput.requestSummaries ?? [];
6
+ let fallbackArraysFound = 0;
7
+ let openaiKeyUsed = false;
8
+ for (const caseResult of cases) {
9
+ if (caseResult.model !== GLM_52_OPENROUTER_MODEL) {
10
+ mismatches.push(`${caseResult.runner_id}: model is ${caseResult.model}, expected ${GLM_52_OPENROUTER_MODEL}`);
11
+ }
12
+ if (caseResult.gpt_fallback_allowed !== false) {
13
+ mismatches.push(`${caseResult.runner_id}: gpt_fallback_allowed is not false`);
14
+ }
15
+ }
16
+ for (const summary of requestSummaries) {
17
+ if (summary.model !== undefined && summary.model !== GLM_52_OPENROUTER_MODEL) {
18
+ mismatches.push(`request-summary:${String(summary.worker_id ?? summary.runner_id ?? 'unknown')}: model is ${String(summary.model)}`);
19
+ }
20
+ const models = Array.isArray(summary.models) ? summary.models : [];
21
+ const fallbackModelsCount = typeof summary.fallback_models_count === 'number' ? summary.fallback_models_count : models.length;
22
+ if (fallbackModelsCount > 0)
23
+ fallbackArraysFound += 1;
24
+ if (summary.openai_key_used === true || summary.authorization_source === 'openai')
25
+ openaiKeyUsed = true;
26
+ if (summary.gpt_fallback_allowed !== undefined && summary.gpt_fallback_allowed !== false) {
27
+ mismatches.push(`request-summary:${String(summary.worker_id ?? summary.runner_id ?? 'unknown')}: gpt_fallback_allowed is not false`);
28
+ }
29
+ }
30
+ if (fallbackArraysFound > 0)
31
+ mismatches.push(`fallback_arrays_found:${fallbackArraysFound}`);
32
+ if (openaiKeyUsed)
33
+ mismatches.push('openai_key_used');
34
+ const requestSummaryStatus = requestSummaries.length > 0 ? 'checked' : 'unavailable';
35
+ return {
36
+ schema: 'sks.glm-bench-model-lock-proof.v1',
37
+ checked_cases: checkedCases,
38
+ model: GLM_52_OPENROUTER_MODEL,
39
+ gpt_fallback_allowed: false,
40
+ request_summary_status: requestSummaryStatus,
41
+ request_summaries_checked: requestSummaries.length,
42
+ request_summaries_unavailable: Math.max(0, cases.length - requestSummaries.length),
43
+ naruto_request_summaries_checked: requestSummaries.filter((summary) => String(summary.worker_id ?? '').startsWith('worker-')).length,
44
+ direct_trace_checked: proofInput.directTraceChecked === true,
45
+ fallback_arrays_found: fallbackArraysFound,
46
+ openai_key_used: openaiKeyUsed,
47
+ fallback_array_scan: requestSummaryStatus,
48
+ openai_key_usage_scan: requestSummaryStatus,
49
+ mismatches,
50
+ passed: mismatches.length === 0
51
+ };
52
+ }
53
+ //# sourceMappingURL=glm-bench-model-lock-proof.js.map
@@ -0,0 +1,75 @@
1
+ import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
2
+ import { writeTextAtomic, nowIso } from '../../../fsx.js';
3
+ import path from 'node:path';
4
+ export async function writeGlmBenchReport(benchDir, result) {
5
+ const reportPath = path.join(benchDir, 'bench-report.md');
6
+ const lines = [];
7
+ lines.push('# GLM Benchmark Report — True Direct vs Naruto', '');
8
+ lines.push(`Generated: ${result.generated_at}`);
9
+ lines.push(`Model: ${GLM_52_OPENROUTER_MODEL}`);
10
+ lines.push(`GPT fallback allowed: false`);
11
+ lines.push(`Status: ${result.status}`);
12
+ lines.push('');
13
+ if (result.fixture) {
14
+ lines.push('## Fixture', '');
15
+ lines.push(`- Task: ${result.fixture.task}`);
16
+ lines.push(`- Target: ${result.fixture.target_file}`);
17
+ lines.push(`- Temp repo: ${result.fixture.fixture_dir}`);
18
+ lines.push('');
19
+ }
20
+ lines.push('## Cases', '');
21
+ lines.push('| Case | Kind | Workers | Wall ms | TTFT p50 | Total p50 | Candidates | Gate pass | Verifier | Merge | Patch gen | Patch gate | Metric |');
22
+ lines.push('| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- | --- | --- | --- |');
23
+ for (const c of result.cases) {
24
+ lines.push(formatCaseRow(c));
25
+ }
26
+ lines.push('');
27
+ const direct = result.cases.find((c) => c.implementation_path === 'direct-glm');
28
+ const narutoBest = result.cases
29
+ .filter((c) => c.implementation_path === 'glm-naruto')
30
+ .sort((a, b) => a.wall_clock_ms - b.wall_clock_ms)[0];
31
+ lines.push('## Comparison', '');
32
+ if (direct) {
33
+ lines.push(`- Direct GLM: ${direct.wall_clock_ms}ms`);
34
+ }
35
+ if (narutoBest) {
36
+ lines.push(`- Best Naruto: ${narutoBest.name} at ${narutoBest.wall_clock_ms}ms`);
37
+ }
38
+ lines.push(`- Recommendation: ${result.comparison.recommendation}`);
39
+ lines.push(`- Reason: ${result.comparison.reason}`);
40
+ lines.push('');
41
+ lines.push('## Limitations', '');
42
+ lines.push('- This benchmark uses a tiny single-file task; tiny tasks may favor direct GLM.');
43
+ lines.push('- Multi-file parallelizable tasks may favor GLM Naruto.');
44
+ lines.push('- Missing usage metrics are reported as `unavailable` or `n/a`, never as fake zero.');
45
+ lines.push('- Direct GLM candidate/verifier/merge metrics are `not_applicable`.');
46
+ lines.push('');
47
+ if (result.model_lock_proof) {
48
+ lines.push('## Model Lock Proof', '');
49
+ lines.push(`- Passed: ${result.model_lock_proof.passed}`);
50
+ lines.push(`- Mismatches: ${result.model_lock_proof.mismatches.length}`);
51
+ lines.push('');
52
+ }
53
+ if (result.no_mutation_proof) {
54
+ lines.push('## No Mutation Proof', '');
55
+ lines.push(`- Passed: ${result.no_mutation_proof.passed}`);
56
+ lines.push(`- User CWD unchanged: ${result.no_mutation_proof.user_cwd_unchanged}`);
57
+ lines.push('');
58
+ }
59
+ lines.push(`_Report generated at ${nowIso()}_`, '');
60
+ await writeTextAtomic(reportPath, lines.join('\n'));
61
+ return reportPath;
62
+ }
63
+ function formatCaseRow(c) {
64
+ const ttft = c.p50_ttft_ms !== null ? String(c.p50_ttft_ms) : 'unavailable';
65
+ const total = c.p50_total_ms !== null ? String(c.p50_total_ms) : 'unavailable';
66
+ const candidates = c.candidate_count !== null ? String(c.candidate_count) : 'n/a';
67
+ const gate = c.gate_pass_rate !== null ? c.gate_pass_rate.toFixed(2) : 'n/a';
68
+ const verifier = c.verifier_pass_rate !== null ? c.verifier_pass_rate.toFixed(2) : 'n/a';
69
+ const merge = c.merge_success !== null ? String(c.merge_success) : 'n/a';
70
+ const patchGen = c.patch_generated !== null ? String(c.patch_generated) : 'n/a';
71
+ const patchGate = c.patch_gate_passed !== null ? String(c.patch_gate_passed) : 'n/a';
72
+ const metricLatency = c.metric_status.latency;
73
+ return `| ${c.name} | ${c.kind} | ${c.workers} | ${c.wall_clock_ms} | ${ttft} | ${total} | ${candidates} | ${gate} | ${verifier} | ${merge} | ${patchGen} | ${patchGate} | ${metricLatency} |`;
74
+ }
75
+ //# sourceMappingURL=glm-bench-report.js.map
@@ -0,0 +1,243 @@
1
+ import os from 'node:os';
2
+ import path from 'node:path';
3
+ import fsp from 'node:fs/promises';
4
+ import { spawn } from 'node:child_process';
5
+ import { nowIso, writeJsonAtomic } from '../../../fsx.js';
6
+ import { GLM_52_OPENROUTER_MODEL } from '../glm-52-settings.js';
7
+ import { resolveOpenRouterApiKey } from '../../openrouter/openrouter-secret-store.js';
8
+ import { runGlmNarutoMission } from '../naruto/glm-naruto-orchestrator.js';
9
+ import { summarizeGlmNarutoWorkerMetrics } from '../naruto/glm-naruto-metrics.js';
10
+ import { runGlmDirectSpeedRun } from '../glm-direct-run.js';
11
+ import { createGlmBenchFixture, cloneFixture, resetFixture, cleanupFixture } from './glm-bench-fixture.js';
12
+ import { runGlmDirectBenchCase } from './glm-direct-bench-runner.js';
13
+ import { computeGlmBenchmarkComparison } from './glm-bench-comparison.js';
14
+ import { buildGlmBenchModelLockProof } from './glm-bench-model-lock-proof.js';
15
+ import { writeGlmBenchReport } from './glm-bench-report.js';
16
+ const NARUTO_WORKER_COUNTS = [1, 4, 8, 12];
17
+ export async function runGlmBenchmark(root, args = [], deps = {}) {
18
+ const live = args.includes('--live');
19
+ const execute = args.includes('--execute');
20
+ const noApply = args.includes('--no-apply') || true;
21
+ const applyTemp = args.includes('--apply-temp');
22
+ const started = Date.now();
23
+ if (execute && !live) {
24
+ return blockedResult(root, ['execute_requires_live_flag']);
25
+ }
26
+ if (!live) {
27
+ return dryRunResult(root, started);
28
+ }
29
+ const key = await resolveOpenRouterApiKey({ env: process.env });
30
+ if (!key.key) {
31
+ return blockedResult(root, ['live_bench_requires_openrouter_key']);
32
+ }
33
+ const userCwd = process.cwd();
34
+ const userCwdBefore = await captureGitStatus(userCwd);
35
+ const benchId = `bench-${nowIso().replace(/[:.]/g, '-')}`;
36
+ const benchDir = path.join(root, '.sneakoscope', 'glm-bench', benchId);
37
+ await fsp.mkdir(benchDir, { recursive: true });
38
+ const sharedFixture = await createGlmBenchFixture();
39
+ const cases = [];
40
+ // Direct GLM case — does NOT call runGlmNarutoMission
41
+ const directFixture = await cloneFixture(sharedFixture, 'direct');
42
+ const directCaseDir = path.join(benchDir, 'cases', 'direct-glm-speed');
43
+ const directCase = await runGlmDirectBenchCase({
44
+ root,
45
+ fixture: directFixture,
46
+ apiKey: key.key,
47
+ noApply: true,
48
+ timeoutMs: 120_000,
49
+ sessionId: `sks-bench-direct-${benchId}`,
50
+ caseDir: directCaseDir
51
+ }, deps.runDirect ? { runDirect: deps.runDirect } : {});
52
+ cases.push(directCase);
53
+ await cleanupFixture(directFixture);
54
+ // Naruto cases — each calls runGlmNarutoMission with different worker counts
55
+ for (const workers of NARUTO_WORKER_COUNTS) {
56
+ const narutoFixture = await cloneFixture(sharedFixture, `naruto-${workers}`);
57
+ const caseDir = path.join(benchDir, 'cases', `glm-naruto-${workers}`);
58
+ await fsp.mkdir(caseDir, { recursive: true });
59
+ const caseStarted = Date.now();
60
+ const runNaruto = deps.runNaruto ?? runGlmNarutoMission;
61
+ const narutoResult = await runNaruto({
62
+ cwd: narutoFixture.fixture_dir,
63
+ task: sharedFixture.task,
64
+ args: ['--bench', '--live', '--no-apply'],
65
+ missionId: `glm-bench-naruto-${workers}-${benchId}`,
66
+ maxWorkers: workers,
67
+ noApply: true
68
+ });
69
+ const traces = await readWorkerTraces(narutoResult.artifact_dir);
70
+ const metrics = summarizeGlmNarutoWorkerMetrics(traces);
71
+ const wallClockMs = Date.now() - caseStarted;
72
+ const narutoCase = {
73
+ schema: 'sks.glm-benchmark-case.v1',
74
+ name: `GLM Naruto ${workers} worker${workers === 1 ? '' : 's'}`,
75
+ kind: 'glm-naruto',
76
+ runner_id: `glm-naruto-${workers}`,
77
+ implementation_path: 'glm-naruto',
78
+ workers,
79
+ model: GLM_52_OPENROUTER_MODEL,
80
+ gpt_fallback_allowed: false,
81
+ no_apply: true,
82
+ mutation_performed: false,
83
+ wall_clock_ms: wallClockMs,
84
+ p50_ttft_ms: metrics.p50_ttft_ms,
85
+ p90_ttft_ms: metrics.p90_ttft_ms,
86
+ p50_total_ms: metrics.p50_total_ms,
87
+ p90_total_ms: metrics.p90_total_ms,
88
+ candidate_count: narutoResult.patch_candidates,
89
+ gate_pass_rate: narutoResult.patch_candidates ? narutoResult.gate_passed_candidates / narutoResult.patch_candidates : null,
90
+ verifier_pass_rate: metrics.verifier_pass_rate > 0 ? metrics.verifier_pass_rate : (traces.length > 0 ? 0 : null),
91
+ merge_success: narutoResult.mergeable_candidates > 0,
92
+ patch_generated: narutoResult.patch_candidates > 0,
93
+ patch_gate_passed: narutoResult.gate_passed_candidates > 0,
94
+ cached_tokens_sum: metrics.cached_tokens_sum,
95
+ cache_write_tokens_sum: metrics.cache_write_tokens_sum,
96
+ reasoning_tokens_sum: metrics.reasoning_tokens_sum,
97
+ metric_status: {
98
+ latency: metrics.p50_total_ms === null && metrics.p50_ttft_ms === null ? 'unavailable' : 'measured',
99
+ usage: metrics.cached_tokens_sum === null && metrics.reasoning_tokens_sum === null ? 'unavailable' : 'measured',
100
+ candidate: 'measured',
101
+ verifier: 'measured',
102
+ merge: 'measured'
103
+ },
104
+ artifacts: {
105
+ case_dir: caseDir,
106
+ trace_path: null,
107
+ mission_artifact_dir: narutoResult.artifact_dir || null
108
+ },
109
+ blockers: narutoResult.blockers,
110
+ warnings: narutoResult.warnings
111
+ };
112
+ await writeJsonAtomic(path.join(caseDir, 'case-result.json'), narutoCase);
113
+ cases.push(narutoCase);
114
+ await cleanupFixture(narutoFixture);
115
+ }
116
+ await cleanupFixture(sharedFixture);
117
+ const comparison = computeGlmBenchmarkComparison(cases);
118
+ const modelLockProof = buildGlmBenchModelLockProof(cases, {
119
+ requestSummaries: await collectRequestSummaries(cases),
120
+ directTraceChecked: cases.some((c) => c.runner_id === 'direct-glm-speed' && c.artifacts.trace_path !== null)
121
+ });
122
+ const userCwdAfter = await captureGitStatus(userCwd);
123
+ const userCwdUnchanged = userCwdBefore === userCwdAfter;
124
+ const noMutationProof = {
125
+ schema: 'sks.glm-bench-no-mutation-proof.v1',
126
+ user_cwd_unchanged: userCwdUnchanged,
127
+ fixture_mutated_only_under_apply_temp: !applyTemp,
128
+ cases_report_no_mutation: true,
129
+ passed: userCwdUnchanged && cases.every((c) => c.mutation_performed === false)
130
+ };
131
+ const result = {
132
+ schema: 'sks.glm-benchmark-result.v1',
133
+ version: '4.0.14',
134
+ generated_at: nowIso(),
135
+ status: 'live',
136
+ model: GLM_52_OPENROUTER_MODEL,
137
+ gpt_fallback_allowed: false,
138
+ fixture: {
139
+ schema: 'sks.glm-bench-fixture.v1',
140
+ fixture_dir: '(cleaned up)',
141
+ task: sharedFixture.task,
142
+ target_file: sharedFixture.target_file,
143
+ initial_content: sharedFixture.initial_content,
144
+ expected_content: sharedFixture.expected_content
145
+ },
146
+ cases,
147
+ comparison,
148
+ model_lock_proof: modelLockProof,
149
+ no_mutation_proof: noMutationProof,
150
+ warnings: ['live_bench_no_apply_temp_repo']
151
+ };
152
+ await writeJsonAtomic(path.join(benchDir, 'bench-result.json'), result);
153
+ await writeJsonAtomic(path.join(benchDir, 'model-lock-proof.json'), modelLockProof);
154
+ await writeGlmBenchReport(benchDir, result);
155
+ return result;
156
+ }
157
+ function dryRunResult(root, startedMs) {
158
+ return {
159
+ schema: 'sks.glm-benchmark-result.v1',
160
+ version: '4.0.14',
161
+ generated_at: nowIso(),
162
+ status: 'dry_run',
163
+ model: GLM_52_OPENROUTER_MODEL,
164
+ gpt_fallback_allowed: false,
165
+ fixture: null,
166
+ cases: [],
167
+ comparison: {
168
+ direct_wall_clock_ms: null,
169
+ best_naruto_wall_clock_ms: null,
170
+ best_naruto_runner_id: null,
171
+ naruto_speedup_vs_direct: null,
172
+ recommendation: 'inconclusive',
173
+ reason: 'Dry run — no live API calls made.'
174
+ },
175
+ model_lock_proof: null,
176
+ no_mutation_proof: null,
177
+ warnings: ['dry_run_no_live_api_calls']
178
+ };
179
+ }
180
+ function blockedResult(root, warnings) {
181
+ return {
182
+ schema: 'sks.glm-benchmark-result.v1',
183
+ version: '4.0.14',
184
+ generated_at: nowIso(),
185
+ status: 'blocked',
186
+ model: GLM_52_OPENROUTER_MODEL,
187
+ gpt_fallback_allowed: false,
188
+ fixture: null,
189
+ cases: [],
190
+ comparison: {
191
+ direct_wall_clock_ms: null,
192
+ best_naruto_wall_clock_ms: null,
193
+ best_naruto_runner_id: null,
194
+ naruto_speedup_vs_direct: null,
195
+ recommendation: 'inconclusive',
196
+ reason: 'Benchmark blocked.'
197
+ },
198
+ model_lock_proof: null,
199
+ no_mutation_proof: null,
200
+ warnings
201
+ };
202
+ }
203
+ async function readWorkerTraces(artifactDir) {
204
+ if (!artifactDir)
205
+ return [];
206
+ try {
207
+ return JSON.parse(await fsp.readFile(path.join(artifactDir, 'worker-traces.json'), 'utf8'));
208
+ }
209
+ catch {
210
+ return [];
211
+ }
212
+ }
213
+ async function collectRequestSummaries(cases) {
214
+ const summaries = [];
215
+ for (const caseResult of cases) {
216
+ const dir = caseResult.artifacts.mission_artifact_dir;
217
+ if (!dir)
218
+ continue;
219
+ try {
220
+ const workerRoot = path.join(dir, 'workers');
221
+ const workerIds = await fsp.readdir(workerRoot);
222
+ for (const workerId of workerIds) {
223
+ try {
224
+ const summary = JSON.parse(await fsp.readFile(path.join(workerRoot, workerId, 'request-summary.json'), 'utf8'));
225
+ summaries.push(summary);
226
+ }
227
+ catch { }
228
+ }
229
+ }
230
+ catch { }
231
+ }
232
+ return summaries;
233
+ }
234
+ async function captureGitStatus(cwd) {
235
+ return new Promise((resolve) => {
236
+ const child = spawn('git', ['status', '--short'], { cwd, stdio: ['ignore', 'pipe', 'ignore'] });
237
+ let stdout = '';
238
+ child.stdout.on('data', (chunk) => { stdout += String(chunk); });
239
+ child.on('close', () => resolve(stdout.trim()));
240
+ child.on('error', () => resolve(''));
241
+ });
242
+ }
243
+ //# sourceMappingURL=glm-benchmark-runner.js.map
@@ -0,0 +1,2 @@
1
+ export const GLM_BENCHMARK_VERSION = '4.0.14';
2
+ //# sourceMappingURL=glm-benchmark-types.js.map