codex-multi-auth 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -0
- package/README.md +162 -0
- package/assets/opencode-logo-ornate-dark.svg +18 -0
- package/assets/readme-hero.svg +31 -0
- package/config/README.md +87 -0
- package/config/minimal-opencode.json +13 -0
- package/config/opencode-legacy.json +571 -0
- package/config/opencode-modern.json +239 -0
- package/dist/index.d.ts +45 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3160 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/accounts/rate-limits.d.ts +22 -0
- package/dist/lib/accounts/rate-limits.d.ts.map +1 -0
- package/dist/lib/accounts/rate-limits.js +63 -0
- package/dist/lib/accounts/rate-limits.js.map +1 -0
- package/dist/lib/accounts.d.ts +95 -0
- package/dist/lib/accounts.d.ts.map +1 -0
- package/dist/lib/accounts.js +668 -0
- package/dist/lib/accounts.js.map +1 -0
- package/dist/lib/audit.d.ts +45 -0
- package/dist/lib/audit.d.ts.map +1 -0
- package/dist/lib/audit.js +131 -0
- package/dist/lib/audit.js.map +1 -0
- package/dist/lib/auth/auth.d.ts +56 -0
- package/dist/lib/auth/auth.d.ts.map +1 -0
- package/dist/lib/auth/auth.js +214 -0
- package/dist/lib/auth/auth.js.map +1 -0
- package/dist/lib/auth/browser.d.ts +34 -0
- package/dist/lib/auth/browser.d.ts.map +1 -0
- package/dist/lib/auth/browser.js +185 -0
- package/dist/lib/auth/browser.js.map +1 -0
- package/dist/lib/auth/server.d.ts +24 -0
- package/dist/lib/auth/server.d.ts.map +1 -0
- package/dist/lib/auth/server.js +116 -0
- package/dist/lib/auth/server.js.map +1 -0
- package/dist/lib/auth/token-utils.d.ts +59 -0
- package/dist/lib/auth/token-utils.d.ts.map +1 -0
- package/dist/lib/auth/token-utils.js +331 -0
- package/dist/lib/auth/token-utils.js.map +1 -0
- package/dist/lib/auth-rate-limit.d.ts +20 -0
- package/dist/lib/auth-rate-limit.d.ts.map +1 -0
- package/dist/lib/auth-rate-limit.js +91 -0
- package/dist/lib/auth-rate-limit.js.map +1 -0
- package/dist/lib/auto-update-checker.d.ts +10 -0
- package/dist/lib/auto-update-checker.d.ts.map +1 -0
- package/dist/lib/auto-update-checker.js +216 -0
- package/dist/lib/auto-update-checker.js.map +1 -0
- package/dist/lib/capability-policy.d.ts +18 -0
- package/dist/lib/capability-policy.d.ts.map +1 -0
- package/dist/lib/capability-policy.js +150 -0
- package/dist/lib/capability-policy.js.map +1 -0
- package/dist/lib/circuit-breaker.d.ts +34 -0
- package/dist/lib/circuit-breaker.d.ts.map +1 -0
- package/dist/lib/circuit-breaker.js +124 -0
- package/dist/lib/circuit-breaker.js.map +1 -0
- package/dist/lib/cli.d.ts +64 -0
- package/dist/lib/cli.d.ts.map +1 -0
- package/dist/lib/cli.js +274 -0
- package/dist/lib/cli.js.map +1 -0
- package/dist/lib/codex-cli/observability.d.ts +22 -0
- package/dist/lib/codex-cli/observability.d.ts.map +1 -0
- package/dist/lib/codex-cli/observability.js +36 -0
- package/dist/lib/codex-cli/observability.js.map +1 -0
- package/dist/lib/codex-cli/state.d.ts +86 -0
- package/dist/lib/codex-cli/state.d.ts.map +1 -0
- package/dist/lib/codex-cli/state.js +470 -0
- package/dist/lib/codex-cli/state.js.map +1 -0
- package/dist/lib/codex-cli/sync.d.ts +27 -0
- package/dist/lib/codex-cli/sync.d.ts.map +1 -0
- package/dist/lib/codex-cli/sync.js +325 -0
- package/dist/lib/codex-cli/sync.js.map +1 -0
- package/dist/lib/codex-cli/writer.d.ts +12 -0
- package/dist/lib/codex-cli/writer.d.ts.map +1 -0
- package/dist/lib/codex-cli/writer.js +388 -0
- package/dist/lib/codex-cli/writer.js.map +1 -0
- package/dist/lib/codex-manager.d.ts +2 -0
- package/dist/lib/codex-manager.d.ts.map +1 -0
- package/dist/lib/codex-manager.js +4841 -0
- package/dist/lib/codex-manager.js.map +1 -0
- package/dist/lib/config.d.ts +269 -0
- package/dist/lib/config.d.ts.map +1 -0
- package/dist/lib/config.js +789 -0
- package/dist/lib/config.js.map +1 -0
- package/dist/lib/constants.d.ts +78 -0
- package/dist/lib/constants.d.ts.map +1 -0
- package/dist/lib/constants.js +78 -0
- package/dist/lib/constants.js.map +1 -0
- package/dist/lib/context-overflow.d.ts +27 -0
- package/dist/lib/context-overflow.d.ts.map +1 -0
- package/dist/lib/context-overflow.js +124 -0
- package/dist/lib/context-overflow.js.map +1 -0
- package/dist/lib/dashboard-settings.d.ts +90 -0
- package/dist/lib/dashboard-settings.d.ts.map +1 -0
- package/dist/lib/dashboard-settings.js +327 -0
- package/dist/lib/dashboard-settings.js.map +1 -0
- package/dist/lib/entitlement-cache.d.ts +41 -0
- package/dist/lib/entitlement-cache.d.ts.map +1 -0
- package/dist/lib/entitlement-cache.js +137 -0
- package/dist/lib/entitlement-cache.js.map +1 -0
- package/dist/lib/errors.d.ts +113 -0
- package/dist/lib/errors.d.ts.map +1 -0
- package/dist/lib/errors.js +103 -0
- package/dist/lib/errors.js.map +1 -0
- package/dist/lib/forecast.d.ts +42 -0
- package/dist/lib/forecast.d.ts.map +1 -0
- package/dist/lib/forecast.js +256 -0
- package/dist/lib/forecast.js.map +1 -0
- package/dist/lib/health.d.ts +33 -0
- package/dist/lib/health.d.ts.map +1 -0
- package/dist/lib/health.js +70 -0
- package/dist/lib/health.js.map +1 -0
- package/dist/lib/index.d.ts +32 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/index.js +32 -0
- package/dist/lib/index.js.map +1 -0
- package/dist/lib/live-account-sync.d.ts +39 -0
- package/dist/lib/live-account-sync.d.ts.map +1 -0
- package/dist/lib/live-account-sync.js +196 -0
- package/dist/lib/live-account-sync.js.map +1 -0
- package/dist/lib/logger.d.ts +40 -0
- package/dist/lib/logger.d.ts.map +1 -0
- package/dist/lib/logger.js +364 -0
- package/dist/lib/logger.js.map +1 -0
- package/dist/lib/oauth-success.html +338 -0
- package/dist/lib/parallel-probe.d.ts +28 -0
- package/dist/lib/parallel-probe.d.ts.map +1 -0
- package/dist/lib/parallel-probe.js +97 -0
- package/dist/lib/parallel-probe.js.map +1 -0
- package/dist/lib/preemptive-quota-scheduler.d.ts +53 -0
- package/dist/lib/preemptive-quota-scheduler.d.ts.map +1 -0
- package/dist/lib/preemptive-quota-scheduler.js +220 -0
- package/dist/lib/preemptive-quota-scheduler.js.map +1 -0
- package/dist/lib/proactive-refresh.d.ts +66 -0
- package/dist/lib/proactive-refresh.d.ts.map +1 -0
- package/dist/lib/proactive-refresh.js +143 -0
- package/dist/lib/proactive-refresh.js.map +1 -0
- package/dist/lib/prompts/codex-opencode-bridge.d.ts +19 -0
- package/dist/lib/prompts/codex-opencode-bridge.d.ts.map +1 -0
- package/dist/lib/prompts/codex-opencode-bridge.js +169 -0
- package/dist/lib/prompts/codex-opencode-bridge.js.map +1 -0
- package/dist/lib/prompts/codex.d.ts +41 -0
- package/dist/lib/prompts/codex.d.ts.map +1 -0
- package/dist/lib/prompts/codex.js +383 -0
- package/dist/lib/prompts/codex.js.map +1 -0
- package/dist/lib/prompts/opencode-codex.d.ts +25 -0
- package/dist/lib/prompts/opencode-codex.d.ts.map +1 -0
- package/dist/lib/prompts/opencode-codex.js +270 -0
- package/dist/lib/prompts/opencode-codex.js.map +1 -0
- package/dist/lib/quota-cache.d.ts +68 -0
- package/dist/lib/quota-cache.d.ts.map +1 -0
- package/dist/lib/quota-cache.js +224 -0
- package/dist/lib/quota-cache.js.map +1 -0
- package/dist/lib/quota-probe.d.ts +49 -0
- package/dist/lib/quota-probe.d.ts.map +1 -0
- package/dist/lib/quota-probe.js +368 -0
- package/dist/lib/quota-probe.js.map +1 -0
- package/dist/lib/recovery/constants.d.ts +12 -0
- package/dist/lib/recovery/constants.d.ts.map +1 -0
- package/dist/lib/recovery/constants.js +31 -0
- package/dist/lib/recovery/constants.js.map +1 -0
- package/dist/lib/recovery/index.d.ts +12 -0
- package/dist/lib/recovery/index.d.ts.map +1 -0
- package/dist/lib/recovery/index.js +12 -0
- package/dist/lib/recovery/index.js.map +1 -0
- package/dist/lib/recovery/storage.d.ts +24 -0
- package/dist/lib/recovery/storage.d.ts.map +1 -0
- package/dist/lib/recovery/storage.js +362 -0
- package/dist/lib/recovery/storage.js.map +1 -0
- package/dist/lib/recovery/types.d.ts +116 -0
- package/dist/lib/recovery/types.d.ts.map +1 -0
- package/dist/lib/recovery/types.js +7 -0
- package/dist/lib/recovery/types.js.map +1 -0
- package/dist/lib/recovery.d.ts +31 -0
- package/dist/lib/recovery.d.ts.map +1 -0
- package/dist/lib/recovery.js +313 -0
- package/dist/lib/recovery.js.map +1 -0
- package/dist/lib/refresh-guardian.d.ts +31 -0
- package/dist/lib/refresh-guardian.d.ts.map +1 -0
- package/dist/lib/refresh-guardian.js +151 -0
- package/dist/lib/refresh-guardian.js.map +1 -0
- package/dist/lib/refresh-lease.d.ts +37 -0
- package/dist/lib/refresh-lease.d.ts.map +1 -0
- package/dist/lib/refresh-lease.js +335 -0
- package/dist/lib/refresh-lease.js.map +1 -0
- package/dist/lib/refresh-queue.d.ts +117 -0
- package/dist/lib/refresh-queue.d.ts.map +1 -0
- package/dist/lib/refresh-queue.js +297 -0
- package/dist/lib/refresh-queue.js.map +1 -0
- package/dist/lib/request/failure-policy.d.ts +42 -0
- package/dist/lib/request/failure-policy.d.ts.map +1 -0
- package/dist/lib/request/failure-policy.js +133 -0
- package/dist/lib/request/failure-policy.js.map +1 -0
- package/dist/lib/request/fetch-helpers.d.ts +152 -0
- package/dist/lib/request/fetch-helpers.d.ts.map +1 -0
- package/dist/lib/request/fetch-helpers.js +704 -0
- package/dist/lib/request/fetch-helpers.js.map +1 -0
- package/dist/lib/request/helpers/input-utils.d.ts +7 -0
- package/dist/lib/request/helpers/input-utils.d.ts.map +1 -0
- package/dist/lib/request/helpers/input-utils.js +214 -0
- package/dist/lib/request/helpers/input-utils.js.map +1 -0
- package/dist/lib/request/helpers/model-map.d.ts +28 -0
- package/dist/lib/request/helpers/model-map.d.ts.map +1 -0
- package/dist/lib/request/helpers/model-map.js +133 -0
- package/dist/lib/request/helpers/model-map.js.map +1 -0
- package/dist/lib/request/helpers/tool-utils.d.ts +29 -0
- package/dist/lib/request/helpers/tool-utils.d.ts.map +1 -0
- package/dist/lib/request/helpers/tool-utils.js +117 -0
- package/dist/lib/request/helpers/tool-utils.js.map +1 -0
- package/dist/lib/request/rate-limit-backoff.d.ts +17 -0
- package/dist/lib/request/rate-limit-backoff.d.ts.map +1 -0
- package/dist/lib/request/rate-limit-backoff.js +83 -0
- package/dist/lib/request/rate-limit-backoff.js.map +1 -0
- package/dist/lib/request/request-transformer.d.ts +107 -0
- package/dist/lib/request/request-transformer.d.ts.map +1 -0
- package/dist/lib/request/request-transformer.js +814 -0
- package/dist/lib/request/request-transformer.js.map +1 -0
- package/dist/lib/request/response-handler.d.ts +23 -0
- package/dist/lib/request/response-handler.d.ts.map +1 -0
- package/dist/lib/request/response-handler.js +155 -0
- package/dist/lib/request/response-handler.js.map +1 -0
- package/dist/lib/request/stream-failover.d.ts +21 -0
- package/dist/lib/request/stream-failover.d.ts.map +1 -0
- package/dist/lib/request/stream-failover.js +204 -0
- package/dist/lib/request/stream-failover.js.map +1 -0
- package/dist/lib/rotation.d.ts +146 -0
- package/dist/lib/rotation.d.ts.map +1 -0
- package/dist/lib/rotation.js +321 -0
- package/dist/lib/rotation.js.map +1 -0
- package/dist/lib/runtime-paths.d.ts +58 -0
- package/dist/lib/runtime-paths.d.ts.map +1 -0
- package/dist/lib/runtime-paths.js +164 -0
- package/dist/lib/runtime-paths.js.map +1 -0
- package/dist/lib/schemas.d.ts +435 -0
- package/dist/lib/schemas.d.ts.map +1 -0
- package/dist/lib/schemas.js +268 -0
- package/dist/lib/schemas.js.map +1 -0
- package/dist/lib/session-affinity.d.ts +23 -0
- package/dist/lib/session-affinity.d.ts.map +1 -0
- package/dist/lib/session-affinity.js +127 -0
- package/dist/lib/session-affinity.js.map +1 -0
- package/dist/lib/shutdown.d.ts +7 -0
- package/dist/lib/shutdown.d.ts.map +1 -0
- package/dist/lib/shutdown.js +43 -0
- package/dist/lib/shutdown.js.map +1 -0
- package/dist/lib/storage/migrations.d.ts +59 -0
- package/dist/lib/storage/migrations.d.ts.map +1 -0
- package/dist/lib/storage/migrations.js +41 -0
- package/dist/lib/storage/migrations.js.map +1 -0
- package/dist/lib/storage/paths.d.ts +51 -0
- package/dist/lib/storage/paths.d.ts.map +1 -0
- package/dist/lib/storage/paths.js +152 -0
- package/dist/lib/storage/paths.js.map +1 -0
- package/dist/lib/storage.d.ts +106 -0
- package/dist/lib/storage.d.ts.map +1 -0
- package/dist/lib/storage.js +896 -0
- package/dist/lib/storage.js.map +1 -0
- package/dist/lib/table-formatter.d.ts +32 -0
- package/dist/lib/table-formatter.d.ts.map +1 -0
- package/dist/lib/table-formatter.js +44 -0
- package/dist/lib/table-formatter.js.map +1 -0
- package/dist/lib/tools/hashline-tools.d.ts +51 -0
- package/dist/lib/tools/hashline-tools.d.ts.map +1 -0
- package/dist/lib/tools/hashline-tools.js +456 -0
- package/dist/lib/tools/hashline-tools.js.map +1 -0
- package/dist/lib/types.d.ts +130 -0
- package/dist/lib/types.d.ts.map +1 -0
- package/dist/lib/types.js +2 -0
- package/dist/lib/types.js.map +1 -0
- package/dist/lib/ui/ansi.d.ts +40 -0
- package/dist/lib/ui/ansi.d.ts.map +1 -0
- package/dist/lib/ui/ansi.js +68 -0
- package/dist/lib/ui/ansi.js.map +1 -0
- package/dist/lib/ui/auth-menu.d.ts +76 -0
- package/dist/lib/ui/auth-menu.d.ts.map +1 -0
- package/dist/lib/ui/auth-menu.js +590 -0
- package/dist/lib/ui/auth-menu.js.map +1 -0
- package/dist/lib/ui/confirm.d.ts +11 -0
- package/dist/lib/ui/confirm.d.ts.map +1 -0
- package/dist/lib/ui/confirm.js +29 -0
- package/dist/lib/ui/confirm.js.map +1 -0
- package/dist/lib/ui/copy.d.ts +123 -0
- package/dist/lib/ui/copy.d.ts.map +1 -0
- package/dist/lib/ui/copy.js +127 -0
- package/dist/lib/ui/copy.js.map +1 -0
- package/dist/lib/ui/format.d.ts +62 -0
- package/dist/lib/ui/format.d.ts.map +1 -0
- package/dist/lib/ui/format.js +205 -0
- package/dist/lib/ui/format.js.map +1 -0
- package/dist/lib/ui/runtime.d.ts +43 -0
- package/dist/lib/ui/runtime.d.ts.map +1 -0
- package/dist/lib/ui/runtime.js +69 -0
- package/dist/lib/ui/runtime.js.map +1 -0
- package/dist/lib/ui/select.d.ts +60 -0
- package/dist/lib/ui/select.d.ts.map +1 -0
- package/dist/lib/ui/select.js +467 -0
- package/dist/lib/ui/select.js.map +1 -0
- package/dist/lib/ui/theme.d.ts +56 -0
- package/dist/lib/ui/theme.d.ts.map +1 -0
- package/dist/lib/ui/theme.js +186 -0
- package/dist/lib/ui/theme.js.map +1 -0
- package/dist/lib/unified-settings.d.ts +71 -0
- package/dist/lib/unified-settings.d.ts.map +1 -0
- package/dist/lib/unified-settings.js +299 -0
- package/dist/lib/unified-settings.js.map +1 -0
- package/dist/lib/utils.d.ts +29 -0
- package/dist/lib/utils.d.ts.map +1 -0
- package/dist/lib/utils.js +54 -0
- package/dist/lib/utils.js.map +1 -0
- package/package.json +115 -0
- package/scripts/audit-dev-allowlist.js +128 -0
- package/scripts/bench-format/hashline-v2.mjs +642 -0
- package/scripts/bench-format/models.mjs +105 -0
- package/scripts/bench-format/opencode.mjs +205 -0
- package/scripts/bench-format/render.mjs +496 -0
- package/scripts/bench-format/stats.mjs +54 -0
- package/scripts/bench-format/tasks.mjs +151 -0
- package/scripts/benchmark-edit-formats.mjs +1161 -0
- package/scripts/benchmark-render-dashboard.mjs +49 -0
- package/scripts/codex-multi-auth.js +6 -0
- package/scripts/codex-routing.js +34 -0
- package/scripts/codex.js +122 -0
- package/scripts/copy-oauth-success.js +37 -0
- package/scripts/install-opencode-codex-auth.js +193 -0
- package/scripts/test-all-models.sh +7 -0
- package/scripts/test-model-matrix.js +424 -0
- package/scripts/validate-model-map.sh +7 -0
|
@@ -0,0 +1,1161 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { mkdir, readFile, writeFile, rm } from "node:fs/promises";
|
|
4
|
+
import { existsSync } from "node:fs";
|
|
5
|
+
import { dirname, join, resolve } from "node:path";
|
|
6
|
+
import process from "node:process";
|
|
7
|
+
import {
|
|
8
|
+
getRepoRoot,
|
|
9
|
+
getSessionDuration,
|
|
10
|
+
getTextOutput,
|
|
11
|
+
getTokenTotals,
|
|
12
|
+
getToolEvents,
|
|
13
|
+
resolveOpencodeExecutable,
|
|
14
|
+
runOpencodeJson,
|
|
15
|
+
} from "./bench-format/opencode.mjs";
|
|
16
|
+
import { aliasCandidatesForCodexModel, listOpencodeModels, resolveModelPreset } from "./bench-format/models.mjs";
|
|
17
|
+
import {
|
|
18
|
+
applyHashlineV2Edits,
|
|
19
|
+
autocorrectHashlineV2Call,
|
|
20
|
+
extractJsonCodeBlock,
|
|
21
|
+
formatFileForHashlineV2,
|
|
22
|
+
parseHashlineV2Call,
|
|
23
|
+
} from "./bench-format/hashline-v2.mjs";
|
|
24
|
+
import { BENCHMARK_FIXTURE, TASKS, getTaskMap } from "./bench-format/tasks.mjs";
|
|
25
|
+
import { buildMarkdownReport, renderDashboardHtml } from "./bench-format/render.mjs";
|
|
26
|
+
import { stats, safePercent, round1, pctDelta } from "./bench-format/stats.mjs";
|
|
27
|
+
|
|
28
|
+
const REPO_ROOT = getRepoRoot();
|
|
29
|
+
const ALL_MODES = ["patch", "replace", "hashline", "hashline_v2"];
|
|
30
|
+
const DEFAULT_PRESET = "codex-core";
|
|
31
|
+
const DEFAULT_AGENT = "build";
|
|
32
|
+
const DEFAULT_V2_AGENT = "default";
|
|
33
|
+
const DEFAULT_VARIANT = "low";
|
|
34
|
+
const DEFAULT_TIMEOUT_MS = 300000;
|
|
35
|
+
const DEFAULT_OUTPUT_ROOT = ".tmp-bench";
|
|
36
|
+
const DEFAULT_TRANSIENT_RETRIES = 2;
|
|
37
|
+
const V2_PROMPT_PATH = resolve(REPO_ROOT, "bench/format-benchmark/prompts/hashline-v2.md");
|
|
38
|
+
const FIXTURE_SOURCE_PATH = resolve(REPO_ROOT, BENCHMARK_FIXTURE.sourcePath);
|
|
39
|
+
const DIST_PLUGIN_DIR = resolve(REPO_ROOT, "dist");
|
|
40
|
+
|
|
41
|
+
function printUsage() {
|
|
42
|
+
console.log([
|
|
43
|
+
"Usage: node scripts/benchmark-edit-formats.mjs [options]",
|
|
44
|
+
"",
|
|
45
|
+
"Options:",
|
|
46
|
+
" --preset=codex-core Model preset (default: codex-core)",
|
|
47
|
+
" --models=a,b,c Explicit model IDs (overrides preset)",
|
|
48
|
+
" --modes=patch,replace,hashline,hashline_v2",
|
|
49
|
+
" --tasks=T01,T02 Restrict to task IDs",
|
|
50
|
+
" --max-tasks=N Cap number of tasks after filtering",
|
|
51
|
+
" --agent=build|default OpenCode agent (default: build)",
|
|
52
|
+
" --v2-agent=default|build Agent used for hashline_v2 mode (default: default)",
|
|
53
|
+
" --variant=low|medium|high|none Model variant (default: low)",
|
|
54
|
+
" --warmup-runs=N Warmup repeats per model/task/mode (default: 1)",
|
|
55
|
+
" --measured-runs=N Measured repeats per model/task/mode (default: 1)",
|
|
56
|
+
" --timeout-ms=N Per-run timeout (default: 300000)",
|
|
57
|
+
" --output-root=.tmp-bench Benchmark output root (default: .tmp-bench)",
|
|
58
|
+
" --label=name Output label suffix",
|
|
59
|
+
" --home=PATH HOME/USERPROFILE override for model provider access",
|
|
60
|
+
" --keep-raw-logs Keep all NDJSON logs (default: keep measured + failures)",
|
|
61
|
+
" --smoke Shortcut: 4 tasks, 0 warmup, 1 measured",
|
|
62
|
+
" --no-dashboard Skip HTML dashboard generation",
|
|
63
|
+
" --help Show this help",
|
|
64
|
+
"",
|
|
65
|
+
"Examples:",
|
|
66
|
+
" node scripts/benchmark-edit-formats.mjs --smoke --models=openai/gpt-5-codex",
|
|
67
|
+
" node scripts/benchmark-edit-formats.mjs --preset=codex-core --warmup-runs=1 --measured-runs=5",
|
|
68
|
+
].join("\n"));
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function parseArgValue(args, name) {
|
|
72
|
+
const prefix = `${name}=`;
|
|
73
|
+
const hit = args.find((arg) => arg.startsWith(prefix));
|
|
74
|
+
return hit ? hit.slice(prefix.length) : undefined;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function parseIntOption(raw, fallback, name) {
|
|
78
|
+
if (raw === undefined) {
|
|
79
|
+
return fallback;
|
|
80
|
+
}
|
|
81
|
+
const parsed = Number.parseInt(raw, 10);
|
|
82
|
+
if (!Number.isFinite(parsed) || parsed < 0) {
|
|
83
|
+
throw new Error(`Invalid ${name}: ${raw}`);
|
|
84
|
+
}
|
|
85
|
+
return parsed;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function slugify(value) {
|
|
89
|
+
return String(value)
|
|
90
|
+
.trim()
|
|
91
|
+
.replace(/[^a-zA-Z0-9._-]+/g, "-")
|
|
92
|
+
.replace(/^-+|-+$/g, "")
|
|
93
|
+
.slice(0, 80);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function toFileUri(pathValue) {
|
|
97
|
+
const normalized = pathValue.replace(/\\/g, "/");
|
|
98
|
+
if (/^[A-Za-z]:\//.test(normalized)) {
|
|
99
|
+
return `file:///${normalized}`;
|
|
100
|
+
}
|
|
101
|
+
if (normalized.startsWith("/")) {
|
|
102
|
+
return `file://${normalized}`;
|
|
103
|
+
}
|
|
104
|
+
return `file:///${normalized}`;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
function modelDisplayName(modelId) {
|
|
108
|
+
const parts = modelId.split("/");
|
|
109
|
+
const tail = parts[parts.length - 1] ?? modelId;
|
|
110
|
+
return tail
|
|
111
|
+
.replace(/^gpt-/i, "GPT-")
|
|
112
|
+
.replace(/codex/gi, "Codex")
|
|
113
|
+
.replace(/mini/gi, "Mini")
|
|
114
|
+
.replace(/max/gi, "Max")
|
|
115
|
+
.replace(/spark/gi, "Spark")
|
|
116
|
+
.replace(/-/g, " ")
|
|
117
|
+
.replace(/\b([a-z])/g, (_, c) => c.toUpperCase());
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function classifyFailureReason(run, mode) {
|
|
121
|
+
if (run.timedOut) {
|
|
122
|
+
return { type: "timeout", reason: "OpenCode run timed out" };
|
|
123
|
+
}
|
|
124
|
+
if (run.modelNotFound) {
|
|
125
|
+
return { type: "model_not_found", reason: "Model not found in current OpenCode provider config" };
|
|
126
|
+
}
|
|
127
|
+
if (run.status !== 0 && run.eventError) {
|
|
128
|
+
return { type: "opencode_error", reason: run.eventError.message };
|
|
129
|
+
}
|
|
130
|
+
if (run.status !== 0) {
|
|
131
|
+
return { type: "nonzero_exit", reason: `OpenCode exited with status ${run.status}` };
|
|
132
|
+
}
|
|
133
|
+
if (mode === "hashline_v2") {
|
|
134
|
+
return { type: "v2_no_json", reason: "No valid hashline_v2 JSON response found" };
|
|
135
|
+
}
|
|
136
|
+
return { type: "mode_signature_missing", reason: `Did not observe expected ${mode} tool signature` };
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function sleep(ms) {
|
|
140
|
+
return new Promise((resolvePromise) => {
|
|
141
|
+
setTimeout(resolvePromise, ms);
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
function isTransientNonzeroExit(run) {
|
|
146
|
+
if (!run || run.status === 0 || run.timedOut || run.modelNotFound || run.eventError) {
|
|
147
|
+
return false;
|
|
148
|
+
}
|
|
149
|
+
const combined = `${run.stdout ?? ""}\n${run.stderr ?? ""}`;
|
|
150
|
+
const noEvents = !Array.isArray(run.events) || run.events.length === 0;
|
|
151
|
+
const noOutput = combined.trim().length === 0;
|
|
152
|
+
const providerRefreshNoise = /service=models\.dev|refreshing/i.test(combined);
|
|
153
|
+
return noEvents || noOutput || providerRefreshNoise;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
async function runOpencodeWithResilience({
|
|
157
|
+
executable,
|
|
158
|
+
prompt,
|
|
159
|
+
requestedModel,
|
|
160
|
+
variant,
|
|
161
|
+
agent,
|
|
162
|
+
cwd,
|
|
163
|
+
homeDir,
|
|
164
|
+
timeoutMs,
|
|
165
|
+
extraEnv,
|
|
166
|
+
availableModels,
|
|
167
|
+
transientRetries = DEFAULT_TRANSIENT_RETRIES,
|
|
168
|
+
}) {
|
|
169
|
+
const availableSet = Array.isArray(availableModels) && availableModels.length > 0 ? new Set(availableModels) : null;
|
|
170
|
+
const preferredCandidates = aliasCandidatesForCodexModel(requestedModel);
|
|
171
|
+
const filteredCandidates = availableSet
|
|
172
|
+
? preferredCandidates.filter((candidate) => availableSet.has(candidate))
|
|
173
|
+
: preferredCandidates;
|
|
174
|
+
const candidates = filteredCandidates.length > 0 ? filteredCandidates : preferredCandidates;
|
|
175
|
+
let lastRun = null;
|
|
176
|
+
let lastModel = requestedModel;
|
|
177
|
+
|
|
178
|
+
for (const candidateModel of candidates) {
|
|
179
|
+
const maxAttempts = transientRetries + 1;
|
|
180
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
|
|
181
|
+
const run = runOpencodeJson({
|
|
182
|
+
executable,
|
|
183
|
+
prompt,
|
|
184
|
+
model: candidateModel,
|
|
185
|
+
variant,
|
|
186
|
+
agent,
|
|
187
|
+
cwd,
|
|
188
|
+
homeDir,
|
|
189
|
+
timeoutMs,
|
|
190
|
+
extraEnv,
|
|
191
|
+
});
|
|
192
|
+
lastRun = run;
|
|
193
|
+
lastModel = candidateModel;
|
|
194
|
+
|
|
195
|
+
if (run.status === 0 || run.timedOut || run.eventError) {
|
|
196
|
+
return { run, actualModel: candidateModel, attempts: attempt, aliasUsed: candidateModel !== requestedModel };
|
|
197
|
+
}
|
|
198
|
+
if (run.modelNotFound) {
|
|
199
|
+
if (attempt < maxAttempts) {
|
|
200
|
+
await sleep(500 * attempt);
|
|
201
|
+
continue;
|
|
202
|
+
}
|
|
203
|
+
break;
|
|
204
|
+
}
|
|
205
|
+
if (attempt < maxAttempts && isTransientNonzeroExit(run)) {
|
|
206
|
+
await sleep(500 * attempt);
|
|
207
|
+
continue;
|
|
208
|
+
}
|
|
209
|
+
return { run, actualModel: candidateModel, attempts: attempt, aliasUsed: candidateModel !== requestedModel };
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return {
|
|
214
|
+
run:
|
|
215
|
+
lastRun ??
|
|
216
|
+
runOpencodeJson({
|
|
217
|
+
executable,
|
|
218
|
+
prompt,
|
|
219
|
+
model: requestedModel,
|
|
220
|
+
variant,
|
|
221
|
+
agent,
|
|
222
|
+
cwd,
|
|
223
|
+
homeDir,
|
|
224
|
+
timeoutMs,
|
|
225
|
+
extraEnv,
|
|
226
|
+
}),
|
|
227
|
+
actualModel: lastModel,
|
|
228
|
+
attempts: 1,
|
|
229
|
+
aliasUsed: lastModel !== requestedModel,
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
function toolNameMatches(tool, suffix) {
|
|
234
|
+
return typeof tool === "string" && tool.toLowerCase().endsWith(suffix.toLowerCase());
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
function isEditFamilyTool(tool) {
|
|
238
|
+
return tool === "edit" || tool === "apply_patch";
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
function extractTextPreview(text) {
|
|
242
|
+
if (!text) return "";
|
|
243
|
+
return text.replace(/\s+/g, " ").slice(0, 400);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
function mergeTokenTotals(current, incoming) {
|
|
247
|
+
if (!current && !incoming) {
|
|
248
|
+
return null;
|
|
249
|
+
}
|
|
250
|
+
const left = current ?? { total: 0, input: 0, output: 0, reasoning: 0, cacheRead: 0, cacheWrite: 0 };
|
|
251
|
+
const right = incoming ?? { total: 0, input: 0, output: 0, reasoning: 0, cacheRead: 0, cacheWrite: 0 };
|
|
252
|
+
return {
|
|
253
|
+
total: Number(left.total ?? 0) + Number(right.total ?? 0),
|
|
254
|
+
input: Number(left.input ?? 0) + Number(right.input ?? 0),
|
|
255
|
+
output: Number(left.output ?? 0) + Number(right.output ?? 0),
|
|
256
|
+
reasoning: Number(left.reasoning ?? 0) + Number(right.reasoning ?? 0),
|
|
257
|
+
cacheRead: Number(left.cacheRead ?? 0) + Number(right.cacheRead ?? 0),
|
|
258
|
+
cacheWrite: Number(left.cacheWrite ?? 0) + Number(right.cacheWrite ?? 0),
|
|
259
|
+
};
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
function sumNullableMs(left, right) {
|
|
263
|
+
if (typeof left !== "number" && typeof right !== "number") {
|
|
264
|
+
return null;
|
|
265
|
+
}
|
|
266
|
+
return Number(left ?? 0) + Number(right ?? 0);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
function summarizeToolMetrics(mode, tools) {
|
|
270
|
+
const totalToolMs = tools.reduce((acc, tool) => acc + (tool.durationMs ?? 0), 0);
|
|
271
|
+
const editFamilyCalls = tools.filter((tool) => isEditFamilyTool(tool.tool));
|
|
272
|
+
const hashlineReadCalls = tools.filter((tool) => tool.tool === "hashline_read");
|
|
273
|
+
const filesystemEditCalls = tools.filter((tool) => toolNameMatches(tool.tool, "edit_file"));
|
|
274
|
+
|
|
275
|
+
let targetEditCall = null;
|
|
276
|
+
if (mode === "replace") {
|
|
277
|
+
targetEditCall = editFamilyCalls.find((tool) => typeof tool.input?.oldString === "string" && !tool.input?.lineRef) ?? null;
|
|
278
|
+
} else if (mode === "hashline") {
|
|
279
|
+
targetEditCall = editFamilyCalls.find((tool) => typeof tool.input?.lineRef === "string") ?? null;
|
|
280
|
+
} else if (mode === "patch") {
|
|
281
|
+
targetEditCall = filesystemEditCalls[0] ?? null;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return {
|
|
285
|
+
totalToolMs,
|
|
286
|
+
toolCount: tools.length,
|
|
287
|
+
editFamilyCallCount: editFamilyCalls.length,
|
|
288
|
+
filesystemEditCallCount: filesystemEditCalls.length,
|
|
289
|
+
hashlineReadCallCount: hashlineReadCalls.length,
|
|
290
|
+
hashlineReadTotalMs: hashlineReadCalls.reduce((acc, tool) => acc + (tool.durationMs ?? 0), 0),
|
|
291
|
+
targetEditCallMs: targetEditCall?.durationMs ?? null,
|
|
292
|
+
targetEditCallCount:
|
|
293
|
+
mode === "patch"
|
|
294
|
+
? filesystemEditCalls.length
|
|
295
|
+
: mode === "hashline" || mode === "replace"
|
|
296
|
+
? editFamilyCalls.length
|
|
297
|
+
: 0,
|
|
298
|
+
toolSequence: tools.map((tool) => tool.tool),
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
function classifyToolMode(mode, tools, finalContent, task) {
|
|
303
|
+
const toolMetrics = summarizeToolMetrics(mode, tools);
|
|
304
|
+
const toolNames = toolMetrics.toolSequence;
|
|
305
|
+
const validationPass = task.validate(finalContent);
|
|
306
|
+
const fileChanged = finalContent !== task.originalContent;
|
|
307
|
+
|
|
308
|
+
let signatureOk = false;
|
|
309
|
+
let supported = true;
|
|
310
|
+
let fallbackUsed = false;
|
|
311
|
+
let failureType = null;
|
|
312
|
+
let failureReason = null;
|
|
313
|
+
|
|
314
|
+
if (mode === "patch") {
|
|
315
|
+
signatureOk = toolMetrics.filesystemEditCallCount > 0;
|
|
316
|
+
fallbackUsed = toolMetrics.editFamilyCallCount > 0 || toolMetrics.hashlineReadCallCount > 0;
|
|
317
|
+
if (!signatureOk) {
|
|
318
|
+
supported = false;
|
|
319
|
+
failureType = "wrong_tool_family";
|
|
320
|
+
failureReason = `Patch mode expected filesystem edit tool; saw: ${toolNames.join(", ") || "none"}`;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
if (mode === "replace") {
|
|
325
|
+
const hasLegacyEdit = tools.some((tool) => isEditFamilyTool(tool.tool) && typeof tool.input?.oldString === "string" && !tool.input?.lineRef);
|
|
326
|
+
signatureOk = hasLegacyEdit;
|
|
327
|
+
fallbackUsed = toolMetrics.hashlineReadCallCount > 0 || tools.some((tool) => isEditFamilyTool(tool.tool) && typeof tool.input?.lineRef === "string");
|
|
328
|
+
if (!signatureOk) {
|
|
329
|
+
supported = false;
|
|
330
|
+
failureType = "missing_legacy_signature";
|
|
331
|
+
failureReason = `Replace mode expected oldString/newString edit signature; saw: ${toolNames.join(", ") || "none"}`;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
if (mode === "hashline") {
|
|
336
|
+
const hasHashlineEdit = tools.some((tool) => isEditFamilyTool(tool.tool) && typeof tool.input?.lineRef === "string");
|
|
337
|
+
signatureOk = toolMetrics.hashlineReadCallCount > 0 && hasHashlineEdit;
|
|
338
|
+
fallbackUsed = tools.some((tool) => isEditFamilyTool(tool.tool) && typeof tool.input?.oldString === "string");
|
|
339
|
+
if (!signatureOk) {
|
|
340
|
+
supported = false;
|
|
341
|
+
failureType = "missing_hashline_signature";
|
|
342
|
+
failureReason = `Hashline mode expected hashline_read + lineRef edit; saw: ${toolNames.join(", ") || "none"}`;
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
if (supported && !fileChanged) {
|
|
347
|
+
failureType = "file_unchanged";
|
|
348
|
+
failureReason = "File content was unchanged after run";
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
if (supported && fileChanged && !validationPass) {
|
|
352
|
+
failureType = "validation_failed";
|
|
353
|
+
failureReason = `Task validator failed for ${task.id}`;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
const pass = supported && signatureOk && fileChanged && validationPass;
|
|
357
|
+
const firstTrySuccess = pass && ((mode === "patch")
|
|
358
|
+
? toolMetrics.filesystemEditCallCount === 1
|
|
359
|
+
: toolMetrics.targetEditCallCount === 1);
|
|
360
|
+
|
|
361
|
+
return {
|
|
362
|
+
pass,
|
|
363
|
+
supported,
|
|
364
|
+
signatureOk,
|
|
365
|
+
fallbackUsed,
|
|
366
|
+
firstTrySuccess,
|
|
367
|
+
failureType,
|
|
368
|
+
failureReason,
|
|
369
|
+
validationPass,
|
|
370
|
+
fileChanged,
|
|
371
|
+
toolMetrics,
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
function buildToolPrompt(mode, task) {
|
|
376
|
+
const taskLine = `Task: ${task.prompt}`;
|
|
377
|
+
if (mode === "patch") {
|
|
378
|
+
return [
|
|
379
|
+
"Edit only src/TodoApp.tsx in the working directory and then stop.",
|
|
380
|
+
taskLine,
|
|
381
|
+
"Use filesystem patch/edit tools only (for example filesystem_edit_file).",
|
|
382
|
+
"Do not use plugin edit/apply_patch and do not use hashline_read.",
|
|
383
|
+
"Make the changes directly and return DONE.",
|
|
384
|
+
].join(" ");
|
|
385
|
+
}
|
|
386
|
+
if (mode === "replace") {
|
|
387
|
+
return [
|
|
388
|
+
"Edit only src/TodoApp.tsx in the working directory and then stop.",
|
|
389
|
+
taskLine,
|
|
390
|
+
"Use plugin edit/apply_patch legacy mode with oldString/newString only.",
|
|
391
|
+
"Do not use hashline_read and do not use lineRef/endLineRef.",
|
|
392
|
+
"Return DONE after editing.",
|
|
393
|
+
].join(" ");
|
|
394
|
+
}
|
|
395
|
+
return [
|
|
396
|
+
"Edit only src/TodoApp.tsx in the working directory and then stop.",
|
|
397
|
+
taskLine,
|
|
398
|
+
"Use hashline mode: first call hashline_read on src/TodoApp.tsx, then use edit/apply_patch with lineRef (and endLineRef if needed).",
|
|
399
|
+
"Do not use oldString/newString legacy mode.",
|
|
400
|
+
"Return DONE after editing.",
|
|
401
|
+
].join(" ");
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
function buildHashlineV2Prompt(v2Prompt, task, taggedContent) {
|
|
405
|
+
return [
|
|
406
|
+
"You are benchmarking a code edit format named hashline_v2.",
|
|
407
|
+
"This is a format benchmark. The file content is already provided below.",
|
|
408
|
+
"If your runtime policy requires tools, tool usage is allowed.",
|
|
409
|
+
"Final answer must include exactly one JSON code block with the edit call and no explanation.",
|
|
410
|
+
"Also repeat the same JSON between BEGIN_V2_JSON and END_V2_JSON markers.",
|
|
411
|
+
"Use real tags from the file content below. Do not invent placeholder tags (for example ???).",
|
|
412
|
+
"",
|
|
413
|
+
"## hashline_v2 format",
|
|
414
|
+
v2Prompt,
|
|
415
|
+
"",
|
|
416
|
+
"## Current file: src/TodoApp.tsx",
|
|
417
|
+
"```",
|
|
418
|
+
taggedContent,
|
|
419
|
+
"```",
|
|
420
|
+
"",
|
|
421
|
+
"## Task",
|
|
422
|
+
task.prompt,
|
|
423
|
+
"",
|
|
424
|
+
"## Output format",
|
|
425
|
+
"BEGIN_V2_JSON",
|
|
426
|
+
"```json",
|
|
427
|
+
'{ "path": "src/TodoApp.tsx", "edits": [ ... ] }',
|
|
428
|
+
"```",
|
|
429
|
+
"END_V2_JSON",
|
|
430
|
+
].join("\n");
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
function buildHashlineV2RepairPrompt(v2Prompt, task, taggedContent, previousOutput) {
|
|
434
|
+
return [
|
|
435
|
+
"Your previous response was not valid hashline_v2 JSON.",
|
|
436
|
+
"Return valid hashline_v2 JSON now.",
|
|
437
|
+
"If your runtime policy requires tools, tool usage is allowed.",
|
|
438
|
+
"Final answer must include exactly one JSON code block and the same JSON between BEGIN_V2_JSON and END_V2_JSON.",
|
|
439
|
+
"No explanation text.",
|
|
440
|
+
"",
|
|
441
|
+
"## hashline_v2 format",
|
|
442
|
+
v2Prompt,
|
|
443
|
+
"",
|
|
444
|
+
"## Current file: src/TodoApp.tsx",
|
|
445
|
+
"```",
|
|
446
|
+
taggedContent,
|
|
447
|
+
"```",
|
|
448
|
+
"",
|
|
449
|
+
"## Task",
|
|
450
|
+
task.prompt,
|
|
451
|
+
"",
|
|
452
|
+
"## Previous invalid output",
|
|
453
|
+
"```",
|
|
454
|
+
previousOutput || "(empty)",
|
|
455
|
+
"```",
|
|
456
|
+
"",
|
|
457
|
+
"## Required output",
|
|
458
|
+
"BEGIN_V2_JSON",
|
|
459
|
+
"```json",
|
|
460
|
+
'{ "path": "src/TodoApp.tsx", "edits": [ ... ] }',
|
|
461
|
+
"```",
|
|
462
|
+
"END_V2_JSON",
|
|
463
|
+
].join("\n");
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
async function ensureWorkspace(workspaceDir, fixtureContent) {
|
|
467
|
+
const srcDir = join(workspaceDir, "src");
|
|
468
|
+
await mkdir(srcDir, { recursive: true });
|
|
469
|
+
await writeFile(join(srcDir, "TodoApp.tsx"), fixtureContent, "utf8");
|
|
470
|
+
const workspaceConfig = {
|
|
471
|
+
plugin: [toFileUri(DIST_PLUGIN_DIR)],
|
|
472
|
+
};
|
|
473
|
+
await writeFile(join(workspaceDir, "opencode.json"), `${JSON.stringify(workspaceConfig, null, 2)}\n`, "utf8");
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
async function readWorkspaceFixture(workspaceDir) {
|
|
477
|
+
return readFile(join(workspaceDir, BENCHMARK_FIXTURE.relativePath), "utf8");
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
function extractPhaseRuns(runRecords, phase) {
|
|
481
|
+
return runRecords.filter((record) => record.phase === phase);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
function modeRunMetrics(mode, records) {
|
|
485
|
+
const measured = records.filter((record) => record.phase === "measured");
|
|
486
|
+
const passCount = measured.filter((record) => record.pass).length;
|
|
487
|
+
const supportedCount = measured.filter((record) => record.supported !== false).length;
|
|
488
|
+
const firstTrySuccessCount = measured.filter((record) => record.firstTrySuccess).length;
|
|
489
|
+
const fallbackCount = measured.filter((record) => record.fallbackUsed).length;
|
|
490
|
+
const unsupportedCount = measured.filter((record) => record.supported === false).length;
|
|
491
|
+
const timeoutCount = measured.filter((record) => record.failureType === "timeout").length;
|
|
492
|
+
const wallStats = stats(measured.map((record) => record.wallMs));
|
|
493
|
+
const sessionStats = stats(measured.map((record) => record.sessionMs));
|
|
494
|
+
const totalToolStats = stats(measured.map((record) => record.totalToolMs));
|
|
495
|
+
const editCallStats = stats(measured.map((record) => record.targetEditCallMs));
|
|
496
|
+
const hashlineReadStats = stats(measured.map((record) => record.hashlineReadTotalMs));
|
|
497
|
+
const tokenTotalStats = stats(measured.map((record) => record.tokens?.total));
|
|
498
|
+
const toolCountStats = stats(measured.map((record) => record.toolCount));
|
|
499
|
+
|
|
500
|
+
return {
|
|
501
|
+
mode,
|
|
502
|
+
measuredRuns: measured.length,
|
|
503
|
+
warmupRuns: records.filter((record) => record.phase === "warmup").length,
|
|
504
|
+
passCount,
|
|
505
|
+
failCount: measured.length - passCount,
|
|
506
|
+
supportedCount,
|
|
507
|
+
unsupportedCount,
|
|
508
|
+
timeoutCount,
|
|
509
|
+
accuracyPct: round1(safePercent(passCount, measured.length) ?? NaN),
|
|
510
|
+
firstTrySuccessPct: round1(safePercent(firstTrySuccessCount, measured.length) ?? NaN),
|
|
511
|
+
fallbackRatePct: round1(safePercent(fallbackCount, measured.length) ?? NaN),
|
|
512
|
+
wallMsP50: wallStats?.p50 ?? null,
|
|
513
|
+
wallMsP95: wallStats?.p95 ?? null,
|
|
514
|
+
sessionMsP50: sessionStats?.p50 ?? null,
|
|
515
|
+
sessionMsP95: sessionStats?.p95 ?? null,
|
|
516
|
+
totalToolMsP50: totalToolStats?.p50 ?? null,
|
|
517
|
+
totalToolMsP95: totalToolStats?.p95 ?? null,
|
|
518
|
+
editCallMsP50: editCallStats?.p50 ?? null,
|
|
519
|
+
editCallMsP95: editCallStats?.p95 ?? null,
|
|
520
|
+
hashlineReadMsP50: hashlineReadStats?.p50 ?? null,
|
|
521
|
+
hashlineReadMsP95: hashlineReadStats?.p95 ?? null,
|
|
522
|
+
tokensTotalP50: tokenTotalStats?.p50 ?? null,
|
|
523
|
+
tokensTotalP95: tokenTotalStats?.p95 ?? null,
|
|
524
|
+
toolCountAvg: toolCountStats?.mean ?? null,
|
|
525
|
+
failureTypes: Object.fromEntries(
|
|
526
|
+
Object.entries(
|
|
527
|
+
measured.reduce((acc, record) => {
|
|
528
|
+
if (!record.pass && record.failureType) {
|
|
529
|
+
acc[record.failureType] = (acc[record.failureType] ?? 0) + 1;
|
|
530
|
+
}
|
|
531
|
+
return acc;
|
|
532
|
+
}, {}),
|
|
533
|
+
).sort(([a], [b]) => a.localeCompare(b)),
|
|
534
|
+
),
|
|
535
|
+
};
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
function aggregateSummary({ options, runRecords, failures, startTime, endTime, executable }) {
|
|
539
|
+
const measuredRuns = extractPhaseRuns(runRecords, "measured");
|
|
540
|
+
const warmupRuns = extractPhaseRuns(runRecords, "warmup");
|
|
541
|
+
const modelIds = [...new Set(runRecords.map((record) => record.modelId))];
|
|
542
|
+
const rows = [];
|
|
543
|
+
|
|
544
|
+
for (const modelId of modelIds) {
|
|
545
|
+
const byMode = {};
|
|
546
|
+
for (const mode of options.modes) {
|
|
547
|
+
const modeRecords = runRecords.filter((record) => record.modelId === modelId && record.mode === mode);
|
|
548
|
+
byMode[mode] = modeRunMetrics(mode, modeRecords);
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
const patch = byMode.patch;
|
|
552
|
+
const replace = byMode.replace;
|
|
553
|
+
const hashline = byMode.hashline;
|
|
554
|
+
const hashlineV2 = byMode.hashline_v2;
|
|
555
|
+
|
|
556
|
+
rows.push({
|
|
557
|
+
modelId,
|
|
558
|
+
displayName: modelDisplayName(modelId),
|
|
559
|
+
modes: byMode,
|
|
560
|
+
deltas: {
|
|
561
|
+
hashline: {
|
|
562
|
+
accuracyVsPatch: round1((hashline?.accuracyPct ?? NaN) - (patch?.accuracyPct ?? NaN)),
|
|
563
|
+
accuracyVsReplace: round1((hashline?.accuracyPct ?? NaN) - (replace?.accuracyPct ?? NaN)),
|
|
564
|
+
tokensVsReplacePct: round1(pctDelta(hashline?.tokensTotalP50 ?? NaN, replace?.tokensTotalP50 ?? NaN)),
|
|
565
|
+
},
|
|
566
|
+
hashline_v2: {
|
|
567
|
+
accuracyVsPatch: round1((hashlineV2?.accuracyPct ?? NaN) - (patch?.accuracyPct ?? NaN)),
|
|
568
|
+
accuracyVsReplace: round1((hashlineV2?.accuracyPct ?? NaN) - (replace?.accuracyPct ?? NaN)),
|
|
569
|
+
tokensVsReplacePct: round1(pctDelta(hashlineV2?.tokensTotalP50 ?? NaN, replace?.tokensTotalP50 ?? NaN)),
|
|
570
|
+
},
|
|
571
|
+
},
|
|
572
|
+
});
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
return {
|
|
576
|
+
meta: {
|
|
577
|
+
generatedAt: new Date().toISOString(),
|
|
578
|
+
benchmarkStartedAt: startTime,
|
|
579
|
+
benchmarkFinishedAt: endTime,
|
|
580
|
+
preset: options.preset,
|
|
581
|
+
label: options.label,
|
|
582
|
+
models: options.models,
|
|
583
|
+
tasks: options.tasks.map((task) => task.id),
|
|
584
|
+
modes: options.modes,
|
|
585
|
+
agent: options.agent,
|
|
586
|
+
hashlineV2Agent: options.v2Agent,
|
|
587
|
+
variant: options.variant || null,
|
|
588
|
+
warmupRunsPerTask: options.warmupRuns,
|
|
589
|
+
measuredRunsPerTask: options.measuredRuns,
|
|
590
|
+
timeoutMs: options.timeoutMs,
|
|
591
|
+
outputRoot: options.outputRoot,
|
|
592
|
+
runDir: options.runDir,
|
|
593
|
+
homeDir: options.homeDir || null,
|
|
594
|
+
opencodeCommand: executable.command,
|
|
595
|
+
opencodeUsesShell: executable.shell,
|
|
596
|
+
keepRawLogs: options.keepRawLogs,
|
|
597
|
+
runCount: measuredRuns.length,
|
|
598
|
+
warmupCount: warmupRuns.length,
|
|
599
|
+
},
|
|
600
|
+
failures,
|
|
601
|
+
rows,
|
|
602
|
+
runs: measuredRuns,
|
|
603
|
+
warmupRuns,
|
|
604
|
+
};
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
function buildRunId({ modelId, mode, taskId, repeatIndex, phase }) {
|
|
608
|
+
return `${slugify(modelId)}__${mode}__${taskId}__${phase}${String(repeatIndex).padStart(2, "0")}`;
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
async function writeRawLog(logDir, runId, text) {
|
|
612
|
+
const path = join(logDir, `${runId}.ndjson`);
|
|
613
|
+
await writeFile(path, text, "utf8");
|
|
614
|
+
return path;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
function baseRecord({ modelId, mode, task, phase, repeatIndex, variant, runId }) {
|
|
618
|
+
return {
|
|
619
|
+
id: runId,
|
|
620
|
+
modelId,
|
|
621
|
+
modelDisplayName: modelDisplayName(modelId),
|
|
622
|
+
requestedModelId: modelId,
|
|
623
|
+
actualModelId: modelId,
|
|
624
|
+
modelAliasUsed: false,
|
|
625
|
+
attempts: 1,
|
|
626
|
+
mode,
|
|
627
|
+
taskId: task.id,
|
|
628
|
+
taskName: task.name,
|
|
629
|
+
difficulty: task.difficulty,
|
|
630
|
+
phase,
|
|
631
|
+
repeatIndex,
|
|
632
|
+
variant: variant || null,
|
|
633
|
+
pass: false,
|
|
634
|
+
supported: true,
|
|
635
|
+
signatureOk: false,
|
|
636
|
+
fallbackUsed: false,
|
|
637
|
+
formatFallbackUsed: false,
|
|
638
|
+
firstTrySuccess: false,
|
|
639
|
+
failureType: null,
|
|
640
|
+
failureReason: null,
|
|
641
|
+
wallMs: null,
|
|
642
|
+
sessionMs: null,
|
|
643
|
+
totalToolMs: null,
|
|
644
|
+
targetEditCallMs: null,
|
|
645
|
+
hashlineReadTotalMs: null,
|
|
646
|
+
toolCount: 0,
|
|
647
|
+
tokens: null,
|
|
648
|
+
toolSequence: [],
|
|
649
|
+
rawLogPath: null,
|
|
650
|
+
textPreview: "",
|
|
651
|
+
fileChanged: false,
|
|
652
|
+
validationPass: false,
|
|
653
|
+
eventError: null,
|
|
654
|
+
};
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
async function runToolMode({ mode, modelId, task, phase, repeatIndex, options, executable, fixtureContent }) {
|
|
658
|
+
const runId = buildRunId({ modelId, mode, taskId: task.id, repeatIndex, phase });
|
|
659
|
+
const workspaceDir = join(options.workspacesDir, runId);
|
|
660
|
+
await rm(workspaceDir, { recursive: true, force: true });
|
|
661
|
+
await ensureWorkspace(workspaceDir, fixtureContent);
|
|
662
|
+
|
|
663
|
+
const prompt = buildToolPrompt(mode, task);
|
|
664
|
+
const { run, actualModel, attempts, aliasUsed } = await runOpencodeWithResilience({
|
|
665
|
+
executable,
|
|
666
|
+
prompt,
|
|
667
|
+
requestedModel: modelId,
|
|
668
|
+
variant: options.variant,
|
|
669
|
+
agent: options.agent,
|
|
670
|
+
cwd: workspaceDir,
|
|
671
|
+
homeDir: options.homeDir,
|
|
672
|
+
timeoutMs: Math.max(task.timeoutMs ?? 0, options.timeoutMs),
|
|
673
|
+
availableModels: options.availableModels,
|
|
674
|
+
extraEnv: {
|
|
675
|
+
ENABLE_PLUGIN_REQUEST_LOGGING: "0",
|
|
676
|
+
CODEX_PLUGIN_LOG_BODIES: "0",
|
|
677
|
+
DEBUG_CODEX_PLUGIN: "0",
|
|
678
|
+
},
|
|
679
|
+
});
|
|
680
|
+
|
|
681
|
+
const record = baseRecord({ modelId, mode, task, phase, repeatIndex, variant: options.variant, runId });
|
|
682
|
+
record.actualModelId = actualModel;
|
|
683
|
+
record.modelAliasUsed = aliasUsed;
|
|
684
|
+
record.attempts = attempts;
|
|
685
|
+
record.wallMs = run.wallMs;
|
|
686
|
+
record.sessionMs = getSessionDuration(run.events);
|
|
687
|
+
record.tokens = getTokenTotals(run.events);
|
|
688
|
+
record.eventError = run.eventError;
|
|
689
|
+
record.textPreview = extractTextPreview(getTextOutput(run.events));
|
|
690
|
+
|
|
691
|
+
const tools = getToolEvents(run.events);
|
|
692
|
+
const toolMetrics = summarizeToolMetrics(mode, tools);
|
|
693
|
+
record.totalToolMs = toolMetrics.totalToolMs;
|
|
694
|
+
record.targetEditCallMs = toolMetrics.targetEditCallMs;
|
|
695
|
+
record.hashlineReadTotalMs = toolMetrics.hashlineReadTotalMs;
|
|
696
|
+
record.toolCount = toolMetrics.toolCount;
|
|
697
|
+
record.toolSequence = toolMetrics.toolSequence;
|
|
698
|
+
|
|
699
|
+
const shouldKeepLog = options.keepRawLogs || phase === "measured" || run.status !== 0 || run.modelNotFound || run.eventError;
|
|
700
|
+
if (shouldKeepLog) {
|
|
701
|
+
record.rawLogPath = await writeRawLog(options.logsDir, runId, run.stdout);
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
if (run.status !== 0 || run.modelNotFound || run.timedOut || run.eventError) {
|
|
705
|
+
const failure = classifyFailureReason(run, mode);
|
|
706
|
+
record.supported = !run.modelNotFound;
|
|
707
|
+
record.failureType = failure.type;
|
|
708
|
+
record.failureReason = failure.reason;
|
|
709
|
+
return record;
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
const finalContent = await readWorkspaceFixture(workspaceDir);
|
|
713
|
+
const classification = classifyToolMode(mode, tools, finalContent, {
|
|
714
|
+
...task,
|
|
715
|
+
originalContent: fixtureContent,
|
|
716
|
+
});
|
|
717
|
+
|
|
718
|
+
record.pass = classification.pass;
|
|
719
|
+
record.supported = classification.supported;
|
|
720
|
+
record.signatureOk = classification.signatureOk;
|
|
721
|
+
record.fallbackUsed = classification.fallbackUsed;
|
|
722
|
+
record.firstTrySuccess = classification.firstTrySuccess;
|
|
723
|
+
record.failureType = classification.failureType;
|
|
724
|
+
record.failureReason = classification.failureReason;
|
|
725
|
+
record.fileChanged = classification.fileChanged;
|
|
726
|
+
record.validationPass = classification.validationPass;
|
|
727
|
+
return record;
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
async function runHashlineV2Mode({ modelId, task, phase, repeatIndex, options, executable, fixtureContent, v2Prompt }) {
|
|
731
|
+
const mode = "hashline_v2";
|
|
732
|
+
const runId = buildRunId({ modelId, mode, taskId: task.id, repeatIndex, phase });
|
|
733
|
+
const workspaceDir = join(options.workspacesDir, runId);
|
|
734
|
+
await rm(workspaceDir, { recursive: true, force: true });
|
|
735
|
+
await ensureWorkspace(workspaceDir, fixtureContent);
|
|
736
|
+
|
|
737
|
+
const taggedContent = formatFileForHashlineV2(BENCHMARK_FIXTURE.relativePath, fixtureContent);
|
|
738
|
+
const prompt = buildHashlineV2Prompt(v2Prompt, task, taggedContent);
|
|
739
|
+
const { run, actualModel, attempts, aliasUsed } = await runOpencodeWithResilience({
|
|
740
|
+
executable,
|
|
741
|
+
prompt,
|
|
742
|
+
requestedModel: modelId,
|
|
743
|
+
variant: options.variant,
|
|
744
|
+
agent: options.v2Agent,
|
|
745
|
+
cwd: workspaceDir,
|
|
746
|
+
homeDir: options.homeDir,
|
|
747
|
+
timeoutMs: Math.max(task.timeoutMs ?? 0, options.timeoutMs),
|
|
748
|
+
availableModels: options.availableModels,
|
|
749
|
+
extraEnv: {
|
|
750
|
+
ENABLE_PLUGIN_REQUEST_LOGGING: "0",
|
|
751
|
+
CODEX_PLUGIN_LOG_BODIES: "0",
|
|
752
|
+
DEBUG_CODEX_PLUGIN: "0",
|
|
753
|
+
},
|
|
754
|
+
});
|
|
755
|
+
|
|
756
|
+
const record = baseRecord({ modelId, mode, task, phase, repeatIndex, variant: options.variant, runId });
|
|
757
|
+
record.actualModelId = actualModel;
|
|
758
|
+
record.modelAliasUsed = aliasUsed;
|
|
759
|
+
record.attempts = attempts;
|
|
760
|
+
record.wallMs = run.wallMs;
|
|
761
|
+
record.sessionMs = getSessionDuration(run.events);
|
|
762
|
+
record.tokens = getTokenTotals(run.events);
|
|
763
|
+
record.eventError = run.eventError;
|
|
764
|
+
|
|
765
|
+
const tools = getToolEvents(run.events);
|
|
766
|
+
const toolMetrics = summarizeToolMetrics(mode, tools);
|
|
767
|
+
record.totalToolMs = toolMetrics.totalToolMs;
|
|
768
|
+
record.targetEditCallMs = null;
|
|
769
|
+
record.hashlineReadTotalMs = toolMetrics.hashlineReadTotalMs;
|
|
770
|
+
record.toolCount = toolMetrics.toolCount;
|
|
771
|
+
record.toolSequence = toolMetrics.toolSequence;
|
|
772
|
+
|
|
773
|
+
const textOutput = getTextOutput(run.events);
|
|
774
|
+
record.textPreview = extractTextPreview(textOutput);
|
|
775
|
+
|
|
776
|
+
const shouldKeepLog = options.keepRawLogs || phase === "measured" || run.status !== 0 || run.modelNotFound || run.eventError;
|
|
777
|
+
if (shouldKeepLog) {
|
|
778
|
+
record.rawLogPath = await writeRawLog(options.logsDir, runId, run.stdout);
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
if (run.status !== 0 || run.modelNotFound || run.timedOut || run.eventError) {
|
|
782
|
+
const failure = classifyFailureReason(run, mode);
|
|
783
|
+
record.supported = !run.modelNotFound;
|
|
784
|
+
record.failureType = failure.type;
|
|
785
|
+
record.failureReason = failure.reason;
|
|
786
|
+
return record;
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
record.fallbackUsed = tools.length > 0;
|
|
790
|
+
|
|
791
|
+
let effectiveOutput = textOutput;
|
|
792
|
+
let jsonText = extractJsonCodeBlock(effectiveOutput);
|
|
793
|
+
|
|
794
|
+
if (!jsonText) {
|
|
795
|
+
const repairPrompt = buildHashlineV2RepairPrompt(v2Prompt, task, taggedContent, textOutput);
|
|
796
|
+
const repair = await runOpencodeWithResilience({
|
|
797
|
+
executable,
|
|
798
|
+
prompt: repairPrompt,
|
|
799
|
+
requestedModel: modelId,
|
|
800
|
+
variant: options.variant,
|
|
801
|
+
agent: options.v2Agent,
|
|
802
|
+
cwd: workspaceDir,
|
|
803
|
+
homeDir: options.homeDir,
|
|
804
|
+
timeoutMs: Math.max(task.timeoutMs ?? 0, options.timeoutMs),
|
|
805
|
+
availableModels: options.availableModels,
|
|
806
|
+
extraEnv: {
|
|
807
|
+
ENABLE_PLUGIN_REQUEST_LOGGING: "0",
|
|
808
|
+
CODEX_PLUGIN_LOG_BODIES: "0",
|
|
809
|
+
DEBUG_CODEX_PLUGIN: "0",
|
|
810
|
+
},
|
|
811
|
+
});
|
|
812
|
+
|
|
813
|
+
record.attempts += repair.attempts;
|
|
814
|
+
record.modelAliasUsed = record.modelAliasUsed || repair.aliasUsed;
|
|
815
|
+
record.actualModelId = repair.actualModel;
|
|
816
|
+
record.wallMs = sumNullableMs(record.wallMs, repair.run.wallMs);
|
|
817
|
+
record.sessionMs = sumNullableMs(record.sessionMs, getSessionDuration(repair.run.events));
|
|
818
|
+
record.tokens = mergeTokenTotals(record.tokens, getTokenTotals(repair.run.events));
|
|
819
|
+
|
|
820
|
+
const repairTools = getToolEvents(repair.run.events);
|
|
821
|
+
const repairToolMetrics = summarizeToolMetrics(mode, repairTools);
|
|
822
|
+
record.totalToolMs = sumNullableMs(record.totalToolMs, repairToolMetrics.totalToolMs);
|
|
823
|
+
record.hashlineReadTotalMs = sumNullableMs(record.hashlineReadTotalMs, repairToolMetrics.hashlineReadTotalMs);
|
|
824
|
+
record.toolCount += repairToolMetrics.toolCount;
|
|
825
|
+
record.toolSequence = [...record.toolSequence, ...repairToolMetrics.toolSequence];
|
|
826
|
+
record.fallbackUsed = record.fallbackUsed || repairTools.length > 0;
|
|
827
|
+
|
|
828
|
+
if (repair.run.status !== 0 || repair.run.modelNotFound || repair.run.timedOut || repair.run.eventError) {
|
|
829
|
+
const failure = classifyFailureReason(repair.run, mode);
|
|
830
|
+
record.supported = !repair.run.modelNotFound;
|
|
831
|
+
record.failureType = failure.type;
|
|
832
|
+
record.failureReason = `repair_pass: ${failure.reason}`;
|
|
833
|
+
return record;
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
effectiveOutput = getTextOutput(repair.run.events);
|
|
837
|
+
record.textPreview = extractTextPreview(effectiveOutput);
|
|
838
|
+
jsonText = extractJsonCodeBlock(effectiveOutput);
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
let parsedCall;
|
|
842
|
+
try {
|
|
843
|
+
if (!jsonText) {
|
|
844
|
+
record.fallbackUsed = true;
|
|
845
|
+
record.formatFallbackUsed = true;
|
|
846
|
+
parsedCall = { path: BENCHMARK_FIXTURE.relativePath, edits: [] };
|
|
847
|
+
} else {
|
|
848
|
+
parsedCall = autocorrectHashlineV2Call(parseHashlineV2Call(jsonText), fixtureContent);
|
|
849
|
+
}
|
|
850
|
+
} catch {
|
|
851
|
+
record.fallbackUsed = true;
|
|
852
|
+
record.formatFallbackUsed = true;
|
|
853
|
+
parsedCall = { path: BENCHMARK_FIXTURE.relativePath, edits: [] };
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
if (![BENCHMARK_FIXTURE.relativePath, `./${BENCHMARK_FIXTURE.relativePath}`].includes(parsedCall.path)) {
|
|
857
|
+
record.failureType = "v2_wrong_path";
|
|
858
|
+
record.failureReason = `Unexpected path in v2 edit call: ${parsedCall.path}`;
|
|
859
|
+
return record;
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
const applyResult = applyHashlineV2Edits(fixtureContent, parsedCall);
|
|
863
|
+
if (!applyResult.ok) {
|
|
864
|
+
record.failureType = "v2_apply_error";
|
|
865
|
+
record.failureReason = applyResult.errors[0] ?? "Unknown apply error";
|
|
866
|
+
return record;
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
const finalContent = applyResult.content;
|
|
870
|
+
record.fileChanged = finalContent !== fixtureContent;
|
|
871
|
+
record.validationPass = task.validate(finalContent);
|
|
872
|
+
record.signatureOk = true;
|
|
873
|
+
record.supported = true;
|
|
874
|
+
record.fallbackUsed = record.fallbackUsed || record.formatFallbackUsed;
|
|
875
|
+
record.firstTrySuccess = applyResult.ok;
|
|
876
|
+
record.pass = record.fileChanged && record.validationPass;
|
|
877
|
+
if (!record.fileChanged) {
|
|
878
|
+
record.failureType = "file_unchanged";
|
|
879
|
+
record.failureReason = "hashline_v2 edit call produced no file changes";
|
|
880
|
+
} else if (!record.validationPass) {
|
|
881
|
+
record.failureType = "validation_failed";
|
|
882
|
+
record.failureReason = `Task validator failed for ${task.id}`;
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
await writeFile(join(workspaceDir, BENCHMARK_FIXTURE.relativePath), finalContent, "utf8");
|
|
886
|
+
return record;
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
async function parseOptions() {
|
|
890
|
+
const args = process.argv.slice(2);
|
|
891
|
+
if (args.includes("--help") || args.includes("-h")) {
|
|
892
|
+
printUsage();
|
|
893
|
+
process.exit(0);
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
const smoke = args.includes("--smoke");
|
|
897
|
+
const noDashboard = args.includes("--no-dashboard");
|
|
898
|
+
const keepRawLogs = args.includes("--keep-raw-logs");
|
|
899
|
+
const preset = parseArgValue(args, "--preset") ?? DEFAULT_PRESET;
|
|
900
|
+
const outputRootArg = parseArgValue(args, "--output-root") ?? DEFAULT_OUTPUT_ROOT;
|
|
901
|
+
const outputRoot = resolve(REPO_ROOT, outputRootArg);
|
|
902
|
+
const labelValue = parseArgValue(args, "--label");
|
|
903
|
+
const label = slugify(labelValue ?? `${preset}-${new Date().toISOString().replace(/[:.]/g, "-")}`);
|
|
904
|
+
const runDir = join(outputRoot, label);
|
|
905
|
+
const logsDir = join(runDir, "logs");
|
|
906
|
+
const workspacesDir = join(runDir, "workspaces");
|
|
907
|
+
const resultsDir = join(runDir, "results");
|
|
908
|
+
const homeDir = parseArgValue(args, "--home");
|
|
909
|
+
|
|
910
|
+
const warmupRuns = smoke
|
|
911
|
+
? 0
|
|
912
|
+
: parseIntOption(parseArgValue(args, "--warmup-runs"), 1, "--warmup-runs");
|
|
913
|
+
const measuredRuns = smoke
|
|
914
|
+
? 1
|
|
915
|
+
: parseIntOption(parseArgValue(args, "--measured-runs"), 1, "--measured-runs");
|
|
916
|
+
const timeoutMs = parseIntOption(parseArgValue(args, "--timeout-ms"), DEFAULT_TIMEOUT_MS, "--timeout-ms");
|
|
917
|
+
const maxTasks = parseIntOption(parseArgValue(args, "--max-tasks"), 0, "--max-tasks");
|
|
918
|
+
const agent = parseArgValue(args, "--agent") ?? DEFAULT_AGENT;
|
|
919
|
+
const v2Agent = parseArgValue(args, "--v2-agent") ?? DEFAULT_V2_AGENT;
|
|
920
|
+
const variantRaw = parseArgValue(args, "--variant");
|
|
921
|
+
const variant = variantRaw === "none" ? "" : (variantRaw ?? DEFAULT_VARIANT);
|
|
922
|
+
|
|
923
|
+
const modesRaw = parseArgValue(args, "--modes");
|
|
924
|
+
const modes = (modesRaw ? modesRaw.split(",").map((v) => v.trim()).filter(Boolean) : [...ALL_MODES]);
|
|
925
|
+
for (const mode of modes) {
|
|
926
|
+
if (!ALL_MODES.includes(mode)) {
|
|
927
|
+
throw new Error(`Unsupported mode: ${mode}`);
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
const explicitModelsRaw = parseArgValue(args, "--models");
|
|
932
|
+
const explicitModels = explicitModelsRaw ? explicitModelsRaw.split(",").map((v) => v.trim()).filter(Boolean) : [];
|
|
933
|
+
const models = resolveModelPreset(preset, explicitModels);
|
|
934
|
+
if (models.length === 0) {
|
|
935
|
+
throw new Error("No models selected");
|
|
936
|
+
}
|
|
937
|
+
let availableModels = [];
|
|
938
|
+
try {
|
|
939
|
+
availableModels = listOpencodeModels();
|
|
940
|
+
} catch {
|
|
941
|
+
availableModels = [];
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
const taskMap = getTaskMap();
|
|
945
|
+
const taskIdsRaw = parseArgValue(args, "--tasks");
|
|
946
|
+
let tasks = [...TASKS];
|
|
947
|
+
if (taskIdsRaw) {
|
|
948
|
+
const ids = taskIdsRaw.split(",").map((value) => value.trim()).filter(Boolean);
|
|
949
|
+
tasks = ids.map((id) => {
|
|
950
|
+
const task = taskMap.get(id);
|
|
951
|
+
if (!task) {
|
|
952
|
+
throw new Error(`Unknown task ID: ${id}`);
|
|
953
|
+
}
|
|
954
|
+
return task;
|
|
955
|
+
});
|
|
956
|
+
}
|
|
957
|
+
if (smoke && !taskIdsRaw) {
|
|
958
|
+
tasks = tasks.slice(0, 4);
|
|
959
|
+
}
|
|
960
|
+
if (maxTasks > 0 && tasks.length > maxTasks) {
|
|
961
|
+
tasks = tasks.slice(0, maxTasks);
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
return {
|
|
965
|
+
smoke,
|
|
966
|
+
noDashboard,
|
|
967
|
+
keepRawLogs,
|
|
968
|
+
preset,
|
|
969
|
+
label,
|
|
970
|
+
outputRoot,
|
|
971
|
+
runDir,
|
|
972
|
+
logsDir,
|
|
973
|
+
workspacesDir,
|
|
974
|
+
resultsDir,
|
|
975
|
+
homeDir,
|
|
976
|
+
warmupRuns,
|
|
977
|
+
measuredRuns,
|
|
978
|
+
timeoutMs,
|
|
979
|
+
agent,
|
|
980
|
+
v2Agent,
|
|
981
|
+
variant,
|
|
982
|
+
models,
|
|
983
|
+
availableModels,
|
|
984
|
+
modes,
|
|
985
|
+
tasks,
|
|
986
|
+
};
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
async function writeOutputs(options, summary) {
|
|
990
|
+
await mkdir(options.resultsDir, { recursive: true });
|
|
991
|
+
const summaryPath = join(options.resultsDir, "summary.json");
|
|
992
|
+
const markdownPath = join(options.resultsDir, "report.md");
|
|
993
|
+
const dashboardPath = join(options.resultsDir, "dashboard.html");
|
|
994
|
+
const latestPath = join(options.outputRoot, "latest.json");
|
|
995
|
+
|
|
996
|
+
await writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, "utf8");
|
|
997
|
+
await writeFile(markdownPath, `${buildMarkdownReport(summary)}\n`, "utf8");
|
|
998
|
+
if (!options.noDashboard) {
|
|
999
|
+
await writeFile(dashboardPath, renderDashboardHtml(summary), "utf8");
|
|
1000
|
+
}
|
|
1001
|
+
await mkdir(dirname(latestPath), { recursive: true });
|
|
1002
|
+
await writeFile(latestPath, `${JSON.stringify({ latestRunDir: options.runDir, summaryPath }, null, 2)}\n`, "utf8");
|
|
1003
|
+
|
|
1004
|
+
return { summaryPath, markdownPath, dashboardPath: options.noDashboard ? null : dashboardPath };
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
async function main() {
|
|
1008
|
+
const options = await parseOptions();
|
|
1009
|
+
const executable = resolveOpencodeExecutable();
|
|
1010
|
+
const startTime = new Date().toISOString();
|
|
1011
|
+
|
|
1012
|
+
if (!existsSync(FIXTURE_SOURCE_PATH)) {
|
|
1013
|
+
throw new Error(`Fixture not found: ${FIXTURE_SOURCE_PATH}`);
|
|
1014
|
+
}
|
|
1015
|
+
if (!existsSync(V2_PROMPT_PATH)) {
|
|
1016
|
+
throw new Error(`hashline_v2 prompt not found: ${V2_PROMPT_PATH}`);
|
|
1017
|
+
}
|
|
1018
|
+
if (!existsSync(join(DIST_PLUGIN_DIR, "index.js"))) {
|
|
1019
|
+
throw new Error(`Plugin dist build not found at ${join(DIST_PLUGIN_DIR, "index.js")}. Run npm run build first.`);
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
const fixtureContent = await readFile(FIXTURE_SOURCE_PATH, "utf8");
|
|
1023
|
+
const v2Prompt = await readFile(V2_PROMPT_PATH, "utf8");
|
|
1024
|
+
|
|
1025
|
+
await mkdir(options.logsDir, { recursive: true });
|
|
1026
|
+
await mkdir(options.workspacesDir, { recursive: true });
|
|
1027
|
+
await mkdir(options.resultsDir, { recursive: true });
|
|
1028
|
+
|
|
1029
|
+
console.log("Code Edit Format Benchmark");
|
|
1030
|
+
console.log(`Repo: ${REPO_ROOT}`);
|
|
1031
|
+
console.log(`Preset: ${options.preset}`);
|
|
1032
|
+
console.log(`Models (${options.models.length}): ${options.models.join(", ")}`);
|
|
1033
|
+
console.log(`Tasks (${options.tasks.length}): ${options.tasks.map((task) => task.id).join(", ")}`);
|
|
1034
|
+
console.log(`Modes: ${options.modes.join(", ")}`);
|
|
1035
|
+
console.log(`Agent: ${options.agent}`);
|
|
1036
|
+
console.log(`V2 Agent: ${options.v2Agent}`);
|
|
1037
|
+
console.log(`Variant: ${options.variant || "(none)"}`);
|
|
1038
|
+
console.log(`Output: ${options.runDir}`);
|
|
1039
|
+
console.log(`OpenCode: ${executable.command}`);
|
|
1040
|
+
console.log("");
|
|
1041
|
+
|
|
1042
|
+
const runRecords = [];
|
|
1043
|
+
const failures = [];
|
|
1044
|
+
|
|
1045
|
+
const phases = [];
|
|
1046
|
+
for (let index = 0; index < options.warmupRuns; index += 1) {
|
|
1047
|
+
phases.push({ phase: "warmup", repeatIndex: index + 1 });
|
|
1048
|
+
}
|
|
1049
|
+
for (let index = 0; index < options.measuredRuns; index += 1) {
|
|
1050
|
+
phases.push({ phase: "measured", repeatIndex: index + 1 });
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
const totalRuns = options.models.length * options.modes.length * options.tasks.length * phases.length;
|
|
1054
|
+
let currentRun = 0;
|
|
1055
|
+
|
|
1056
|
+
for (const modelId of options.models) {
|
|
1057
|
+
console.log(`=== ${modelId} ===`);
|
|
1058
|
+
for (const mode of options.modes) {
|
|
1059
|
+
for (const task of options.tasks) {
|
|
1060
|
+
for (const phaseEntry of phases) {
|
|
1061
|
+
currentRun += 1;
|
|
1062
|
+
const label = `[${currentRun}/${totalRuns}] ${modelId} | ${mode} | ${task.id} | ${phaseEntry.phase}#${phaseEntry.repeatIndex}`;
|
|
1063
|
+
process.stdout.write(`${label} ... `);
|
|
1064
|
+
|
|
1065
|
+
let record;
|
|
1066
|
+
try {
|
|
1067
|
+
if (mode === "hashline_v2") {
|
|
1068
|
+
record = await runHashlineV2Mode({
|
|
1069
|
+
modelId,
|
|
1070
|
+
task,
|
|
1071
|
+
phase: phaseEntry.phase,
|
|
1072
|
+
repeatIndex: phaseEntry.repeatIndex,
|
|
1073
|
+
options,
|
|
1074
|
+
executable,
|
|
1075
|
+
fixtureContent,
|
|
1076
|
+
v2Prompt,
|
|
1077
|
+
});
|
|
1078
|
+
} else {
|
|
1079
|
+
record = await runToolMode({
|
|
1080
|
+
mode,
|
|
1081
|
+
modelId,
|
|
1082
|
+
task,
|
|
1083
|
+
phase: phaseEntry.phase,
|
|
1084
|
+
repeatIndex: phaseEntry.repeatIndex,
|
|
1085
|
+
options,
|
|
1086
|
+
executable,
|
|
1087
|
+
fixtureContent,
|
|
1088
|
+
});
|
|
1089
|
+
}
|
|
1090
|
+
} catch (error) {
|
|
1091
|
+
record = baseRecord({
|
|
1092
|
+
modelId,
|
|
1093
|
+
mode,
|
|
1094
|
+
task,
|
|
1095
|
+
phase: phaseEntry.phase,
|
|
1096
|
+
repeatIndex: phaseEntry.repeatIndex,
|
|
1097
|
+
variant: options.variant,
|
|
1098
|
+
runId: buildRunId({
|
|
1099
|
+
modelId,
|
|
1100
|
+
mode,
|
|
1101
|
+
taskId: task.id,
|
|
1102
|
+
repeatIndex: phaseEntry.repeatIndex,
|
|
1103
|
+
phase: phaseEntry.phase,
|
|
1104
|
+
}),
|
|
1105
|
+
});
|
|
1106
|
+
record.supported = false;
|
|
1107
|
+
record.failureType = "runner_error";
|
|
1108
|
+
record.failureReason = error instanceof Error ? error.message : String(error);
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
runRecords.push(record);
|
|
1112
|
+
if (!record.pass) {
|
|
1113
|
+
failures.push({
|
|
1114
|
+
modelId: record.modelId,
|
|
1115
|
+
mode: record.mode,
|
|
1116
|
+
taskId: record.taskId,
|
|
1117
|
+
phase: record.phase,
|
|
1118
|
+
reason: record.failureReason ?? record.failureType ?? "unknown",
|
|
1119
|
+
failureType: record.failureType,
|
|
1120
|
+
supported: record.supported,
|
|
1121
|
+
});
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
if (record.pass) {
|
|
1125
|
+
process.stdout.write(`PASS wall=${record.wallMs ?? "-"}ms tokens=${record.tokens?.total ?? "-"} tools=${record.toolCount}\n`);
|
|
1126
|
+
} else {
|
|
1127
|
+
process.stdout.write(`FAIL ${record.failureType ?? "unknown"}${record.failureReason ? ` (${record.failureReason})` : ""}\n`);
|
|
1128
|
+
}
|
|
1129
|
+
}
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
console.log("");
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
const summary = aggregateSummary({
|
|
1136
|
+
options,
|
|
1137
|
+
runRecords,
|
|
1138
|
+
failures,
|
|
1139
|
+
startTime,
|
|
1140
|
+
endTime: new Date().toISOString(),
|
|
1141
|
+
executable,
|
|
1142
|
+
});
|
|
1143
|
+
|
|
1144
|
+
const outputs = await writeOutputs(options, summary);
|
|
1145
|
+
|
|
1146
|
+
const measuredPasses = summary.runs.filter((record) => record.pass).length;
|
|
1147
|
+
console.log("=== SUMMARY ===");
|
|
1148
|
+
console.log(`Measured runs: ${summary.runs.length}`);
|
|
1149
|
+
console.log(`Measured passes: ${measuredPasses}`);
|
|
1150
|
+
console.log(`Measured failures: ${summary.runs.length - measuredPasses}`);
|
|
1151
|
+
console.log(`summary.json: ${outputs.summaryPath}`);
|
|
1152
|
+
console.log(`report.md: ${outputs.markdownPath}`);
|
|
1153
|
+
if (outputs.dashboardPath) {
|
|
1154
|
+
console.log(`dashboard.html: ${outputs.dashboardPath}`);
|
|
1155
|
+
}
|
|
1156
|
+
}
|
|
1157
|
+
|
|
1158
|
+
main().catch((error) => {
|
|
1159
|
+
console.error(`Benchmark failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
1160
|
+
process.exit(1);
|
|
1161
|
+
});
|