@interf/compiler 0.3.4 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +131 -149
- package/builtin-workflows/interf/README.md +19 -0
- package/builtin-workflows/interf/compile/stages/shape/SKILL.md +28 -0
- package/builtin-workflows/interf/compile/stages/structure/SKILL.md +18 -0
- package/builtin-workflows/interf/compile/stages/summarize/SKILL.md +18 -0
- package/builtin-workflows/interf/improve/SKILL.md +18 -0
- package/builtin-workflows/interf/use/query/SKILL.md +28 -0
- package/builtin-workflows/interf/workflow.json +120 -0
- package/builtin-workflows/interf/workspace.schema.json +75 -0
- package/dist/bin.d.ts +0 -1
- package/dist/bin.js +0 -1
- package/dist/commands/compile-controller.d.ts +14 -0
- package/dist/commands/compile-controller.js +409 -0
- package/dist/commands/compile.d.ts +8 -9
- package/dist/commands/compile.js +151 -153
- package/dist/commands/create-workflow-wizard.d.ts +4 -2
- package/dist/commands/create-workflow-wizard.js +33 -31
- package/dist/commands/create.d.ts +0 -1
- package/dist/commands/create.js +15 -11
- package/dist/commands/default.d.ts +0 -1
- package/dist/commands/default.js +0 -1
- package/dist/commands/doctor.d.ts +0 -1
- package/dist/commands/doctor.js +1 -16
- package/dist/commands/executor-flow.d.ts +0 -1
- package/dist/commands/executor-flow.js +0 -1
- package/dist/commands/init.d.ts +0 -1
- package/dist/commands/init.js +71 -14
- package/dist/commands/list.d.ts +0 -1
- package/dist/commands/list.js +0 -1
- package/dist/commands/reset.d.ts +0 -1
- package/dist/commands/reset.js +0 -1
- package/dist/commands/source-config-wizard.d.ts +28 -7
- package/dist/commands/source-config-wizard.js +159 -63
- package/dist/commands/status.d.ts +0 -1
- package/dist/commands/status.js +0 -1
- package/dist/commands/test-flow.d.ts +20 -10
- package/dist/commands/test-flow.js +89 -23
- package/dist/commands/test.d.ts +0 -1
- package/dist/commands/test.js +36 -72
- package/dist/commands/verify.d.ts +0 -1
- package/dist/commands/verify.js +0 -1
- package/dist/commands/workspace-flow.d.ts +3 -3
- package/dist/commands/workspace-flow.js +30 -12
- package/dist/index.d.ts +5 -6
- package/dist/index.js +3 -4
- package/dist/lib/agent-args.d.ts +0 -1
- package/dist/lib/agent-args.js +0 -1
- package/dist/lib/agent-constants.d.ts +0 -1
- package/dist/lib/agent-constants.js +0 -1
- package/dist/lib/agent-detection.d.ts +0 -1
- package/dist/lib/agent-detection.js +0 -1
- package/dist/lib/agent-execution.d.ts +0 -1
- package/dist/lib/agent-execution.js +47 -12
- package/dist/lib/agent-logs.d.ts +0 -1
- package/dist/lib/agent-logs.js +0 -1
- package/dist/lib/agent-preflight.d.ts +0 -1
- package/dist/lib/agent-preflight.js +1 -2
- package/dist/lib/agent-render.d.ts +0 -1
- package/dist/lib/agent-render.js +0 -1
- package/dist/lib/agent-shells.d.ts +30 -3
- package/dist/lib/agent-shells.js +517 -54
- package/dist/lib/agent-status.d.ts +0 -1
- package/dist/lib/agent-status.js +0 -1
- package/dist/lib/agent-types.d.ts +0 -1
- package/dist/lib/agent-types.js +0 -1
- package/dist/lib/agents.d.ts +0 -9
- package/dist/lib/agents.js +0 -9
- package/dist/lib/chart-guidance.d.ts +1 -0
- package/dist/lib/chart-guidance.js +8 -0
- package/dist/lib/config.d.ts +0 -3
- package/dist/lib/config.js +0 -5
- package/dist/lib/discovery.d.ts +0 -1
- package/dist/lib/discovery.js +0 -1
- package/dist/lib/execution-profile.d.ts +0 -1
- package/dist/lib/execution-profile.js +0 -1
- package/dist/lib/executors.d.ts +0 -1
- package/dist/lib/executors.js +0 -1
- package/dist/lib/filesystem.d.ts +0 -1
- package/dist/lib/filesystem.js +0 -1
- package/dist/lib/interf-bootstrap.d.ts +0 -1
- package/dist/lib/interf-bootstrap.js +6 -2
- package/dist/lib/interf-detect.d.ts +2 -2
- package/dist/lib/interf-detect.js +25 -8
- package/dist/lib/interf-scaffold.d.ts +0 -1
- package/dist/lib/interf-scaffold.js +54 -21
- package/dist/lib/interf-workflow-package.d.ts +18 -1
- package/dist/lib/interf-workflow-package.js +164 -23
- package/dist/lib/interf.d.ts +1 -2
- package/dist/lib/interf.js +1 -2
- package/dist/lib/local-workflows.d.ts +10 -5
- package/dist/lib/local-workflows.js +208 -97
- package/dist/lib/logger.d.ts +0 -1
- package/dist/lib/logger.js +0 -1
- package/dist/lib/obsidian.d.ts +0 -1
- package/dist/lib/obsidian.js +0 -1
- package/dist/lib/parse.d.ts +0 -1
- package/dist/lib/parse.js +0 -1
- package/dist/lib/registry.d.ts +0 -1
- package/dist/lib/registry.js +0 -1
- package/dist/lib/runtime-acceptance.d.ts +0 -1
- package/dist/lib/runtime-acceptance.js +1 -2
- package/dist/lib/runtime-contracts.d.ts +0 -1
- package/dist/lib/runtime-contracts.js +14 -8
- package/dist/lib/runtime-paths.d.ts +0 -1
- package/dist/lib/runtime-paths.js +5 -10
- package/dist/lib/runtime-prompt.d.ts +0 -1
- package/dist/lib/runtime-prompt.js +8 -23
- package/dist/lib/runtime-reconcile.d.ts +0 -1
- package/dist/lib/runtime-reconcile.js +7 -3
- package/dist/lib/runtime-runs.d.ts +0 -1
- package/dist/lib/runtime-runs.js +4 -5
- package/dist/lib/runtime-types.d.ts +0 -1
- package/dist/lib/runtime-types.js +0 -1
- package/dist/lib/runtime.d.ts +0 -1
- package/dist/lib/runtime.js +0 -1
- package/dist/lib/schema.d.ts +356 -51
- package/dist/lib/schema.js +189 -54
- package/dist/lib/source-config.d.ts +17 -8
- package/dist/lib/source-config.js +125 -11
- package/dist/lib/state-artifacts.d.ts +0 -1
- package/dist/lib/state-artifacts.js +0 -1
- package/dist/lib/state-health.d.ts +0 -1
- package/dist/lib/state-health.js +2 -3
- package/dist/lib/state-io.d.ts +1 -1
- package/dist/lib/state-io.js +9 -10
- package/dist/lib/state-paths.d.ts +0 -1
- package/dist/lib/state-paths.js +5 -11
- package/dist/lib/state-view.d.ts +0 -1
- package/dist/lib/state-view.js +7 -8
- package/dist/lib/state.d.ts +0 -1
- package/dist/lib/state.js +0 -1
- package/dist/lib/summarize-plan.d.ts +0 -1
- package/dist/lib/summarize-plan.js +3 -3
- package/dist/lib/test-execution.d.ts +14 -0
- package/dist/lib/{benchmark-execution.js → test-execution.js} +122 -120
- package/dist/lib/test-matrices.d.ts +90 -0
- package/dist/lib/test-matrices.js +96 -0
- package/dist/lib/test-paths.d.ts +12 -0
- package/dist/lib/test-paths.js +44 -0
- package/dist/lib/test-profile-presets.d.ts +57 -0
- package/dist/lib/test-profile-presets.js +50 -0
- package/dist/lib/test-sandbox.d.ts +11 -0
- package/dist/lib/{benchmark-sandbox.js → test-sandbox.js} +18 -13
- package/dist/lib/test-specs.d.ts +7 -0
- package/dist/lib/test-specs.js +114 -0
- package/dist/lib/test-targets.d.ts +5 -0
- package/dist/lib/test-targets.js +38 -0
- package/dist/lib/test-types.d.ts +17 -0
- package/dist/lib/test-types.js +1 -0
- package/dist/lib/test.d.ts +4 -0
- package/dist/lib/test.js +3 -0
- package/dist/lib/user-config.d.ts +0 -1
- package/dist/lib/user-config.js +0 -1
- package/dist/lib/util.d.ts +0 -2
- package/dist/lib/util.js +0 -2
- package/dist/lib/validate-helpers.d.ts +0 -1
- package/dist/lib/validate-helpers.js +0 -1
- package/dist/lib/validate-workspace.d.ts +0 -1
- package/dist/lib/validate-workspace.js +34 -25
- package/dist/lib/validate.d.ts +0 -1
- package/dist/lib/validate.js +55 -9
- package/dist/lib/workflow-abi.d.ts +138 -0
- package/dist/lib/workflow-abi.js +181 -0
- package/dist/lib/workflow-definitions.d.ts +26 -5
- package/dist/lib/workflow-definitions.js +105 -168
- package/dist/lib/workflow-helpers.d.ts +1 -2
- package/dist/lib/workflow-helpers.js +32 -21
- package/dist/lib/workflow-improvement.d.ts +22 -0
- package/dist/lib/workflow-improvement.js +396 -0
- package/dist/lib/workflow-review-paths.d.ts +10 -0
- package/dist/lib/workflow-review-paths.js +27 -0
- package/dist/lib/workflow-stage-runner.d.ts +1 -1
- package/dist/lib/workflow-stage-runner.js +4 -1
- package/dist/lib/workflows.d.ts +1 -2
- package/dist/lib/workflows.js +1 -2
- package/dist/lib/workspace-compile.d.ts +0 -1
- package/dist/lib/workspace-compile.js +146 -109
- package/dist/lib/workspace-home.d.ts +5 -0
- package/dist/lib/workspace-home.js +32 -0
- package/dist/lib/workspace-layout.d.ts +2 -0
- package/dist/lib/workspace-layout.js +60 -0
- package/dist/lib/workspace-paths.d.ts +41 -0
- package/dist/lib/workspace-paths.js +107 -0
- package/dist/lib/workspace-raw.d.ts +20 -2
- package/dist/lib/workspace-raw.js +6 -8
- package/dist/lib/workspace-reset.d.ts +0 -1
- package/dist/lib/workspace-reset.js +27 -5
- package/dist/lib/workspace-schema.d.ts +1 -10
- package/dist/lib/workspace-schema.js +16 -74
- package/package.json +16 -15
- package/dist/bin.d.ts.map +0 -1
- package/dist/bin.js.map +0 -1
- package/dist/commands/compile.d.ts.map +0 -1
- package/dist/commands/compile.js.map +0 -1
- package/dist/commands/create-workflow-wizard.d.ts.map +0 -1
- package/dist/commands/create-workflow-wizard.js.map +0 -1
- package/dist/commands/create.d.ts.map +0 -1
- package/dist/commands/create.js.map +0 -1
- package/dist/commands/default.d.ts.map +0 -1
- package/dist/commands/default.js.map +0 -1
- package/dist/commands/doctor.d.ts.map +0 -1
- package/dist/commands/doctor.js.map +0 -1
- package/dist/commands/executor-flow.d.ts.map +0 -1
- package/dist/commands/executor-flow.js.map +0 -1
- package/dist/commands/init.d.ts.map +0 -1
- package/dist/commands/init.js.map +0 -1
- package/dist/commands/list.d.ts.map +0 -1
- package/dist/commands/list.js.map +0 -1
- package/dist/commands/reset.d.ts.map +0 -1
- package/dist/commands/reset.js.map +0 -1
- package/dist/commands/source-config-wizard.d.ts.map +0 -1
- package/dist/commands/source-config-wizard.js.map +0 -1
- package/dist/commands/status.d.ts.map +0 -1
- package/dist/commands/status.js.map +0 -1
- package/dist/commands/test-flow.d.ts.map +0 -1
- package/dist/commands/test-flow.js.map +0 -1
- package/dist/commands/test.d.ts.map +0 -1
- package/dist/commands/test.js.map +0 -1
- package/dist/commands/verify.d.ts.map +0 -1
- package/dist/commands/verify.js.map +0 -1
- package/dist/commands/workspace-flow.d.ts.map +0 -1
- package/dist/commands/workspace-flow.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/lib/agent-args.d.ts.map +0 -1
- package/dist/lib/agent-args.js.map +0 -1
- package/dist/lib/agent-constants.d.ts.map +0 -1
- package/dist/lib/agent-constants.js.map +0 -1
- package/dist/lib/agent-detection.d.ts.map +0 -1
- package/dist/lib/agent-detection.js.map +0 -1
- package/dist/lib/agent-execution.d.ts.map +0 -1
- package/dist/lib/agent-execution.js.map +0 -1
- package/dist/lib/agent-logs.d.ts.map +0 -1
- package/dist/lib/agent-logs.js.map +0 -1
- package/dist/lib/agent-preflight.d.ts.map +0 -1
- package/dist/lib/agent-preflight.js.map +0 -1
- package/dist/lib/agent-render.d.ts.map +0 -1
- package/dist/lib/agent-render.js.map +0 -1
- package/dist/lib/agent-shells.d.ts.map +0 -1
- package/dist/lib/agent-shells.js.map +0 -1
- package/dist/lib/agent-skills.d.ts +0 -21
- package/dist/lib/agent-skills.d.ts.map +0 -1
- package/dist/lib/agent-skills.js +0 -215
- package/dist/lib/agent-skills.js.map +0 -1
- package/dist/lib/agent-status.d.ts.map +0 -1
- package/dist/lib/agent-status.js.map +0 -1
- package/dist/lib/agent-types.d.ts.map +0 -1
- package/dist/lib/agent-types.js.map +0 -1
- package/dist/lib/agents.d.ts.map +0 -1
- package/dist/lib/agents.js.map +0 -1
- package/dist/lib/benchmark-execution.d.ts +0 -13
- package/dist/lib/benchmark-execution.d.ts.map +0 -1
- package/dist/lib/benchmark-execution.js.map +0 -1
- package/dist/lib/benchmark-paths.d.ts +0 -13
- package/dist/lib/benchmark-paths.d.ts.map +0 -1
- package/dist/lib/benchmark-paths.js +0 -44
- package/dist/lib/benchmark-paths.js.map +0 -1
- package/dist/lib/benchmark-sandbox.d.ts +0 -12
- package/dist/lib/benchmark-sandbox.d.ts.map +0 -1
- package/dist/lib/benchmark-sandbox.js.map +0 -1
- package/dist/lib/benchmark-specs.d.ts +0 -8
- package/dist/lib/benchmark-specs.d.ts.map +0 -1
- package/dist/lib/benchmark-specs.js +0 -115
- package/dist/lib/benchmark-specs.js.map +0 -1
- package/dist/lib/benchmark-targets.d.ts +0 -5
- package/dist/lib/benchmark-targets.d.ts.map +0 -1
- package/dist/lib/benchmark-targets.js +0 -38
- package/dist/lib/benchmark-targets.js.map +0 -1
- package/dist/lib/benchmark-types.d.ts +0 -18
- package/dist/lib/benchmark-types.d.ts.map +0 -1
- package/dist/lib/benchmark-types.js +0 -2
- package/dist/lib/benchmark-types.js.map +0 -1
- package/dist/lib/benchmark.d.ts +0 -5
- package/dist/lib/benchmark.d.ts.map +0 -1
- package/dist/lib/benchmark.js +0 -4
- package/dist/lib/benchmark.js.map +0 -1
- package/dist/lib/config.d.ts.map +0 -1
- package/dist/lib/config.js.map +0 -1
- package/dist/lib/discovery.d.ts.map +0 -1
- package/dist/lib/discovery.js.map +0 -1
- package/dist/lib/eval-packs.d.ts +0 -158
- package/dist/lib/eval-packs.d.ts.map +0 -1
- package/dist/lib/eval-packs.js +0 -149
- package/dist/lib/eval-packs.js.map +0 -1
- package/dist/lib/execution-profile.d.ts.map +0 -1
- package/dist/lib/execution-profile.js.map +0 -1
- package/dist/lib/executors.d.ts.map +0 -1
- package/dist/lib/executors.js.map +0 -1
- package/dist/lib/filesystem.d.ts.map +0 -1
- package/dist/lib/filesystem.js.map +0 -1
- package/dist/lib/interf-bootstrap.d.ts.map +0 -1
- package/dist/lib/interf-bootstrap.js.map +0 -1
- package/dist/lib/interf-detect.d.ts.map +0 -1
- package/dist/lib/interf-detect.js.map +0 -1
- package/dist/lib/interf-scaffold.d.ts.map +0 -1
- package/dist/lib/interf-scaffold.js.map +0 -1
- package/dist/lib/interf-workflow-package.d.ts.map +0 -1
- package/dist/lib/interf-workflow-package.js.map +0 -1
- package/dist/lib/interf.d.ts.map +0 -1
- package/dist/lib/interf.js.map +0 -1
- package/dist/lib/local-workflows.d.ts.map +0 -1
- package/dist/lib/local-workflows.js.map +0 -1
- package/dist/lib/logger.d.ts.map +0 -1
- package/dist/lib/logger.js.map +0 -1
- package/dist/lib/obsidian.d.ts.map +0 -1
- package/dist/lib/obsidian.js.map +0 -1
- package/dist/lib/parse.d.ts.map +0 -1
- package/dist/lib/parse.js.map +0 -1
- package/dist/lib/registry.d.ts.map +0 -1
- package/dist/lib/registry.js.map +0 -1
- package/dist/lib/runtime-acceptance.d.ts.map +0 -1
- package/dist/lib/runtime-acceptance.js.map +0 -1
- package/dist/lib/runtime-contracts.d.ts.map +0 -1
- package/dist/lib/runtime-contracts.js.map +0 -1
- package/dist/lib/runtime-paths.d.ts.map +0 -1
- package/dist/lib/runtime-paths.js.map +0 -1
- package/dist/lib/runtime-prompt.d.ts.map +0 -1
- package/dist/lib/runtime-prompt.js.map +0 -1
- package/dist/lib/runtime-reconcile.d.ts.map +0 -1
- package/dist/lib/runtime-reconcile.js.map +0 -1
- package/dist/lib/runtime-runs.d.ts.map +0 -1
- package/dist/lib/runtime-runs.js.map +0 -1
- package/dist/lib/runtime-types.d.ts.map +0 -1
- package/dist/lib/runtime-types.js.map +0 -1
- package/dist/lib/runtime.d.ts.map +0 -1
- package/dist/lib/runtime.js.map +0 -1
- package/dist/lib/schema.d.ts.map +0 -1
- package/dist/lib/schema.js.map +0 -1
- package/dist/lib/source-config.d.ts.map +0 -1
- package/dist/lib/source-config.js.map +0 -1
- package/dist/lib/state-artifacts.d.ts.map +0 -1
- package/dist/lib/state-artifacts.js.map +0 -1
- package/dist/lib/state-health.d.ts.map +0 -1
- package/dist/lib/state-health.js.map +0 -1
- package/dist/lib/state-io.d.ts.map +0 -1
- package/dist/lib/state-io.js.map +0 -1
- package/dist/lib/state-paths.d.ts.map +0 -1
- package/dist/lib/state-paths.js.map +0 -1
- package/dist/lib/state-view.d.ts.map +0 -1
- package/dist/lib/state-view.js.map +0 -1
- package/dist/lib/state.d.ts.map +0 -1
- package/dist/lib/state.js.map +0 -1
- package/dist/lib/summarize-plan.d.ts.map +0 -1
- package/dist/lib/summarize-plan.js.map +0 -1
- package/dist/lib/user-config.d.ts.map +0 -1
- package/dist/lib/user-config.js.map +0 -1
- package/dist/lib/util.d.ts.map +0 -1
- package/dist/lib/util.js.map +0 -1
- package/dist/lib/validate-helpers.d.ts.map +0 -1
- package/dist/lib/validate-helpers.js.map +0 -1
- package/dist/lib/validate-workspace.d.ts.map +0 -1
- package/dist/lib/validate-workspace.js.map +0 -1
- package/dist/lib/validate.d.ts.map +0 -1
- package/dist/lib/validate.js.map +0 -1
- package/dist/lib/workflow-definitions.d.ts.map +0 -1
- package/dist/lib/workflow-definitions.js.map +0 -1
- package/dist/lib/workflow-helpers.d.ts.map +0 -1
- package/dist/lib/workflow-helpers.js.map +0 -1
- package/dist/lib/workflow-stage-runner.d.ts.map +0 -1
- package/dist/lib/workflow-stage-runner.js.map +0 -1
- package/dist/lib/workflow-starter-docs.d.ts +0 -7
- package/dist/lib/workflow-starter-docs.d.ts.map +0 -1
- package/dist/lib/workflow-starter-docs.js +0 -3
- package/dist/lib/workflow-starter-docs.js.map +0 -1
- package/dist/lib/workflows.d.ts.map +0 -1
- package/dist/lib/workflows.js.map +0 -1
- package/dist/lib/workspace-compile.d.ts.map +0 -1
- package/dist/lib/workspace-compile.js.map +0 -1
- package/dist/lib/workspace-docs.d.ts +0 -3
- package/dist/lib/workspace-docs.d.ts.map +0 -1
- package/dist/lib/workspace-docs.js +0 -82
- package/dist/lib/workspace-docs.js.map +0 -1
- package/dist/lib/workspace-raw.d.ts.map +0 -1
- package/dist/lib/workspace-raw.js.map +0 -1
- package/dist/lib/workspace-reset.d.ts.map +0 -1
- package/dist/lib/workspace-reset.js.map +0 -1
- package/dist/lib/workspace-schema.d.ts.map +0 -1
- package/dist/lib/workspace-schema.js.map +0 -1
- package/skills/benchmark/SKILL.md +0 -122
- package/skills/workflow/create/SKILL.md +0 -141
- package/skills/workspace/shape/SKILL.md +0 -15
- package/skills/workspace/structure/SKILL.md +0 -15
- package/skills/workspace/summarize/SKILL.md +0 -15
- package/templates/workspace/README.md +0 -24
- package/templates/workspace/interfignore +0 -2
|
@@ -2,19 +2,19 @@ import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync
|
|
|
2
2
|
import { join } from "node:path";
|
|
3
3
|
import { tmpdir } from "node:os";
|
|
4
4
|
import { buildRuntimeExecutorInfo } from "./executors.js";
|
|
5
|
-
import {
|
|
6
|
-
import {
|
|
5
|
+
import { targetTestRunGitignorePath, targetTestRunsPath, targetTestSandboxGitignorePath, targetTestSandboxesPath, normalizeTestId, } from "./test-paths.js";
|
|
6
|
+
import { createTestSandbox, } from "./test-sandbox.js";
|
|
7
7
|
function parseWords(content) {
|
|
8
8
|
return content.trim().split(/\s+/).filter(Boolean).length;
|
|
9
9
|
}
|
|
10
10
|
function normalizeText(content) {
|
|
11
11
|
return content.toLowerCase().replace(/\s+/g, " ").trim();
|
|
12
12
|
}
|
|
13
|
-
function evaluateTextExpect(
|
|
13
|
+
function evaluateTextExpect(testCase, content) {
|
|
14
14
|
const checks = [];
|
|
15
15
|
const normalized = normalizeText(content);
|
|
16
16
|
const wordCount = parseWords(content);
|
|
17
|
-
for (const phrase of
|
|
17
|
+
for (const phrase of testCase.expect?.must_include ?? []) {
|
|
18
18
|
const ok = normalized.includes(normalizeText(phrase));
|
|
19
19
|
checks.push({
|
|
20
20
|
label: `must include "${phrase}"`,
|
|
@@ -22,7 +22,7 @@ function evaluateTextExpect(benchmarkCase, content) {
|
|
|
22
22
|
detail: ok ? "present" : "missing",
|
|
23
23
|
});
|
|
24
24
|
}
|
|
25
|
-
for (const options of
|
|
25
|
+
for (const options of testCase.expect?.must_include_one_of ?? []) {
|
|
26
26
|
const ok = options.some((phrase) => normalized.includes(normalizeText(phrase)));
|
|
27
27
|
checks.push({
|
|
28
28
|
label: `must include one of [${options.map((phrase) => `"${phrase}"`).join(", ")}]`,
|
|
@@ -30,7 +30,7 @@ function evaluateTextExpect(benchmarkCase, content) {
|
|
|
30
30
|
detail: ok ? "present" : "missing",
|
|
31
31
|
});
|
|
32
32
|
}
|
|
33
|
-
for (const phrase of
|
|
33
|
+
for (const phrase of testCase.expect?.must_not_include ?? []) {
|
|
34
34
|
const ok = !normalized.includes(normalizeText(phrase));
|
|
35
35
|
checks.push({
|
|
36
36
|
label: `must not include "${phrase}"`,
|
|
@@ -38,18 +38,18 @@ function evaluateTextExpect(benchmarkCase, content) {
|
|
|
38
38
|
detail: ok ? "absent" : "present",
|
|
39
39
|
});
|
|
40
40
|
}
|
|
41
|
-
if (typeof
|
|
42
|
-
const ok = wordCount >=
|
|
41
|
+
if (typeof testCase.expect?.min_words === "number") {
|
|
42
|
+
const ok = wordCount >= testCase.expect.min_words;
|
|
43
43
|
checks.push({
|
|
44
|
-
label: `min words ${
|
|
44
|
+
label: `min words ${testCase.expect.min_words}`,
|
|
45
45
|
ok,
|
|
46
46
|
detail: `${wordCount} words`,
|
|
47
47
|
});
|
|
48
48
|
}
|
|
49
|
-
if (typeof
|
|
50
|
-
const ok = wordCount <=
|
|
49
|
+
if (typeof testCase.expect?.max_words === "number") {
|
|
50
|
+
const ok = wordCount <= testCase.expect.max_words;
|
|
51
51
|
checks.push({
|
|
52
|
-
label: `max words ${
|
|
52
|
+
label: `max words ${testCase.expect.max_words}`,
|
|
53
53
|
ok,
|
|
54
54
|
detail: `${wordCount} words`,
|
|
55
55
|
});
|
|
@@ -59,25 +59,25 @@ function evaluateTextExpect(benchmarkCase, content) {
|
|
|
59
59
|
wordCount,
|
|
60
60
|
};
|
|
61
61
|
}
|
|
62
|
-
function
|
|
63
|
-
return !
|
|
62
|
+
function testCaseNeedsExecutor(testCase) {
|
|
63
|
+
return !testCase.file || Boolean(testCase.answer);
|
|
64
64
|
}
|
|
65
|
-
function
|
|
66
|
-
if (!
|
|
67
|
-
throw new Error(`
|
|
65
|
+
function runTestCase(target, testCase) {
|
|
66
|
+
if (!testCase.file) {
|
|
67
|
+
throw new Error(`Test case "${testCase.id}" requires an executor because it has no file target.`);
|
|
68
68
|
}
|
|
69
|
-
const outputPath = join(target.path,
|
|
69
|
+
const outputPath = join(target.path, testCase.file);
|
|
70
70
|
const checks = [];
|
|
71
71
|
if (!existsSync(outputPath)) {
|
|
72
72
|
checks.push({
|
|
73
73
|
label: "file exists",
|
|
74
74
|
ok: false,
|
|
75
|
-
detail: `Missing ${
|
|
75
|
+
detail: `Missing ${testCase.file}`,
|
|
76
76
|
});
|
|
77
77
|
return {
|
|
78
|
-
caseId:
|
|
79
|
-
question:
|
|
80
|
-
...(
|
|
78
|
+
caseId: testCase.id,
|
|
79
|
+
question: testCase.question,
|
|
80
|
+
...(testCase.file ? { file: testCase.file } : {}),
|
|
81
81
|
ok: false,
|
|
82
82
|
wordCount: 0,
|
|
83
83
|
passedChecks: 0,
|
|
@@ -88,17 +88,17 @@ function runBenchmarkCase(target, benchmarkCase) {
|
|
|
88
88
|
checks.push({
|
|
89
89
|
label: "file exists",
|
|
90
90
|
ok: true,
|
|
91
|
-
detail: `Found ${
|
|
91
|
+
detail: `Found ${testCase.file}`,
|
|
92
92
|
});
|
|
93
93
|
const content = readFileSync(outputPath, "utf8");
|
|
94
|
-
const evaluated = evaluateTextExpect(
|
|
94
|
+
const evaluated = evaluateTextExpect(testCase, content);
|
|
95
95
|
const wordCount = evaluated.wordCount;
|
|
96
96
|
checks.push(...evaluated.checks);
|
|
97
97
|
const passedChecks = checks.filter((check) => check.ok).length;
|
|
98
98
|
return {
|
|
99
|
-
caseId:
|
|
100
|
-
question:
|
|
101
|
-
...(
|
|
99
|
+
caseId: testCase.id,
|
|
100
|
+
question: testCase.question,
|
|
101
|
+
...(testCase.file ? { file: testCase.file } : {}),
|
|
102
102
|
ok: passedChecks === checks.length,
|
|
103
103
|
wordCount,
|
|
104
104
|
passedChecks,
|
|
@@ -106,25 +106,25 @@ function runBenchmarkCase(target, benchmarkCase) {
|
|
|
106
106
|
checks,
|
|
107
107
|
};
|
|
108
108
|
}
|
|
109
|
-
function
|
|
109
|
+
function buildTestJudgePrompt(testCase, candidateLabel, candidateContent, verdictPath) {
|
|
110
110
|
return [
|
|
111
|
-
"You are judging whether one Interf
|
|
111
|
+
"You are judging whether one Interf test answer passes.",
|
|
112
112
|
"Do not browse other files or ask follow-up questions.",
|
|
113
|
-
"Judge only from the
|
|
113
|
+
"Judge only from the truth-check rule and the candidate answer below.",
|
|
114
114
|
"Emit only STATUS:, DONE:, BLOCKED:, or ERROR: lines.",
|
|
115
115
|
`Write JSON to ${JSON.stringify(verdictPath)} with keys: pass (boolean), summary (string).`,
|
|
116
116
|
"Before finishing, write the JSON verdict file.",
|
|
117
117
|
"Final line must be `DONE: pass=true - <short summary>` or `DONE: pass=false - <short summary>`.",
|
|
118
|
-
`Question: ${
|
|
119
|
-
`Expected answer: ${
|
|
120
|
-
`Strictness: ${
|
|
118
|
+
`Question: ${testCase.question}`,
|
|
119
|
+
`Expected answer: ${testCase.answer ?? "The answer clearly satisfies the question."}`,
|
|
120
|
+
`Strictness: ${testCase.strictness ?? "approximate"}`,
|
|
121
121
|
`Candidate: ${candidateLabel}`,
|
|
122
122
|
"Candidate answer starts after the next line and ends at `END CANDIDATE`.",
|
|
123
123
|
candidateContent,
|
|
124
124
|
"END CANDIDATE",
|
|
125
125
|
].join("\n");
|
|
126
126
|
}
|
|
127
|
-
function
|
|
127
|
+
function readTestJudgeVerdict(verdictPath) {
|
|
128
128
|
if (!existsSync(verdictPath))
|
|
129
129
|
return null;
|
|
130
130
|
const raw = JSON.parse(readFileSync(verdictPath, "utf8"));
|
|
@@ -133,7 +133,7 @@ function readBenchmarkJudgeVerdict(verdictPath) {
|
|
|
133
133
|
summary: typeof raw.summary === "string" ? raw.summary : "",
|
|
134
134
|
};
|
|
135
135
|
}
|
|
136
|
-
function
|
|
136
|
+
function readTestJudgeVerdictFromStatus(statusPath) {
|
|
137
137
|
if (!existsSync(statusPath))
|
|
138
138
|
return null;
|
|
139
139
|
const lines = readFileSync(statusPath, "utf8")
|
|
@@ -163,14 +163,14 @@ function readBenchmarkJudgeVerdictFromStatus(statusPath) {
|
|
|
163
163
|
}
|
|
164
164
|
return null;
|
|
165
165
|
}
|
|
166
|
-
async function
|
|
167
|
-
const tempDir = mkdtempSync(join(tmpdir(), "interf-
|
|
166
|
+
async function runTargetTestsJudge(testCase, executor, candidateLabel, candidateContent) {
|
|
167
|
+
const tempDir = mkdtempSync(join(tmpdir(), "interf-test-judge-"));
|
|
168
168
|
let executionError = null;
|
|
169
169
|
let verdict = null;
|
|
170
170
|
try {
|
|
171
171
|
const verdictPath = join(tempDir, "verdict.json");
|
|
172
172
|
const statusPath = join(tempDir, "judge.status.log");
|
|
173
|
-
const prompt =
|
|
173
|
+
const prompt = buildTestJudgePrompt(testCase, candidateLabel, candidateContent, verdictPath);
|
|
174
174
|
try {
|
|
175
175
|
await executor.execute(tempDir, prompt, {
|
|
176
176
|
statusLogPath: statusPath,
|
|
@@ -180,9 +180,9 @@ async function runBenchmarkJudge(benchmarkCase, executor, candidateLabel, candid
|
|
|
180
180
|
executionError = error instanceof Error ? error.message : String(error);
|
|
181
181
|
}
|
|
182
182
|
try {
|
|
183
|
-
verdict =
|
|
183
|
+
verdict = readTestJudgeVerdict(verdictPath);
|
|
184
184
|
if (!verdict) {
|
|
185
|
-
verdict =
|
|
185
|
+
verdict = readTestJudgeVerdictFromStatus(statusPath);
|
|
186
186
|
}
|
|
187
187
|
}
|
|
188
188
|
catch (error) {
|
|
@@ -194,22 +194,23 @@ async function runBenchmarkJudge(benchmarkCase, executor, candidateLabel, candid
|
|
|
194
194
|
}
|
|
195
195
|
return { verdict, error: executionError };
|
|
196
196
|
}
|
|
197
|
-
function
|
|
197
|
+
function buildTestQueryPrompt(target, testCase, answerPath, tracePath) {
|
|
198
198
|
const header = target.type === "workspace"
|
|
199
199
|
? [
|
|
200
|
-
"You are running an Interf
|
|
200
|
+
"You are running an Interf test inside an isolated sandboxed compiled workspace.",
|
|
201
201
|
"Read `AGENTS.md` first.",
|
|
202
202
|
"Use the local native `interf-query` skill available in this workspace.",
|
|
203
|
-
"
|
|
204
|
-
"Answer the benchmark question the same way you would answer a real user inside this compiled workspace.",
|
|
203
|
+
"Answer the truth-check question the same way you would answer a real user inside this compiled workspace.",
|
|
205
204
|
"Prefer `home.md`, `knowledge/`, and `summaries/` before raw fallback.",
|
|
206
|
-
"This sandbox is self-contained: the copied workspace has its own sanitized `raw/` fallback via
|
|
205
|
+
"This sandbox is self-contained: the copied workspace has its own sanitized `raw/` fallback via `.interf/interf.json` `source.path`.",
|
|
207
206
|
"The source-folder control plane is intentionally absent from this sandbox. Work only from this sandboxed workspace and its embedded raw files.",
|
|
208
207
|
]
|
|
209
208
|
: [
|
|
210
|
-
"You are running an Interf baseline test inside an isolated
|
|
209
|
+
"You are running an Interf baseline test inside an isolated raw test shell.",
|
|
210
|
+
"Read `AGENTS.md` first.",
|
|
211
|
+
"Use the local native `interf-query` skill available in this shell.",
|
|
211
212
|
"There is no compiled workspace in this sandbox.",
|
|
212
|
-
"Answer only from
|
|
213
|
+
"Answer only from `raw/` inside this shell.",
|
|
213
214
|
"The source-folder control plane is intentionally absent from this sandbox.",
|
|
214
215
|
];
|
|
215
216
|
return [
|
|
@@ -218,18 +219,18 @@ function buildBenchmarkQueryPrompt(target, benchmarkCase, answerPath, tracePath)
|
|
|
218
219
|
"Do not ask follow-up questions.",
|
|
219
220
|
`Write the answer to ${JSON.stringify(answerPath)}.`,
|
|
220
221
|
`Write the trace to ${JSON.stringify(tracePath)} with keys: case_id, target, artifacts_consulted, raw_paths_read, used_raw_fallback, answer_summary.`,
|
|
221
|
-
`Set \`case_id\` to ${JSON.stringify(
|
|
222
|
+
`Set \`case_id\` to ${JSON.stringify(testCase.id)}.`,
|
|
222
223
|
`Set \`target\` to ${JSON.stringify(target.type)}.`,
|
|
223
|
-
`Question: ${
|
|
224
|
+
`Question: ${testCase.question}`,
|
|
224
225
|
].join("\n");
|
|
225
226
|
}
|
|
226
|
-
async function
|
|
227
|
-
const tempDir = mkdtempSync(join(tmpdir(), "interf-
|
|
227
|
+
async function runLiveTestCase(target, testCase, executor) {
|
|
228
|
+
const tempDir = mkdtempSync(join(tmpdir(), "interf-test-live-"));
|
|
228
229
|
const answerPath = join(tempDir, "answer.md");
|
|
229
230
|
const tracePath = join(tempDir, "trace.json");
|
|
230
231
|
const statusPath = join(tempDir, "status.log");
|
|
231
232
|
const eventPath = join(tempDir, "events.ndjson");
|
|
232
|
-
const prompt =
|
|
233
|
+
const prompt = buildTestQueryPrompt(target, testCase, answerPath, tracePath);
|
|
233
234
|
let executionError = null;
|
|
234
235
|
let code = -1;
|
|
235
236
|
try {
|
|
@@ -250,8 +251,8 @@ async function runLiveBenchmarkCase(target, benchmarkCase, executor) {
|
|
|
250
251
|
detail: executionError ? `missing answer file (${executionError})` : "missing answer file",
|
|
251
252
|
});
|
|
252
253
|
return {
|
|
253
|
-
caseId:
|
|
254
|
-
question:
|
|
254
|
+
caseId: testCase.id,
|
|
255
|
+
question: testCase.question,
|
|
255
256
|
ok: false,
|
|
256
257
|
wordCount: 0,
|
|
257
258
|
passedChecks: 0,
|
|
@@ -260,7 +261,7 @@ async function runLiveBenchmarkCase(target, benchmarkCase, executor) {
|
|
|
260
261
|
};
|
|
261
262
|
}
|
|
262
263
|
const answer = readFileSync(answerPath, "utf8");
|
|
263
|
-
const evaluated = evaluateTextExpect(
|
|
264
|
+
const evaluated = evaluateTextExpect(testCase, answer);
|
|
264
265
|
checks.push({
|
|
265
266
|
label: "answer exists",
|
|
266
267
|
ok: true,
|
|
@@ -292,8 +293,8 @@ async function runLiveBenchmarkCase(target, benchmarkCase, executor) {
|
|
|
292
293
|
detail: "missing trace file",
|
|
293
294
|
});
|
|
294
295
|
}
|
|
295
|
-
if (
|
|
296
|
-
const judged = await
|
|
296
|
+
if (testCase.answer) {
|
|
297
|
+
const judged = await runTargetTestsJudge(testCase, executor, `generated answer for ${testCase.id}`, answer);
|
|
297
298
|
checks.push({
|
|
298
299
|
label: "judge verdict",
|
|
299
300
|
ok: judged.verdict?.pass === true,
|
|
@@ -304,8 +305,8 @@ async function runLiveBenchmarkCase(target, benchmarkCase, executor) {
|
|
|
304
305
|
}
|
|
305
306
|
const passedChecks = checks.filter((check) => check.ok).length;
|
|
306
307
|
return {
|
|
307
|
-
caseId:
|
|
308
|
-
question:
|
|
308
|
+
caseId: testCase.id,
|
|
309
|
+
question: testCase.question,
|
|
309
310
|
ok: code === 0 && passedChecks === checks.length,
|
|
310
311
|
wordCount: evaluated.wordCount,
|
|
311
312
|
passedChecks,
|
|
@@ -319,22 +320,22 @@ async function runLiveBenchmarkCase(target, benchmarkCase, executor) {
|
|
|
319
320
|
rmSync(tempDir, { recursive: true, force: true });
|
|
320
321
|
}
|
|
321
322
|
}
|
|
322
|
-
async function
|
|
323
|
-
if (!
|
|
324
|
-
return
|
|
323
|
+
async function runTestCaseWithJudge(target, testCase, executor) {
|
|
324
|
+
if (!testCase.file) {
|
|
325
|
+
return runLiveTestCase(target, testCase, executor);
|
|
325
326
|
}
|
|
326
|
-
const outputPath = join(target.path,
|
|
327
|
+
const outputPath = join(target.path, testCase.file);
|
|
327
328
|
const checks = [];
|
|
328
329
|
if (!existsSync(outputPath)) {
|
|
329
330
|
checks.push({
|
|
330
331
|
label: "file exists",
|
|
331
332
|
ok: false,
|
|
332
|
-
detail: `Missing ${
|
|
333
|
+
detail: `Missing ${testCase.file}`,
|
|
333
334
|
});
|
|
334
335
|
return {
|
|
335
|
-
caseId:
|
|
336
|
-
question:
|
|
337
|
-
...(
|
|
336
|
+
caseId: testCase.id,
|
|
337
|
+
question: testCase.question,
|
|
338
|
+
...(testCase.file ? { file: testCase.file } : {}),
|
|
338
339
|
ok: false,
|
|
339
340
|
wordCount: 0,
|
|
340
341
|
passedChecks: 0,
|
|
@@ -343,16 +344,16 @@ async function runBenchmarkCaseWithJudge(target, benchmarkCase, executor) {
|
|
|
343
344
|
};
|
|
344
345
|
}
|
|
345
346
|
const content = readFileSync(outputPath, "utf8");
|
|
346
|
-
const evaluated = evaluateTextExpect(
|
|
347
|
+
const evaluated = evaluateTextExpect(testCase, content);
|
|
347
348
|
const wordCount = evaluated.wordCount;
|
|
348
349
|
checks.push({
|
|
349
350
|
label: "file exists",
|
|
350
351
|
ok: true,
|
|
351
|
-
detail: `Found ${
|
|
352
|
+
detail: `Found ${testCase.file}`,
|
|
352
353
|
});
|
|
353
354
|
checks.push(...evaluated.checks);
|
|
354
|
-
if (
|
|
355
|
-
const judged = await
|
|
355
|
+
if (testCase.answer) {
|
|
356
|
+
const judged = await runTargetTestsJudge(testCase, executor, `compiled file ${outputPath}`, content);
|
|
356
357
|
checks.push({
|
|
357
358
|
label: "judge verdict",
|
|
358
359
|
ok: judged.verdict?.pass === true,
|
|
@@ -363,9 +364,9 @@ async function runBenchmarkCaseWithJudge(target, benchmarkCase, executor) {
|
|
|
363
364
|
}
|
|
364
365
|
const passedChecks = checks.filter((check) => check.ok).length;
|
|
365
366
|
return {
|
|
366
|
-
caseId:
|
|
367
|
-
question:
|
|
368
|
-
...(
|
|
367
|
+
caseId: testCase.id,
|
|
368
|
+
question: testCase.question,
|
|
369
|
+
...(testCase.file ? { file: testCase.file } : {}),
|
|
369
370
|
ok: passedChecks === checks.length,
|
|
370
371
|
wordCount,
|
|
371
372
|
passedChecks,
|
|
@@ -373,7 +374,7 @@ async function runBenchmarkCaseWithJudge(target, benchmarkCase, executor) {
|
|
|
373
374
|
checks,
|
|
374
375
|
};
|
|
375
376
|
}
|
|
376
|
-
function
|
|
377
|
+
function buildTestTargetResult(target, caseResults, options = {}) {
|
|
377
378
|
const passedCases = caseResults.filter((result) => result.ok).length;
|
|
378
379
|
const passedChecks = caseResults.reduce((total, result) => total + result.passedChecks, 0);
|
|
379
380
|
const totalChecks = caseResults.reduce((total, result) => total + result.totalChecks, 0);
|
|
@@ -393,25 +394,25 @@ function buildBenchmarkTargetResult(target, caseResults, options = {}) {
|
|
|
393
394
|
caseResults,
|
|
394
395
|
};
|
|
395
396
|
}
|
|
396
|
-
function
|
|
397
|
+
function buildTestTargetRun(sourcePath, spec, results, executor, generatedAt) {
|
|
397
398
|
return {
|
|
398
|
-
kind: "interf-
|
|
399
|
+
kind: "interf-test-target-run",
|
|
399
400
|
version: 1,
|
|
400
401
|
generated_at: generatedAt ?? new Date().toISOString(),
|
|
401
|
-
|
|
402
|
+
spec: {
|
|
402
403
|
id: spec.id,
|
|
403
404
|
name: spec.name,
|
|
404
405
|
type: spec.type,
|
|
405
406
|
file: spec.filePath,
|
|
406
407
|
...(spec.description ? { description: spec.description } : {}),
|
|
407
408
|
case_count: spec.cases.length,
|
|
408
|
-
cases: spec.cases.map((
|
|
409
|
-
id:
|
|
410
|
-
question:
|
|
411
|
-
...(
|
|
412
|
-
...(
|
|
413
|
-
...(
|
|
414
|
-
...(
|
|
409
|
+
cases: spec.cases.map((testCase) => ({
|
|
410
|
+
id: testCase.id,
|
|
411
|
+
question: testCase.question,
|
|
412
|
+
...(testCase.file ? { file: testCase.file } : {}),
|
|
413
|
+
...(testCase.answer ? { answer: testCase.answer } : {}),
|
|
414
|
+
...(testCase.strictness ? { strictness: testCase.strictness } : {}),
|
|
415
|
+
...(testCase.expect ? { expect: testCase.expect } : {}),
|
|
415
416
|
})),
|
|
416
417
|
},
|
|
417
418
|
source_path: sourcePath,
|
|
@@ -420,97 +421,99 @@ function buildBenchmarkRunResult(sourcePath, spec, results, executor, generatedA
|
|
|
420
421
|
results,
|
|
421
422
|
};
|
|
422
423
|
}
|
|
423
|
-
export function
|
|
424
|
-
if (spec.cases.some((
|
|
425
|
-
throw new Error("This
|
|
424
|
+
export function runTargetTests(sourcePath, spec, targets) {
|
|
425
|
+
if (spec.cases.some((testCase) => testCaseNeedsExecutor(testCase))) {
|
|
426
|
+
throw new Error("This test needs a live executor. Use runTargetTestsWithJudge instead.");
|
|
426
427
|
}
|
|
427
428
|
for (const target of targets) {
|
|
428
429
|
if (target.type !== spec.type) {
|
|
429
|
-
throw new Error(`
|
|
430
|
+
throw new Error(`Test target type mismatch: expected ${spec.type}, got ${target.type}`);
|
|
430
431
|
}
|
|
431
432
|
}
|
|
432
|
-
const results = targets.map((target) =>
|
|
433
|
-
return
|
|
433
|
+
const results = targets.map((target) => buildTestTargetResult(target, spec.cases.map((testCase) => runTestCase(target, testCase))));
|
|
434
|
+
return buildTestTargetRun(sourcePath, spec, results);
|
|
434
435
|
}
|
|
435
|
-
export async function
|
|
436
|
+
export async function runTargetTestsWithJudge(sourcePath, spec, targets, executor, options = {}) {
|
|
436
437
|
const preserveMode = options.preserveSandboxes ?? "on-failure";
|
|
438
|
+
const artifactRootPath = options.artifactRootPath ?? sourcePath;
|
|
437
439
|
const generatedAt = new Date().toISOString();
|
|
438
440
|
const sandboxRunId = `${generatedAt.replace(/[:.]/g, "-")}-${spec.id}`;
|
|
439
441
|
for (const target of targets) {
|
|
440
442
|
if (target.type !== spec.type) {
|
|
441
|
-
throw new Error(`
|
|
443
|
+
throw new Error(`Test target type mismatch: expected ${spec.type}, got ${target.type}`);
|
|
442
444
|
}
|
|
443
445
|
}
|
|
444
446
|
const results = [];
|
|
445
447
|
for (const [index, target] of targets.entries()) {
|
|
446
|
-
const sandbox =
|
|
448
|
+
const sandbox = createTestSandbox(target);
|
|
447
449
|
try {
|
|
448
450
|
const sandboxTarget = {
|
|
449
451
|
...target,
|
|
450
452
|
path: sandbox.targetPath,
|
|
451
453
|
};
|
|
452
454
|
const caseResults = [];
|
|
453
|
-
for (const
|
|
454
|
-
if (
|
|
455
|
-
caseResults.push(await
|
|
455
|
+
for (const testCase of spec.cases) {
|
|
456
|
+
if (testCaseNeedsExecutor(testCase)) {
|
|
457
|
+
caseResults.push(await runLiveTestCase(sandboxTarget, testCase, executor));
|
|
456
458
|
}
|
|
457
459
|
else {
|
|
458
|
-
caseResults.push(await
|
|
460
|
+
caseResults.push(await runTestCaseWithJudge(sandboxTarget, testCase, executor));
|
|
459
461
|
}
|
|
460
462
|
}
|
|
461
463
|
let sandboxPath;
|
|
462
|
-
const targetResult =
|
|
464
|
+
const targetResult = buildTestTargetResult(target, caseResults);
|
|
463
465
|
const shouldPreserveSandbox = preserveMode === "always" || !targetResult.ok;
|
|
464
466
|
if (shouldPreserveSandbox) {
|
|
465
|
-
const sandboxRoot =
|
|
467
|
+
const sandboxRoot = targetTestSandboxesPath(artifactRootPath, target.type);
|
|
466
468
|
mkdirSync(sandboxRoot, { recursive: true });
|
|
467
|
-
const gitignorePath =
|
|
469
|
+
const gitignorePath = targetTestSandboxGitignorePath(artifactRootPath, target.type);
|
|
468
470
|
if (!existsSync(gitignorePath)) {
|
|
469
471
|
writeFileSync(gitignorePath, "*\n!.gitignore\n");
|
|
470
472
|
}
|
|
471
|
-
const sandboxPathName = `${String(index + 1).padStart(2, "0")}-${
|
|
473
|
+
const sandboxPathName = `${String(index + 1).padStart(2, "0")}-${normalizeTestId(target.name) || target.type}`;
|
|
472
474
|
sandbox.preserve(join(sandboxRoot, sandboxRunId, sandboxPathName));
|
|
473
475
|
sandboxPath = sandbox.targetPath;
|
|
474
476
|
}
|
|
475
|
-
results.push(
|
|
477
|
+
results.push(buildTestTargetResult(target, caseResults, { sandboxPath }));
|
|
476
478
|
}
|
|
477
479
|
finally {
|
|
478
480
|
sandbox.cleanup();
|
|
479
481
|
}
|
|
480
482
|
}
|
|
481
|
-
return
|
|
483
|
+
return buildTestTargetRun(sourcePath, spec, results, executor, generatedAt);
|
|
482
484
|
}
|
|
483
|
-
export async function
|
|
484
|
-
if (spec.cases.some((
|
|
485
|
+
export async function runTargetTestsAuto(sourcePath, spec, targets, options) {
|
|
486
|
+
if (spec.cases.some((testCase) => testCaseNeedsExecutor(testCase))) {
|
|
485
487
|
if (!options?.executor) {
|
|
486
|
-
throw new Error("This
|
|
488
|
+
throw new Error("This test needs a live local executor, but no executor was provided.");
|
|
487
489
|
}
|
|
488
|
-
return
|
|
490
|
+
return runTargetTestsWithJudge(sourcePath, spec, targets, options.executor, {
|
|
489
491
|
preserveSandboxes: options.preserveSandboxes,
|
|
492
|
+
artifactRootPath: options.artifactRootPath,
|
|
490
493
|
});
|
|
491
494
|
}
|
|
492
|
-
return
|
|
495
|
+
return runTargetTests(sourcePath, spec, targets);
|
|
493
496
|
}
|
|
494
|
-
export function
|
|
495
|
-
const dirPath =
|
|
497
|
+
export function saveTargetTestRun(artifactRootPath, result) {
|
|
498
|
+
const dirPath = targetTestRunsPath(artifactRootPath, result.spec.type);
|
|
496
499
|
mkdirSync(dirPath, { recursive: true });
|
|
497
|
-
const gitignorePath =
|
|
500
|
+
const gitignorePath = targetTestRunGitignorePath(artifactRootPath, result.spec.type);
|
|
498
501
|
if (!existsSync(gitignorePath)) {
|
|
499
502
|
writeFileSync(gitignorePath, "*\n!.gitignore\n");
|
|
500
503
|
}
|
|
501
504
|
const timestamp = result.generated_at.replace(/[:.]/g, "-");
|
|
502
|
-
const runDirPath = join(dirPath, `${timestamp}-${result.
|
|
505
|
+
const runDirPath = join(dirPath, `${timestamp}-${result.spec.id}`);
|
|
503
506
|
mkdirSync(runDirPath, { recursive: true });
|
|
504
507
|
const manifestPath = join(runDirPath, "manifest.json");
|
|
505
508
|
writeFileSync(manifestPath, `${JSON.stringify({
|
|
506
|
-
kind: "interf-
|
|
509
|
+
kind: "interf-test-target-run-manifest",
|
|
507
510
|
version: 1,
|
|
508
511
|
generated_at: result.generated_at,
|
|
509
|
-
|
|
510
|
-
id: result.
|
|
511
|
-
name: result.
|
|
512
|
-
type: result.
|
|
513
|
-
case_count: result.
|
|
512
|
+
spec: {
|
|
513
|
+
id: result.spec.id,
|
|
514
|
+
name: result.spec.name,
|
|
515
|
+
type: result.spec.type,
|
|
516
|
+
case_count: result.spec.case_count,
|
|
514
517
|
},
|
|
515
518
|
result_file: "run.json",
|
|
516
519
|
target_count: result.target_count,
|
|
@@ -520,4 +523,3 @@ export function saveBenchmarkRun(sourcePath, result) {
|
|
|
520
523
|
writeFileSync(runPath, `${JSON.stringify(result, null, 2)}\n`);
|
|
521
524
|
return runPath;
|
|
522
525
|
}
|
|
523
|
-
//# sourceMappingURL=benchmark-execution.js.map
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import type { WorkflowExecutionProfile } from "./executors.js";
|
|
3
|
+
export declare const TestMatrixAgentSchema: z.ZodEnum<{
|
|
4
|
+
"claude-code": "claude-code";
|
|
5
|
+
codex: "codex";
|
|
6
|
+
}>;
|
|
7
|
+
export declare const TestMatrixProfileSchema: z.ZodPipe<z.ZodObject<{
|
|
8
|
+
id: z.ZodString;
|
|
9
|
+
preset: z.ZodOptional<z.ZodEnum<{
|
|
10
|
+
"release-claude-high": "release-claude-high";
|
|
11
|
+
"release-codex-high": "release-codex-high";
|
|
12
|
+
"tier1-max-claude": "tier1-max-claude";
|
|
13
|
+
"tier1-max-codex": "tier1-max-codex";
|
|
14
|
+
}>>;
|
|
15
|
+
agent: z.ZodOptional<z.ZodEnum<{
|
|
16
|
+
"claude-code": "claude-code";
|
|
17
|
+
codex: "codex";
|
|
18
|
+
}>>;
|
|
19
|
+
model: z.ZodOptional<z.ZodString>;
|
|
20
|
+
profile: z.ZodOptional<z.ZodString>;
|
|
21
|
+
effort: z.ZodOptional<z.ZodString>;
|
|
22
|
+
timeout_ms: z.ZodOptional<z.ZodNumber>;
|
|
23
|
+
}, z.core.$strip>, z.ZodTransform<{
|
|
24
|
+
agent: "claude-code" | "codex";
|
|
25
|
+
id: string;
|
|
26
|
+
preset?: "release-claude-high" | "release-codex-high" | "tier1-max-claude" | "tier1-max-codex" | undefined;
|
|
27
|
+
model?: string | undefined;
|
|
28
|
+
profile?: string | undefined;
|
|
29
|
+
effort?: string | undefined;
|
|
30
|
+
timeout_ms?: number | undefined;
|
|
31
|
+
}, {
|
|
32
|
+
id: string;
|
|
33
|
+
preset?: "release-claude-high" | "release-codex-high" | "tier1-max-claude" | "tier1-max-codex" | undefined;
|
|
34
|
+
agent?: "claude-code" | "codex" | undefined;
|
|
35
|
+
model?: string | undefined;
|
|
36
|
+
profile?: string | undefined;
|
|
37
|
+
effort?: string | undefined;
|
|
38
|
+
timeout_ms?: number | undefined;
|
|
39
|
+
}>>;
|
|
40
|
+
export declare const TestMatrixRetryPolicySchema: z.ZodObject<{
|
|
41
|
+
max_attempts_per_profile: z.ZodOptional<z.ZodNumber>;
|
|
42
|
+
}, z.core.$strip>;
|
|
43
|
+
export declare const TestMatrixWorkspaceSchema: z.ZodString;
|
|
44
|
+
export declare const TestMatrixSchema: z.ZodObject<{
|
|
45
|
+
id: z.ZodString;
|
|
46
|
+
name: z.ZodString;
|
|
47
|
+
source_path: z.ZodString;
|
|
48
|
+
compile_profiles: z.ZodArray<z.ZodPipe<z.ZodObject<{
|
|
49
|
+
id: z.ZodString;
|
|
50
|
+
preset: z.ZodOptional<z.ZodEnum<{
|
|
51
|
+
"release-claude-high": "release-claude-high";
|
|
52
|
+
"release-codex-high": "release-codex-high";
|
|
53
|
+
"tier1-max-claude": "tier1-max-claude";
|
|
54
|
+
"tier1-max-codex": "tier1-max-codex";
|
|
55
|
+
}>>;
|
|
56
|
+
agent: z.ZodOptional<z.ZodEnum<{
|
|
57
|
+
"claude-code": "claude-code";
|
|
58
|
+
codex: "codex";
|
|
59
|
+
}>>;
|
|
60
|
+
model: z.ZodOptional<z.ZodString>;
|
|
61
|
+
profile: z.ZodOptional<z.ZodString>;
|
|
62
|
+
effort: z.ZodOptional<z.ZodString>;
|
|
63
|
+
timeout_ms: z.ZodOptional<z.ZodNumber>;
|
|
64
|
+
}, z.core.$strip>, z.ZodTransform<{
|
|
65
|
+
agent: "claude-code" | "codex";
|
|
66
|
+
id: string;
|
|
67
|
+
preset?: "release-claude-high" | "release-codex-high" | "tier1-max-claude" | "tier1-max-codex" | undefined;
|
|
68
|
+
model?: string | undefined;
|
|
69
|
+
profile?: string | undefined;
|
|
70
|
+
effort?: string | undefined;
|
|
71
|
+
timeout_ms?: number | undefined;
|
|
72
|
+
}, {
|
|
73
|
+
id: string;
|
|
74
|
+
preset?: "release-claude-high" | "release-codex-high" | "tier1-max-claude" | "tier1-max-codex" | undefined;
|
|
75
|
+
agent?: "claude-code" | "codex" | undefined;
|
|
76
|
+
model?: string | undefined;
|
|
77
|
+
profile?: string | undefined;
|
|
78
|
+
effort?: string | undefined;
|
|
79
|
+
timeout_ms?: number | undefined;
|
|
80
|
+
}>>>;
|
|
81
|
+
retry_policy: z.ZodOptional<z.ZodObject<{
|
|
82
|
+
max_attempts_per_profile: z.ZodOptional<z.ZodNumber>;
|
|
83
|
+
}, z.core.$strip>>;
|
|
84
|
+
workspaces: z.ZodArray<z.ZodString>;
|
|
85
|
+
}, z.core.$strip>;
|
|
86
|
+
export type TestMatrixProfile = z.infer<typeof TestMatrixProfileSchema>;
|
|
87
|
+
export type TestMatrixRetryPolicy = z.infer<typeof TestMatrixRetryPolicySchema>;
|
|
88
|
+
export type TestMatrixWorkspace = z.infer<typeof TestMatrixWorkspaceSchema>;
|
|
89
|
+
export type TestMatrix = z.infer<typeof TestMatrixSchema>;
|
|
90
|
+
export declare function testMatrixProfileToExecutionProfile(profile: TestMatrixProfile): WorkflowExecutionProfile;
|