npm - @interf/compiler - Versions diffs - 0.33.0 → 0.50.0 - Mend

@interf/compiler 0.33.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

package/dist/packages/runtime/service/server-routes-project-context.js CHANGED Viewed

@@ -80,7 +80,7 @@ export async function tryHandleProjectContext(req, res, runtime, ctx, method) {
         if (method === "GET") {
             const context = runtime.getLatestContextGraph(storedProject.projectDataDir, storedProject.id);
             if (!context?.build_evidence) {
-                sendError(res, 404, "Build evidence not found.");
+                sendError(res, 404, "Graph diagnostics not found.");
             }
             else {
                 sendJson(res, 200, context.build_evidence);
@@ -88,6 +88,46 @@ export async function tryHandleProjectContext(req, res, runtime, ctx, method) {
             return true;
         }
     }
+    if (subPath === PROJECT_SUBRESOURCES.contextGraphSessions) {
+        if (method === "GET") {
+            const sessions = runtime.listLatestContextGraphStageSessions(storedProject.projectDataDir, storedProject.id);
+            sendJson(res, 200, { sessions });
+            return true;
+        }
+    }
+    if (subPath.startsWith(`${PROJECT_SUBRESOURCES.contextGraphSessions}/`)) {
+        if (method === "GET") {
+            const rawStageRunId = subPath.slice(PROJECT_SUBRESOURCES.contextGraphSessions.length + 1);
+            let stageRunId;
+            try {
+                stageRunId = decodeURIComponent(rawStageRunId);
+            }
+            catch {
+                sendError(res, 400, "Stage execution session id is not valid URI-encoded UTF-8.");
+                return true;
+            }
+            const session = runtime
+                .listLatestContextGraphStageSessions(storedProject.projectDataDir, storedProject.id)
+                .find((entry) => entry.stage_run_id === stageRunId) ?? null;
+            if (!session)
+                sendError(res, 404, `Stage execution session not found: ${stageRunId}`);
+            else
+                sendJson(res, 200, { session });
+            return true;
+        }
+    }
+    if (subPath === PROJECT_SUBRESOURCES.contextGraphEntrypoint) {
+        if (method === "GET") {
+            const handoff = runtime.getContextGraphHandoff(storedProject.projectDataDir, storedProject.id);
+            if (!handoff) {
+                sendError(res, 404, "Context Graph handoff not found.");
+            }
+            else {
+                sendJson(res, 200, handoff);
+            }
+            return true;
+        }
+    }
     // GET /v1/projects/<id>/context-graph/traces — first-class traces surface.
     // Returns a derived Traces wire shape rolled up from the current
     // Context Graph's artifacts + Checks + source file index.
@@ -117,20 +157,75 @@ export async function tryHandleProjectContext(req, res, runtime, ctx, method) {
             return true;
         }
     }
+    if (subPath === PROJECT_SUBRESOURCES.contextGraphStages) {
+        if (method === "GET") {
+            const context = runtime.getLatestContextGraph(storedProject.projectDataDir, storedProject.id);
+            if (!context) {
+                sendError(res, 404, "Context Graph not found.");
+            }
+            else {
+                sendJson(res, 200, {
+                    stages: context.stage_summaries,
+                    readiness: context.readiness_rollup,
+                    primary_metrics: context.primary_metrics,
+                });
+            }
+            return true;
+        }
+    }
+    if (subPath === PROJECT_SUBRESOURCES.contextGraphResources) {
+        if (method === "GET") {
+            const context = runtime.getLatestContextGraph(storedProject.projectDataDir, storedProject.id);
+            if (!context) {
+                sendError(res, 404, "Context Graph not found.");
+            }
+            else {
+                sendJson(res, 200, {
+                    resources: context.resources,
+                    entrypoints: context.entrypoints,
+                    graph_outputs: context.graph_manifest?.graph_outputs ?? null,
+                });
+            }
+            return true;
+        }
+    }
+    if (subPath.startsWith(`${PROJECT_SUBRESOURCES.contextGraphResources}/`)) {
+        if (method === "GET") {
+            const rawResourceId = subPath.slice(PROJECT_SUBRESOURCES.contextGraphResources.length + 1);
+            let resourceId;
+            try {
+                resourceId = decodeURIComponent(rawResourceId);
+            }
+            catch {
+                sendError(res, 400, "Resource id is not valid URI-encoded UTF-8.");
+                return true;
+            }
+            const context = runtime.getLatestContextGraph(storedProject.projectDataDir, storedProject.id);
+            const resource = context?.resources.find((entry) => entry.id === resourceId) ?? null;
+            if (!resource) {
+                sendError(res, 404, `Context Graph resource not found: ${resourceId}`);
+            }
+            else {
+                const stage = resource.stage_id
+                    ? context?.stage_summaries.find((entry) => entry.stage_id === resource.stage_id) ?? null
+                    : null;
+                sendJson(res, 200, {
+                    resource,
+                    stage,
+                    linked_resources: context?.resources.filter((entry) => resource.links.includes(entry.path ?? entry.id)) ?? [],
+                });
+            }
+            return true;
+        }
+    }
     // GET /v1/projects/<id>/source-state — manifest-backed drift verdict for
     // the Project's Source binding.
     if (subPath === PROJECT_SUBRESOURCES.sourceState) {
         if (method === "GET") {
-            // The per-run SourceState record carries the historical file index;
-            // for v1 we treat the latest Build id as the graph pointer and
-            // leave the hash comparison to a follow-up enhancement. For now the
-            // verdict is `unknown` until a Build has produced a Context Graph.
             const latestContext = runtime.getLatestContextGraph(storedProject.projectDataDir, storedProject.id);
             const state = computeProjectSourceState({
                 projectId: storedProject.id,
-                sourceFolderPath: storedProject.source.locator,
                 contextGraphPath: latestContext?.path ?? storedProject.contextGraphPath,
-                lastGraphIndexHash: null,
             });
             sendJson(res, 200, state);
             return true;

package/dist/packages/runtime/service/server-routes-project-jobs.js CHANGED Viewed

@@ -2,6 +2,19 @@ import { PROJECT_SUBRESOURCES } from "./routes.js";
 import { readJsonBody, sendError, sendJson, } from "./server-helpers.js";
 import { writeBenchmarkSpecsForProject } from "../../projects/source-config.js";
 import { loadTestSpec } from "../verify/verify-specs.js";
+import { ProjectBenchmarkQuestionDraftCreateRequestSchema, ProjectBuildPlanAuthoringCreateRequestSchema, ProjectResetRequestSchema, } from "../schemas/index.js";
+function projectScopedJobRequest(storedProject, body) {
+    const scopedRequest = ProjectBuildPlanAuthoringCreateRequestSchema.parse(body ?? {});
+    const explicitIntent = typeof scopedRequest.intent === "string" && scopedRequest.intent.trim().length > 0
+        ? scopedRequest.intent
+        : null;
+    return {
+        ...scopedRequest,
+        project: storedProject.id,
+        source_folder_path: storedProject.source.locator,
+        intent: explicitIntent ?? storedProject.intent,
+    };
+}
 /**
  * Async/structured mutations on a project: Build Plan authoring
  * and improvement jobs, benchmark-question drafts, Build Plan change deltas,
@@ -14,11 +27,7 @@ export async function tryHandleProjectJobs(req, res, runtime, ctx, method) {
         if (method === "POST") {
             try {
                 const body = (await readJsonBody(req));
-                const job = await runtime.createBuildPlanAuthoringRun(storedProject.projectDataDir, {
-                    ...(body ?? {}),
-                    project: storedProject.id,
-                    source_folder_path: storedProject.source.locator,
-                });
+                const job = await runtime.createBuildPlanAuthoringRun(storedProject.projectDataDir, projectScopedJobRequest(storedProject, body));
                 sendJson(res, 202, job);
             }
             catch (error) {
@@ -31,11 +40,7 @@ export async function tryHandleProjectJobs(req, res, runtime, ctx, method) {
         if (method === "POST") {
             try {
                 const body = (await readJsonBody(req));
-                const job = await runtime.createBuildPlanAuthoringRun(storedProject.projectDataDir, {
-                    ...(body ?? {}),
-                    project: storedProject.id,
-                    source_folder_path: storedProject.source.locator,
-                }, "build-plan-improvement");
+                const job = await runtime.createBuildPlanAuthoringRun(storedProject.projectDataDir, projectScopedJobRequest(storedProject, body), "build-plan-improvement");
                 sendJson(res, 202, job);
             }
             catch (error) {
@@ -48,8 +53,9 @@ export async function tryHandleProjectJobs(req, res, runtime, ctx, method) {
         if (method === "POST") {
             try {
                 const body = (await readJsonBody(req));
+                const scopedRequest = ProjectBenchmarkQuestionDraftCreateRequestSchema.parse(body ?? {});
                 const job = await runtime.createBenchmarkQuestionDraftRun(storedProject.projectDataDir, {
-                    ...(body ?? {}),
+                    ...scopedRequest,
                     project: storedProject.id,
                     source_folder_path: storedProject.source.locator,
                 });
@@ -116,7 +122,8 @@ export async function tryHandleProjectJobs(req, res, runtime, ctx, method) {
         if (method === "POST") {
             try {
                 const body = (await readJsonBody(req));
-                const request = { project: storedProject.id, scope: "build", ...(body ?? {}) };
+                const scopedRequest = ProjectResetRequestSchema.parse(body ?? {});
+                const request = { ...scopedRequest, project: storedProject.id };
                 const result = runtime.applyReset(storedProject.projectDataDir, request);
                 sendJson(res, 200, result);
             }

package/dist/packages/runtime/service/server-routes-project-runs.js CHANGED Viewed

@@ -1,6 +1,7 @@
 import { PROJECT_SUBRESOURCES } from "./routes.js";
 import { readJsonBody, sendError, sendErrorResponse, sendJson, } from "./server-helpers.js";
 import { assertCanRunBenchmark } from "../entitlement-guard.js";
+import { ProjectBuildRunCreateRequestSchema, ProjectVerifyRunCreateRequestSchema, } from "../schemas/index.js";
 /**
  * Build/verify run mutations and the per-Project runs listing.
  * Run-id lookups for an unknown project are handled by the
@@ -16,7 +17,8 @@ export async function tryHandleProjectRuns(req, res, runtime, ctx, method) {
             }
             try {
                 const body = (await readJsonBody(req));
-                const request = { project: storedProject.id, ...(body ?? {}) };
+                const scopedRequest = ProjectBuildRunCreateRequestSchema.parse(body ?? {});
+                const request = { ...scopedRequest, project: storedProject.id };
                 const idempotencyKeyRaw = req.headers["x-interf-idempotency-key"];
                 const idempotencyKey = Array.isArray(idempotencyKeyRaw)
                     ? idempotencyKeyRaw[0]
@@ -64,7 +66,8 @@ export async function tryHandleProjectRuns(req, res, runtime, ctx, method) {
                     .filter((run) => !run.parent_run_id).length;
                 assertCanRunBenchmark(runtime.currentAccount, existingCount);
                 const body = (await readJsonBody(req));
-                const request = { project: storedProject.id, ...(body ?? {}) };
+                const scopedRequest = ProjectVerifyRunCreateRequestSchema.parse(body ?? {});
+                const request = { ...scopedRequest, project: storedProject.id };
                 const resource = await runtime.createVerifyRun(storedProject.projectDataDir, request);
                 sendJson(res, 201, resource);
             }

package/dist/packages/runtime/service/server-routes-projects.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import { ProjectCreateRequestSchema, ProjectUpdateRequestSchema, } from "../schemas/index.js";
 import { LOCAL_SERVICE_ROUTES } from "./routes.js";
-import { readJsonBody, sendError, sendErrorResponse, sendJson, } from "./server-helpers.js";
+import { decodeOr400, readJsonBody, sendError, sendErrorResponse, sendJson, } from "./server-helpers.js";
 import { createStoredProject, deleteStoredProject, getStoredProject, listStoredProjects, projectWireShape, updateStoredProject, } from "../project-store.js";
 import { tryHandleProjectRuns, } from "./server-routes-project-runs.js";
 import { tryHandleProjectJobs } from "./server-routes-project-jobs.js";
@@ -19,6 +19,7 @@ export async function tryHandleProjectCollection(req, res, runtime, path, method
             const stored = createStoredProject(runtime, {
                 id: body.id,
                 source: { kind: "local-folder", locator: body.source.locator },
+                intent: body.intent,
                 build_plan_id: body.build_plan_id,
             });
             sendJson(res, 201, projectWireShape(stored));
@@ -41,7 +42,9 @@ export async function tryHandlePerProject(req, res, runtime, path, method) {
     const slashIndex = tail.indexOf("/");
     const projectId = slashIndex === -1 ? tail : tail.slice(0, slashIndex);
     const subPath = slashIndex === -1 ? "" : tail.slice(slashIndex + 1);
-    const decodedProjectId = decodeURIComponent(projectId);
+    const decodedProjectId = decodeOr400(res, projectId, "Project id");
+    if (decodedProjectId === null)
+        return true;
     const storedProject = getStoredProject(decodedProjectId);
     if (!storedProject) {
         sendError(res, 404, `Project not found: ${decodedProjectId}`);
@@ -63,6 +66,7 @@ export async function tryHandlePerProject(req, res, runtime, path, method) {
             try {
                 const body = ProjectUpdateRequestSchema.parse(await readJsonBody(req));
                 const updated = updateStoredProject(decodedProjectId, {
+                    intent: body.intent,
                     build_plan_id: body.build_plan_id,
                 });
                 sendJson(res, 200, projectWireShape(updated));

package/dist/packages/runtime/service/server-routes-runs.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { LOCAL_SERVICE_ROUTES, RUN_SUBRESOURCES } from "./routes.js";
-import { parseRequestUrl, sendError, sendJson } from "./server-helpers.js";
+import { decodeOr400, parseRequestUrl, sendError, sendJson } from "./server-helpers.js";
 import { isTraversalRelativePath, safeApiFilePath, sendApiFile, } from "./server-api-files.js";
 import { findInstanceBuildRun, findInstanceJob, findInstanceRun, findInstanceVerifyRun, listInstanceRuns, } from "./server-instance-helpers.js";
 import { runObservabilityToStatus } from "../run-observability.js";
@@ -10,7 +10,9 @@ export function tryHandleRuns(req, res, runtime, path, method) {
     }
     const observableRunMatch = path.match(/^\/v1\/runs\/([^/]+)(?:\/([^/]+))?$/);
     if (observableRunMatch?.[1]) {
-        const runId = decodeURIComponent(observableRunMatch[1]);
+        const runId = decodeOr400(res, observableRunMatch[1], "Run id");
+        if (runId === null)
+            return true;
         const child = observableRunMatch[2];
         if (method === "GET" && !child) {
             const run = findInstanceRun(runtime, runId);
@@ -80,7 +82,10 @@ export function tryHandleRuns(req, res, runtime, path, method) {
     }
     const verifyRunMatch = path.match(/^\/v1\/verify-runs\/([^/]+)$/);
     if (method === "GET" && verifyRunMatch?.[1]) {
-        const found = findInstanceVerifyRun(runtime, decodeURIComponent(verifyRunMatch[1]));
+        const verifyRunId = decodeOr400(res, verifyRunMatch[1], "Verify run id");
+        if (verifyRunId === null)
+            return true;
+        const found = findInstanceVerifyRun(runtime, verifyRunId);
         if (!found)
             sendError(res, 404, "Verify run not found.");
         else
@@ -89,7 +94,9 @@ export function tryHandleRuns(req, res, runtime, path, method) {
     }
     const jobMatch = path.match(/^\/v1\/jobs\/([^/]+)(?:\/([^/]+))?$/);
     if (jobMatch?.[1]) {
-        const runId = decodeURIComponent(jobMatch[1]);
+        const runId = decodeOr400(res, jobMatch[1], "Job run id");
+        if (runId === null)
+            return true;
         const child = jobMatch[2];
         const found = findInstanceJob(runtime, runId);
         if (method === "GET" && !child) {

package/dist/packages/runtime/verify/lib/schema.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import { z } from "zod";
 import { RuntimeExecutorInfoSchema, TestCaseExpectSchema, TestTargetTypeSchema, ProjectIdSchema, } from "../../../contracts/lib/schema.js";
+import { isSafeRelativeTestFile } from "../test-file-guard.js";
 const TestCaseCoreSchema = z.object({
     id: z.string().regex(/^[a-z0-9][a-z0-9-]{0,79}$/),
     question: z.string().min(1),
@@ -14,6 +15,17 @@ const TestCaseCoreSchema = z.object({
             message: "Test cases need at least one of file, answer, or expect.",
         });
     }
+    // H2: a `file` target is a project-relative output path that is later joined
+    // onto the (sandboxed) target directory. Reject absolute or `..`-escaping
+    // values at parse time so they can never reach `path.join`. Centralizing the
+    // guard here means H1 (verify-execution join) and H3 (spec writer) inherit it.
+    if (value.file !== undefined && !isSafeRelativeTestFile(value.file)) {
+        ctx.addIssue({
+            code: z.ZodIssueCode.custom,
+            path: ["file"],
+            message: "Test case file must be a relative path without '..' or a leading '/'.",
+        });
+    }
 });
 export const TestCaseSchema = TestCaseCoreSchema;
 const TestSpecCoreSchema = z.object({

package/dist/packages/runtime/verify/test-file-guard.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export declare function isSafeRelativeTestFile(file: string): boolean;
2	+ export declare function assertSafeRelativeTestFile(file: string): void;

package/dist/packages/runtime/verify/test-file-guard.js ADDED Viewed

@@ -0,0 +1,29 @@
+// Pure, Node-free guard for a `testCase.file` value. A test file is a
+// project-relative output path inside the (sandboxed) target directory, so it
+// must never be absolute, never escape via `..`, and stay within a conservative
+// character set. This is the single guard both the schema (H2) and the spec
+// writer (H3) consume so a malicious `file` cannot reach `path.join` and a later
+// `existsSync`/`readFileSync`.
+//
+// This module imports no Node builtins so it is safe to pull into the schemas
+// barrel (`@interf/compiler/schemas`) and bundle into the renderer. The
+// filesystem-aware path helpers live in `verify-paths.ts`, which re-exports
+// these guards for server-side callers.
+const TEST_FILE_PATTERN = /^[A-Za-z0-9._/-]+$/;
+export function isSafeRelativeTestFile(file) {
+    if (file.length === 0)
+        return false;
+    if (file.startsWith("/"))
+        return false;
+    if (!TEST_FILE_PATTERN.test(file))
+        return false;
+    // Reject `..` as a whole path segment (e.g. `..`, `../x`, `a/../b`, `a/..`).
+    if (file.split("/").some((segment) => segment === ".."))
+        return false;
+    return true;
+}
+export function assertSafeRelativeTestFile(file) {
+    if (!isSafeRelativeTestFile(file)) {
+        throw new Error(`Invalid test case file: ${file}`);
+    }
+}

package/dist/packages/runtime/verify/verify-execution.d.ts CHANGED Viewed

@@ -1,6 +1,12 @@
 import { type AgentExecutor } from "../agents/lib/executors.js";
 import type { TestTargetRun, TestTargetCandidate, LoadedTestSpec } from "./verify-types.js";
 import { type TestSandboxRetentionMode } from "./verify-sandbox.js";
+export declare function resolveTargetFilePath(targetPath: string, file: string): string;
+interface TestJudgeVerdict {
+    pass: boolean;
+    summary: string;
+}
+export declare function readTestJudgeVerdictFromStatus(statusPath: string): TestJudgeVerdict | null;
 export declare function runTargetTests(sourcePath: string, spec: LoadedTestSpec, targets: TestTargetCandidate[]): TestTargetRun;
 export declare function runTargetTestsWithJudge(sourcePath: string, spec: LoadedTestSpec, targets: TestTargetCandidate[], executor: AgentExecutor, options?: {
     preserveSandboxes?: TestSandboxRetentionMode;
@@ -12,3 +18,4 @@ export declare function runTargetTestsAuto(sourcePath: string, spec: LoadedTestS
     artifactRootPath?: string;
 }): Promise<TestTargetRun>;
 export declare function saveTargetTestRun(artifactRootPath: string, result: TestTargetRun): string;
+export {};

package/dist/packages/runtime/verify/verify-execution.js CHANGED Viewed

@@ -1,9 +1,21 @@
 import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync, } from "node:fs";
 import { join } from "node:path";
 import { tmpdir } from "node:os";
+import { assertPathWithinRoot } from "../../contracts/utils/path-guards.js";
 import { buildRuntimeExecutorInfo } from "../agents/lib/executors.js";
 import { targetTestRunGitignorePath, targetTestRunsPath, targetTestSandboxGitignorePath, targetTestSandboxesPath, normalizeTestId, } from "./verify-paths.js";
 import { createTestSandbox, } from "./verify-sandbox.js";
+import { freezePreservedShell } from "../agents/lib/shell-fs.js";
+// H1: `testCase.file` is a project-relative output path joined onto the target
+// directory. `path.join` resolves `..` at string level, so a malicious file can
+// escape the (sandboxed) target and be read via existsSync/readFileSync. Resolve
+// the join and assert it stays within target.path before any filesystem access.
+// Centralized so every read of a test-case file inherits the bound (ENGINE-POLICY
+// rule 7). The schema/spec-writer guards (H2/H3) are defense-in-depth ahead of
+// this; this is the last line before the filesystem.
+export function resolveTargetFilePath(targetPath, file) {
+    return assertPathWithinRoot(targetPath, join(targetPath, file), "test case file");
+}
 function parseWords(content) {
     return content.trim().split(/\s+/).filter(Boolean).length;
 }
@@ -66,7 +78,7 @@ function runTestCase(target, testCase) {
     if (!testCase.file) {
         throw new Error(`Test case "${testCase.id}" requires an executor because it has no file target.`);
     }
-    const outputPath = join(target.path, testCase.file);
+    const outputPath = resolveTargetFilePath(target.path, testCase.file);
     const checks = [];
     if (!existsSync(outputPath)) {
         checks.push({
@@ -133,7 +145,13 @@ function readTestJudgeVerdict(verdictPath) {
         summary: typeof raw.summary === "string" ? raw.summary : "",
     };
 }
-function readTestJudgeVerdictFromStatus(statusPath) {
+// M13: verdict.json is the single source of truth for a judge pass/fail. This is
+// a strict fallback for when the agent omitted the JSON file: only an explicit
+// `pass=true` / `pass=false` token on a terminal DONE:/BLOCKED:/ERROR: line is
+// honored. The prompt mandates `DONE: pass=true|false - <summary>`, so a benign
+// status line that merely mentions a match must NOT be read as a pass. A terminal
+// line with no explicit pass token yields no verdict (treated as missing/invalid).
+export function readTestJudgeVerdictFromStatus(statusPath) {
     if (!existsSync(statusPath))
         return null;
     const lines = readFileSync(statusPath, "utf8")
@@ -148,53 +166,98 @@ function readTestJudgeVerdictFromStatus(statusPath) {
             continue;
         const normalized = line.toLowerCase();
         const summary = line.replace(/^(DONE|BLOCKED|ERROR):\s*/i, "").trim();
-        if (normalized.includes("pass=true")) {
-            return { pass: true, summary };
-        }
-        if (normalized.includes("pass=false")) {
-            return { pass: false, summary };
-        }
-        if (/values match expected|matches expected|candidate matches|answer matches|expected values match/i.test(line)) {
+        if (/\bpass=true\b/.test(normalized)) {
             return { pass: true, summary };
         }
-        if (/does not match|do not match|mismatch|candidate fails|answer fails|expected values do not match/i.test(line)) {
+        if (/\bpass=false\b/.test(normalized)) {
             return { pass: false, summary };
         }
+        // Terminal line reached without an explicit pass token: no trustworthy
+        // verdict. Stop scanning so an earlier, benign line cannot be misread.
+        return null;
     }
     return null;
 }
-async function runTargetTestsJudge(testCase, executor, candidateLabel, candidateContent) {
-    const tempDir = mkdtempSync(join(tmpdir(), "interf-test-judge-"));
+async function runTargetTestsJudge(testCase, executor, candidateLabel, candidateContent,
+// When supplied, the judge shell is created at this durable path and preserved
+// (frozen) on finish so the judge execution is inspectable: prompt, reasoning
+// transcript, the candidate it judged, and the JSON verdict all survive. Lives
+// under the benchmark sandbox so it is preserved with the sandbox on failure.
+// When omitted, the shell is an ephemeral /tmp dir removed on finish.
+preservedShellRoot) {
+    const preserve = Boolean(preservedShellRoot);
+    const tempDir = preservedShellRoot
+        ? (mkdirSync(preservedShellRoot, { recursive: true }), preservedShellRoot)
+        : mkdtempSync(join(tmpdir(), "interf-test-judge-"));
+    // The preserved shell needs a runtime/ dir up front: freezePreservedShell
+    // writes its manifest to runtime/preserved-shell.json, and reasoning is teed
+    // there too. (The ephemeral path doesn't need it.)
+    if (preserve) {
+        mkdirSync(join(tempDir, "runtime"), { recursive: true });
+    }
     let executionError = null;
     let verdict = null;
     try {
         const verdictPath = join(tempDir, "verdict.json");
         const statusPath = join(tempDir, "judge.status.log");
         const prompt = buildTestJudgePrompt(testCase, candidateLabel, candidateContent, verdictPath);
+        // Preserve the rendered prompt and the candidate it judged so the preserved
+        // shell holds everything needed to reproduce the verdict, not just the output.
+        if (preserve) {
+            writeFileSync(join(tempDir, "prompt.txt"), `${prompt}\n`);
+            writeFileSync(join(tempDir, "candidate.txt"), `${candidateContent}\n`);
+        }
+        // Same canonical convention as the stage path: reasoning is teed into the
+        // shell's runtime/ dir so it is preserved when the shell is frozen.
+        const reasoningLogPath = preserve ? join(tempDir, "runtime", "agent-reasoning.jsonl") : null;
         try {
             await executor.execute(tempDir, prompt, {
                 statusLogPath: statusPath,
+                reasoningLogPath,
             });
         }
         catch (error) {
             executionError = error instanceof Error ? error.message : String(error);
         }
-        try {
-            verdict = readTestJudgeVerdict(verdictPath);
-            if (!verdict) {
-                verdict = readTestJudgeVerdictFromStatus(statusPath);
+        // M12: first error wins. If execute() already failed, do not read verdict
+        // files — a throw in verdict-reading would otherwise overwrite the real
+        // execution failure and mask why the judge never ran. Only read the verdict
+        // when execution itself reported no error.
+        if (!executionError) {
+            try {
+                verdict = readTestJudgeVerdict(verdictPath);
+                if (!verdict) {
+                    verdict = readTestJudgeVerdictFromStatus(statusPath);
+                }
+            }
+            catch (error) {
+                executionError = error instanceof Error ? error.message : String(error);
             }
-        }
-        catch (error) {
-            executionError = error instanceof Error ? error.message : String(error);
         }
     }
     finally {
-        rmSync(tempDir, { recursive: true, force: true });
+        // Preserve a durable judge shell (freeze materializes symlinks + writes the
+        // preserved-shell manifest, path unchanged); only remove an ephemeral one.
+        if (preserve) {
+            freezePreservedShell(tempDir, "test-judge");
+        }
+        else {
+            rmSync(tempDir, { recursive: true, force: true });
+        }
     }
     return { verdict, error: executionError };
 }
-function buildTestQueryPrompt(target, testCase, answerPath, tracePath) {
+/**
+ * Durable judge-shell root under a benchmark sandbox target. Lives beside the
+ * sandbox's other preserved runtime artifacts so a preserved sandbox carries the
+ * judge execution too. Keyed by case + candidate so multiple judged cases in one
+ * target don't collide.
+ */
+function judgeShellRoot(targetPath, testCase, candidateLabel) {
+    const slug = `${normalizeTestId(testCase.id) || "case"}-${normalizeTestId(candidateLabel) || "candidate"}`;
+    return join(targetPath, ".interf", "runtime", "test-judge", slug);
+}
+function buildTestQueryPrompt(target, testCase, answerPath, tracePath, retryReason) {
     const header = target.type === "context-graph"
         ? [
             "You are running an Interf benchmark inside an isolated sandboxed context-graph copy.",
@@ -217,10 +280,13 @@ function buildTestQueryPrompt(target, testCase, answerPath, tracePath) {
         ...header,
         "Emit only STATUS:, DONE:, BLOCKED:, or ERROR: lines.",
         "Do not ask follow-up questions.",
+        "Create both required output files before printing DONE.",
+        "The output paths are absolute temp file paths outside the Context Graph sandbox; write them exactly as given.",
         `Write the answer to ${JSON.stringify(answerPath)}.`,
         `Write the trace to ${JSON.stringify(tracePath)} with keys: case_id, target, artifacts_consulted, source_paths_read, used_source_files, answer_summary.`,
         `Set \`case_id\` to ${JSON.stringify(testCase.id)}.`,
         `Set \`target\` to ${JSON.stringify(target.type)}.`,
+        ...(retryReason ? [`Retry reason: ${retryReason}`] : []),
         `Question: ${testCase.question}`,
     ].join("\n");
 }
@@ -228,20 +294,26 @@ async function runLiveTestCase(target, testCase, executor) {
     const tempDir = mkdtempSync(join(tmpdir(), "interf-test-live-"));
     const answerPath = join(tempDir, "answer.md");
     const tracePath = join(tempDir, "trace.json");
-    const statusPath = join(tempDir, "status.log");
-    const eventPath = join(tempDir, "events.ndjson");
-    const prompt = buildTestQueryPrompt(target, testCase, answerPath, tracePath);
     let executionError = null;
     let code = -1;
     try {
-        try {
-            code = await executor.execute(target.path, prompt, {
-                eventLogPath: eventPath,
-                statusLogPath: statusPath,
-            });
-        }
-        catch (error) {
-            executionError = error instanceof Error ? error.message : String(error);
+        for (let attempt = 1; attempt <= 2; attempt += 1) {
+            const statusPath = join(tempDir, `status-${attempt}.log`);
+            const eventPath = join(tempDir, `events-${attempt}.ndjson`);
+            const prompt = buildTestQueryPrompt(target, testCase, answerPath, tracePath, attempt === 1
+                ? undefined
+                : `Attempt ${attempt - 1} exited without writing ${!existsSync(answerPath) ? "the answer file" : "the trace file"}. Write both files before DONE.`);
+            try {
+                code = await executor.execute(target.path, prompt, {
+                    eventLogPath: eventPath,
+                    statusLogPath: statusPath,
+                });
+            }
+            catch (error) {
+                executionError = error instanceof Error ? error.message : String(error);
+            }
+            if (existsSync(answerPath) && existsSync(tracePath))
+                break;
         }
         const checks = [];
         if (!existsSync(answerPath)) {
@@ -294,7 +366,8 @@ async function runLiveTestCase(target, testCase, executor) {
             });
         }
         if (testCase.answer) {
-            const judged = await runTargetTestsJudge(testCase, executor, `generated answer for ${testCase.id}`, answer);
+            const candidateLabel = `generated answer for ${testCase.id}`;
+            const judged = await runTargetTestsJudge(testCase, executor, candidateLabel, answer, judgeShellRoot(target.path, testCase, candidateLabel));
             checks.push({
                 label: "judge verdict",
                 ok: judged.verdict?.pass === true,
@@ -324,7 +397,7 @@ async function runTestCaseWithJudge(target, testCase, executor) {
     if (!testCase.file) {
         return runLiveTestCase(target, testCase, executor);
     }
-    const outputPath = join(target.path, testCase.file);
+    const outputPath = resolveTargetFilePath(target.path, testCase.file);
     const checks = [];
     if (!existsSync(outputPath)) {
         checks.push({
@@ -353,7 +426,8 @@ async function runTestCaseWithJudge(target, testCase, executor) {
     });
     checks.push(...evaluated.checks);
     if (testCase.answer) {
-        const judged = await runTargetTestsJudge(testCase, executor, `Context Graph file ${outputPath}`, content);
+        const candidateLabel = `Context Graph file ${outputPath}`;
+        const judged = await runTargetTestsJudge(testCase, executor, candidateLabel, content, judgeShellRoot(target.path, testCase, candidateLabel));
         checks.push({
             label: "judge verdict",
             ok: judged.verdict?.pass === true,

package/dist/packages/runtime/verify/verify-paths.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import type { TestTargetType } from "./verify-types.js";
 export declare const TEST_SPEC_EXTENSIONS: Set<string>;
+export { isSafeRelativeTestFile, assertSafeRelativeTestFile, } from "./test-file-guard.js";
 export declare function testSpecRootPath(sourcePath: string): string;
 export declare function testSpecTypePath(sourcePath: string, type: TestTargetType): string;
 export declare function targetTestRunsPath(contextGraphPath: string, type: TestTargetType): string;

package/dist/packages/runtime/verify/verify-paths.js CHANGED Viewed

@@ -4,6 +4,10 @@ import { targetTestRunsRootForContextGraph, targetTestSandboxesRootForContextGra
 import { asProjectDataDir, projectTestsSpecsRoot, } from "../../contracts/lib/project-paths.js";
 const TEST_ID_PATTERN = /^[a-z0-9][a-z0-9-]{0,79}$/;
 export const TEST_SPEC_EXTENSIONS = new Set([".json"]);
+// The `testCase.file` safelist guard lives in a Node-free module so the schemas
+// barrel can import it without pulling Node builtins into the renderer bundle.
+// Re-exported here for server-side path callers.
+export { isSafeRelativeTestFile, assertSafeRelativeTestFile, } from "./test-file-guard.js";
 export function testSpecRootPath(sourcePath) {
     return projectTestsSpecsRoot(asProjectDataDir(sourcePath));
 }