npm - @riddledc/riddle-proof - Versions diffs - 0.8.7 → 0.8.9 - Mend

@riddledc/riddle-proof 0.8.7 → 0.8.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/adapters/codex-exec-agent.cjs +75 -10
package/dist/adapters/codex-exec-agent.js +1 -1
package/dist/adapters/codex.cjs +75 -10
package/dist/adapters/codex.js +1 -1
package/dist/adapters/local-agent.cjs +75 -10
package/dist/adapters/local-agent.js +1 -1
package/dist/{chunk-PYCQNK66.js → chunk-EEIYUZXE.js} +75 -10
package/dist/{chunk-V6VZ3CAI.js → chunk-RTWGGKS3.js} +1 -1
package/dist/cli/index.js +2 -2
package/dist/cli.cjs +75 -10
package/dist/cli.js +2 -2
package/dist/codex-exec-agent.cjs +75 -10
package/dist/codex-exec-agent.js +1 -1
package/dist/index.cjs +75 -10
package/dist/index.js +1 -1
package/dist/local-agent.cjs +75 -10
package/dist/local-agent.js +1 -1
package/package.json +1 -1
package/runtime/lib/verify.py +204 -5
package/runtime/tests/recon_verify_smoke.py +19 -12

package/dist/index.cjs CHANGED Viewed

@@ -6531,6 +6531,8 @@ var import_node_child_process3 = require("child_process");
 var import_node_fs4 = require("fs");
 var import_node_os = __toESM(require("os"), 1);
 var import_node_path4 = __toESM(require("path"), 1);
+var DEFAULT_CODEX_TIMEOUT_MS = 6e5;
+var DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS = 18e4;
 var REFINED_INPUTS_SCHEMA = {
   type: "object",
   additionalProperties: false,
@@ -6874,6 +6876,46 @@ function parseJsonFromRunnerOutputs(outputs, schema) {
   if (!combined.trim() || seen.has(combined)) return { parsed: null, source: "" };
   return { parsed: parseJsonObject(combined, schema), source: "combined_output" };
 }
+function resolveCodexTimeoutMs(config, request) {
+  if (typeof config.codexTimeoutMs === "number" && Number.isFinite(config.codexTimeoutMs) && config.codexTimeoutMs > 0) {
+    return Number(config.codexTimeoutMs);
+  }
+  return request.purpose === "proof packet authoring" ? DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS : DEFAULT_CODEX_TIMEOUT_MS;
+}
+function isCodexLifecycleEvent(value) {
+  if (!value || typeof value !== "object" || Array.isArray(value)) return false;
+  const type = value.type;
+  return typeof type === "string" && (type.startsWith("thread.") || type.startsWith("turn.") || type.startsWith("exec.") || type.startsWith("agent.") || type.startsWith("token.") || type.startsWith("reasoning.") || type.startsWith("error."));
+}
+function analyzeCodexRunnerOutput(outputs) {
+  const eventTypes = /* @__PURE__ */ new Set();
+  let eventLineCount = 0;
+  let nonEventLineCount = 0;
+  const nonEventSamples = [];
+  for (const output of outputs) {
+    const lines = output.text.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
+    for (const line of lines) {
+      try {
+        const parsed = JSON.parse(line);
+        if (isCodexLifecycleEvent(parsed)) {
+          eventLineCount += 1;
+          eventTypes.add(parsed.type);
+          continue;
+        }
+      } catch {
+      }
+      nonEventLineCount += 1;
+      if (nonEventSamples.length < 3) nonEventSamples.push(line.slice(0, 240));
+    }
+  }
+  return {
+    eventLineCount,
+    eventTypes: Array.from(eventTypes),
+    nonEventLineCount,
+    nonEventSamples,
+    onlyLifecycleEvents: eventLineCount > 0 && nonEventLineCount === 0
+  };
+}
 function isHarnessVerificationOnlyBlocker(blocker) {
   const text = blocker.toLowerCase();
   return (text.includes("erofs") || text.includes("read-only file system")) && text.includes("node_modules") && (text.includes(".vite-temp") || text.includes("vite.config"));
@@ -6897,21 +6939,25 @@ function runnerMetrics(input) {
     exit_status: input.status ?? null,
     timed_out: input.timedOut || false,
     error_code: input.errorCode,
+    codex_event_types: input.codexEventTypes && input.codexEventTypes.length ? input.codexEventTypes : void 0,
+    codex_event_line_count: input.codexEventLineCount,
+    codex_non_event_line_count: input.codexNonEventLineCount,
     codex_command: input.config.codexCommand || "codex",
     codex_model: input.config.codexModel,
     codex_sandbox: input.config.codexSandbox || "workspace-write",
     codex_full_auto: input.config.codexFullAuto !== false,
-    timeout_ms: Number(input.config.codexTimeoutMs || 6e5)
+    timeout_ms: input.timeoutMs ?? DEFAULT_CODEX_TIMEOUT_MS
   });
 }
 function createCodexExecJsonRunner(config = {}) {
   return (request) => {
     const startedAt = (/* @__PURE__ */ new Date()).toISOString();
     const startedMs = Date.now();
+    const timeoutMs = resolveCodexTimeoutMs(config, request);
     if (!request.workdir || !(0, import_node_fs4.existsSync)(request.workdir)) {
       return {
         ok: false,
-        metrics: runnerMetrics({ request, config, startedAt, startedMs, errorCode: "workdir_missing" }),
+        metrics: runnerMetrics({ request, config, startedAt, startedMs, timeoutMs, errorCode: "workdir_missing" }),
         blocker: {
           code: "codex_workdir_missing",
           message: `Codex workdir does not exist for ${request.purpose}.`,
@@ -6946,7 +6992,7 @@ function createCodexExecJsonRunner(config = {}) {
       const proc = (0, import_node_child_process3.spawnSync)(config.codexCommand || "codex", args, {
         input: request.prompt,
         encoding: "utf-8",
-        timeout: Number(config.codexTimeoutMs || 6e5),
+        timeout: timeoutMs,
         maxBuffer: 10 * 1024 * 1024,
         env
       });
@@ -6965,6 +7011,7 @@ function createCodexExecJsonRunner(config = {}) {
             stderr: proc.stderr || "",
             status: proc.status,
             timedOut,
+            timeoutMs,
             errorCode: proc.error.code || "spawn_error"
           }),
           blocker: {
@@ -6987,6 +7034,7 @@ function createCodexExecJsonRunner(config = {}) {
             stdout: proc.stdout || "",
             stderr: proc.stderr || "",
             status: proc.status,
+            timeoutMs,
             errorCode: "nonzero_exit"
           }),
           blocker: {
@@ -6999,12 +7047,15 @@ function createCodexExecJsonRunner(config = {}) {
       const finalText = (0, import_node_fs4.existsSync)(lastMessagePath) ? (0, import_node_fs4.readFileSync)(lastMessagePath, "utf-8") : String(proc.stdout || "");
       const stdoutText = String(proc.stdout || "");
       const stderrText = String(proc.stderr || "");
-      const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs([
+      const runnerOutputs = [
         { source: (0, import_node_fs4.existsSync)(lastMessagePath) ? "last_message" : "stdout", text: finalText },
         { source: "stdout", text: stdoutText },
         { source: "stderr", text: stderrText }
-      ], request.schema);
+      ];
+      const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs(runnerOutputs, request.schema);
       if (!parsed) {
+        const outputAnalysis = analyzeCodexRunnerOutput(runnerOutputs);
+        const errorCode = outputAnalysis.onlyLifecycleEvents ? "no_final_response" : "invalid_json";
         return {
           ok: false,
           stdout: stdoutText,
@@ -7018,12 +7069,24 @@ function createCodexExecJsonRunner(config = {}) {
             stderr: stderrText,
             finalText,
             status: proc.status,
-            errorCode: "invalid_json"
+            timeoutMs,
+            errorCode,
+            codexEventTypes: outputAnalysis.eventTypes,
+            codexEventLineCount: outputAnalysis.eventLineCount,
+            codexNonEventLineCount: outputAnalysis.nonEventLineCount
           }),
           blocker: {
-            code: "codex_invalid_json",
-            message: `Codex completed ${request.purpose}, but did not return valid JSON.`,
-            details: { finalText, stdout: stdoutText, stderr: stderrText }
+            code: outputAnalysis.onlyLifecycleEvents ? "codex_no_final_response" : "codex_invalid_json",
+            message: outputAnalysis.onlyLifecycleEvents ? `Codex emitted lifecycle events during ${request.purpose}, but did not produce a final JSON response.` : `Codex completed ${request.purpose}, but did not return valid JSON.`,
+            details: {
+              finalText,
+              stdout: stdoutText,
+              stderr: stderrText,
+              event_types: outputAnalysis.eventTypes,
+              event_line_count: outputAnalysis.eventLineCount,
+              non_event_line_count: outputAnalysis.nonEventLineCount,
+              non_event_samples: outputAnalysis.nonEventSamples
+            }
           }
         };
       }
@@ -7041,7 +7104,8 @@ function createCodexExecJsonRunner(config = {}) {
           stderr: stderrText,
           finalText,
           parsedJsonSource,
-          status: proc.status
+          status: proc.status,
+          timeoutMs
         })
       };
     } finally {
@@ -7150,6 +7214,7 @@ function createCodexExecAgentAdapter(config = {}, runner = createCodexExecJsonRu
           "Write a proof_plan and capture_script that will verify the exact user-facing change.",
           "Use recon_assessment.baseline_understanding as the source of truth. Do not author a proof plan unless it names the observed before state and the requested delta from that state.",
           "Use the recon-approved route and baseline context; make the plan name the concrete target, expected before state, expected after state, and stop condition.",
+          "Do not leave this authoring stage pending for external investigation. Keep any repo inspection brief, do not modify files, and return the JSON proof packet from the available state.",
           "Choose the evidence modality from verification_mode and success_criteria: screenshots for visual/UI proof, interactions plus screenshots for interaction proof, structured metrics/logs/JSON/audio analysis for non-visual proof.",
           "For playable/gameplay proof, treat screenshots as supporting artifacts only: start the game, send keyboard or pointer input, measure state before/after, measure non-HUD canvas/playfield pixel deltas across time, and return playability evidence with version riddle-proof.playability.v1.",
           "For interaction proof, return a structured evidence object with start route/state, terminal route/state, action, assertions, and matched UI text. Catch waitForURL or selector timeouts and record them as failed assertions instead of throwing before evidence is emitted.",

package/dist/index.js CHANGED Viewed

@@ -134,7 +134,7 @@ import {
   createCodexExecAgentAdapter,
   createCodexExecJsonRunner,
   runCodexExecAgentDoctor
-} from "./chunk-PYCQNK66.js";
+} from "./chunk-EEIYUZXE.js";
 import {
   applyTerminalMetadata,
   compactRecord,

package/dist/local-agent.cjs CHANGED Viewed

@@ -48,6 +48,8 @@ function compactRecord(input) {
 }
 // src/codex-exec-agent.ts
+var DEFAULT_CODEX_TIMEOUT_MS = 6e5;
+var DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS = 18e4;
 var REFINED_INPUTS_SCHEMA = {
   type: "object",
   additionalProperties: false,
@@ -391,6 +393,46 @@ function parseJsonFromRunnerOutputs(outputs, schema) {
   if (!combined.trim() || seen.has(combined)) return { parsed: null, source: "" };
   return { parsed: parseJsonObject(combined, schema), source: "combined_output" };
 }
+function resolveCodexTimeoutMs(config, request) {
+  if (typeof config.codexTimeoutMs === "number" && Number.isFinite(config.codexTimeoutMs) && config.codexTimeoutMs > 0) {
+    return Number(config.codexTimeoutMs);
+  }
+  return request.purpose === "proof packet authoring" ? DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS : DEFAULT_CODEX_TIMEOUT_MS;
+}
+function isCodexLifecycleEvent(value) {
+  if (!value || typeof value !== "object" || Array.isArray(value)) return false;
+  const type = value.type;
+  return typeof type === "string" && (type.startsWith("thread.") || type.startsWith("turn.") || type.startsWith("exec.") || type.startsWith("agent.") || type.startsWith("token.") || type.startsWith("reasoning.") || type.startsWith("error."));
+}
+function analyzeCodexRunnerOutput(outputs) {
+  const eventTypes = /* @__PURE__ */ new Set();
+  let eventLineCount = 0;
+  let nonEventLineCount = 0;
+  const nonEventSamples = [];
+  for (const output of outputs) {
+    const lines = output.text.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
+    for (const line of lines) {
+      try {
+        const parsed = JSON.parse(line);
+        if (isCodexLifecycleEvent(parsed)) {
+          eventLineCount += 1;
+          eventTypes.add(parsed.type);
+          continue;
+        }
+      } catch {
+      }
+      nonEventLineCount += 1;
+      if (nonEventSamples.length < 3) nonEventSamples.push(line.slice(0, 240));
+    }
+  }
+  return {
+    eventLineCount,
+    eventTypes: Array.from(eventTypes),
+    nonEventLineCount,
+    nonEventSamples,
+    onlyLifecycleEvents: eventLineCount > 0 && nonEventLineCount === 0
+  };
+}
 function isHarnessVerificationOnlyBlocker(blocker) {
   const text = blocker.toLowerCase();
   return (text.includes("erofs") || text.includes("read-only file system")) && text.includes("node_modules") && (text.includes(".vite-temp") || text.includes("vite.config"));
@@ -414,21 +456,25 @@ function runnerMetrics(input) {
     exit_status: input.status ?? null,
     timed_out: input.timedOut || false,
     error_code: input.errorCode,
+    codex_event_types: input.codexEventTypes && input.codexEventTypes.length ? input.codexEventTypes : void 0,
+    codex_event_line_count: input.codexEventLineCount,
+    codex_non_event_line_count: input.codexNonEventLineCount,
     codex_command: input.config.codexCommand || "codex",
     codex_model: input.config.codexModel,
     codex_sandbox: input.config.codexSandbox || "workspace-write",
     codex_full_auto: input.config.codexFullAuto !== false,
-    timeout_ms: Number(input.config.codexTimeoutMs || 6e5)
+    timeout_ms: input.timeoutMs ?? DEFAULT_CODEX_TIMEOUT_MS
   });
 }
 function createCodexExecJsonRunner(config = {}) {
   return (request) => {
     const startedAt = (/* @__PURE__ */ new Date()).toISOString();
     const startedMs = Date.now();
+    const timeoutMs = resolveCodexTimeoutMs(config, request);
     if (!request.workdir || !(0, import_node_fs.existsSync)(request.workdir)) {
       return {
         ok: false,
-        metrics: runnerMetrics({ request, config, startedAt, startedMs, errorCode: "workdir_missing" }),
+        metrics: runnerMetrics({ request, config, startedAt, startedMs, timeoutMs, errorCode: "workdir_missing" }),
         blocker: {
           code: "codex_workdir_missing",
           message: `Codex workdir does not exist for ${request.purpose}.`,
@@ -463,7 +509,7 @@ function createCodexExecJsonRunner(config = {}) {
       const proc = (0, import_node_child_process.spawnSync)(config.codexCommand || "codex", args, {
         input: request.prompt,
         encoding: "utf-8",
-        timeout: Number(config.codexTimeoutMs || 6e5),
+        timeout: timeoutMs,
         maxBuffer: 10 * 1024 * 1024,
         env
       });
@@ -482,6 +528,7 @@ function createCodexExecJsonRunner(config = {}) {
             stderr: proc.stderr || "",
             status: proc.status,
             timedOut,
+            timeoutMs,
             errorCode: proc.error.code || "spawn_error"
           }),
           blocker: {
@@ -504,6 +551,7 @@ function createCodexExecJsonRunner(config = {}) {
             stdout: proc.stdout || "",
             stderr: proc.stderr || "",
             status: proc.status,
+            timeoutMs,
             errorCode: "nonzero_exit"
           }),
           blocker: {
@@ -516,12 +564,15 @@ function createCodexExecJsonRunner(config = {}) {
       const finalText = (0, import_node_fs.existsSync)(lastMessagePath) ? (0, import_node_fs.readFileSync)(lastMessagePath, "utf-8") : String(proc.stdout || "");
       const stdoutText = String(proc.stdout || "");
       const stderrText = String(proc.stderr || "");
-      const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs([
+      const runnerOutputs = [
         { source: (0, import_node_fs.existsSync)(lastMessagePath) ? "last_message" : "stdout", text: finalText },
         { source: "stdout", text: stdoutText },
         { source: "stderr", text: stderrText }
-      ], request.schema);
+      ];
+      const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs(runnerOutputs, request.schema);
       if (!parsed) {
+        const outputAnalysis = analyzeCodexRunnerOutput(runnerOutputs);
+        const errorCode = outputAnalysis.onlyLifecycleEvents ? "no_final_response" : "invalid_json";
         return {
           ok: false,
           stdout: stdoutText,
@@ -535,12 +586,24 @@ function createCodexExecJsonRunner(config = {}) {
             stderr: stderrText,
             finalText,
             status: proc.status,
-            errorCode: "invalid_json"
+            timeoutMs,
+            errorCode,
+            codexEventTypes: outputAnalysis.eventTypes,
+            codexEventLineCount: outputAnalysis.eventLineCount,
+            codexNonEventLineCount: outputAnalysis.nonEventLineCount
           }),
           blocker: {
-            code: "codex_invalid_json",
-            message: `Codex completed ${request.purpose}, but did not return valid JSON.`,
-            details: { finalText, stdout: stdoutText, stderr: stderrText }
+            code: outputAnalysis.onlyLifecycleEvents ? "codex_no_final_response" : "codex_invalid_json",
+            message: outputAnalysis.onlyLifecycleEvents ? `Codex emitted lifecycle events during ${request.purpose}, but did not produce a final JSON response.` : `Codex completed ${request.purpose}, but did not return valid JSON.`,
+            details: {
+              finalText,
+              stdout: stdoutText,
+              stderr: stderrText,
+              event_types: outputAnalysis.eventTypes,
+              event_line_count: outputAnalysis.eventLineCount,
+              non_event_line_count: outputAnalysis.nonEventLineCount,
+              non_event_samples: outputAnalysis.nonEventSamples
+            }
           }
         };
       }
@@ -558,7 +621,8 @@ function createCodexExecJsonRunner(config = {}) {
           stderr: stderrText,
           finalText,
           parsedJsonSource,
-          status: proc.status
+          status: proc.status,
+          timeoutMs
         })
       };
     } finally {
@@ -667,6 +731,7 @@ function createCodexExecAgentAdapter(config = {}, runner = createCodexExecJsonRu
           "Write a proof_plan and capture_script that will verify the exact user-facing change.",
           "Use recon_assessment.baseline_understanding as the source of truth. Do not author a proof plan unless it names the observed before state and the requested delta from that state.",
           "Use the recon-approved route and baseline context; make the plan name the concrete target, expected before state, expected after state, and stop condition.",
+          "Do not leave this authoring stage pending for external investigation. Keep any repo inspection brief, do not modify files, and return the JSON proof packet from the available state.",
           "Choose the evidence modality from verification_mode and success_criteria: screenshots for visual/UI proof, interactions plus screenshots for interaction proof, structured metrics/logs/JSON/audio analysis for non-visual proof.",
           "For playable/gameplay proof, treat screenshots as supporting artifacts only: start the game, send keyboard or pointer input, measure state before/after, measure non-HUD canvas/playfield pixel deltas across time, and return playability evidence with version riddle-proof.playability.v1.",
           "For interaction proof, return a structured evidence object with start route/state, terminal route/state, action, assertions, and matched UI text. Catch waitForURL or selector timeouts and record them as failed assertions instead of throwing before evidence is emitted.",

package/dist/local-agent.js CHANGED Viewed

@@ -3,7 +3,7 @@ import {
   createCodexExecAgentAdapter,
   createCodexExecJsonRunner,
   runCodexExecAgentDoctor
-} from "./chunk-PYCQNK66.js";
+} from "./chunk-EEIYUZXE.js";
 import "./chunk-VY4Y5U57.js";
 import "./chunk-MLKGABMK.js";
 export {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@riddledc/riddle-proof",
-  "version": "0.8.7",
+  "version": "0.8.9",
   "description": "Reusable Riddle Proof contracts and helpers for evidence-backed agent changes.",
   "license": "MIT",
   "author": "RiddleDC",

package/runtime/lib/verify.py CHANGED Viewed

@@ -2158,6 +2158,170 @@ def interaction_assertions_pass(value):
     return False
+INTERACTION_ASSERTION_CONTAINER_KEYS = ('assertions', 'checks', 'predicates', 'expectations')
+INTERACTION_FAILURE_FLAG_KEYS = (
+    'passed',
+    'ok',
+    'valid',
+    'success',
+    'proofReady',
+    'proof_ready',
+    'interactionPassed',
+    'interaction_passed',
+    'routeMatches',
+    'route_matches',
+)
+INTERACTION_FAILURE_STATUS_VALUES = {'fail', 'failed', 'failure', 'error', 'errored', 'timeout', 'timed_out'}
+INTERACTION_ASSERTION_NAME_KEYS = ('name', 'id', 'key', 'label', 'assertion', 'check', 'field')
+INTERACTION_ROUTE_CONTEXT_KEYS = (
+    'expected',
+    'observed',
+    'actual',
+    'start',
+    'before',
+    'after',
+    'terminal',
+    'final',
+    'expected_after',
+    'expectedAfter',
+    'expected_terminal',
+    'expectedTerminal',
+    'expected_final',
+    'expectedFinal',
+)
+def failure_label(prefix, key):
+    key = str(key or '').strip()
+    prefix = str(prefix or '').strip()
+    if prefix and key:
+        return prefix + '.' + key
+    return key or prefix or 'failed'
+def assertion_item_label(item, fallback):
+    if isinstance(item, dict):
+        for key in INTERACTION_ASSERTION_NAME_KEYS:
+            value = str(item.get(key) or '').strip()
+            if value:
+                return value
+    return fallback
+def collect_interaction_failed_assertions(value, prefix='', depth=0):
+    if depth > 6:
+        return []
+    failures = []
+    if isinstance(value, dict):
+        for key in INTERACTION_FAILURE_FLAG_KEYS:
+            if value.get(key) is False:
+                failures.append(failure_label(prefix, key))
+        status = str(value.get('status') or value.get('result') or '').strip().lower()
+        if status in INTERACTION_FAILURE_STATUS_VALUES:
+            failures.append(failure_label(prefix, assertion_item_label(value, 'status')))
+        for key in INTERACTION_ASSERTION_CONTAINER_KEYS:
+            checks = value.get(key)
+            container_prefix = failure_label(prefix, key)
+            if isinstance(checks, dict):
+                for check_key, check_value in checks.items():
+                    if check_value is False:
+                        failures.append(failure_label(container_prefix, check_key))
+                    elif isinstance(check_value, dict):
+                        nested = collect_interaction_failed_assertions(
+                            check_value,
+                            failure_label(container_prefix, check_key),
+                            depth + 1,
+                        )
+                        failures.extend(nested)
+                    elif isinstance(check_value, list):
+                        failures.extend(collect_interaction_failed_assertions(
+                            check_value,
+                            failure_label(container_prefix, check_key),
+                            depth + 1,
+                        ))
+            elif isinstance(checks, list):
+                for index, item in enumerate(checks):
+                    if item is False:
+                        failures.append(failure_label(container_prefix, str(index)))
+                    elif isinstance(item, dict):
+                        item_label = assertion_item_label(item, str(index))
+                        failures.extend(collect_interaction_failed_assertions(
+                            item,
+                            failure_label(container_prefix, item_label),
+                            depth + 1,
+                        ))
+        for key in EVIDENCE_CONTAINER_KEYS:
+            nested = value.get(key)
+            if isinstance(nested, (dict, list)):
+                failures.extend(collect_interaction_failed_assertions(nested, failure_label(prefix, key), depth + 1))
+    elif isinstance(value, list):
+        for index, item in enumerate(value):
+            if item is False:
+                failures.append(failure_label(prefix, str(index)))
+            elif isinstance(item, (dict, list)):
+                failures.extend(collect_interaction_failed_assertions(item, prefix, depth + 1))
+    deduped = []
+    seen = set()
+    for failure in failures:
+        failure = str(failure or '').strip()
+        if not failure or failure in seen:
+            continue
+        seen.add(failure)
+        deduped.append(failure)
+    return deduped
+def interaction_route_context_present(value, depth=0):
+    if depth > 6:
+        return False
+    if isinstance(value, dict):
+        if terminal_path_from_record(value):
+            return True
+        for key in INTERACTION_ROUTE_CONTEXT_KEYS:
+            nested = value.get(key)
+            if isinstance(nested, dict):
+                if record_path_candidate(nested, allow_location_keys=True):
+                    return True
+                query = str(nested.get('query') or nested.get('search') or '').strip()
+                hash_value = str(nested.get('hash') or nested.get('fragment') or '').strip()
+                if query or hash_value:
+                    return True
+                if interaction_route_context_present(nested, depth + 1):
+                    return True
+            elif isinstance(nested, str) and path_candidate(nested):
+                return True
+        for key in EVIDENCE_CONTAINER_KEYS:
+            nested = value.get(key)
+            if isinstance(nested, (dict, list)) and interaction_route_context_present(nested, depth + 1):
+                return True
+    elif isinstance(value, list):
+        return any(interaction_route_context_present(item, depth + 1) for item in value)
+    return False
+def failed_interaction_evidence_summary(proof_evidence):
+    failures = []
+    for record in proof_evidence_records(proof_evidence):
+        failures.extend(collect_interaction_failed_assertions(record))
+    deduped = []
+    seen = set()
+    for failure in failures:
+        if failure not in seen:
+            seen.add(failure)
+            deduped.append(failure)
+    if not deduped or not interaction_route_context_present(proof_evidence):
+        return ''
+    summary = 'Structured interaction proof evidence captured failed assertion(s): ' + ', '.join(deduped[:8]) + '.'
+    capture_errors = []
+    for record in proof_evidence_records(proof_evidence):
+        error = str(record.get('capture_error') or record.get('error') or '').strip()
+        if error:
+            capture_errors.append(error)
+    if capture_errors:
+        summary += ' Capture script error: ' + capture_errors[0][:300]
+    return summary
 def interaction_terminal_path_from_evidence(proof_evidence):
     for record in proof_evidence_records(proof_evidence):
         candidate = terminal_path_from_record(record)
@@ -2903,6 +3067,9 @@ def build_supervisor_assessment_request(state, payload, after_observation, requi
         evidence_basis.append('structured-artifacts')
     if supporting.get('playability_ready'):
         evidence_basis.append('playability')
+    interaction_failure_summary = str(state.get('structured_interaction_failure_summary') or '').strip()
+    if interaction_failure_summary:
+        evidence_basis.append('structured-interaction-failure')
     visual_delta = ((evidence_bundle or {}).get('after') or {}).get('visual_delta') or {}
     if visual_delta.get('status') == 'measured':
         evidence_basis.append('visual-delta')
@@ -2936,6 +3103,8 @@ def build_supervisor_assessment_request(state, payload, after_observation, requi
         evidence_bundle['artifact_usage'] = artifact_usage
     visual_delta_blocker = '' if audit_no_diff_mode(state) else visual_delta_blocker_for_mode(verification_mode, visual_delta)
     hard_blockers = [visual_delta_blocker] if visual_delta_blocker else []
+    if interaction_failure_summary:
+        hard_blockers.append(interaction_failure_summary)
     if verification_mode in PLAYABILITY_MODES and not supporting.get('playability_ready'):
         assessment = supporting.get('playability_assessment') or {}
         concerns = assessment.get('concerns') if isinstance(assessment, dict) else []
@@ -2961,6 +3130,10 @@ def build_supervisor_assessment_request(state, payload, after_observation, requi
         instructions.append(
             'For visual/UI polish, capture success is not proof. If visual_delta.status is unmeasured, missing, not_applicable, or measured with passed=false, choose needs_implementation or needs_richer_proof instead of ready_to_ship.'
         )
+    if interaction_failure_summary:
+        instructions.append(
+            'The structured interaction evidence contains failed assertions. Treat those failed assertions as a hard blocker for ready_to_ship; do not send this back to author unless the capture script itself is missing the needed evidence.'
+        )
     instructions.extend([
         'For playable/gameplay proof, screenshots are supporting evidence only. Do not mark ready_to_ship unless playability_assessment.passed is true and the proof shows accepted input, state/time progression, and playfield/canvas pixel motion.',
         'For data/audio/log/metrics/custom modes, judge the structured evidence bundle and proof_evidence_sample directly; screenshots are optional supporting context.',
@@ -2983,6 +3156,7 @@ def build_supervisor_assessment_request(state, payload, after_observation, requi
         'viewport_matrix': viewport_matrix,
         'evidence_bundle': evidence_bundle or {},
         'evidence_basis': evidence_basis,
+        'structured_interaction_failure_summary': interaction_failure_summary,
         'artifact_contract': artifact_contract,
         'artifact_production': artifact_production,
         'artifact_usage': artifact_usage,
@@ -3384,6 +3558,14 @@ if proof_evidence_required_for_mode(s.get('verification_mode')):
     if proof_evidence_blocker:
         summary_lines.append('Structured proof evidence gate: ' + proof_evidence_blocker)
+structured_interaction_failure_summary = ''
+proof_evidence = evidence_bundle.get('proof_evidence')
+if verification_mode in INTERACTION_MODES and proof_evidence is not None:
+    structured_interaction_failure_summary = failed_interaction_evidence_summary(proof_evidence)
+    if structured_interaction_failure_summary:
+        summary_lines.append('Structured interaction evidence gate: ' + structured_interaction_failure_summary)
+s['structured_interaction_failure_summary'] = structured_interaction_failure_summary
 visual_delta_recovery = build_visual_delta_recovery_decision(
     s.get('verification_mode'),
     visual_delta,
@@ -3392,14 +3574,20 @@ visual_delta_recovery = build_visual_delta_recovery_decision(
 if visual_delta_recovery:
     summary_lines.append('Visual delta recovery: ' + visual_delta_recovery['summary'])
+has_judgable_failed_interaction_evidence = (
+    bool(structured_interaction_failure_summary)
+    and required_baseline_present
+    and not proof_evidence_blocker
+    and not visual_delta_recovery
+)
 has_good_evidence = (
     required_baseline_present
-    and after_observation.get('valid')
+    and (after_observation.get('valid') or has_judgable_failed_interaction_evidence)
     and not proof_evidence_blocker
     and not visual_delta_recovery
 )
-if has_good_evidence:
+if has_good_evidence and after_observation.get('valid'):
     s['capture_hint_saved'] = record_successful_capture_hint(
         s,
         server_path=s.get('expected_start_path') or expected_path or s.get('server_path') or '/',
@@ -3410,9 +3598,12 @@ if has_good_evidence:
     )
 if has_good_evidence:
+    if has_judgable_failed_interaction_evidence and isinstance(evidence_bundle.get('proof_session'), dict):
+        evidence_bundle['proof_session']['status'] = 'evidence_captured'
+        s['proof_session'] = evidence_bundle.get('proof_session') or {}
     supervisor_request = build_supervisor_assessment_request(s, after_payload, after_observation, required_baseline_present, expected_path, evidence_bundle)
     s['verify_status'] = 'evidence_captured'
-    s['merge_recommendation'] = 'pending-supervisor-judgment'
+    s['merge_recommendation'] = 'do-not-merge' if has_judgable_failed_interaction_evidence else 'pending-supervisor-judgment'
     s['proof_assessment'] = {}
     s['proof_assessment_source'] = None
     s['proof_assessment_request'] = supervisor_request
@@ -3422,11 +3613,16 @@ if has_good_evidence:
         fields_agent_may_update.append('implementation_notes')
     s['verify_decision_request'] = {
         'status': s['verify_status'],
-        'summary': 'Verify captured usable evidence and is waiting for supervising-agent proof assessment.',
+        'summary': (
+            'Verify captured structured interaction evidence with failed assertions and is waiting for supervising-agent proof assessment.'
+            if has_judgable_failed_interaction_evidence
+            else 'Verify captured usable evidence and is waiting for supervising-agent proof assessment.'
+        ),
         'expected_path': expected_path,
         'expected_start_path': s.get('expected_start_path') or expected_path,
         'route_expectation': s.get('route_expectation') or {},
         'latest_observation': after_observation,
+        'structured_interaction_failure_summary': structured_interaction_failure_summary,
         'next_stage_options': next_stage_options,
         'recommended_stage': None,
         'continue_with_stage': None,
@@ -3438,7 +3634,10 @@ if has_good_evidence:
             'Do not escalate to the human unless the supervising agent concludes the workflow is genuinely stuck or not converging.',
         ],
     }
-    summary_lines.append('Proof assessment: awaiting supervising agent judgment')
+    if has_judgable_failed_interaction_evidence:
+        summary_lines.append('Proof assessment: awaiting supervising agent judgment on failed interaction evidence')
+    else:
+        summary_lines.append('Proof assessment: awaiting supervising agent judgment')
     summary_lines.append('Proof next stage: supervising agent decides after reviewing the evidence packet')
 else:
     capture_retry = visual_delta_recovery or build_capture_retry_decision(after_observation, required_baseline_present, proof_evidence_blocker, s.get('route_expectation') or {})