@riddledc/riddle-proof 0.8.7 → 0.8.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -6531,6 +6531,8 @@ var import_node_child_process3 = require("child_process");
6531
6531
  var import_node_fs4 = require("fs");
6532
6532
  var import_node_os = __toESM(require("os"), 1);
6533
6533
  var import_node_path4 = __toESM(require("path"), 1);
6534
+ var DEFAULT_CODEX_TIMEOUT_MS = 6e5;
6535
+ var DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS = 18e4;
6534
6536
  var REFINED_INPUTS_SCHEMA = {
6535
6537
  type: "object",
6536
6538
  additionalProperties: false,
@@ -6874,6 +6876,46 @@ function parseJsonFromRunnerOutputs(outputs, schema) {
6874
6876
  if (!combined.trim() || seen.has(combined)) return { parsed: null, source: "" };
6875
6877
  return { parsed: parseJsonObject(combined, schema), source: "combined_output" };
6876
6878
  }
6879
+ function resolveCodexTimeoutMs(config, request) {
6880
+ if (typeof config.codexTimeoutMs === "number" && Number.isFinite(config.codexTimeoutMs) && config.codexTimeoutMs > 0) {
6881
+ return Number(config.codexTimeoutMs);
6882
+ }
6883
+ return request.purpose === "proof packet authoring" ? DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS : DEFAULT_CODEX_TIMEOUT_MS;
6884
+ }
6885
+ function isCodexLifecycleEvent(value) {
6886
+ if (!value || typeof value !== "object" || Array.isArray(value)) return false;
6887
+ const type = value.type;
6888
+ return typeof type === "string" && (type.startsWith("thread.") || type.startsWith("turn.") || type.startsWith("exec.") || type.startsWith("agent.") || type.startsWith("token.") || type.startsWith("reasoning.") || type.startsWith("error."));
6889
+ }
6890
+ function analyzeCodexRunnerOutput(outputs) {
6891
+ const eventTypes = /* @__PURE__ */ new Set();
6892
+ let eventLineCount = 0;
6893
+ let nonEventLineCount = 0;
6894
+ const nonEventSamples = [];
6895
+ for (const output of outputs) {
6896
+ const lines = output.text.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
6897
+ for (const line of lines) {
6898
+ try {
6899
+ const parsed = JSON.parse(line);
6900
+ if (isCodexLifecycleEvent(parsed)) {
6901
+ eventLineCount += 1;
6902
+ eventTypes.add(parsed.type);
6903
+ continue;
6904
+ }
6905
+ } catch {
6906
+ }
6907
+ nonEventLineCount += 1;
6908
+ if (nonEventSamples.length < 3) nonEventSamples.push(line.slice(0, 240));
6909
+ }
6910
+ }
6911
+ return {
6912
+ eventLineCount,
6913
+ eventTypes: Array.from(eventTypes),
6914
+ nonEventLineCount,
6915
+ nonEventSamples,
6916
+ onlyLifecycleEvents: eventLineCount > 0 && nonEventLineCount === 0
6917
+ };
6918
+ }
6877
6919
  function isHarnessVerificationOnlyBlocker(blocker) {
6878
6920
  const text = blocker.toLowerCase();
6879
6921
  return (text.includes("erofs") || text.includes("read-only file system")) && text.includes("node_modules") && (text.includes(".vite-temp") || text.includes("vite.config"));
@@ -6897,21 +6939,25 @@ function runnerMetrics(input) {
6897
6939
  exit_status: input.status ?? null,
6898
6940
  timed_out: input.timedOut || false,
6899
6941
  error_code: input.errorCode,
6942
+ codex_event_types: input.codexEventTypes && input.codexEventTypes.length ? input.codexEventTypes : void 0,
6943
+ codex_event_line_count: input.codexEventLineCount,
6944
+ codex_non_event_line_count: input.codexNonEventLineCount,
6900
6945
  codex_command: input.config.codexCommand || "codex",
6901
6946
  codex_model: input.config.codexModel,
6902
6947
  codex_sandbox: input.config.codexSandbox || "workspace-write",
6903
6948
  codex_full_auto: input.config.codexFullAuto !== false,
6904
- timeout_ms: Number(input.config.codexTimeoutMs || 6e5)
6949
+ timeout_ms: input.timeoutMs ?? DEFAULT_CODEX_TIMEOUT_MS
6905
6950
  });
6906
6951
  }
6907
6952
  function createCodexExecJsonRunner(config = {}) {
6908
6953
  return (request) => {
6909
6954
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
6910
6955
  const startedMs = Date.now();
6956
+ const timeoutMs = resolveCodexTimeoutMs(config, request);
6911
6957
  if (!request.workdir || !(0, import_node_fs4.existsSync)(request.workdir)) {
6912
6958
  return {
6913
6959
  ok: false,
6914
- metrics: runnerMetrics({ request, config, startedAt, startedMs, errorCode: "workdir_missing" }),
6960
+ metrics: runnerMetrics({ request, config, startedAt, startedMs, timeoutMs, errorCode: "workdir_missing" }),
6915
6961
  blocker: {
6916
6962
  code: "codex_workdir_missing",
6917
6963
  message: `Codex workdir does not exist for ${request.purpose}.`,
@@ -6946,7 +6992,7 @@ function createCodexExecJsonRunner(config = {}) {
6946
6992
  const proc = (0, import_node_child_process3.spawnSync)(config.codexCommand || "codex", args, {
6947
6993
  input: request.prompt,
6948
6994
  encoding: "utf-8",
6949
- timeout: Number(config.codexTimeoutMs || 6e5),
6995
+ timeout: timeoutMs,
6950
6996
  maxBuffer: 10 * 1024 * 1024,
6951
6997
  env
6952
6998
  });
@@ -6965,6 +7011,7 @@ function createCodexExecJsonRunner(config = {}) {
6965
7011
  stderr: proc.stderr || "",
6966
7012
  status: proc.status,
6967
7013
  timedOut,
7014
+ timeoutMs,
6968
7015
  errorCode: proc.error.code || "spawn_error"
6969
7016
  }),
6970
7017
  blocker: {
@@ -6987,6 +7034,7 @@ function createCodexExecJsonRunner(config = {}) {
6987
7034
  stdout: proc.stdout || "",
6988
7035
  stderr: proc.stderr || "",
6989
7036
  status: proc.status,
7037
+ timeoutMs,
6990
7038
  errorCode: "nonzero_exit"
6991
7039
  }),
6992
7040
  blocker: {
@@ -6999,12 +7047,15 @@ function createCodexExecJsonRunner(config = {}) {
6999
7047
  const finalText = (0, import_node_fs4.existsSync)(lastMessagePath) ? (0, import_node_fs4.readFileSync)(lastMessagePath, "utf-8") : String(proc.stdout || "");
7000
7048
  const stdoutText = String(proc.stdout || "");
7001
7049
  const stderrText = String(proc.stderr || "");
7002
- const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs([
7050
+ const runnerOutputs = [
7003
7051
  { source: (0, import_node_fs4.existsSync)(lastMessagePath) ? "last_message" : "stdout", text: finalText },
7004
7052
  { source: "stdout", text: stdoutText },
7005
7053
  { source: "stderr", text: stderrText }
7006
- ], request.schema);
7054
+ ];
7055
+ const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs(runnerOutputs, request.schema);
7007
7056
  if (!parsed) {
7057
+ const outputAnalysis = analyzeCodexRunnerOutput(runnerOutputs);
7058
+ const errorCode = outputAnalysis.onlyLifecycleEvents ? "no_final_response" : "invalid_json";
7008
7059
  return {
7009
7060
  ok: false,
7010
7061
  stdout: stdoutText,
@@ -7018,12 +7069,24 @@ function createCodexExecJsonRunner(config = {}) {
7018
7069
  stderr: stderrText,
7019
7070
  finalText,
7020
7071
  status: proc.status,
7021
- errorCode: "invalid_json"
7072
+ timeoutMs,
7073
+ errorCode,
7074
+ codexEventTypes: outputAnalysis.eventTypes,
7075
+ codexEventLineCount: outputAnalysis.eventLineCount,
7076
+ codexNonEventLineCount: outputAnalysis.nonEventLineCount
7022
7077
  }),
7023
7078
  blocker: {
7024
- code: "codex_invalid_json",
7025
- message: `Codex completed ${request.purpose}, but did not return valid JSON.`,
7026
- details: { finalText, stdout: stdoutText, stderr: stderrText }
7079
+ code: outputAnalysis.onlyLifecycleEvents ? "codex_no_final_response" : "codex_invalid_json",
7080
+ message: outputAnalysis.onlyLifecycleEvents ? `Codex emitted lifecycle events during ${request.purpose}, but did not produce a final JSON response.` : `Codex completed ${request.purpose}, but did not return valid JSON.`,
7081
+ details: {
7082
+ finalText,
7083
+ stdout: stdoutText,
7084
+ stderr: stderrText,
7085
+ event_types: outputAnalysis.eventTypes,
7086
+ event_line_count: outputAnalysis.eventLineCount,
7087
+ non_event_line_count: outputAnalysis.nonEventLineCount,
7088
+ non_event_samples: outputAnalysis.nonEventSamples
7089
+ }
7027
7090
  }
7028
7091
  };
7029
7092
  }
@@ -7041,7 +7104,8 @@ function createCodexExecJsonRunner(config = {}) {
7041
7104
  stderr: stderrText,
7042
7105
  finalText,
7043
7106
  parsedJsonSource,
7044
- status: proc.status
7107
+ status: proc.status,
7108
+ timeoutMs
7045
7109
  })
7046
7110
  };
7047
7111
  } finally {
@@ -7150,6 +7214,7 @@ function createCodexExecAgentAdapter(config = {}, runner = createCodexExecJsonRu
7150
7214
  "Write a proof_plan and capture_script that will verify the exact user-facing change.",
7151
7215
  "Use recon_assessment.baseline_understanding as the source of truth. Do not author a proof plan unless it names the observed before state and the requested delta from that state.",
7152
7216
  "Use the recon-approved route and baseline context; make the plan name the concrete target, expected before state, expected after state, and stop condition.",
7217
+ "Do not leave this authoring stage pending for external investigation. Keep any repo inspection brief, do not modify files, and return the JSON proof packet from the available state.",
7153
7218
  "Choose the evidence modality from verification_mode and success_criteria: screenshots for visual/UI proof, interactions plus screenshots for interaction proof, structured metrics/logs/JSON/audio analysis for non-visual proof.",
7154
7219
  "For playable/gameplay proof, treat screenshots as supporting artifacts only: start the game, send keyboard or pointer input, measure state before/after, measure non-HUD canvas/playfield pixel deltas across time, and return playability evidence with version riddle-proof.playability.v1.",
7155
7220
  "For interaction proof, return a structured evidence object with start route/state, terminal route/state, action, assertions, and matched UI text. Catch waitForURL or selector timeouts and record them as failed assertions instead of throwing before evidence is emitted.",
package/dist/index.js CHANGED
@@ -134,7 +134,7 @@ import {
134
134
  createCodexExecAgentAdapter,
135
135
  createCodexExecJsonRunner,
136
136
  runCodexExecAgentDoctor
137
- } from "./chunk-PYCQNK66.js";
137
+ } from "./chunk-EEIYUZXE.js";
138
138
  import {
139
139
  applyTerminalMetadata,
140
140
  compactRecord,
@@ -48,6 +48,8 @@ function compactRecord(input) {
48
48
  }
49
49
 
50
50
  // src/codex-exec-agent.ts
51
+ var DEFAULT_CODEX_TIMEOUT_MS = 6e5;
52
+ var DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS = 18e4;
51
53
  var REFINED_INPUTS_SCHEMA = {
52
54
  type: "object",
53
55
  additionalProperties: false,
@@ -391,6 +393,46 @@ function parseJsonFromRunnerOutputs(outputs, schema) {
391
393
  if (!combined.trim() || seen.has(combined)) return { parsed: null, source: "" };
392
394
  return { parsed: parseJsonObject(combined, schema), source: "combined_output" };
393
395
  }
396
+ function resolveCodexTimeoutMs(config, request) {
397
+ if (typeof config.codexTimeoutMs === "number" && Number.isFinite(config.codexTimeoutMs) && config.codexTimeoutMs > 0) {
398
+ return Number(config.codexTimeoutMs);
399
+ }
400
+ return request.purpose === "proof packet authoring" ? DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS : DEFAULT_CODEX_TIMEOUT_MS;
401
+ }
402
+ function isCodexLifecycleEvent(value) {
403
+ if (!value || typeof value !== "object" || Array.isArray(value)) return false;
404
+ const type = value.type;
405
+ return typeof type === "string" && (type.startsWith("thread.") || type.startsWith("turn.") || type.startsWith("exec.") || type.startsWith("agent.") || type.startsWith("token.") || type.startsWith("reasoning.") || type.startsWith("error."));
406
+ }
407
+ function analyzeCodexRunnerOutput(outputs) {
408
+ const eventTypes = /* @__PURE__ */ new Set();
409
+ let eventLineCount = 0;
410
+ let nonEventLineCount = 0;
411
+ const nonEventSamples = [];
412
+ for (const output of outputs) {
413
+ const lines = output.text.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
414
+ for (const line of lines) {
415
+ try {
416
+ const parsed = JSON.parse(line);
417
+ if (isCodexLifecycleEvent(parsed)) {
418
+ eventLineCount += 1;
419
+ eventTypes.add(parsed.type);
420
+ continue;
421
+ }
422
+ } catch {
423
+ }
424
+ nonEventLineCount += 1;
425
+ if (nonEventSamples.length < 3) nonEventSamples.push(line.slice(0, 240));
426
+ }
427
+ }
428
+ return {
429
+ eventLineCount,
430
+ eventTypes: Array.from(eventTypes),
431
+ nonEventLineCount,
432
+ nonEventSamples,
433
+ onlyLifecycleEvents: eventLineCount > 0 && nonEventLineCount === 0
434
+ };
435
+ }
394
436
  function isHarnessVerificationOnlyBlocker(blocker) {
395
437
  const text = blocker.toLowerCase();
396
438
  return (text.includes("erofs") || text.includes("read-only file system")) && text.includes("node_modules") && (text.includes(".vite-temp") || text.includes("vite.config"));
@@ -414,21 +456,25 @@ function runnerMetrics(input) {
414
456
  exit_status: input.status ?? null,
415
457
  timed_out: input.timedOut || false,
416
458
  error_code: input.errorCode,
459
+ codex_event_types: input.codexEventTypes && input.codexEventTypes.length ? input.codexEventTypes : void 0,
460
+ codex_event_line_count: input.codexEventLineCount,
461
+ codex_non_event_line_count: input.codexNonEventLineCount,
417
462
  codex_command: input.config.codexCommand || "codex",
418
463
  codex_model: input.config.codexModel,
419
464
  codex_sandbox: input.config.codexSandbox || "workspace-write",
420
465
  codex_full_auto: input.config.codexFullAuto !== false,
421
- timeout_ms: Number(input.config.codexTimeoutMs || 6e5)
466
+ timeout_ms: input.timeoutMs ?? DEFAULT_CODEX_TIMEOUT_MS
422
467
  });
423
468
  }
424
469
  function createCodexExecJsonRunner(config = {}) {
425
470
  return (request) => {
426
471
  const startedAt = (/* @__PURE__ */ new Date()).toISOString();
427
472
  const startedMs = Date.now();
473
+ const timeoutMs = resolveCodexTimeoutMs(config, request);
428
474
  if (!request.workdir || !(0, import_node_fs.existsSync)(request.workdir)) {
429
475
  return {
430
476
  ok: false,
431
- metrics: runnerMetrics({ request, config, startedAt, startedMs, errorCode: "workdir_missing" }),
477
+ metrics: runnerMetrics({ request, config, startedAt, startedMs, timeoutMs, errorCode: "workdir_missing" }),
432
478
  blocker: {
433
479
  code: "codex_workdir_missing",
434
480
  message: `Codex workdir does not exist for ${request.purpose}.`,
@@ -463,7 +509,7 @@ function createCodexExecJsonRunner(config = {}) {
463
509
  const proc = (0, import_node_child_process.spawnSync)(config.codexCommand || "codex", args, {
464
510
  input: request.prompt,
465
511
  encoding: "utf-8",
466
- timeout: Number(config.codexTimeoutMs || 6e5),
512
+ timeout: timeoutMs,
467
513
  maxBuffer: 10 * 1024 * 1024,
468
514
  env
469
515
  });
@@ -482,6 +528,7 @@ function createCodexExecJsonRunner(config = {}) {
482
528
  stderr: proc.stderr || "",
483
529
  status: proc.status,
484
530
  timedOut,
531
+ timeoutMs,
485
532
  errorCode: proc.error.code || "spawn_error"
486
533
  }),
487
534
  blocker: {
@@ -504,6 +551,7 @@ function createCodexExecJsonRunner(config = {}) {
504
551
  stdout: proc.stdout || "",
505
552
  stderr: proc.stderr || "",
506
553
  status: proc.status,
554
+ timeoutMs,
507
555
  errorCode: "nonzero_exit"
508
556
  }),
509
557
  blocker: {
@@ -516,12 +564,15 @@ function createCodexExecJsonRunner(config = {}) {
516
564
  const finalText = (0, import_node_fs.existsSync)(lastMessagePath) ? (0, import_node_fs.readFileSync)(lastMessagePath, "utf-8") : String(proc.stdout || "");
517
565
  const stdoutText = String(proc.stdout || "");
518
566
  const stderrText = String(proc.stderr || "");
519
- const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs([
567
+ const runnerOutputs = [
520
568
  { source: (0, import_node_fs.existsSync)(lastMessagePath) ? "last_message" : "stdout", text: finalText },
521
569
  { source: "stdout", text: stdoutText },
522
570
  { source: "stderr", text: stderrText }
523
- ], request.schema);
571
+ ];
572
+ const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs(runnerOutputs, request.schema);
524
573
  if (!parsed) {
574
+ const outputAnalysis = analyzeCodexRunnerOutput(runnerOutputs);
575
+ const errorCode = outputAnalysis.onlyLifecycleEvents ? "no_final_response" : "invalid_json";
525
576
  return {
526
577
  ok: false,
527
578
  stdout: stdoutText,
@@ -535,12 +586,24 @@ function createCodexExecJsonRunner(config = {}) {
535
586
  stderr: stderrText,
536
587
  finalText,
537
588
  status: proc.status,
538
- errorCode: "invalid_json"
589
+ timeoutMs,
590
+ errorCode,
591
+ codexEventTypes: outputAnalysis.eventTypes,
592
+ codexEventLineCount: outputAnalysis.eventLineCount,
593
+ codexNonEventLineCount: outputAnalysis.nonEventLineCount
539
594
  }),
540
595
  blocker: {
541
- code: "codex_invalid_json",
542
- message: `Codex completed ${request.purpose}, but did not return valid JSON.`,
543
- details: { finalText, stdout: stdoutText, stderr: stderrText }
596
+ code: outputAnalysis.onlyLifecycleEvents ? "codex_no_final_response" : "codex_invalid_json",
597
+ message: outputAnalysis.onlyLifecycleEvents ? `Codex emitted lifecycle events during ${request.purpose}, but did not produce a final JSON response.` : `Codex completed ${request.purpose}, but did not return valid JSON.`,
598
+ details: {
599
+ finalText,
600
+ stdout: stdoutText,
601
+ stderr: stderrText,
602
+ event_types: outputAnalysis.eventTypes,
603
+ event_line_count: outputAnalysis.eventLineCount,
604
+ non_event_line_count: outputAnalysis.nonEventLineCount,
605
+ non_event_samples: outputAnalysis.nonEventSamples
606
+ }
544
607
  }
545
608
  };
546
609
  }
@@ -558,7 +621,8 @@ function createCodexExecJsonRunner(config = {}) {
558
621
  stderr: stderrText,
559
622
  finalText,
560
623
  parsedJsonSource,
561
- status: proc.status
624
+ status: proc.status,
625
+ timeoutMs
562
626
  })
563
627
  };
564
628
  } finally {
@@ -667,6 +731,7 @@ function createCodexExecAgentAdapter(config = {}, runner = createCodexExecJsonRu
667
731
  "Write a proof_plan and capture_script that will verify the exact user-facing change.",
668
732
  "Use recon_assessment.baseline_understanding as the source of truth. Do not author a proof plan unless it names the observed before state and the requested delta from that state.",
669
733
  "Use the recon-approved route and baseline context; make the plan name the concrete target, expected before state, expected after state, and stop condition.",
734
+ "Do not leave this authoring stage pending for external investigation. Keep any repo inspection brief, do not modify files, and return the JSON proof packet from the available state.",
670
735
  "Choose the evidence modality from verification_mode and success_criteria: screenshots for visual/UI proof, interactions plus screenshots for interaction proof, structured metrics/logs/JSON/audio analysis for non-visual proof.",
671
736
  "For playable/gameplay proof, treat screenshots as supporting artifacts only: start the game, send keyboard or pointer input, measure state before/after, measure non-HUD canvas/playfield pixel deltas across time, and return playability evidence with version riddle-proof.playability.v1.",
672
737
  "For interaction proof, return a structured evidence object with start route/state, terminal route/state, action, assertions, and matched UI text. Catch waitForURL or selector timeouts and record them as failed assertions instead of throwing before evidence is emitted.",
@@ -3,7 +3,7 @@ import {
3
3
  createCodexExecAgentAdapter,
4
4
  createCodexExecJsonRunner,
5
5
  runCodexExecAgentDoctor
6
- } from "./chunk-PYCQNK66.js";
6
+ } from "./chunk-EEIYUZXE.js";
7
7
  import "./chunk-VY4Y5U57.js";
8
8
  import "./chunk-MLKGABMK.js";
9
9
  export {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@riddledc/riddle-proof",
3
- "version": "0.8.7",
3
+ "version": "0.8.9",
4
4
  "description": "Reusable Riddle Proof contracts and helpers for evidence-backed agent changes.",
5
5
  "license": "MIT",
6
6
  "author": "RiddleDC",
@@ -2158,6 +2158,170 @@ def interaction_assertions_pass(value):
2158
2158
  return False
2159
2159
 
2160
2160
 
2161
+ INTERACTION_ASSERTION_CONTAINER_KEYS = ('assertions', 'checks', 'predicates', 'expectations')
2162
+ INTERACTION_FAILURE_FLAG_KEYS = (
2163
+ 'passed',
2164
+ 'ok',
2165
+ 'valid',
2166
+ 'success',
2167
+ 'proofReady',
2168
+ 'proof_ready',
2169
+ 'interactionPassed',
2170
+ 'interaction_passed',
2171
+ 'routeMatches',
2172
+ 'route_matches',
2173
+ )
2174
+ INTERACTION_FAILURE_STATUS_VALUES = {'fail', 'failed', 'failure', 'error', 'errored', 'timeout', 'timed_out'}
2175
+ INTERACTION_ASSERTION_NAME_KEYS = ('name', 'id', 'key', 'label', 'assertion', 'check', 'field')
2176
+ INTERACTION_ROUTE_CONTEXT_KEYS = (
2177
+ 'expected',
2178
+ 'observed',
2179
+ 'actual',
2180
+ 'start',
2181
+ 'before',
2182
+ 'after',
2183
+ 'terminal',
2184
+ 'final',
2185
+ 'expected_after',
2186
+ 'expectedAfter',
2187
+ 'expected_terminal',
2188
+ 'expectedTerminal',
2189
+ 'expected_final',
2190
+ 'expectedFinal',
2191
+ )
2192
+
2193
+
2194
+ def failure_label(prefix, key):
2195
+ key = str(key or '').strip()
2196
+ prefix = str(prefix or '').strip()
2197
+ if prefix and key:
2198
+ return prefix + '.' + key
2199
+ return key or prefix or 'failed'
2200
+
2201
+
2202
+ def assertion_item_label(item, fallback):
2203
+ if isinstance(item, dict):
2204
+ for key in INTERACTION_ASSERTION_NAME_KEYS:
2205
+ value = str(item.get(key) or '').strip()
2206
+ if value:
2207
+ return value
2208
+ return fallback
2209
+
2210
+
2211
+ def collect_interaction_failed_assertions(value, prefix='', depth=0):
2212
+ if depth > 6:
2213
+ return []
2214
+ failures = []
2215
+ if isinstance(value, dict):
2216
+ for key in INTERACTION_FAILURE_FLAG_KEYS:
2217
+ if value.get(key) is False:
2218
+ failures.append(failure_label(prefix, key))
2219
+ status = str(value.get('status') or value.get('result') or '').strip().lower()
2220
+ if status in INTERACTION_FAILURE_STATUS_VALUES:
2221
+ failures.append(failure_label(prefix, assertion_item_label(value, 'status')))
2222
+ for key in INTERACTION_ASSERTION_CONTAINER_KEYS:
2223
+ checks = value.get(key)
2224
+ container_prefix = failure_label(prefix, key)
2225
+ if isinstance(checks, dict):
2226
+ for check_key, check_value in checks.items():
2227
+ if check_value is False:
2228
+ failures.append(failure_label(container_prefix, check_key))
2229
+ elif isinstance(check_value, dict):
2230
+ nested = collect_interaction_failed_assertions(
2231
+ check_value,
2232
+ failure_label(container_prefix, check_key),
2233
+ depth + 1,
2234
+ )
2235
+ failures.extend(nested)
2236
+ elif isinstance(check_value, list):
2237
+ failures.extend(collect_interaction_failed_assertions(
2238
+ check_value,
2239
+ failure_label(container_prefix, check_key),
2240
+ depth + 1,
2241
+ ))
2242
+ elif isinstance(checks, list):
2243
+ for index, item in enumerate(checks):
2244
+ if item is False:
2245
+ failures.append(failure_label(container_prefix, str(index)))
2246
+ elif isinstance(item, dict):
2247
+ item_label = assertion_item_label(item, str(index))
2248
+ failures.extend(collect_interaction_failed_assertions(
2249
+ item,
2250
+ failure_label(container_prefix, item_label),
2251
+ depth + 1,
2252
+ ))
2253
+ for key in EVIDENCE_CONTAINER_KEYS:
2254
+ nested = value.get(key)
2255
+ if isinstance(nested, (dict, list)):
2256
+ failures.extend(collect_interaction_failed_assertions(nested, failure_label(prefix, key), depth + 1))
2257
+ elif isinstance(value, list):
2258
+ for index, item in enumerate(value):
2259
+ if item is False:
2260
+ failures.append(failure_label(prefix, str(index)))
2261
+ elif isinstance(item, (dict, list)):
2262
+ failures.extend(collect_interaction_failed_assertions(item, prefix, depth + 1))
2263
+ deduped = []
2264
+ seen = set()
2265
+ for failure in failures:
2266
+ failure = str(failure or '').strip()
2267
+ if not failure or failure in seen:
2268
+ continue
2269
+ seen.add(failure)
2270
+ deduped.append(failure)
2271
+ return deduped
2272
+
2273
+
2274
+ def interaction_route_context_present(value, depth=0):
2275
+ if depth > 6:
2276
+ return False
2277
+ if isinstance(value, dict):
2278
+ if terminal_path_from_record(value):
2279
+ return True
2280
+ for key in INTERACTION_ROUTE_CONTEXT_KEYS:
2281
+ nested = value.get(key)
2282
+ if isinstance(nested, dict):
2283
+ if record_path_candidate(nested, allow_location_keys=True):
2284
+ return True
2285
+ query = str(nested.get('query') or nested.get('search') or '').strip()
2286
+ hash_value = str(nested.get('hash') or nested.get('fragment') or '').strip()
2287
+ if query or hash_value:
2288
+ return True
2289
+ if interaction_route_context_present(nested, depth + 1):
2290
+ return True
2291
+ elif isinstance(nested, str) and path_candidate(nested):
2292
+ return True
2293
+ for key in EVIDENCE_CONTAINER_KEYS:
2294
+ nested = value.get(key)
2295
+ if isinstance(nested, (dict, list)) and interaction_route_context_present(nested, depth + 1):
2296
+ return True
2297
+ elif isinstance(value, list):
2298
+ return any(interaction_route_context_present(item, depth + 1) for item in value)
2299
+ return False
2300
+
2301
+
2302
+ def failed_interaction_evidence_summary(proof_evidence):
2303
+ failures = []
2304
+ for record in proof_evidence_records(proof_evidence):
2305
+ failures.extend(collect_interaction_failed_assertions(record))
2306
+ deduped = []
2307
+ seen = set()
2308
+ for failure in failures:
2309
+ if failure not in seen:
2310
+ seen.add(failure)
2311
+ deduped.append(failure)
2312
+ if not deduped or not interaction_route_context_present(proof_evidence):
2313
+ return ''
2314
+ summary = 'Structured interaction proof evidence captured failed assertion(s): ' + ', '.join(deduped[:8]) + '.'
2315
+ capture_errors = []
2316
+ for record in proof_evidence_records(proof_evidence):
2317
+ error = str(record.get('capture_error') or record.get('error') or '').strip()
2318
+ if error:
2319
+ capture_errors.append(error)
2320
+ if capture_errors:
2321
+ summary += ' Capture script error: ' + capture_errors[0][:300]
2322
+ return summary
2323
+
2324
+
2161
2325
  def interaction_terminal_path_from_evidence(proof_evidence):
2162
2326
  for record in proof_evidence_records(proof_evidence):
2163
2327
  candidate = terminal_path_from_record(record)
@@ -2903,6 +3067,9 @@ def build_supervisor_assessment_request(state, payload, after_observation, requi
2903
3067
  evidence_basis.append('structured-artifacts')
2904
3068
  if supporting.get('playability_ready'):
2905
3069
  evidence_basis.append('playability')
3070
+ interaction_failure_summary = str(state.get('structured_interaction_failure_summary') or '').strip()
3071
+ if interaction_failure_summary:
3072
+ evidence_basis.append('structured-interaction-failure')
2906
3073
  visual_delta = ((evidence_bundle or {}).get('after') or {}).get('visual_delta') or {}
2907
3074
  if visual_delta.get('status') == 'measured':
2908
3075
  evidence_basis.append('visual-delta')
@@ -2936,6 +3103,8 @@ def build_supervisor_assessment_request(state, payload, after_observation, requi
2936
3103
  evidence_bundle['artifact_usage'] = artifact_usage
2937
3104
  visual_delta_blocker = '' if audit_no_diff_mode(state) else visual_delta_blocker_for_mode(verification_mode, visual_delta)
2938
3105
  hard_blockers = [visual_delta_blocker] if visual_delta_blocker else []
3106
+ if interaction_failure_summary:
3107
+ hard_blockers.append(interaction_failure_summary)
2939
3108
  if verification_mode in PLAYABILITY_MODES and not supporting.get('playability_ready'):
2940
3109
  assessment = supporting.get('playability_assessment') or {}
2941
3110
  concerns = assessment.get('concerns') if isinstance(assessment, dict) else []
@@ -2961,6 +3130,10 @@ def build_supervisor_assessment_request(state, payload, after_observation, requi
2961
3130
  instructions.append(
2962
3131
  'For visual/UI polish, capture success is not proof. If visual_delta.status is unmeasured, missing, not_applicable, or measured with passed=false, choose needs_implementation or needs_richer_proof instead of ready_to_ship.'
2963
3132
  )
3133
+ if interaction_failure_summary:
3134
+ instructions.append(
3135
+ 'The structured interaction evidence contains failed assertions. Treat those failed assertions as a hard blocker for ready_to_ship; do not send this back to author unless the capture script itself is missing the needed evidence.'
3136
+ )
2964
3137
  instructions.extend([
2965
3138
  'For playable/gameplay proof, screenshots are supporting evidence only. Do not mark ready_to_ship unless playability_assessment.passed is true and the proof shows accepted input, state/time progression, and playfield/canvas pixel motion.',
2966
3139
  'For data/audio/log/metrics/custom modes, judge the structured evidence bundle and proof_evidence_sample directly; screenshots are optional supporting context.',
@@ -2983,6 +3156,7 @@ def build_supervisor_assessment_request(state, payload, after_observation, requi
2983
3156
  'viewport_matrix': viewport_matrix,
2984
3157
  'evidence_bundle': evidence_bundle or {},
2985
3158
  'evidence_basis': evidence_basis,
3159
+ 'structured_interaction_failure_summary': interaction_failure_summary,
2986
3160
  'artifact_contract': artifact_contract,
2987
3161
  'artifact_production': artifact_production,
2988
3162
  'artifact_usage': artifact_usage,
@@ -3384,6 +3558,14 @@ if proof_evidence_required_for_mode(s.get('verification_mode')):
3384
3558
  if proof_evidence_blocker:
3385
3559
  summary_lines.append('Structured proof evidence gate: ' + proof_evidence_blocker)
3386
3560
 
3561
+ structured_interaction_failure_summary = ''
3562
+ proof_evidence = evidence_bundle.get('proof_evidence')
3563
+ if verification_mode in INTERACTION_MODES and proof_evidence is not None:
3564
+ structured_interaction_failure_summary = failed_interaction_evidence_summary(proof_evidence)
3565
+ if structured_interaction_failure_summary:
3566
+ summary_lines.append('Structured interaction evidence gate: ' + structured_interaction_failure_summary)
3567
+ s['structured_interaction_failure_summary'] = structured_interaction_failure_summary
3568
+
3387
3569
  visual_delta_recovery = build_visual_delta_recovery_decision(
3388
3570
  s.get('verification_mode'),
3389
3571
  visual_delta,
@@ -3392,14 +3574,20 @@ visual_delta_recovery = build_visual_delta_recovery_decision(
3392
3574
  if visual_delta_recovery:
3393
3575
  summary_lines.append('Visual delta recovery: ' + visual_delta_recovery['summary'])
3394
3576
 
3577
+ has_judgable_failed_interaction_evidence = (
3578
+ bool(structured_interaction_failure_summary)
3579
+ and required_baseline_present
3580
+ and not proof_evidence_blocker
3581
+ and not visual_delta_recovery
3582
+ )
3395
3583
  has_good_evidence = (
3396
3584
  required_baseline_present
3397
- and after_observation.get('valid')
3585
+ and (after_observation.get('valid') or has_judgable_failed_interaction_evidence)
3398
3586
  and not proof_evidence_blocker
3399
3587
  and not visual_delta_recovery
3400
3588
  )
3401
3589
 
3402
- if has_good_evidence:
3590
+ if has_good_evidence and after_observation.get('valid'):
3403
3591
  s['capture_hint_saved'] = record_successful_capture_hint(
3404
3592
  s,
3405
3593
  server_path=s.get('expected_start_path') or expected_path or s.get('server_path') or '/',
@@ -3410,9 +3598,12 @@ if has_good_evidence:
3410
3598
  )
3411
3599
 
3412
3600
  if has_good_evidence:
3601
+ if has_judgable_failed_interaction_evidence and isinstance(evidence_bundle.get('proof_session'), dict):
3602
+ evidence_bundle['proof_session']['status'] = 'evidence_captured'
3603
+ s['proof_session'] = evidence_bundle.get('proof_session') or {}
3413
3604
  supervisor_request = build_supervisor_assessment_request(s, after_payload, after_observation, required_baseline_present, expected_path, evidence_bundle)
3414
3605
  s['verify_status'] = 'evidence_captured'
3415
- s['merge_recommendation'] = 'pending-supervisor-judgment'
3606
+ s['merge_recommendation'] = 'do-not-merge' if has_judgable_failed_interaction_evidence else 'pending-supervisor-judgment'
3416
3607
  s['proof_assessment'] = {}
3417
3608
  s['proof_assessment_source'] = None
3418
3609
  s['proof_assessment_request'] = supervisor_request
@@ -3422,11 +3613,16 @@ if has_good_evidence:
3422
3613
  fields_agent_may_update.append('implementation_notes')
3423
3614
  s['verify_decision_request'] = {
3424
3615
  'status': s['verify_status'],
3425
- 'summary': 'Verify captured usable evidence and is waiting for supervising-agent proof assessment.',
3616
+ 'summary': (
3617
+ 'Verify captured structured interaction evidence with failed assertions and is waiting for supervising-agent proof assessment.'
3618
+ if has_judgable_failed_interaction_evidence
3619
+ else 'Verify captured usable evidence and is waiting for supervising-agent proof assessment.'
3620
+ ),
3426
3621
  'expected_path': expected_path,
3427
3622
  'expected_start_path': s.get('expected_start_path') or expected_path,
3428
3623
  'route_expectation': s.get('route_expectation') or {},
3429
3624
  'latest_observation': after_observation,
3625
+ 'structured_interaction_failure_summary': structured_interaction_failure_summary,
3430
3626
  'next_stage_options': next_stage_options,
3431
3627
  'recommended_stage': None,
3432
3628
  'continue_with_stage': None,
@@ -3438,7 +3634,10 @@ if has_good_evidence:
3438
3634
  'Do not escalate to the human unless the supervising agent concludes the workflow is genuinely stuck or not converging.',
3439
3635
  ],
3440
3636
  }
3441
- summary_lines.append('Proof assessment: awaiting supervising agent judgment')
3637
+ if has_judgable_failed_interaction_evidence:
3638
+ summary_lines.append('Proof assessment: awaiting supervising agent judgment on failed interaction evidence')
3639
+ else:
3640
+ summary_lines.append('Proof assessment: awaiting supervising agent judgment')
3442
3641
  summary_lines.append('Proof next stage: supervising agent decides after reviewing the evidence packet')
3443
3642
  else:
3444
3643
  capture_retry = visual_delta_recovery or build_capture_retry_decision(after_observation, required_baseline_present, proof_evidence_blocker, s.get('route_expectation') or {})