@riddledc/riddle-proof 0.8.8 → 0.8.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/codex-exec-agent.cjs +75 -10
- package/dist/adapters/codex-exec-agent.js +1 -1
- package/dist/adapters/codex.cjs +75 -10
- package/dist/adapters/codex.js +1 -1
- package/dist/adapters/local-agent.cjs +75 -10
- package/dist/adapters/local-agent.js +1 -1
- package/dist/advanced/engine-harness.cjs +12 -0
- package/dist/advanced/engine-harness.js +1 -1
- package/dist/advanced/index.cjs +12 -0
- package/dist/advanced/index.d.cts +2 -2
- package/dist/advanced/index.d.ts +2 -2
- package/dist/advanced/index.js +1 -1
- package/dist/advanced/proof-run-core.d.cts +1 -1
- package/dist/advanced/proof-run-core.d.ts +1 -1
- package/dist/advanced/proof-run-engine.d.cts +2 -2
- package/dist/advanced/proof-run-engine.d.ts +2 -2
- package/dist/{chunk-V6VZ3CAI.js → chunk-2PXL3RDB.js} +2 -2
- package/dist/{chunk-E7ATYSYS.js → chunk-BBUO7HM4.js} +12 -0
- package/dist/{chunk-PYCQNK66.js → chunk-EEIYUZXE.js} +75 -10
- package/dist/cli/index.js +3 -3
- package/dist/cli.cjs +87 -10
- package/dist/cli.js +3 -3
- package/dist/codex-exec-agent.cjs +75 -10
- package/dist/codex-exec-agent.js +1 -1
- package/dist/engine-harness.cjs +12 -0
- package/dist/engine-harness.js +1 -1
- package/dist/index.cjs +87 -10
- package/dist/index.js +2 -2
- package/dist/local-agent.cjs +75 -10
- package/dist/local-agent.js +1 -1
- package/dist/{proof-run-core-CE0jx7wL.d.ts → proof-run-core-Ci9uFxMc.d.cts} +1 -1
- package/dist/{proof-run-core-CE0jx7wL.d.cts → proof-run-core-Ci9uFxMc.d.ts} +1 -1
- package/dist/proof-run-core.d.cts +1 -1
- package/dist/proof-run-core.d.ts +1 -1
- package/dist/{proof-run-engine-BlocjMni.d.cts → proof-run-engine-Bd1T43Dy.d.cts} +4 -4
- package/dist/{proof-run-engine-C_m8WJmX.d.ts → proof-run-engine-CXyhB-io.d.ts} +4 -4
- package/dist/proof-run-engine.d.cts +2 -2
- package/dist/proof-run-engine.d.ts +2 -2
- package/package.json +2 -2
- package/runtime/lib/verify.py +88 -2
- package/runtime/tests/recon_verify_smoke.py +147 -24
- package/runtime/tests/trust_boundary_regression.py +143 -0
|
@@ -22,14 +22,14 @@ import {
|
|
|
22
22
|
createDisabledRiddleProofAgentAdapter,
|
|
23
23
|
readRiddleProofRunStatus,
|
|
24
24
|
runRiddleProofEngineHarness
|
|
25
|
-
} from "./chunk-
|
|
25
|
+
} from "./chunk-BBUO7HM4.js";
|
|
26
26
|
import {
|
|
27
27
|
createCheckpointResponseTemplate
|
|
28
28
|
} from "./chunk-4FOHZ7JG.js";
|
|
29
29
|
import {
|
|
30
30
|
createCodexExecAgentAdapter,
|
|
31
31
|
runCodexExecAgentDoctor
|
|
32
|
-
} from "./chunk-
|
|
32
|
+
} from "./chunk-EEIYUZXE.js";
|
|
33
33
|
|
|
34
34
|
// src/cli.ts
|
|
35
35
|
import { existsSync, mkdirSync, readdirSync, readFileSync, statSync, writeFileSync } from "fs";
|
|
@@ -1331,6 +1331,18 @@ async function routeCheckpoint(request, state, result, agent, input) {
|
|
|
1331
1331
|
if (checkpoint === "verify_agent_retry") {
|
|
1332
1332
|
const next = recommendedContinuation(result);
|
|
1333
1333
|
if (next) return { next };
|
|
1334
|
+
return {
|
|
1335
|
+
blocker: {
|
|
1336
|
+
code: "proof_assessment_blocked",
|
|
1337
|
+
checkpoint,
|
|
1338
|
+
message: result.summary || "The supervising proof assessment did not approve shipping and did not provide a safe retry continuation.",
|
|
1339
|
+
details: compactRecord({
|
|
1340
|
+
proofAssessment: result.proofAssessment || result.checkpointContract?.proof_assessment || recordValue(result.raw)?.proofAssessment || null,
|
|
1341
|
+
verifyDecisionRequest: result.verifyDecisionRequest || result.checkpointContract?.verify_decision_request || null,
|
|
1342
|
+
checkpointContract: result.checkpointContract || null
|
|
1343
|
+
})
|
|
1344
|
+
}
|
|
1345
|
+
};
|
|
1334
1346
|
}
|
|
1335
1347
|
if (checkpoint === "awaiting_stage_advance") {
|
|
1336
1348
|
const next = recommendedContinuation(result) || defaultAwaitingStageContinuation(result);
|
|
@@ -7,6 +7,8 @@ import { execFileSync, spawnSync } from "child_process";
|
|
|
7
7
|
import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "fs";
|
|
8
8
|
import os from "os";
|
|
9
9
|
import path from "path";
|
|
10
|
+
var DEFAULT_CODEX_TIMEOUT_MS = 6e5;
|
|
11
|
+
var DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS = 18e4;
|
|
10
12
|
var REFINED_INPUTS_SCHEMA = {
|
|
11
13
|
type: "object",
|
|
12
14
|
additionalProperties: false,
|
|
@@ -350,6 +352,46 @@ function parseJsonFromRunnerOutputs(outputs, schema) {
|
|
|
350
352
|
if (!combined.trim() || seen.has(combined)) return { parsed: null, source: "" };
|
|
351
353
|
return { parsed: parseJsonObject(combined, schema), source: "combined_output" };
|
|
352
354
|
}
|
|
355
|
+
function resolveCodexTimeoutMs(config, request) {
|
|
356
|
+
if (typeof config.codexTimeoutMs === "number" && Number.isFinite(config.codexTimeoutMs) && config.codexTimeoutMs > 0) {
|
|
357
|
+
return Number(config.codexTimeoutMs);
|
|
358
|
+
}
|
|
359
|
+
return request.purpose === "proof packet authoring" ? DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS : DEFAULT_CODEX_TIMEOUT_MS;
|
|
360
|
+
}
|
|
361
|
+
function isCodexLifecycleEvent(value) {
|
|
362
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) return false;
|
|
363
|
+
const type = value.type;
|
|
364
|
+
return typeof type === "string" && (type.startsWith("thread.") || type.startsWith("turn.") || type.startsWith("exec.") || type.startsWith("agent.") || type.startsWith("token.") || type.startsWith("reasoning.") || type.startsWith("error."));
|
|
365
|
+
}
|
|
366
|
+
function analyzeCodexRunnerOutput(outputs) {
|
|
367
|
+
const eventTypes = /* @__PURE__ */ new Set();
|
|
368
|
+
let eventLineCount = 0;
|
|
369
|
+
let nonEventLineCount = 0;
|
|
370
|
+
const nonEventSamples = [];
|
|
371
|
+
for (const output of outputs) {
|
|
372
|
+
const lines = output.text.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
|
|
373
|
+
for (const line of lines) {
|
|
374
|
+
try {
|
|
375
|
+
const parsed = JSON.parse(line);
|
|
376
|
+
if (isCodexLifecycleEvent(parsed)) {
|
|
377
|
+
eventLineCount += 1;
|
|
378
|
+
eventTypes.add(parsed.type);
|
|
379
|
+
continue;
|
|
380
|
+
}
|
|
381
|
+
} catch {
|
|
382
|
+
}
|
|
383
|
+
nonEventLineCount += 1;
|
|
384
|
+
if (nonEventSamples.length < 3) nonEventSamples.push(line.slice(0, 240));
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
return {
|
|
388
|
+
eventLineCount,
|
|
389
|
+
eventTypes: Array.from(eventTypes),
|
|
390
|
+
nonEventLineCount,
|
|
391
|
+
nonEventSamples,
|
|
392
|
+
onlyLifecycleEvents: eventLineCount > 0 && nonEventLineCount === 0
|
|
393
|
+
};
|
|
394
|
+
}
|
|
353
395
|
function isHarnessVerificationOnlyBlocker(blocker) {
|
|
354
396
|
const text = blocker.toLowerCase();
|
|
355
397
|
return (text.includes("erofs") || text.includes("read-only file system")) && text.includes("node_modules") && (text.includes(".vite-temp") || text.includes("vite.config"));
|
|
@@ -373,21 +415,25 @@ function runnerMetrics(input) {
|
|
|
373
415
|
exit_status: input.status ?? null,
|
|
374
416
|
timed_out: input.timedOut || false,
|
|
375
417
|
error_code: input.errorCode,
|
|
418
|
+
codex_event_types: input.codexEventTypes && input.codexEventTypes.length ? input.codexEventTypes : void 0,
|
|
419
|
+
codex_event_line_count: input.codexEventLineCount,
|
|
420
|
+
codex_non_event_line_count: input.codexNonEventLineCount,
|
|
376
421
|
codex_command: input.config.codexCommand || "codex",
|
|
377
422
|
codex_model: input.config.codexModel,
|
|
378
423
|
codex_sandbox: input.config.codexSandbox || "workspace-write",
|
|
379
424
|
codex_full_auto: input.config.codexFullAuto !== false,
|
|
380
|
-
timeout_ms:
|
|
425
|
+
timeout_ms: input.timeoutMs ?? DEFAULT_CODEX_TIMEOUT_MS
|
|
381
426
|
});
|
|
382
427
|
}
|
|
383
428
|
function createCodexExecJsonRunner(config = {}) {
|
|
384
429
|
return (request) => {
|
|
385
430
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
386
431
|
const startedMs = Date.now();
|
|
432
|
+
const timeoutMs = resolveCodexTimeoutMs(config, request);
|
|
387
433
|
if (!request.workdir || !existsSync(request.workdir)) {
|
|
388
434
|
return {
|
|
389
435
|
ok: false,
|
|
390
|
-
metrics: runnerMetrics({ request, config, startedAt, startedMs, errorCode: "workdir_missing" }),
|
|
436
|
+
metrics: runnerMetrics({ request, config, startedAt, startedMs, timeoutMs, errorCode: "workdir_missing" }),
|
|
391
437
|
blocker: {
|
|
392
438
|
code: "codex_workdir_missing",
|
|
393
439
|
message: `Codex workdir does not exist for ${request.purpose}.`,
|
|
@@ -422,7 +468,7 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
422
468
|
const proc = spawnSync(config.codexCommand || "codex", args, {
|
|
423
469
|
input: request.prompt,
|
|
424
470
|
encoding: "utf-8",
|
|
425
|
-
timeout:
|
|
471
|
+
timeout: timeoutMs,
|
|
426
472
|
maxBuffer: 10 * 1024 * 1024,
|
|
427
473
|
env
|
|
428
474
|
});
|
|
@@ -441,6 +487,7 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
441
487
|
stderr: proc.stderr || "",
|
|
442
488
|
status: proc.status,
|
|
443
489
|
timedOut,
|
|
490
|
+
timeoutMs,
|
|
444
491
|
errorCode: proc.error.code || "spawn_error"
|
|
445
492
|
}),
|
|
446
493
|
blocker: {
|
|
@@ -463,6 +510,7 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
463
510
|
stdout: proc.stdout || "",
|
|
464
511
|
stderr: proc.stderr || "",
|
|
465
512
|
status: proc.status,
|
|
513
|
+
timeoutMs,
|
|
466
514
|
errorCode: "nonzero_exit"
|
|
467
515
|
}),
|
|
468
516
|
blocker: {
|
|
@@ -475,12 +523,15 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
475
523
|
const finalText = existsSync(lastMessagePath) ? readFileSync(lastMessagePath, "utf-8") : String(proc.stdout || "");
|
|
476
524
|
const stdoutText = String(proc.stdout || "");
|
|
477
525
|
const stderrText = String(proc.stderr || "");
|
|
478
|
-
const
|
|
526
|
+
const runnerOutputs = [
|
|
479
527
|
{ source: existsSync(lastMessagePath) ? "last_message" : "stdout", text: finalText },
|
|
480
528
|
{ source: "stdout", text: stdoutText },
|
|
481
529
|
{ source: "stderr", text: stderrText }
|
|
482
|
-
]
|
|
530
|
+
];
|
|
531
|
+
const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs(runnerOutputs, request.schema);
|
|
483
532
|
if (!parsed) {
|
|
533
|
+
const outputAnalysis = analyzeCodexRunnerOutput(runnerOutputs);
|
|
534
|
+
const errorCode = outputAnalysis.onlyLifecycleEvents ? "no_final_response" : "invalid_json";
|
|
484
535
|
return {
|
|
485
536
|
ok: false,
|
|
486
537
|
stdout: stdoutText,
|
|
@@ -494,12 +545,24 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
494
545
|
stderr: stderrText,
|
|
495
546
|
finalText,
|
|
496
547
|
status: proc.status,
|
|
497
|
-
|
|
548
|
+
timeoutMs,
|
|
549
|
+
errorCode,
|
|
550
|
+
codexEventTypes: outputAnalysis.eventTypes,
|
|
551
|
+
codexEventLineCount: outputAnalysis.eventLineCount,
|
|
552
|
+
codexNonEventLineCount: outputAnalysis.nonEventLineCount
|
|
498
553
|
}),
|
|
499
554
|
blocker: {
|
|
500
|
-
code: "codex_invalid_json",
|
|
501
|
-
message: `Codex completed ${request.purpose}, but did not return valid JSON.`,
|
|
502
|
-
details: {
|
|
555
|
+
code: outputAnalysis.onlyLifecycleEvents ? "codex_no_final_response" : "codex_invalid_json",
|
|
556
|
+
message: outputAnalysis.onlyLifecycleEvents ? `Codex emitted lifecycle events during ${request.purpose}, but did not produce a final JSON response.` : `Codex completed ${request.purpose}, but did not return valid JSON.`,
|
|
557
|
+
details: {
|
|
558
|
+
finalText,
|
|
559
|
+
stdout: stdoutText,
|
|
560
|
+
stderr: stderrText,
|
|
561
|
+
event_types: outputAnalysis.eventTypes,
|
|
562
|
+
event_line_count: outputAnalysis.eventLineCount,
|
|
563
|
+
non_event_line_count: outputAnalysis.nonEventLineCount,
|
|
564
|
+
non_event_samples: outputAnalysis.nonEventSamples
|
|
565
|
+
}
|
|
503
566
|
}
|
|
504
567
|
};
|
|
505
568
|
}
|
|
@@ -517,7 +580,8 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
517
580
|
stderr: stderrText,
|
|
518
581
|
finalText,
|
|
519
582
|
parsedJsonSource,
|
|
520
|
-
status: proc.status
|
|
583
|
+
status: proc.status,
|
|
584
|
+
timeoutMs
|
|
521
585
|
})
|
|
522
586
|
};
|
|
523
587
|
} finally {
|
|
@@ -626,6 +690,7 @@ function createCodexExecAgentAdapter(config = {}, runner = createCodexExecJsonRu
|
|
|
626
690
|
"Write a proof_plan and capture_script that will verify the exact user-facing change.",
|
|
627
691
|
"Use recon_assessment.baseline_understanding as the source of truth. Do not author a proof plan unless it names the observed before state and the requested delta from that state.",
|
|
628
692
|
"Use the recon-approved route and baseline context; make the plan name the concrete target, expected before state, expected after state, and stop condition.",
|
|
693
|
+
"Do not leave this authoring stage pending for external investigation. Keep any repo inspection brief, do not modify files, and return the JSON proof packet from the available state.",
|
|
629
694
|
"Choose the evidence modality from verification_mode and success_criteria: screenshots for visual/UI proof, interactions plus screenshots for interaction proof, structured metrics/logs/JSON/audio analysis for non-visual proof.",
|
|
630
695
|
"For playable/gameplay proof, treat screenshots as supporting artifacts only: start the game, send keyboard or pointer input, measure state before/after, measure non-HUD canvas/playfield pixel deltas across time, and return playability evidence with version riddle-proof.playability.v1.",
|
|
631
696
|
"For interaction proof, return a structured evidence object with start route/state, terminal route/state, action, assertions, and matched UI text. Catch waitForURL or selector timeouts and record them as failed assertions instead of throwing before evidence is emitted.",
|
package/dist/cli/index.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
import "../chunk-
|
|
1
|
+
import "../chunk-2PXL3RDB.js";
|
|
2
2
|
import "../chunk-PEWAIEER.js";
|
|
3
3
|
import "../chunk-TWTEUS7R.js";
|
|
4
|
-
import "../chunk-
|
|
4
|
+
import "../chunk-BBUO7HM4.js";
|
|
5
5
|
import "../chunk-YZUVEJ5B.js";
|
|
6
6
|
import "../chunk-FMOYUYH2.js";
|
|
7
7
|
import "../chunk-5N5QFI2S.js";
|
|
8
8
|
import "../chunk-4FOHZ7JG.js";
|
|
9
9
|
import "../chunk-JFQXAJH2.js";
|
|
10
|
-
import "../chunk-
|
|
10
|
+
import "../chunk-EEIYUZXE.js";
|
|
11
11
|
import "../chunk-VY4Y5U57.js";
|
|
12
12
|
import "../chunk-MLKGABMK.js";
|
package/dist/cli.cjs
CHANGED
|
@@ -5656,6 +5656,18 @@ async function routeCheckpoint(request, state, result, agent, input) {
|
|
|
5656
5656
|
if (checkpoint === "verify_agent_retry") {
|
|
5657
5657
|
const next = recommendedContinuation(result);
|
|
5658
5658
|
if (next) return { next };
|
|
5659
|
+
return {
|
|
5660
|
+
blocker: {
|
|
5661
|
+
code: "proof_assessment_blocked",
|
|
5662
|
+
checkpoint,
|
|
5663
|
+
message: result.summary || "The supervising proof assessment did not approve shipping and did not provide a safe retry continuation.",
|
|
5664
|
+
details: compactRecord({
|
|
5665
|
+
proofAssessment: result.proofAssessment || result.checkpointContract?.proof_assessment || recordValue(result.raw)?.proofAssessment || null,
|
|
5666
|
+
verifyDecisionRequest: result.verifyDecisionRequest || result.checkpointContract?.verify_decision_request || null,
|
|
5667
|
+
checkpointContract: result.checkpointContract || null
|
|
5668
|
+
})
|
|
5669
|
+
}
|
|
5670
|
+
};
|
|
5659
5671
|
}
|
|
5660
5672
|
if (checkpoint === "awaiting_stage_advance") {
|
|
5661
5673
|
const next = recommendedContinuation(result) || defaultAwaitingStageContinuation(result);
|
|
@@ -5868,6 +5880,8 @@ var import_node_child_process3 = require("child_process");
|
|
|
5868
5880
|
var import_node_fs4 = require("fs");
|
|
5869
5881
|
var import_node_os = __toESM(require("os"), 1);
|
|
5870
5882
|
var import_node_path4 = __toESM(require("path"), 1);
|
|
5883
|
+
var DEFAULT_CODEX_TIMEOUT_MS = 6e5;
|
|
5884
|
+
var DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS = 18e4;
|
|
5871
5885
|
var REFINED_INPUTS_SCHEMA = {
|
|
5872
5886
|
type: "object",
|
|
5873
5887
|
additionalProperties: false,
|
|
@@ -6211,6 +6225,46 @@ function parseJsonFromRunnerOutputs(outputs, schema) {
|
|
|
6211
6225
|
if (!combined.trim() || seen.has(combined)) return { parsed: null, source: "" };
|
|
6212
6226
|
return { parsed: parseJsonObject(combined, schema), source: "combined_output" };
|
|
6213
6227
|
}
|
|
6228
|
+
function resolveCodexTimeoutMs(config, request) {
|
|
6229
|
+
if (typeof config.codexTimeoutMs === "number" && Number.isFinite(config.codexTimeoutMs) && config.codexTimeoutMs > 0) {
|
|
6230
|
+
return Number(config.codexTimeoutMs);
|
|
6231
|
+
}
|
|
6232
|
+
return request.purpose === "proof packet authoring" ? DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS : DEFAULT_CODEX_TIMEOUT_MS;
|
|
6233
|
+
}
|
|
6234
|
+
function isCodexLifecycleEvent(value) {
|
|
6235
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) return false;
|
|
6236
|
+
const type = value.type;
|
|
6237
|
+
return typeof type === "string" && (type.startsWith("thread.") || type.startsWith("turn.") || type.startsWith("exec.") || type.startsWith("agent.") || type.startsWith("token.") || type.startsWith("reasoning.") || type.startsWith("error."));
|
|
6238
|
+
}
|
|
6239
|
+
function analyzeCodexRunnerOutput(outputs) {
|
|
6240
|
+
const eventTypes = /* @__PURE__ */ new Set();
|
|
6241
|
+
let eventLineCount = 0;
|
|
6242
|
+
let nonEventLineCount = 0;
|
|
6243
|
+
const nonEventSamples = [];
|
|
6244
|
+
for (const output of outputs) {
|
|
6245
|
+
const lines = output.text.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
|
|
6246
|
+
for (const line of lines) {
|
|
6247
|
+
try {
|
|
6248
|
+
const parsed = JSON.parse(line);
|
|
6249
|
+
if (isCodexLifecycleEvent(parsed)) {
|
|
6250
|
+
eventLineCount += 1;
|
|
6251
|
+
eventTypes.add(parsed.type);
|
|
6252
|
+
continue;
|
|
6253
|
+
}
|
|
6254
|
+
} catch {
|
|
6255
|
+
}
|
|
6256
|
+
nonEventLineCount += 1;
|
|
6257
|
+
if (nonEventSamples.length < 3) nonEventSamples.push(line.slice(0, 240));
|
|
6258
|
+
}
|
|
6259
|
+
}
|
|
6260
|
+
return {
|
|
6261
|
+
eventLineCount,
|
|
6262
|
+
eventTypes: Array.from(eventTypes),
|
|
6263
|
+
nonEventLineCount,
|
|
6264
|
+
nonEventSamples,
|
|
6265
|
+
onlyLifecycleEvents: eventLineCount > 0 && nonEventLineCount === 0
|
|
6266
|
+
};
|
|
6267
|
+
}
|
|
6214
6268
|
function isHarnessVerificationOnlyBlocker(blocker) {
|
|
6215
6269
|
const text = blocker.toLowerCase();
|
|
6216
6270
|
return (text.includes("erofs") || text.includes("read-only file system")) && text.includes("node_modules") && (text.includes(".vite-temp") || text.includes("vite.config"));
|
|
@@ -6234,21 +6288,25 @@ function runnerMetrics(input) {
|
|
|
6234
6288
|
exit_status: input.status ?? null,
|
|
6235
6289
|
timed_out: input.timedOut || false,
|
|
6236
6290
|
error_code: input.errorCode,
|
|
6291
|
+
codex_event_types: input.codexEventTypes && input.codexEventTypes.length ? input.codexEventTypes : void 0,
|
|
6292
|
+
codex_event_line_count: input.codexEventLineCount,
|
|
6293
|
+
codex_non_event_line_count: input.codexNonEventLineCount,
|
|
6237
6294
|
codex_command: input.config.codexCommand || "codex",
|
|
6238
6295
|
codex_model: input.config.codexModel,
|
|
6239
6296
|
codex_sandbox: input.config.codexSandbox || "workspace-write",
|
|
6240
6297
|
codex_full_auto: input.config.codexFullAuto !== false,
|
|
6241
|
-
timeout_ms:
|
|
6298
|
+
timeout_ms: input.timeoutMs ?? DEFAULT_CODEX_TIMEOUT_MS
|
|
6242
6299
|
});
|
|
6243
6300
|
}
|
|
6244
6301
|
function createCodexExecJsonRunner(config = {}) {
|
|
6245
6302
|
return (request) => {
|
|
6246
6303
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
6247
6304
|
const startedMs = Date.now();
|
|
6305
|
+
const timeoutMs = resolveCodexTimeoutMs(config, request);
|
|
6248
6306
|
if (!request.workdir || !(0, import_node_fs4.existsSync)(request.workdir)) {
|
|
6249
6307
|
return {
|
|
6250
6308
|
ok: false,
|
|
6251
|
-
metrics: runnerMetrics({ request, config, startedAt, startedMs, errorCode: "workdir_missing" }),
|
|
6309
|
+
metrics: runnerMetrics({ request, config, startedAt, startedMs, timeoutMs, errorCode: "workdir_missing" }),
|
|
6252
6310
|
blocker: {
|
|
6253
6311
|
code: "codex_workdir_missing",
|
|
6254
6312
|
message: `Codex workdir does not exist for ${request.purpose}.`,
|
|
@@ -6283,7 +6341,7 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
6283
6341
|
const proc = (0, import_node_child_process3.spawnSync)(config.codexCommand || "codex", args, {
|
|
6284
6342
|
input: request.prompt,
|
|
6285
6343
|
encoding: "utf-8",
|
|
6286
|
-
timeout:
|
|
6344
|
+
timeout: timeoutMs,
|
|
6287
6345
|
maxBuffer: 10 * 1024 * 1024,
|
|
6288
6346
|
env
|
|
6289
6347
|
});
|
|
@@ -6302,6 +6360,7 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
6302
6360
|
stderr: proc.stderr || "",
|
|
6303
6361
|
status: proc.status,
|
|
6304
6362
|
timedOut,
|
|
6363
|
+
timeoutMs,
|
|
6305
6364
|
errorCode: proc.error.code || "spawn_error"
|
|
6306
6365
|
}),
|
|
6307
6366
|
blocker: {
|
|
@@ -6324,6 +6383,7 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
6324
6383
|
stdout: proc.stdout || "",
|
|
6325
6384
|
stderr: proc.stderr || "",
|
|
6326
6385
|
status: proc.status,
|
|
6386
|
+
timeoutMs,
|
|
6327
6387
|
errorCode: "nonzero_exit"
|
|
6328
6388
|
}),
|
|
6329
6389
|
blocker: {
|
|
@@ -6336,12 +6396,15 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
6336
6396
|
const finalText = (0, import_node_fs4.existsSync)(lastMessagePath) ? (0, import_node_fs4.readFileSync)(lastMessagePath, "utf-8") : String(proc.stdout || "");
|
|
6337
6397
|
const stdoutText = String(proc.stdout || "");
|
|
6338
6398
|
const stderrText = String(proc.stderr || "");
|
|
6339
|
-
const
|
|
6399
|
+
const runnerOutputs = [
|
|
6340
6400
|
{ source: (0, import_node_fs4.existsSync)(lastMessagePath) ? "last_message" : "stdout", text: finalText },
|
|
6341
6401
|
{ source: "stdout", text: stdoutText },
|
|
6342
6402
|
{ source: "stderr", text: stderrText }
|
|
6343
|
-
]
|
|
6403
|
+
];
|
|
6404
|
+
const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs(runnerOutputs, request.schema);
|
|
6344
6405
|
if (!parsed) {
|
|
6406
|
+
const outputAnalysis = analyzeCodexRunnerOutput(runnerOutputs);
|
|
6407
|
+
const errorCode = outputAnalysis.onlyLifecycleEvents ? "no_final_response" : "invalid_json";
|
|
6345
6408
|
return {
|
|
6346
6409
|
ok: false,
|
|
6347
6410
|
stdout: stdoutText,
|
|
@@ -6355,12 +6418,24 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
6355
6418
|
stderr: stderrText,
|
|
6356
6419
|
finalText,
|
|
6357
6420
|
status: proc.status,
|
|
6358
|
-
|
|
6421
|
+
timeoutMs,
|
|
6422
|
+
errorCode,
|
|
6423
|
+
codexEventTypes: outputAnalysis.eventTypes,
|
|
6424
|
+
codexEventLineCount: outputAnalysis.eventLineCount,
|
|
6425
|
+
codexNonEventLineCount: outputAnalysis.nonEventLineCount
|
|
6359
6426
|
}),
|
|
6360
6427
|
blocker: {
|
|
6361
|
-
code: "codex_invalid_json",
|
|
6362
|
-
message: `Codex completed ${request.purpose}, but did not return valid JSON.`,
|
|
6363
|
-
details: {
|
|
6428
|
+
code: outputAnalysis.onlyLifecycleEvents ? "codex_no_final_response" : "codex_invalid_json",
|
|
6429
|
+
message: outputAnalysis.onlyLifecycleEvents ? `Codex emitted lifecycle events during ${request.purpose}, but did not produce a final JSON response.` : `Codex completed ${request.purpose}, but did not return valid JSON.`,
|
|
6430
|
+
details: {
|
|
6431
|
+
finalText,
|
|
6432
|
+
stdout: stdoutText,
|
|
6433
|
+
stderr: stderrText,
|
|
6434
|
+
event_types: outputAnalysis.eventTypes,
|
|
6435
|
+
event_line_count: outputAnalysis.eventLineCount,
|
|
6436
|
+
non_event_line_count: outputAnalysis.nonEventLineCount,
|
|
6437
|
+
non_event_samples: outputAnalysis.nonEventSamples
|
|
6438
|
+
}
|
|
6364
6439
|
}
|
|
6365
6440
|
};
|
|
6366
6441
|
}
|
|
@@ -6378,7 +6453,8 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
6378
6453
|
stderr: stderrText,
|
|
6379
6454
|
finalText,
|
|
6380
6455
|
parsedJsonSource,
|
|
6381
|
-
status: proc.status
|
|
6456
|
+
status: proc.status,
|
|
6457
|
+
timeoutMs
|
|
6382
6458
|
})
|
|
6383
6459
|
};
|
|
6384
6460
|
} finally {
|
|
@@ -6487,6 +6563,7 @@ function createCodexExecAgentAdapter(config = {}, runner = createCodexExecJsonRu
|
|
|
6487
6563
|
"Write a proof_plan and capture_script that will verify the exact user-facing change.",
|
|
6488
6564
|
"Use recon_assessment.baseline_understanding as the source of truth. Do not author a proof plan unless it names the observed before state and the requested delta from that state.",
|
|
6489
6565
|
"Use the recon-approved route and baseline context; make the plan name the concrete target, expected before state, expected after state, and stop condition.",
|
|
6566
|
+
"Do not leave this authoring stage pending for external investigation. Keep any repo inspection brief, do not modify files, and return the JSON proof packet from the available state.",
|
|
6490
6567
|
"Choose the evidence modality from verification_mode and success_criteria: screenshots for visual/UI proof, interactions plus screenshots for interaction proof, structured metrics/logs/JSON/audio analysis for non-visual proof.",
|
|
6491
6568
|
"For playable/gameplay proof, treat screenshots as supporting artifacts only: start the game, send keyboard or pointer input, measure state before/after, measure non-HUD canvas/playfield pixel deltas across time, and return playability evidence with version riddle-proof.playability.v1.",
|
|
6492
6569
|
"For interaction proof, return a structured evidence object with start route/state, terminal route/state, action, assertions, and matched UI text. Catch waitForURL or selector timeouts and record them as failed assertions instead of throwing before evidence is emitted.",
|
package/dist/cli.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import "./chunk-
|
|
2
|
+
import "./chunk-2PXL3RDB.js";
|
|
3
3
|
import "./chunk-PEWAIEER.js";
|
|
4
4
|
import "./chunk-TWTEUS7R.js";
|
|
5
|
-
import "./chunk-
|
|
5
|
+
import "./chunk-BBUO7HM4.js";
|
|
6
6
|
import "./chunk-YZUVEJ5B.js";
|
|
7
7
|
import "./chunk-FMOYUYH2.js";
|
|
8
8
|
import "./chunk-5N5QFI2S.js";
|
|
9
9
|
import "./chunk-4FOHZ7JG.js";
|
|
10
10
|
import "./chunk-JFQXAJH2.js";
|
|
11
|
-
import "./chunk-
|
|
11
|
+
import "./chunk-EEIYUZXE.js";
|
|
12
12
|
import "./chunk-VY4Y5U57.js";
|
|
13
13
|
import "./chunk-MLKGABMK.js";
|
|
@@ -46,6 +46,8 @@ function compactRecord(input) {
|
|
|
46
46
|
}
|
|
47
47
|
|
|
48
48
|
// src/codex-exec-agent.ts
|
|
49
|
+
var DEFAULT_CODEX_TIMEOUT_MS = 6e5;
|
|
50
|
+
var DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS = 18e4;
|
|
49
51
|
var REFINED_INPUTS_SCHEMA = {
|
|
50
52
|
type: "object",
|
|
51
53
|
additionalProperties: false,
|
|
@@ -389,6 +391,46 @@ function parseJsonFromRunnerOutputs(outputs, schema) {
|
|
|
389
391
|
if (!combined.trim() || seen.has(combined)) return { parsed: null, source: "" };
|
|
390
392
|
return { parsed: parseJsonObject(combined, schema), source: "combined_output" };
|
|
391
393
|
}
|
|
394
|
+
function resolveCodexTimeoutMs(config, request) {
|
|
395
|
+
if (typeof config.codexTimeoutMs === "number" && Number.isFinite(config.codexTimeoutMs) && config.codexTimeoutMs > 0) {
|
|
396
|
+
return Number(config.codexTimeoutMs);
|
|
397
|
+
}
|
|
398
|
+
return request.purpose === "proof packet authoring" ? DEFAULT_PROOF_PACKET_AUTHOR_TIMEOUT_MS : DEFAULT_CODEX_TIMEOUT_MS;
|
|
399
|
+
}
|
|
400
|
+
function isCodexLifecycleEvent(value) {
|
|
401
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) return false;
|
|
402
|
+
const type = value.type;
|
|
403
|
+
return typeof type === "string" && (type.startsWith("thread.") || type.startsWith("turn.") || type.startsWith("exec.") || type.startsWith("agent.") || type.startsWith("token.") || type.startsWith("reasoning.") || type.startsWith("error."));
|
|
404
|
+
}
|
|
405
|
+
function analyzeCodexRunnerOutput(outputs) {
|
|
406
|
+
const eventTypes = /* @__PURE__ */ new Set();
|
|
407
|
+
let eventLineCount = 0;
|
|
408
|
+
let nonEventLineCount = 0;
|
|
409
|
+
const nonEventSamples = [];
|
|
410
|
+
for (const output of outputs) {
|
|
411
|
+
const lines = output.text.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
|
|
412
|
+
for (const line of lines) {
|
|
413
|
+
try {
|
|
414
|
+
const parsed = JSON.parse(line);
|
|
415
|
+
if (isCodexLifecycleEvent(parsed)) {
|
|
416
|
+
eventLineCount += 1;
|
|
417
|
+
eventTypes.add(parsed.type);
|
|
418
|
+
continue;
|
|
419
|
+
}
|
|
420
|
+
} catch {
|
|
421
|
+
}
|
|
422
|
+
nonEventLineCount += 1;
|
|
423
|
+
if (nonEventSamples.length < 3) nonEventSamples.push(line.slice(0, 240));
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
return {
|
|
427
|
+
eventLineCount,
|
|
428
|
+
eventTypes: Array.from(eventTypes),
|
|
429
|
+
nonEventLineCount,
|
|
430
|
+
nonEventSamples,
|
|
431
|
+
onlyLifecycleEvents: eventLineCount > 0 && nonEventLineCount === 0
|
|
432
|
+
};
|
|
433
|
+
}
|
|
392
434
|
function isHarnessVerificationOnlyBlocker(blocker) {
|
|
393
435
|
const text = blocker.toLowerCase();
|
|
394
436
|
return (text.includes("erofs") || text.includes("read-only file system")) && text.includes("node_modules") && (text.includes(".vite-temp") || text.includes("vite.config"));
|
|
@@ -412,21 +454,25 @@ function runnerMetrics(input) {
|
|
|
412
454
|
exit_status: input.status ?? null,
|
|
413
455
|
timed_out: input.timedOut || false,
|
|
414
456
|
error_code: input.errorCode,
|
|
457
|
+
codex_event_types: input.codexEventTypes && input.codexEventTypes.length ? input.codexEventTypes : void 0,
|
|
458
|
+
codex_event_line_count: input.codexEventLineCount,
|
|
459
|
+
codex_non_event_line_count: input.codexNonEventLineCount,
|
|
415
460
|
codex_command: input.config.codexCommand || "codex",
|
|
416
461
|
codex_model: input.config.codexModel,
|
|
417
462
|
codex_sandbox: input.config.codexSandbox || "workspace-write",
|
|
418
463
|
codex_full_auto: input.config.codexFullAuto !== false,
|
|
419
|
-
timeout_ms:
|
|
464
|
+
timeout_ms: input.timeoutMs ?? DEFAULT_CODEX_TIMEOUT_MS
|
|
420
465
|
});
|
|
421
466
|
}
|
|
422
467
|
function createCodexExecJsonRunner(config = {}) {
|
|
423
468
|
return (request) => {
|
|
424
469
|
const startedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
425
470
|
const startedMs = Date.now();
|
|
471
|
+
const timeoutMs = resolveCodexTimeoutMs(config, request);
|
|
426
472
|
if (!request.workdir || !(0, import_node_fs.existsSync)(request.workdir)) {
|
|
427
473
|
return {
|
|
428
474
|
ok: false,
|
|
429
|
-
metrics: runnerMetrics({ request, config, startedAt, startedMs, errorCode: "workdir_missing" }),
|
|
475
|
+
metrics: runnerMetrics({ request, config, startedAt, startedMs, timeoutMs, errorCode: "workdir_missing" }),
|
|
430
476
|
blocker: {
|
|
431
477
|
code: "codex_workdir_missing",
|
|
432
478
|
message: `Codex workdir does not exist for ${request.purpose}.`,
|
|
@@ -461,7 +507,7 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
461
507
|
const proc = (0, import_node_child_process.spawnSync)(config.codexCommand || "codex", args, {
|
|
462
508
|
input: request.prompt,
|
|
463
509
|
encoding: "utf-8",
|
|
464
|
-
timeout:
|
|
510
|
+
timeout: timeoutMs,
|
|
465
511
|
maxBuffer: 10 * 1024 * 1024,
|
|
466
512
|
env
|
|
467
513
|
});
|
|
@@ -480,6 +526,7 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
480
526
|
stderr: proc.stderr || "",
|
|
481
527
|
status: proc.status,
|
|
482
528
|
timedOut,
|
|
529
|
+
timeoutMs,
|
|
483
530
|
errorCode: proc.error.code || "spawn_error"
|
|
484
531
|
}),
|
|
485
532
|
blocker: {
|
|
@@ -502,6 +549,7 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
502
549
|
stdout: proc.stdout || "",
|
|
503
550
|
stderr: proc.stderr || "",
|
|
504
551
|
status: proc.status,
|
|
552
|
+
timeoutMs,
|
|
505
553
|
errorCode: "nonzero_exit"
|
|
506
554
|
}),
|
|
507
555
|
blocker: {
|
|
@@ -514,12 +562,15 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
514
562
|
const finalText = (0, import_node_fs.existsSync)(lastMessagePath) ? (0, import_node_fs.readFileSync)(lastMessagePath, "utf-8") : String(proc.stdout || "");
|
|
515
563
|
const stdoutText = String(proc.stdout || "");
|
|
516
564
|
const stderrText = String(proc.stderr || "");
|
|
517
|
-
const
|
|
565
|
+
const runnerOutputs = [
|
|
518
566
|
{ source: (0, import_node_fs.existsSync)(lastMessagePath) ? "last_message" : "stdout", text: finalText },
|
|
519
567
|
{ source: "stdout", text: stdoutText },
|
|
520
568
|
{ source: "stderr", text: stderrText }
|
|
521
|
-
]
|
|
569
|
+
];
|
|
570
|
+
const { parsed, source: parsedJsonSource } = parseJsonFromRunnerOutputs(runnerOutputs, request.schema);
|
|
522
571
|
if (!parsed) {
|
|
572
|
+
const outputAnalysis = analyzeCodexRunnerOutput(runnerOutputs);
|
|
573
|
+
const errorCode = outputAnalysis.onlyLifecycleEvents ? "no_final_response" : "invalid_json";
|
|
523
574
|
return {
|
|
524
575
|
ok: false,
|
|
525
576
|
stdout: stdoutText,
|
|
@@ -533,12 +584,24 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
533
584
|
stderr: stderrText,
|
|
534
585
|
finalText,
|
|
535
586
|
status: proc.status,
|
|
536
|
-
|
|
587
|
+
timeoutMs,
|
|
588
|
+
errorCode,
|
|
589
|
+
codexEventTypes: outputAnalysis.eventTypes,
|
|
590
|
+
codexEventLineCount: outputAnalysis.eventLineCount,
|
|
591
|
+
codexNonEventLineCount: outputAnalysis.nonEventLineCount
|
|
537
592
|
}),
|
|
538
593
|
blocker: {
|
|
539
|
-
code: "codex_invalid_json",
|
|
540
|
-
message: `Codex completed ${request.purpose}, but did not return valid JSON.`,
|
|
541
|
-
details: {
|
|
594
|
+
code: outputAnalysis.onlyLifecycleEvents ? "codex_no_final_response" : "codex_invalid_json",
|
|
595
|
+
message: outputAnalysis.onlyLifecycleEvents ? `Codex emitted lifecycle events during ${request.purpose}, but did not produce a final JSON response.` : `Codex completed ${request.purpose}, but did not return valid JSON.`,
|
|
596
|
+
details: {
|
|
597
|
+
finalText,
|
|
598
|
+
stdout: stdoutText,
|
|
599
|
+
stderr: stderrText,
|
|
600
|
+
event_types: outputAnalysis.eventTypes,
|
|
601
|
+
event_line_count: outputAnalysis.eventLineCount,
|
|
602
|
+
non_event_line_count: outputAnalysis.nonEventLineCount,
|
|
603
|
+
non_event_samples: outputAnalysis.nonEventSamples
|
|
604
|
+
}
|
|
542
605
|
}
|
|
543
606
|
};
|
|
544
607
|
}
|
|
@@ -556,7 +619,8 @@ function createCodexExecJsonRunner(config = {}) {
|
|
|
556
619
|
stderr: stderrText,
|
|
557
620
|
finalText,
|
|
558
621
|
parsedJsonSource,
|
|
559
|
-
status: proc.status
|
|
622
|
+
status: proc.status,
|
|
623
|
+
timeoutMs
|
|
560
624
|
})
|
|
561
625
|
};
|
|
562
626
|
} finally {
|
|
@@ -665,6 +729,7 @@ function createCodexExecAgentAdapter(config = {}, runner = createCodexExecJsonRu
|
|
|
665
729
|
"Write a proof_plan and capture_script that will verify the exact user-facing change.",
|
|
666
730
|
"Use recon_assessment.baseline_understanding as the source of truth. Do not author a proof plan unless it names the observed before state and the requested delta from that state.",
|
|
667
731
|
"Use the recon-approved route and baseline context; make the plan name the concrete target, expected before state, expected after state, and stop condition.",
|
|
732
|
+
"Do not leave this authoring stage pending for external investigation. Keep any repo inspection brief, do not modify files, and return the JSON proof packet from the available state.",
|
|
668
733
|
"Choose the evidence modality from verification_mode and success_criteria: screenshots for visual/UI proof, interactions plus screenshots for interaction proof, structured metrics/logs/JSON/audio analysis for non-visual proof.",
|
|
669
734
|
"For playable/gameplay proof, treat screenshots as supporting artifacts only: start the game, send keyboard or pointer input, measure state before/after, measure non-HUD canvas/playfield pixel deltas across time, and return playability evidence with version riddle-proof.playability.v1.",
|
|
670
735
|
"For interaction proof, return a structured evidence object with start route/state, terminal route/state, action, assertions, and matched UI text. Catch waitForURL or selector timeouts and record them as failed assertions instead of throwing before evidence is emitted.",
|
package/dist/codex-exec-agent.js
CHANGED
package/dist/engine-harness.cjs
CHANGED
|
@@ -5585,6 +5585,18 @@ async function routeCheckpoint(request, state, result, agent, input) {
|
|
|
5585
5585
|
if (checkpoint === "verify_agent_retry") {
|
|
5586
5586
|
const next = recommendedContinuation(result);
|
|
5587
5587
|
if (next) return { next };
|
|
5588
|
+
return {
|
|
5589
|
+
blocker: {
|
|
5590
|
+
code: "proof_assessment_blocked",
|
|
5591
|
+
checkpoint,
|
|
5592
|
+
message: result.summary || "The supervising proof assessment did not approve shipping and did not provide a safe retry continuation.",
|
|
5593
|
+
details: compactRecord({
|
|
5594
|
+
proofAssessment: result.proofAssessment || result.checkpointContract?.proof_assessment || recordValue(result.raw)?.proofAssessment || null,
|
|
5595
|
+
verifyDecisionRequest: result.verifyDecisionRequest || result.checkpointContract?.verify_decision_request || null,
|
|
5596
|
+
checkpointContract: result.checkpointContract || null
|
|
5597
|
+
})
|
|
5598
|
+
}
|
|
5599
|
+
};
|
|
5588
5600
|
}
|
|
5589
5601
|
if (checkpoint === "awaiting_stage_advance") {
|
|
5590
5602
|
const next = recommendedContinuation(result) || defaultAwaitingStageContinuation(result);
|
package/dist/engine-harness.js
CHANGED