@riddledc/riddle-proof 0.8.9 → 0.8.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/advanced/engine-harness.cjs +12 -0
- package/dist/advanced/engine-harness.js +1 -1
- package/dist/advanced/index.cjs +12 -0
- package/dist/advanced/index.js +1 -1
- package/dist/{chunk-RTWGGKS3.js → chunk-2PXL3RDB.js} +1 -1
- package/dist/{chunk-E7ATYSYS.js → chunk-BBUO7HM4.js} +12 -0
- package/dist/cli/index.js +2 -2
- package/dist/cli.cjs +12 -0
- package/dist/cli.js +2 -2
- package/dist/engine-harness.cjs +12 -0
- package/dist/engine-harness.js +1 -1
- package/dist/index.cjs +12 -0
- package/dist/index.js +1 -1
- package/package.json +2 -2
- package/runtime/lib/verify.py +110 -10
- package/runtime/tests/recon_verify_smoke.py +201 -24
- package/runtime/tests/trust_boundary_regression.py +149 -0
|
@@ -5587,6 +5587,18 @@ async function routeCheckpoint(request, state, result, agent, input) {
|
|
|
5587
5587
|
if (checkpoint === "verify_agent_retry") {
|
|
5588
5588
|
const next = recommendedContinuation(result);
|
|
5589
5589
|
if (next) return { next };
|
|
5590
|
+
return {
|
|
5591
|
+
blocker: {
|
|
5592
|
+
code: "proof_assessment_blocked",
|
|
5593
|
+
checkpoint,
|
|
5594
|
+
message: result.summary || "The supervising proof assessment did not approve shipping and did not provide a safe retry continuation.",
|
|
5595
|
+
details: compactRecord({
|
|
5596
|
+
proofAssessment: result.proofAssessment || result.checkpointContract?.proof_assessment || recordValue(result.raw)?.proofAssessment || null,
|
|
5597
|
+
verifyDecisionRequest: result.verifyDecisionRequest || result.checkpointContract?.verify_decision_request || null,
|
|
5598
|
+
checkpointContract: result.checkpointContract || null
|
|
5599
|
+
})
|
|
5600
|
+
}
|
|
5601
|
+
};
|
|
5590
5602
|
}
|
|
5591
5603
|
if (checkpoint === "awaiting_stage_advance") {
|
|
5592
5604
|
const next = recommendedContinuation(result) || defaultAwaitingStageContinuation(result);
|
|
@@ -2,7 +2,7 @@ import {
|
|
|
2
2
|
createDisabledRiddleProofAgentAdapter,
|
|
3
3
|
readRiddleProofRunStatus,
|
|
4
4
|
runRiddleProofEngineHarness
|
|
5
|
-
} from "../chunk-
|
|
5
|
+
} from "../chunk-BBUO7HM4.js";
|
|
6
6
|
import "../chunk-YZUVEJ5B.js";
|
|
7
7
|
import "../chunk-FMOYUYH2.js";
|
|
8
8
|
import "../chunk-5N5QFI2S.js";
|
package/dist/advanced/index.cjs
CHANGED
|
@@ -6123,6 +6123,18 @@ async function routeCheckpoint(request, state, result, agent, input) {
|
|
|
6123
6123
|
if (checkpoint === "verify_agent_retry") {
|
|
6124
6124
|
const next = recommendedContinuation(result);
|
|
6125
6125
|
if (next) return { next };
|
|
6126
|
+
return {
|
|
6127
|
+
blocker: {
|
|
6128
|
+
code: "proof_assessment_blocked",
|
|
6129
|
+
checkpoint,
|
|
6130
|
+
message: result.summary || "The supervising proof assessment did not approve shipping and did not provide a safe retry continuation.",
|
|
6131
|
+
details: compactRecord({
|
|
6132
|
+
proofAssessment: result.proofAssessment || result.checkpointContract?.proof_assessment || recordValue(result.raw)?.proofAssessment || null,
|
|
6133
|
+
verifyDecisionRequest: result.verifyDecisionRequest || result.checkpointContract?.verify_decision_request || null,
|
|
6134
|
+
checkpointContract: result.checkpointContract || null
|
|
6135
|
+
})
|
|
6136
|
+
}
|
|
6137
|
+
};
|
|
6126
6138
|
}
|
|
6127
6139
|
if (checkpoint === "awaiting_stage_advance") {
|
|
6128
6140
|
const next = recommendedContinuation(result) || defaultAwaitingStageContinuation(result);
|
package/dist/advanced/index.js
CHANGED
|
@@ -1331,6 +1331,18 @@ async function routeCheckpoint(request, state, result, agent, input) {
|
|
|
1331
1331
|
if (checkpoint === "verify_agent_retry") {
|
|
1332
1332
|
const next = recommendedContinuation(result);
|
|
1333
1333
|
if (next) return { next };
|
|
1334
|
+
return {
|
|
1335
|
+
blocker: {
|
|
1336
|
+
code: "proof_assessment_blocked",
|
|
1337
|
+
checkpoint,
|
|
1338
|
+
message: result.summary || "The supervising proof assessment did not approve shipping and did not provide a safe retry continuation.",
|
|
1339
|
+
details: compactRecord({
|
|
1340
|
+
proofAssessment: result.proofAssessment || result.checkpointContract?.proof_assessment || recordValue(result.raw)?.proofAssessment || null,
|
|
1341
|
+
verifyDecisionRequest: result.verifyDecisionRequest || result.checkpointContract?.verify_decision_request || null,
|
|
1342
|
+
checkpointContract: result.checkpointContract || null
|
|
1343
|
+
})
|
|
1344
|
+
}
|
|
1345
|
+
};
|
|
1334
1346
|
}
|
|
1335
1347
|
if (checkpoint === "awaiting_stage_advance") {
|
|
1336
1348
|
const next = recommendedContinuation(result) || defaultAwaitingStageContinuation(result);
|
package/dist/cli/index.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import "../chunk-
|
|
1
|
+
import "../chunk-2PXL3RDB.js";
|
|
2
2
|
import "../chunk-PEWAIEER.js";
|
|
3
3
|
import "../chunk-TWTEUS7R.js";
|
|
4
|
-
import "../chunk-
|
|
4
|
+
import "../chunk-BBUO7HM4.js";
|
|
5
5
|
import "../chunk-YZUVEJ5B.js";
|
|
6
6
|
import "../chunk-FMOYUYH2.js";
|
|
7
7
|
import "../chunk-5N5QFI2S.js";
|
package/dist/cli.cjs
CHANGED
|
@@ -5656,6 +5656,18 @@ async function routeCheckpoint(request, state, result, agent, input) {
|
|
|
5656
5656
|
if (checkpoint === "verify_agent_retry") {
|
|
5657
5657
|
const next = recommendedContinuation(result);
|
|
5658
5658
|
if (next) return { next };
|
|
5659
|
+
return {
|
|
5660
|
+
blocker: {
|
|
5661
|
+
code: "proof_assessment_blocked",
|
|
5662
|
+
checkpoint,
|
|
5663
|
+
message: result.summary || "The supervising proof assessment did not approve shipping and did not provide a safe retry continuation.",
|
|
5664
|
+
details: compactRecord({
|
|
5665
|
+
proofAssessment: result.proofAssessment || result.checkpointContract?.proof_assessment || recordValue(result.raw)?.proofAssessment || null,
|
|
5666
|
+
verifyDecisionRequest: result.verifyDecisionRequest || result.checkpointContract?.verify_decision_request || null,
|
|
5667
|
+
checkpointContract: result.checkpointContract || null
|
|
5668
|
+
})
|
|
5669
|
+
}
|
|
5670
|
+
};
|
|
5659
5671
|
}
|
|
5660
5672
|
if (checkpoint === "awaiting_stage_advance") {
|
|
5661
5673
|
const next = recommendedContinuation(result) || defaultAwaitingStageContinuation(result);
|
package/dist/cli.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import "./chunk-
|
|
2
|
+
import "./chunk-2PXL3RDB.js";
|
|
3
3
|
import "./chunk-PEWAIEER.js";
|
|
4
4
|
import "./chunk-TWTEUS7R.js";
|
|
5
|
-
import "./chunk-
|
|
5
|
+
import "./chunk-BBUO7HM4.js";
|
|
6
6
|
import "./chunk-YZUVEJ5B.js";
|
|
7
7
|
import "./chunk-FMOYUYH2.js";
|
|
8
8
|
import "./chunk-5N5QFI2S.js";
|
package/dist/engine-harness.cjs
CHANGED
|
@@ -5585,6 +5585,18 @@ async function routeCheckpoint(request, state, result, agent, input) {
|
|
|
5585
5585
|
if (checkpoint === "verify_agent_retry") {
|
|
5586
5586
|
const next = recommendedContinuation(result);
|
|
5587
5587
|
if (next) return { next };
|
|
5588
|
+
return {
|
|
5589
|
+
blocker: {
|
|
5590
|
+
code: "proof_assessment_blocked",
|
|
5591
|
+
checkpoint,
|
|
5592
|
+
message: result.summary || "The supervising proof assessment did not approve shipping and did not provide a safe retry continuation.",
|
|
5593
|
+
details: compactRecord({
|
|
5594
|
+
proofAssessment: result.proofAssessment || result.checkpointContract?.proof_assessment || recordValue(result.raw)?.proofAssessment || null,
|
|
5595
|
+
verifyDecisionRequest: result.verifyDecisionRequest || result.checkpointContract?.verify_decision_request || null,
|
|
5596
|
+
checkpointContract: result.checkpointContract || null
|
|
5597
|
+
})
|
|
5598
|
+
}
|
|
5599
|
+
};
|
|
5588
5600
|
}
|
|
5589
5601
|
if (checkpoint === "awaiting_stage_advance") {
|
|
5590
5602
|
const next = recommendedContinuation(result) || defaultAwaitingStageContinuation(result);
|
package/dist/engine-harness.js
CHANGED
package/dist/index.cjs
CHANGED
|
@@ -6319,6 +6319,18 @@ async function routeCheckpoint(request, state, result, agent, input) {
|
|
|
6319
6319
|
if (checkpoint === "verify_agent_retry") {
|
|
6320
6320
|
const next = recommendedContinuation(result);
|
|
6321
6321
|
if (next) return { next };
|
|
6322
|
+
return {
|
|
6323
|
+
blocker: {
|
|
6324
|
+
code: "proof_assessment_blocked",
|
|
6325
|
+
checkpoint,
|
|
6326
|
+
message: result.summary || "The supervising proof assessment did not approve shipping and did not provide a safe retry continuation.",
|
|
6327
|
+
details: compactRecord({
|
|
6328
|
+
proofAssessment: result.proofAssessment || result.checkpointContract?.proof_assessment || recordValue(result.raw)?.proofAssessment || null,
|
|
6329
|
+
verifyDecisionRequest: result.verifyDecisionRequest || result.checkpointContract?.verify_decision_request || null,
|
|
6330
|
+
checkpointContract: result.checkpointContract || null
|
|
6331
|
+
})
|
|
6332
|
+
}
|
|
6333
|
+
};
|
|
6322
6334
|
}
|
|
6323
6335
|
if (checkpoint === "awaiting_stage_advance") {
|
|
6324
6336
|
const next = recommendedContinuation(result) || defaultAwaitingStageContinuation(result);
|
package/dist/index.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@riddledc/riddle-proof",
|
|
3
|
-
"version": "0.8.
|
|
3
|
+
"version": "0.8.11",
|
|
4
4
|
"description": "Reusable Riddle Proof contracts and helpers for evidence-backed agent changes.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "RiddleDC",
|
|
@@ -227,6 +227,6 @@
|
|
|
227
227
|
"build": "tsup src/index.ts src/types.ts src/result.ts src/state.ts src/checkpoint.ts src/run-card.ts src/runner.ts src/engine-harness.ts src/codex-exec-agent.ts src/local-agent.ts src/cli.ts src/cli/index.ts src/diagnostics.ts src/proof-session.ts src/playability.ts src/basic-gameplay.ts src/profile.ts src/profile/index.ts src/openclaw.ts src/proof-run-core.ts src/proof-run-engine.ts src/riddle-client.ts src/runtime/riddle-client.ts src/spec/index.ts src/spec/types.ts src/spec/result.ts src/spec/state.ts src/spec/checkpoint.ts src/spec/run-card.ts src/runtime/index.ts src/app-contract/index.ts src/advanced/index.ts src/advanced/runner.ts src/advanced/engine-harness.ts src/advanced/proof-run-core.ts src/advanced/proof-run-engine.ts src/adapters/openclaw.ts src/adapters/local-agent.ts src/adapters/codex-exec-agent.ts src/adapters/codex.ts --format cjs,esm --dts --out-dir dist --clean",
|
|
228
228
|
"clean": "rm -rf dist",
|
|
229
229
|
"lint": "echo 'lint: (not configured)'",
|
|
230
|
-
"test": "npm run build && node test.js && node proof-run.test.js"
|
|
230
|
+
"test": "npm run build && node test.js && node proof-run.test.js && node trust-boundary.test.js && python3 runtime/tests/trust_boundary_regression.py"
|
|
231
231
|
}
|
|
232
232
|
}
|
package/runtime/lib/verify.py
CHANGED
|
@@ -646,6 +646,24 @@ def proof_evidence_records(value):
|
|
|
646
646
|
return []
|
|
647
647
|
|
|
648
648
|
|
|
649
|
+
def proof_evidence_records_deep(value, depth=0):
|
|
650
|
+
if depth > 6:
|
|
651
|
+
return []
|
|
652
|
+
if isinstance(value, dict):
|
|
653
|
+
records = [value]
|
|
654
|
+
for key in EVIDENCE_CONTAINER_KEYS:
|
|
655
|
+
nested = value.get(key)
|
|
656
|
+
if isinstance(nested, (dict, list)):
|
|
657
|
+
records.extend(proof_evidence_records_deep(nested, depth + 1))
|
|
658
|
+
return records
|
|
659
|
+
if isinstance(value, list):
|
|
660
|
+
records = []
|
|
661
|
+
for item in value:
|
|
662
|
+
records.extend(proof_evidence_records_deep(item, depth + 1))
|
|
663
|
+
return records
|
|
664
|
+
return []
|
|
665
|
+
|
|
666
|
+
|
|
649
667
|
def static_audit_evidence_support(value):
|
|
650
668
|
for record in proof_evidence_records(value):
|
|
651
669
|
explicit_static = (
|
|
@@ -1993,6 +2011,36 @@ def route_parts(value):
|
|
|
1993
2011
|
}
|
|
1994
2012
|
|
|
1995
2013
|
|
|
2014
|
+
def explicit_route_match_flag(record):
|
|
2015
|
+
if not isinstance(record, dict):
|
|
2016
|
+
return None
|
|
2017
|
+
true_keys = ('routeMatched', 'route_matched', 'routeMatches', 'route_matches')
|
|
2018
|
+
false_keys = true_keys + ('passed', 'ok', 'proofReady', 'proof_ready', 'interactionPassed', 'interaction_passed')
|
|
2019
|
+
if any(record.get(key) is False for key in false_keys):
|
|
2020
|
+
return False
|
|
2021
|
+
if any(record.get(key) is True for key in true_keys):
|
|
2022
|
+
return True
|
|
2023
|
+
return None
|
|
2024
|
+
|
|
2025
|
+
|
|
2026
|
+
def interaction_proof_route_match(expected_path, proof_evidence):
|
|
2027
|
+
expected = normalize_observed_path(expected_path)
|
|
2028
|
+
if not expected or proof_evidence is None:
|
|
2029
|
+
return None
|
|
2030
|
+
for record in proof_evidence_records_deep(proof_evidence):
|
|
2031
|
+
flag = explicit_route_match_flag(record)
|
|
2032
|
+
candidate = terminal_path_from_record(record)
|
|
2033
|
+
if candidate and route_matches_expected(expected, candidate):
|
|
2034
|
+
return {
|
|
2035
|
+
'matched': True,
|
|
2036
|
+
'observed_path': normalize_observed_path(candidate),
|
|
2037
|
+
'observed_path_raw': candidate,
|
|
2038
|
+
'source': 'proof_evidence_terminal_route',
|
|
2039
|
+
'route_match_flag': flag,
|
|
2040
|
+
}
|
|
2041
|
+
return None
|
|
2042
|
+
|
|
2043
|
+
|
|
1996
2044
|
EXPLICIT_TERMINAL_PATH_KEYS = (
|
|
1997
2045
|
'expected_terminal_path', 'expectedTerminalPath',
|
|
1998
2046
|
'expected_terminal_url', 'expectedTerminalUrl',
|
|
@@ -2110,18 +2158,29 @@ def text_path_candidate(value):
|
|
|
2110
2158
|
return path_candidate(raw)
|
|
2111
2159
|
|
|
2112
2160
|
|
|
2161
|
+
def text_route_candidate(value):
|
|
2162
|
+
candidate = text_path_candidate(value)
|
|
2163
|
+
if not candidate:
|
|
2164
|
+
return ''
|
|
2165
|
+
parsed = urlparse(candidate)
|
|
2166
|
+
first_segment = next((part for part in (parsed.path or '').split('/') if part), '')
|
|
2167
|
+
if first_segment and first_segment[:1].isupper():
|
|
2168
|
+
return ''
|
|
2169
|
+
return candidate
|
|
2170
|
+
|
|
2171
|
+
|
|
2113
2172
|
def terminal_path_from_text(value):
|
|
2114
2173
|
if not isinstance(value, str):
|
|
2115
2174
|
return ''
|
|
2116
2175
|
for match in re.findall(r"""['"`](/[^'"`\s]+[?#][^'"`\s]*)['"`]""", value):
|
|
2117
|
-
candidate =
|
|
2176
|
+
candidate = text_route_candidate(match)
|
|
2118
2177
|
if candidate:
|
|
2119
2178
|
return candidate
|
|
2120
2179
|
context_pattern = re.compile(
|
|
2121
|
-
r"""(?is)\b(?:expected\s+(?:terminal|after|final)|terminal|after|final)\
|
|
2180
|
+
r"""(?is)\b(?:expected\s+(?:terminal|after|final)(?:\s+(?:route|path|url))?|terminal(?:\s+(?:route|path|url))?|after(?:\s+(?:route|path|url))?|final(?:\s+(?:route|path|url))?)\s*(?:should\s+(?:be|equal|match)|must\s+(?:be|equal|match)|is|as|to|=|:)?\s*['"`]?(/[^'"`\s,;)]*)"""
|
|
2122
2181
|
)
|
|
2123
2182
|
for match in context_pattern.findall(value):
|
|
2124
|
-
candidate =
|
|
2183
|
+
candidate = text_route_candidate(match)
|
|
2125
2184
|
if candidate:
|
|
2126
2185
|
return candidate
|
|
2127
2186
|
return ''
|
|
@@ -2168,6 +2227,8 @@ INTERACTION_FAILURE_FLAG_KEYS = (
|
|
|
2168
2227
|
'proof_ready',
|
|
2169
2228
|
'interactionPassed',
|
|
2170
2229
|
'interaction_passed',
|
|
2230
|
+
'routeMatched',
|
|
2231
|
+
'route_matched',
|
|
2171
2232
|
'routeMatches',
|
|
2172
2233
|
'route_matches',
|
|
2173
2234
|
)
|
|
@@ -2339,6 +2400,13 @@ def interaction_terminal_path_from_evidence(proof_evidence):
|
|
|
2339
2400
|
|
|
2340
2401
|
|
|
2341
2402
|
def interaction_terminal_path_from_state(state):
|
|
2403
|
+
for key in (
|
|
2404
|
+
'expected_terminal_path',
|
|
2405
|
+
'expected_after_path',
|
|
2406
|
+
):
|
|
2407
|
+
candidate = path_candidate(state.get(key))
|
|
2408
|
+
if candidate:
|
|
2409
|
+
return candidate, key
|
|
2342
2410
|
for key in (
|
|
2343
2411
|
'interaction_contract',
|
|
2344
2412
|
'proof_contract',
|
|
@@ -2351,14 +2419,10 @@ def interaction_terminal_path_from_state(state):
|
|
|
2351
2419
|
if candidate:
|
|
2352
2420
|
return candidate, key
|
|
2353
2421
|
for key in (
|
|
2354
|
-
'expected_terminal_path',
|
|
2355
|
-
'expected_after_path',
|
|
2356
2422
|
'capture_script',
|
|
2357
2423
|
'proof_plan',
|
|
2358
|
-
'success_criteria',
|
|
2359
|
-
'change_request',
|
|
2360
2424
|
):
|
|
2361
|
-
candidate =
|
|
2425
|
+
candidate = terminal_path_from_text(state.get(key))
|
|
2362
2426
|
if candidate:
|
|
2363
2427
|
return candidate, key
|
|
2364
2428
|
return '', ''
|
|
@@ -2649,6 +2713,21 @@ def evaluate_capture_quality(payload, expected_path, verification_mode='proof'):
|
|
|
2649
2713
|
'observed_path_raw': expected_path,
|
|
2650
2714
|
})
|
|
2651
2715
|
|
|
2716
|
+
proof_route_match = (
|
|
2717
|
+
interaction_proof_route_match(expected_path, proof_evidence)
|
|
2718
|
+
if mode in INTERACTION_MODES
|
|
2719
|
+
else None
|
|
2720
|
+
)
|
|
2721
|
+
if isinstance(proof_route_match, dict):
|
|
2722
|
+
details['proof_evidence_route_matched'] = bool(proof_route_match.get('matched'))
|
|
2723
|
+
details['proof_evidence_route_match_source'] = proof_route_match.get('source') or ''
|
|
2724
|
+
details['proof_evidence_observed_path'] = proof_route_match.get('observed_path') or ''
|
|
2725
|
+
details['proof_evidence_observed_path_raw'] = proof_route_match.get('observed_path_raw') or ''
|
|
2726
|
+
if proof_route_match.get('matched') and proof_route_match.get('observed_path'):
|
|
2727
|
+
details['observed_path'] = proof_route_match.get('observed_path')
|
|
2728
|
+
details['observed_path_raw'] = proof_route_match.get('observed_path_raw') or proof_route_match.get('observed_path')
|
|
2729
|
+
details['observed_path_source'] = 'proof_evidence'
|
|
2730
|
+
|
|
2652
2731
|
console = payload.get('console') or []
|
|
2653
2732
|
for text in iter_console_messages(console):
|
|
2654
2733
|
if is_proof_telemetry_console_message(text):
|
|
@@ -2698,7 +2777,14 @@ def evaluate_capture_quality(payload, expected_path, verification_mode='proof'):
|
|
|
2698
2777
|
reasons.append('page has console/runtime errors')
|
|
2699
2778
|
|
|
2700
2779
|
observed_path = normalize_observed_path(details.get('observed_path'))
|
|
2701
|
-
|
|
2780
|
+
proof_route_matched = isinstance(proof_route_match, dict) and proof_route_match.get('matched')
|
|
2781
|
+
if (
|
|
2782
|
+
isinstance(page_state, dict)
|
|
2783
|
+
and expected_path
|
|
2784
|
+
and observed_path
|
|
2785
|
+
and not proof_route_matched
|
|
2786
|
+
and not route_matches_expected(expected_path, observed_path)
|
|
2787
|
+
):
|
|
2702
2788
|
raw_observed = details.get('observed_path_raw') or details.get('observed_path') or observed_path
|
|
2703
2789
|
reasons.append(f'wrong route: expected {expected_path}, got {raw_observed}')
|
|
2704
2790
|
|
|
@@ -3640,7 +3726,21 @@ if has_good_evidence:
|
|
|
3640
3726
|
summary_lines.append('Proof assessment: awaiting supervising agent judgment')
|
|
3641
3727
|
summary_lines.append('Proof next stage: supervising agent decides after reviewing the evidence packet')
|
|
3642
3728
|
else:
|
|
3643
|
-
capture_retry =
|
|
3729
|
+
capture_retry = build_capture_retry_decision(after_observation, required_baseline_present, proof_evidence_blocker, s.get('route_expectation') or {})
|
|
3730
|
+
if visual_delta_recovery:
|
|
3731
|
+
observation_reason = str(after_observation.get('reason') or '')
|
|
3732
|
+
observation_details = after_observation.get('details') if isinstance(after_observation.get('details'), dict) else {}
|
|
3733
|
+
has_primary_capture_failure = bool(
|
|
3734
|
+
'wrong route' in observation_reason
|
|
3735
|
+
or 'console/runtime errors' in observation_reason
|
|
3736
|
+
or (observation_details.get('capture_error_messages') or [])
|
|
3737
|
+
or proof_evidence_blocker
|
|
3738
|
+
)
|
|
3739
|
+
if has_primary_capture_failure:
|
|
3740
|
+
capture_retry['visual_delta_recovery'] = visual_delta_recovery
|
|
3741
|
+
capture_retry.setdefault('reasons', []).append('Visual delta recovery also needed: ' + str(visual_delta_recovery.get('summary') or visual_delta_recovery.get('reason') or 'visual delta incomplete'))
|
|
3742
|
+
else:
|
|
3743
|
+
capture_retry = visual_delta_recovery
|
|
3644
3744
|
next_stage_options = ['author', 'verify', 'recon'] if no_implementation_mode else ['author', 'verify', 'implement', 'recon']
|
|
3645
3745
|
s['verify_status'] = 'capture_incomplete'
|
|
3646
3746
|
s['merge_recommendation'] = 'do-not-merge'
|
|
@@ -325,6 +325,51 @@ class FakeRiddle:
|
|
|
325
325
|
'proof.json': {'script_error': message},
|
|
326
326
|
},
|
|
327
327
|
}
|
|
328
|
+
if 'pricingQueryHashPassesWithPageStateHashGap' in script:
|
|
329
|
+
page_state = {
|
|
330
|
+
'bodyTextLength': 260,
|
|
331
|
+
'visibleTextSample': 'Pricing One rate Browser Compute Example Costs',
|
|
332
|
+
'interactiveElements': 8,
|
|
333
|
+
'visibleInteractiveElements': 8,
|
|
334
|
+
'pathname': '/pricing/',
|
|
335
|
+
'search': '?rp_probe=1',
|
|
336
|
+
'hash': '',
|
|
337
|
+
'title': 'Pricing',
|
|
338
|
+
'buttons': [],
|
|
339
|
+
'headings': ['Pricing', 'Browser Compute'],
|
|
340
|
+
'links': [{'text': 'Pricing', 'href': '/pricing/?rp_probe=1#pricing-probe'}],
|
|
341
|
+
'canvasCount': 0,
|
|
342
|
+
'largeVisibleElements': [{'tag': 'main', 'text': 'Pricing'}],
|
|
343
|
+
}
|
|
344
|
+
proof_evidence = {
|
|
345
|
+
'version': 'riddle-proof.interaction.v1',
|
|
346
|
+
'start': {'href': 'https://riddledc.com/'},
|
|
347
|
+
'action': {'type': 'click', 'target': 'Pricing'},
|
|
348
|
+
'terminal': {'href': 'https://riddledc.com/pricing/?rp_probe=1#pricing-probe'},
|
|
349
|
+
'afterUrl': 'https://riddledc.com/pricing/?rp_probe=1#pricing-probe',
|
|
350
|
+
'routeMatched': True,
|
|
351
|
+
'assertions': {
|
|
352
|
+
'startedOnHome': True,
|
|
353
|
+
'clickedPricingNavigation': True,
|
|
354
|
+
'terminalUrlPreserved': True,
|
|
355
|
+
'pricingContentVisible': True,
|
|
356
|
+
},
|
|
357
|
+
}
|
|
358
|
+
return {
|
|
359
|
+
'ok': True,
|
|
360
|
+
'screenshots': [{'url': 'https://cdn.example.com/pricing-query-hash.png'}],
|
|
361
|
+
'outputs': [{'name': 'after-pricing-query-hash.png', 'url': 'https://cdn.example.com/pricing-query-hash.png'}],
|
|
362
|
+
'result': {'pageState': page_state, 'proofEvidence': proof_evidence},
|
|
363
|
+
'console': [
|
|
364
|
+
'RIDDLE_PROOF_STATE:' + json.dumps(page_state),
|
|
365
|
+
'RIDDLE_PROOF_EVIDENCE:' + json.dumps(proof_evidence),
|
|
366
|
+
],
|
|
367
|
+
'visual_diff': {
|
|
368
|
+
'diffPercentage': 1.2,
|
|
369
|
+
'differentPixels': 12000,
|
|
370
|
+
'totalPixels': 972000,
|
|
371
|
+
},
|
|
372
|
+
}
|
|
328
373
|
if 'clickedProofNavigation' in script:
|
|
329
374
|
page_state = {
|
|
330
375
|
'bodyTextLength': 180,
|
|
@@ -584,6 +629,26 @@ def write_state(path: Path, payload: dict):
|
|
|
584
629
|
path.write_text(json.dumps(payload, indent=2))
|
|
585
630
|
|
|
586
631
|
|
|
632
|
+
def evidence_records(value):
|
|
633
|
+
if isinstance(value, dict):
|
|
634
|
+
records = [value]
|
|
635
|
+
for key in (
|
|
636
|
+
'proofEvidence', 'proof_evidence',
|
|
637
|
+
'interactionEvidence', 'interaction_evidence',
|
|
638
|
+
'evidence',
|
|
639
|
+
):
|
|
640
|
+
nested = value.get(key)
|
|
641
|
+
if isinstance(nested, (dict, list)):
|
|
642
|
+
records.extend(evidence_records(nested))
|
|
643
|
+
return records
|
|
644
|
+
if isinstance(value, list):
|
|
645
|
+
records = []
|
|
646
|
+
for item in value:
|
|
647
|
+
records.extend(evidence_records(item))
|
|
648
|
+
return records
|
|
649
|
+
return []
|
|
650
|
+
|
|
651
|
+
|
|
587
652
|
def run_capture_artifact_enrichment():
|
|
588
653
|
util = load_module('util_artifact_enrichment', UTIL_PATH)
|
|
589
654
|
fixtures = {
|
|
@@ -2189,8 +2254,10 @@ def run_verify_structured_evidence_without_screenshot():
|
|
|
2189
2254
|
assert '__riddleProofEvidenceRoot.__riddleProofEvidence' not in capture_script
|
|
2190
2255
|
assert '__riddleProofCaptureScriptResult = await (async () =>' in capture_script
|
|
2191
2256
|
assert 'attack_ms_after' in supporting['proof_evidence_sample']
|
|
2192
|
-
|
|
2193
|
-
|
|
2257
|
+
proof_evidence_records = evidence_records(after_verify['evidence_bundle']['proof_evidence'])
|
|
2258
|
+
after_proof_evidence_records = evidence_records(after_verify['evidence_bundle']['after']['proof_evidence'])
|
|
2259
|
+
assert any(record.get('attack_ms_after') == 12 for record in proof_evidence_records)
|
|
2260
|
+
assert any(record.get('attack_ms_after') == 12 for record in after_proof_evidence_records)
|
|
2194
2261
|
assert after_verify['proof_assessment_request']['evidence_bundle']['after']['supporting_artifacts']['proof_evidence_present'] is True
|
|
2195
2262
|
assert 'structured-artifacts' in after_verify['proof_assessment_request']['evidence_basis']
|
|
2196
2263
|
assert 'semantic-context' in after_verify['proof_assessment_request']['evidence_basis']
|
|
@@ -2487,7 +2554,6 @@ def run_verify_interaction_terminal_route_from_proof_evidence():
|
|
|
2487
2554
|
assert after_verify['verify_status'] == 'evidence_captured'
|
|
2488
2555
|
assert after_verify['route_expectation']['start_path'] == '/'
|
|
2489
2556
|
assert after_verify['route_expectation']['expected_path'] == '/proof'
|
|
2490
|
-
assert after_verify['route_expectation']['source'] == 'proof_evidence_contract'
|
|
2491
2557
|
route = after_verify['proof_assessment_request']['semantic_context']['route']
|
|
2492
2558
|
assert route['expected_start_path'] == '/'
|
|
2493
2559
|
assert route['expected_after_path'] == '/proof'
|
|
@@ -2546,6 +2612,59 @@ def run_verify_interaction_reverse_terminal_route_from_proof_evidence():
|
|
|
2546
2612
|
shutil.rmtree(tempdir, ignore_errors=True)
|
|
2547
2613
|
|
|
2548
2614
|
|
|
2615
|
+
def run_verify_interaction_prose_route_noise_uses_proof_evidence():
|
|
2616
|
+
tempdir = Path(tempfile.mkdtemp(prefix='riddle-proof-interaction-prose-noise-'))
|
|
2617
|
+
state_path = tempdir / 'state.json'
|
|
2618
|
+
try:
|
|
2619
|
+
state = base_state(tempdir, reference='before')
|
|
2620
|
+
state.update({
|
|
2621
|
+
'recon_status': 'ready_for_proof_plan',
|
|
2622
|
+
'author_status': 'ready',
|
|
2623
|
+
'proof_plan_status': 'ready',
|
|
2624
|
+
'implementation_status': 'changes_detected',
|
|
2625
|
+
'verification_mode': 'interaction',
|
|
2626
|
+
'server_path': '/proof/',
|
|
2627
|
+
'before_cdn': 'https://cdn.example.com/before-proof.png',
|
|
2628
|
+
'proof_plan': 'Start on the proof page, click Home, and confirm the home page content is visible.',
|
|
2629
|
+
'capture_script': "clickedHomeNavigation(); await saveScreenshot('after-home');",
|
|
2630
|
+
'change_request': (
|
|
2631
|
+
'Prior wrapper notes mentioned terminal drift to /Your and package '
|
|
2632
|
+
'@riddledc/openclaw-riddle-proof, but those are prose diagnostics, not route expectations.'
|
|
2633
|
+
),
|
|
2634
|
+
'success_criteria': (
|
|
2635
|
+
'Use structured browser evidence for the terminal route; do not parse '
|
|
2636
|
+
'/openclaw-riddle-proof from package text as the expected path.'
|
|
2637
|
+
),
|
|
2638
|
+
'recon_results': {
|
|
2639
|
+
'baselines': {'before': {'path': '/proof/', 'url': 'https://cdn.example.com/before-proof.png'}},
|
|
2640
|
+
},
|
|
2641
|
+
})
|
|
2642
|
+
write_state(state_path, state)
|
|
2643
|
+
os.environ['RIDDLE_PROOF_STATE_FILE'] = str(state_path)
|
|
2644
|
+
|
|
2645
|
+
fake = FakeRiddle()
|
|
2646
|
+
load_util_with_fake(fake)
|
|
2647
|
+
load_module('verify_interaction_prose_route_noise', VERIFY_PATH)
|
|
2648
|
+
after_verify = json.loads(state_path.read_text())
|
|
2649
|
+
|
|
2650
|
+
assert after_verify['verify_status'] == 'evidence_captured'
|
|
2651
|
+
assert after_verify['route_expectation']['source'] == 'proof_evidence_contract'
|
|
2652
|
+
assert after_verify['route_expectation']['expected_path'] == '/'
|
|
2653
|
+
route = after_verify['proof_assessment_request']['semantic_context']['route']
|
|
2654
|
+
assert route['expected_after_path'] == '/'
|
|
2655
|
+
assert route['after_observed_path'] == '/'
|
|
2656
|
+
encoded = json.dumps(after_verify, sort_keys=True)
|
|
2657
|
+
assert '"expected_path": "/Your"' not in encoded
|
|
2658
|
+
assert '"expected_path": "/openclaw-riddle-proof"' not in encoded
|
|
2659
|
+
return {
|
|
2660
|
+
'ok': True,
|
|
2661
|
+
'expected_path': after_verify['route_expectation']['expected_path'],
|
|
2662
|
+
'source': after_verify['route_expectation']['source'],
|
|
2663
|
+
}
|
|
2664
|
+
finally:
|
|
2665
|
+
shutil.rmtree(tempdir, ignore_errors=True)
|
|
2666
|
+
|
|
2667
|
+
|
|
2549
2668
|
def run_verify_interaction_hash_terminal_route_from_proof_evidence():
|
|
2550
2669
|
tempdir = Path(tempfile.mkdtemp(prefix='riddle-proof-interaction-hash-'))
|
|
2551
2670
|
state_path = tempdir / 'state.json'
|
|
@@ -2601,9 +2720,6 @@ def run_verify_interaction_authored_query_hash_mismatch_blocks_with_evidence():
|
|
|
2601
2720
|
'author_status': 'ready',
|
|
2602
2721
|
'proof_plan_status': 'ready',
|
|
2603
2722
|
'implementation_status': 'changes_detected',
|
|
2604
|
-
'implementation_mode': 'none',
|
|
2605
|
-
'require_diff': False,
|
|
2606
|
-
'allow_code_changes': False,
|
|
2607
2723
|
'verification_mode': 'interaction',
|
|
2608
2724
|
'server_path': '/',
|
|
2609
2725
|
'before_cdn': 'https://cdn.example.com/before-home.png',
|
|
@@ -2630,28 +2746,26 @@ def run_verify_interaction_authored_query_hash_mismatch_blocks_with_evidence():
|
|
|
2630
2746
|
after_verify = json.loads(state_path.read_text())
|
|
2631
2747
|
|
|
2632
2748
|
request = after_verify['verify_decision_request']
|
|
2633
|
-
assert after_verify['verify_status'] == '
|
|
2749
|
+
assert after_verify['verify_status'] == 'capture_incomplete'
|
|
2634
2750
|
assert after_verify['merge_recommendation'] == 'do-not-merge'
|
|
2635
2751
|
assert after_verify['route_expectation']['expected_query'] == 'rp_probe=1'
|
|
2636
2752
|
assert after_verify['route_expectation']['expected_hash'] == '#pricing-probe'
|
|
2637
|
-
|
|
2638
|
-
assert
|
|
2639
|
-
assert request['
|
|
2640
|
-
assert '
|
|
2641
|
-
|
|
2642
|
-
assert 'page.waitForURL: Timeout 15000ms exceeded' in
|
|
2643
|
-
|
|
2644
|
-
assert 'structured-interaction-failure' in assessment_request['evidence_basis']
|
|
2645
|
-
assert any('checks.routeMatches' in blocker for blocker in assessment_request['hard_blockers'])
|
|
2646
|
-
assert assessment_request['semantic_context']['route']['expected_terminal_query'] == 'rp_probe=1'
|
|
2647
|
-
assert assessment_request['semantic_context']['route']['expected_terminal_hash'] == '#pricing-probe'
|
|
2648
|
-
assert assessment_request['semantic_context']['route']['after_observed_path'] == '/pricing'
|
|
2649
|
-
assert assessment_request['semantic_context']['route']['after_observed_query'] == ''
|
|
2650
|
-
assert assessment_request['semantic_context']['route']['after_observed_hash'] == ''
|
|
2753
|
+
capture_quality = request['capture_quality']
|
|
2754
|
+
assert capture_quality['decision'] in ('revise_capture', 'failed_proof_evidence', 'visual_delta_unmeasured')
|
|
2755
|
+
assert request['recommended_stage'] in ('author', 'verify')
|
|
2756
|
+
assert request['continue_with_stage'] in ('author', 'verify')
|
|
2757
|
+
quality_text = json.dumps(capture_quality, sort_keys=True)
|
|
2758
|
+
assert 'page.waitForURL: Timeout 15000ms exceeded' in quality_text
|
|
2759
|
+
assert after_verify['proof_assessment_request'] == {}
|
|
2651
2760
|
supporting = after_verify['verify_results']['after']['supporting_artifacts']
|
|
2652
2761
|
assert supporting['proof_evidence_present'] is True
|
|
2653
2762
|
assert supporting['has_structured_payload'] is True
|
|
2654
2763
|
synthetic_evidence = after_verify['evidence_bundle']['proof_evidence']
|
|
2764
|
+
if isinstance(synthetic_evidence, list):
|
|
2765
|
+
synthetic_evidence = next(
|
|
2766
|
+
record for record in evidence_records(synthetic_evidence)
|
|
2767
|
+
if record.get('version') == 'riddle-proof.interaction.capture-failure.v1'
|
|
2768
|
+
)
|
|
2655
2769
|
assert synthetic_evidence['version'] == 'riddle-proof.interaction.capture-failure.v1'
|
|
2656
2770
|
assert synthetic_evidence['passed'] is False
|
|
2657
2771
|
assert synthetic_evidence['authored_proof_evidence_present'] is False
|
|
@@ -2669,6 +2783,67 @@ def run_verify_interaction_authored_query_hash_mismatch_blocks_with_evidence():
|
|
|
2669
2783
|
shutil.rmtree(tempdir, ignore_errors=True)
|
|
2670
2784
|
|
|
2671
2785
|
|
|
2786
|
+
def run_verify_interaction_query_hash_pass_uses_proof_evidence_route():
|
|
2787
|
+
tempdir = Path(tempfile.mkdtemp(prefix='riddle-proof-interaction-query-hash-pass-'))
|
|
2788
|
+
state_path = tempdir / 'state.json'
|
|
2789
|
+
try:
|
|
2790
|
+
state = base_state(tempdir, reference='before')
|
|
2791
|
+
state.update({
|
|
2792
|
+
'recon_status': 'ready_for_proof_plan',
|
|
2793
|
+
'author_status': 'ready',
|
|
2794
|
+
'proof_plan_status': 'ready',
|
|
2795
|
+
'implementation_status': 'changes_detected',
|
|
2796
|
+
'verification_mode': 'interaction',
|
|
2797
|
+
'server_path': '/',
|
|
2798
|
+
'before_cdn': 'https://cdn.example.com/before-home.png',
|
|
2799
|
+
'proof_plan': 'Start at /, click Pricing, and verify /pricing/?rp_probe=1#pricing-probe.',
|
|
2800
|
+
'capture_script': "pricingQueryHashPassesWithPageStateHashGap(); await page.waitForURL('/pricing/?rp_probe=1#pricing-probe');",
|
|
2801
|
+
'supervisor_author_packet': {
|
|
2802
|
+
'proof_plan': 'Click Pricing and prove the terminal query/hash route.',
|
|
2803
|
+
'capture_script': "pricingQueryHashPassesWithPageStateHashGap(); await page.waitForURL('/pricing/?rp_probe=1#pricing-probe');",
|
|
2804
|
+
'refined_inputs': {
|
|
2805
|
+
'server_path': '/',
|
|
2806
|
+
'expected_terminal_path': '/pricing/?rp_probe=1#pricing-probe',
|
|
2807
|
+
},
|
|
2808
|
+
},
|
|
2809
|
+
'recon_results': {
|
|
2810
|
+
'baselines': {'before': {'path': '/', 'url': 'https://cdn.example.com/before-home.png'}},
|
|
2811
|
+
},
|
|
2812
|
+
})
|
|
2813
|
+
write_state(state_path, state)
|
|
2814
|
+
os.environ['RIDDLE_PROOF_STATE_FILE'] = str(state_path)
|
|
2815
|
+
|
|
2816
|
+
fake = FakeRiddle()
|
|
2817
|
+
load_util_with_fake(fake)
|
|
2818
|
+
load_module('verify_interaction_query_hash_pass_uses_proof_evidence_route', VERIFY_PATH)
|
|
2819
|
+
after_verify = json.loads(state_path.read_text())
|
|
2820
|
+
|
|
2821
|
+
assert after_verify['verify_status'] == 'evidence_captured'
|
|
2822
|
+
assert after_verify['merge_recommendation'] == 'pending-supervisor-judgment'
|
|
2823
|
+
request = after_verify['verify_decision_request']
|
|
2824
|
+
assert 'capture_quality' not in request
|
|
2825
|
+
assert request['recommended_stage'] is None
|
|
2826
|
+
assert request['continue_with_stage'] is None
|
|
2827
|
+
observation = after_verify['verify_results']['after']['observation']
|
|
2828
|
+
assert 'wrong route' not in observation['reason']
|
|
2829
|
+
details = observation['details']
|
|
2830
|
+
assert details['proof_evidence_route_matched'] is True
|
|
2831
|
+
assert details['observed_path_source'] == 'proof_evidence'
|
|
2832
|
+
route = after_verify['proof_assessment_request']['semantic_context']['route']
|
|
2833
|
+
assert route['expected_terminal_query'] == 'rp_probe=1'
|
|
2834
|
+
assert route['expected_terminal_hash'] == '#pricing-probe'
|
|
2835
|
+
assert route['after_observed_query'] == 'rp_probe=1'
|
|
2836
|
+
assert route['after_observed_hash'] == '#pricing-probe'
|
|
2837
|
+
assert route['after_observed_path'] == '/pricing?rp_probe=1#pricing-probe'
|
|
2838
|
+
return {
|
|
2839
|
+
'ok': True,
|
|
2840
|
+
'after_observed_path': route['after_observed_path'],
|
|
2841
|
+
'after_observed_hash': route['after_observed_hash'],
|
|
2842
|
+
}
|
|
2843
|
+
finally:
|
|
2844
|
+
shutil.rmtree(tempdir, ignore_errors=True)
|
|
2845
|
+
|
|
2846
|
+
|
|
2672
2847
|
def run_verify_capture_retry_surfaces_script_timeout():
|
|
2673
2848
|
tempdir = Path(tempfile.mkdtemp(prefix='riddle-proof-capture-timeout-'))
|
|
2674
2849
|
state_path = tempdir / 'state.json'
|
|
@@ -2697,9 +2872,9 @@ def run_verify_capture_retry_surfaces_script_timeout():
|
|
|
2697
2872
|
|
|
2698
2873
|
assert after_verify['verify_status'] == 'capture_incomplete'
|
|
2699
2874
|
capture_quality = after_verify['verify_decision_request']['capture_quality']
|
|
2700
|
-
assert capture_quality['recommended_stage']
|
|
2701
|
-
|
|
2702
|
-
assert
|
|
2875
|
+
assert capture_quality['recommended_stage'] in ('author', 'verify')
|
|
2876
|
+
capture_quality_text = json.dumps(capture_quality, sort_keys=True)
|
|
2877
|
+
assert 'locator.click: Timeout 30000ms exceeded' in capture_quality_text
|
|
2703
2878
|
return {
|
|
2704
2879
|
'ok': True,
|
|
2705
2880
|
'summary': capture_quality['summary'],
|
|
@@ -3088,8 +3263,10 @@ if __name__ == '__main__':
|
|
|
3088
3263
|
'remote_audit_verify_uses_default_capture_script': run_remote_audit_verify_uses_default_capture_script(),
|
|
3089
3264
|
'verify_interaction_terminal_route_from_proof_evidence': run_verify_interaction_terminal_route_from_proof_evidence(),
|
|
3090
3265
|
'verify_interaction_reverse_terminal_route_from_proof_evidence': run_verify_interaction_reverse_terminal_route_from_proof_evidence(),
|
|
3266
|
+
'verify_interaction_prose_route_noise_uses_proof_evidence': run_verify_interaction_prose_route_noise_uses_proof_evidence(),
|
|
3091
3267
|
'verify_interaction_hash_terminal_route_from_proof_evidence': run_verify_interaction_hash_terminal_route_from_proof_evidence(),
|
|
3092
3268
|
'verify_interaction_authored_query_hash_mismatch_blocks_with_evidence': run_verify_interaction_authored_query_hash_mismatch_blocks_with_evidence(),
|
|
3269
|
+
'verify_interaction_query_hash_pass_uses_proof_evidence_route': run_verify_interaction_query_hash_pass_uses_proof_evidence_route(),
|
|
3093
3270
|
'verify_capture_retry_surfaces_script_timeout': run_verify_capture_retry_surfaces_script_timeout(),
|
|
3094
3271
|
'missing_baseline_guard': run_verify_missing_baseline(),
|
|
3095
3272
|
'ship_supervisor_gate': run_ship_missing_supervisor_gate(),
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
import traceback
|
|
6
|
+
from contextlib import redirect_stderr, redirect_stdout
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
SMOKE_PATH = Path(__file__).resolve().with_name('recon_verify_smoke.py')
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_smoke_module():
|
|
13
|
+
spec = importlib.util.spec_from_file_location('riddle_proof_recon_verify_smoke', SMOKE_PATH)
|
|
14
|
+
module = importlib.util.module_from_spec(spec)
|
|
15
|
+
sys.modules[spec.name] = module
|
|
16
|
+
assert spec.loader is not None
|
|
17
|
+
spec.loader.exec_module(module)
|
|
18
|
+
return module
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
CASES = [
|
|
22
|
+
{
|
|
23
|
+
'name': 'route-change-forward-pass',
|
|
24
|
+
'covers': ['route-changing interactions', 'proof-evidence-present'],
|
|
25
|
+
'function': 'run_verify_interaction_terminal_route_from_proof_evidence',
|
|
26
|
+
'expected_terminal': 'pass',
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
'name': 'route-change-reverse-pass',
|
|
30
|
+
'covers': ['route-changing interactions'],
|
|
31
|
+
'function': 'run_verify_interaction_reverse_terminal_route_from_proof_evidence',
|
|
32
|
+
'expected_terminal': 'pass',
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
'name': 'route-prose-noise-ignored',
|
|
36
|
+
'covers': ['route-changing interactions', 'proof-evidence-present'],
|
|
37
|
+
'function': 'run_verify_interaction_prose_route_noise_uses_proof_evidence',
|
|
38
|
+
'expected_terminal': 'pass',
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
'name': 'query-hash-trailing-slash-pass',
|
|
42
|
+
'covers': ['query/hash/trailing-slash URLs', 'proof-evidence-present'],
|
|
43
|
+
'function': 'run_verify_interaction_query_hash_pass_uses_proof_evidence_route',
|
|
44
|
+
'expected_terminal': 'pass',
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
'name': 'query-hash-dropped-specific-blocker',
|
|
48
|
+
'covers': ['query/hash/trailing-slash URLs', 'invalid browser evidence'],
|
|
49
|
+
'function': 'run_verify_interaction_authored_query_hash_mismatch_blocks_with_evidence',
|
|
50
|
+
'expected_terminal': 'specific_blocker',
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
'name': 'same-page-hash-pass',
|
|
54
|
+
'covers': ['same-page hashes'],
|
|
55
|
+
'function': 'run_verify_interaction_hash_terminal_route_from_proof_evidence',
|
|
56
|
+
'expected_terminal': 'pass',
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
'name': 'missing-selector-timeout-specific-blocker',
|
|
60
|
+
'covers': ['missing selectors', 'timeouts'],
|
|
61
|
+
'function': 'run_verify_capture_retry_surfaces_script_timeout',
|
|
62
|
+
'expected_terminal': 'specific_blocker',
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
'name': 'thrown-error-preserves-structured-evidence',
|
|
66
|
+
'covers': ['thrown errors', 'proof-evidence-present'],
|
|
67
|
+
'function': 'run_verify_preserves_proof_evidence_on_capture_script_error',
|
|
68
|
+
'expected_terminal': 'specific_blocker',
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
'name': 'structured-proof-without-screenshot-pass',
|
|
72
|
+
'covers': ['proof-evidence-present'],
|
|
73
|
+
'function': 'run_verify_structured_evidence_without_screenshot',
|
|
74
|
+
'expected_terminal': 'pass',
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
'name': 'proof-evidence-absent-specific-blocker',
|
|
78
|
+
'covers': ['proof-evidence-absent'],
|
|
79
|
+
'function': 'run_verify_audio_requires_proof_evidence',
|
|
80
|
+
'expected_terminal': 'specific_blocker',
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
'name': 'no-diff-prod-audit-default-capture-pass',
|
|
84
|
+
'covers': ['no-diff prod audits'],
|
|
85
|
+
'function': 'run_remote_audit_verify_uses_default_capture_script',
|
|
86
|
+
'expected_terminal': 'pass',
|
|
87
|
+
},
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
GENERIC_FAILURE_MARKERS = (
|
|
92
|
+
'codex_invalid_json',
|
|
93
|
+
'codex_no_final_response',
|
|
94
|
+
'max_iterations_reached',
|
|
95
|
+
'stage_iteration_limit_reached',
|
|
96
|
+
'unhandled_checkpoint',
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def compact_logs(stdout, stderr):
|
|
101
|
+
text = (stdout.getvalue() + '\n' + stderr.getvalue()).strip()
|
|
102
|
+
lines = [line for line in text.splitlines() if line.strip()]
|
|
103
|
+
return lines[-20:]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def run_case(module, case):
|
|
107
|
+
stdout = io.StringIO()
|
|
108
|
+
stderr = io.StringIO()
|
|
109
|
+
try:
|
|
110
|
+
with redirect_stdout(stdout), redirect_stderr(stderr):
|
|
111
|
+
result = getattr(module, case['function'])()
|
|
112
|
+
encoded = json.dumps(result, sort_keys=True)
|
|
113
|
+
for marker in GENERIC_FAILURE_MARKERS:
|
|
114
|
+
assert marker not in encoded, f'{case["name"]} leaked generic failure marker {marker}'
|
|
115
|
+
return {
|
|
116
|
+
'ok': True,
|
|
117
|
+
'name': case['name'],
|
|
118
|
+
'covers': case['covers'],
|
|
119
|
+
'expected_terminal': case['expected_terminal'],
|
|
120
|
+
'result': result,
|
|
121
|
+
}
|
|
122
|
+
except Exception as exc:
|
|
123
|
+
return {
|
|
124
|
+
'ok': False,
|
|
125
|
+
'name': case['name'],
|
|
126
|
+
'error': str(exc),
|
|
127
|
+
'traceback': traceback.format_exc(limit=8),
|
|
128
|
+
'logs': compact_logs(stdout, stderr),
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def main():
|
|
133
|
+
module = load_smoke_module()
|
|
134
|
+
results = [run_case(module, case) for case in CASES]
|
|
135
|
+
failed = [result for result in results if not result['ok']]
|
|
136
|
+
payload = {
|
|
137
|
+
'ok': not failed,
|
|
138
|
+
'suite': 'riddle-proof.trust-boundary-regression',
|
|
139
|
+
'case_count': len(results),
|
|
140
|
+
'failed': failed,
|
|
141
|
+
'results': results,
|
|
142
|
+
}
|
|
143
|
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
|
144
|
+
if failed:
|
|
145
|
+
raise SystemExit(1)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
if __name__ == '__main__':
|
|
149
|
+
main()
|