voratiq 0.1.0-beta.2 → 0.1.0-beta.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +41 -29
- package/dist/agents/launch/chat.d.ts +23 -0
- package/dist/agents/launch/chat.js +44 -0
- package/dist/agents/launch/environment.d.ts +8 -0
- package/dist/{commands/run/agents/workspace-prep.js → agents/launch/environment.js} +5 -27
- package/dist/agents/launch/prompt.d.ts +6 -0
- package/dist/agents/launch/prompt.js +12 -0
- package/dist/agents/launch/provider-state.d.ts +39 -0
- package/dist/agents/launch/provider-state.js +103 -0
- package/dist/agents/runtime/auth.d.ts +27 -0
- package/dist/agents/runtime/auth.js +72 -0
- package/dist/agents/runtime/chat.d.ts +5 -0
- package/dist/agents/runtime/chat.js +7 -0
- package/dist/agents/runtime/errors.d.ts +27 -0
- package/dist/agents/runtime/errors.js +51 -0
- package/dist/{commands/run/agents → agents/runtime}/failures.d.ts +0 -1
- package/dist/agents/runtime/failures.js +136 -0
- package/dist/agents/runtime/harness.d.ts +2 -0
- package/dist/agents/runtime/harness.js +119 -0
- package/dist/{commands/run/agents/sandbox-launcher.d.ts → agents/runtime/launcher.d.ts} +18 -6
- package/dist/{commands/run/agents/sandbox-launcher.js → agents/runtime/launcher.js} +17 -39
- package/dist/{commands/run/agents/workspace-prep.d.ts → agents/runtime/manifest.d.ts} +6 -6
- package/dist/agents/runtime/manifest.js +34 -0
- package/dist/agents/runtime/policy.d.ts +32 -0
- package/dist/agents/runtime/policy.js +240 -0
- package/dist/agents/runtime/registry.d.ts +4 -0
- package/dist/agents/runtime/registry.js +54 -0
- package/dist/{commands/run → agents/runtime}/sandbox.d.ts +8 -2
- package/dist/{commands/run → agents/runtime}/sandbox.js +28 -67
- package/dist/agents/runtime/shim/run-agent-shim.d.ts +1 -0
- package/dist/agents/runtime/shim/run-agent-shim.js +276 -0
- package/dist/agents/runtime/types.d.ts +91 -0
- package/dist/{commands/run/agents → agents/runtime}/watchdog.d.ts +4 -3
- package/dist/{commands/run/agents → agents/runtime}/watchdog.js +155 -26
- package/dist/auth/providers/codex.js +7 -2
- package/dist/auth/providers/gemini.js +14 -6
- package/dist/auth/providers/types.d.ts +1 -0
- package/dist/auth/providers/utils.d.ts +0 -1
- package/dist/auth/providers/utils.js +1 -49
- package/dist/bin.js +369 -71
- package/dist/cli/apply.d.ts +4 -0
- package/dist/cli/apply.js +28 -9
- package/dist/cli/auto.d.ts +32 -0
- package/dist/cli/auto.js +232 -0
- package/dist/cli/contract.d.ts +328 -0
- package/dist/cli/contract.js +480 -0
- package/dist/cli/errors.d.ts +3 -0
- package/dist/cli/errors.js +21 -3
- package/dist/cli/init.d.ts +5 -0
- package/dist/cli/init.js +34 -6
- package/dist/cli/list.d.ts +6 -4
- package/dist/cli/list.js +39 -16
- package/dist/cli/mcp.d.ts +2 -0
- package/dist/cli/mcp.js +16 -0
- package/dist/cli/message.d.ts +28 -0
- package/dist/cli/message.js +147 -0
- package/dist/cli/operator-envelope.d.ts +180 -0
- package/dist/cli/operator-envelope.js +425 -0
- package/dist/cli/output.d.ts +15 -1
- package/dist/cli/output.js +153 -5
- package/dist/cli/prune.d.ts +7 -3
- package/dist/cli/prune.js +57 -12
- package/dist/cli/reduce.d.ts +29 -0
- package/dist/cli/reduce.js +211 -0
- package/dist/cli/root-launcher.d.ts +4 -0
- package/dist/cli/root-launcher.js +15 -0
- package/dist/cli/run.d.ts +27 -1
- package/dist/cli/run.js +108 -16
- package/dist/cli/spec.d.ts +31 -0
- package/dist/cli/spec.js +180 -0
- package/dist/cli/verify.d.ts +35 -0
- package/dist/cli/verify.js +297 -0
- package/dist/commands/apply/command.d.ts +2 -0
- package/dist/commands/apply/command.js +145 -6
- package/dist/commands/apply/errors.d.ts +43 -4
- package/dist/commands/apply/errors.js +100 -22
- package/dist/commands/apply/types.d.ts +2 -1
- package/dist/commands/auto/command.d.ts +145 -0
- package/dist/commands/auto/command.js +433 -0
- package/dist/commands/auto/errors.d.ts +19 -0
- package/dist/commands/auto/errors.js +19 -0
- package/dist/commands/auto/validation.d.ts +14 -0
- package/dist/commands/auto/validation.js +90 -0
- package/dist/commands/fetch.d.ts +2 -2
- package/dist/commands/fetch.js +4 -4
- package/dist/commands/init/agents.d.ts +2 -1
- package/dist/commands/init/agents.js +66 -63
- package/dist/commands/init/command.js +300 -16
- package/dist/commands/init/types.d.ts +18 -7
- package/dist/commands/interactive/lifecycle.d.ts +15 -0
- package/dist/commands/interactive/lifecycle.js +141 -0
- package/dist/commands/list/command.d.ts +10 -3
- package/dist/commands/list/command.js +597 -40
- package/dist/commands/message/command.d.ts +23 -0
- package/dist/commands/message/command.js +215 -0
- package/dist/commands/message/errors.d.ts +9 -0
- package/dist/commands/message/errors.js +20 -0
- package/dist/commands/message/lifecycle.d.ts +14 -0
- package/dist/commands/message/lifecycle.js +128 -0
- package/dist/commands/prune/command.d.ts +2 -1
- package/dist/commands/prune/command.js +61 -10
- package/dist/commands/prune/errors.d.ts +1 -1
- package/dist/commands/prune/errors.js +5 -5
- package/dist/commands/prune/types.d.ts +21 -0
- package/dist/commands/reduce/command.d.ts +26 -0
- package/dist/commands/reduce/command.js +145 -0
- package/dist/commands/reduce/errors.d.ts +17 -0
- package/dist/commands/reduce/errors.js +32 -0
- package/dist/commands/reduce/targets.d.ts +11 -0
- package/dist/commands/reduce/targets.js +271 -0
- package/dist/commands/root-launcher/command.d.ts +31 -0
- package/dist/commands/root-launcher/command.js +233 -0
- package/dist/commands/run/command.d.ts +6 -1
- package/dist/commands/run/command.js +70 -40
- package/dist/commands/run/lifecycle.d.ts +7 -5
- package/dist/commands/run/lifecycle.js +45 -23
- package/dist/commands/run/record-init.d.ts +4 -1
- package/dist/commands/run/record-init.js +5 -2
- package/dist/commands/run/shim/run-agent-shim.d.ts +2 -1
- package/dist/commands/run/shim/run-agent-shim.js +4 -219
- package/dist/commands/run/validation.d.ts +2 -3
- package/dist/commands/run/validation.js +54 -25
- package/dist/commands/shared/max-parallel.d.ts +5 -0
- package/dist/commands/shared/max-parallel.js +15 -0
- package/dist/commands/shared/preview.d.ts +10 -0
- package/dist/commands/shared/preview.js +60 -0
- package/dist/commands/shared/resolve-reduction-competitors.d.ts +15 -0
- package/dist/commands/shared/resolve-reduction-competitors.js +13 -0
- package/dist/commands/shared/resolve-stage-competitors.d.ts +19 -0
- package/dist/commands/shared/resolve-stage-competitors.js +171 -0
- package/dist/commands/shared/session-id.d.ts +1 -0
- package/dist/commands/shared/session-id.js +1 -0
- package/dist/commands/spec/command.d.ts +22 -0
- package/dist/commands/spec/command.js +330 -0
- package/dist/commands/spec/errors.d.ts +11 -0
- package/dist/commands/spec/errors.js +23 -0
- package/dist/commands/verify/agents.d.ts +8 -0
- package/dist/commands/verify/agents.js +29 -0
- package/dist/commands/verify/command.d.ts +23 -0
- package/dist/commands/verify/command.js +168 -0
- package/dist/commands/verify/lifecycle.d.ts +14 -0
- package/dist/commands/verify/lifecycle.js +229 -0
- package/dist/commands/verify/max-parallel.d.ts +7 -0
- package/dist/commands/verify/max-parallel.js +15 -0
- package/dist/commands/verify/targets.d.ts +18 -0
- package/dist/commands/verify/targets.js +420 -0
- package/dist/competition/command-adapter.d.ts +35 -0
- package/dist/competition/command-adapter.js +20 -0
- package/dist/competition/core.d.ts +41 -0
- package/dist/competition/core.js +181 -0
- package/dist/competition/shared/extra-context.d.ts +14 -0
- package/dist/competition/shared/extra-context.js +100 -0
- package/dist/competition/shared/preflight.d.ts +11 -0
- package/dist/competition/shared/preflight.js +39 -0
- package/dist/competition/shared/prompt-helpers.d.ts +16 -0
- package/dist/competition/shared/prompt-helpers.js +27 -0
- package/dist/competition/shared/prune.d.ts +1 -0
- package/dist/competition/shared/prune.js +4 -0
- package/dist/competition/shared/sandbox-policy.d.ts +9 -0
- package/dist/competition/shared/sandbox-policy.js +7 -0
- package/dist/competition/shared/teardown.d.ts +36 -0
- package/dist/competition/shared/teardown.js +101 -0
- package/dist/configs/agents/defaults.d.ts +31 -2
- package/dist/configs/agents/defaults.js +346 -30
- package/dist/configs/agents/errors.js +14 -11
- package/dist/configs/agents/loader.d.ts +11 -1
- package/dist/configs/agents/loader.js +71 -4
- package/dist/configs/agents/types.js +2 -2
- package/dist/configs/environment/detect.js +9 -4
- package/dist/configs/environment/errors.js +4 -4
- package/dist/configs/environment/loader.d.ts +1 -1
- package/dist/configs/environment/loader.js +3 -3
- package/dist/configs/orchestration/bootstrap.d.ts +16 -0
- package/dist/configs/orchestration/bootstrap.js +122 -0
- package/dist/configs/orchestration/errors.d.ts +15 -0
- package/dist/configs/orchestration/errors.js +28 -0
- package/dist/configs/orchestration/loader.d.ts +9 -0
- package/dist/configs/orchestration/loader.js +148 -0
- package/dist/configs/orchestration/types.d.ts +102 -0
- package/dist/configs/orchestration/types.js +65 -0
- package/dist/configs/sandbox/defaults.js +14 -4
- package/dist/configs/sandbox/errors.d.ts +1 -1
- package/dist/configs/sandbox/errors.js +1 -1
- package/dist/configs/sandbox/loader.js +6 -4
- package/dist/configs/sandbox/schemas.js +4 -2
- package/dist/configs/settings/loader.d.ts +7 -0
- package/dist/configs/settings/loader.js +81 -0
- package/dist/configs/settings/types.d.ts +47 -0
- package/dist/configs/settings/types.js +23 -0
- package/dist/configs/verification/errors.d.ts +11 -0
- package/dist/configs/verification/errors.js +21 -0
- package/dist/configs/verification/loader.d.ts +8 -0
- package/dist/configs/verification/loader.js +43 -0
- package/dist/configs/verification/methods.d.ts +35 -0
- package/dist/configs/verification/methods.js +41 -0
- package/dist/configs/verification/programmatic-defaults.d.ts +10 -0
- package/dist/configs/verification/programmatic-defaults.js +42 -0
- package/dist/configs/verification/programmatic-detect.d.ts +10 -0
- package/dist/configs/{evals/detect.js → verification/programmatic-detect.js} +22 -33
- package/dist/configs/verification/types.d.ts +49 -0
- package/dist/configs/verification/types.js +45 -0
- package/dist/contracts/list.d.ts +207 -0
- package/dist/contracts/list.js +154 -0
- package/dist/domain/interactive/model/types.d.ts +104 -0
- package/dist/domain/interactive/model/types.js +83 -0
- package/dist/domain/interactive/persistence/adapter.d.ts +39 -0
- package/dist/domain/interactive/persistence/adapter.js +144 -0
- package/dist/domain/interactive/prompt.d.ts +3 -0
- package/dist/domain/interactive/prompt.js +7 -0
- package/dist/domain/message/competition/adapter.d.ts +36 -0
- package/dist/domain/message/competition/adapter.js +197 -0
- package/dist/domain/message/competition/prompt.d.ts +8 -0
- package/dist/domain/message/competition/prompt.js +29 -0
- package/dist/domain/message/model/mutators.d.ts +17 -0
- package/dist/domain/message/model/mutators.js +107 -0
- package/dist/domain/message/model/types.d.ts +100 -0
- package/dist/domain/message/model/types.js +87 -0
- package/dist/domain/message/persistence/adapter.d.ts +43 -0
- package/dist/domain/message/persistence/adapter.js +124 -0
- package/dist/domain/reduce/competition/adapter.d.ts +42 -0
- package/dist/domain/reduce/competition/adapter.js +826 -0
- package/dist/domain/reduce/competition/output-validation.d.ts +4 -0
- package/dist/domain/reduce/competition/output-validation.js +18 -0
- package/dist/domain/reduce/competition/prompt.d.ts +10 -0
- package/dist/domain/reduce/competition/prompt.js +96 -0
- package/dist/domain/reduce/competition/reduction.d.ts +9 -0
- package/dist/domain/reduce/competition/reduction.js +32 -0
- package/dist/domain/reduce/model/types.d.ts +122 -0
- package/dist/domain/reduce/model/types.js +84 -0
- package/dist/domain/reduce/persistence/adapter.d.ts +43 -0
- package/dist/domain/reduce/persistence/adapter.js +126 -0
- package/dist/domain/run/competition/adapter.d.ts +30 -0
- package/dist/domain/run/competition/adapter.js +39 -0
- package/dist/domain/run/competition/agent-execution.d.ts +20 -0
- package/dist/domain/run/competition/agent-execution.js +45 -0
- package/dist/domain/run/competition/agent-preparation.d.ts +12 -0
- package/dist/domain/run/competition/agent-preparation.js +24 -0
- package/dist/domain/run/competition/agents/artifacts.d.ts +17 -0
- package/dist/domain/run/competition/agents/artifacts.js +173 -0
- package/dist/{commands/run → domain/run/competition}/agents/lifecycle.d.ts +3 -3
- package/dist/{commands/run → domain/run/competition}/agents/lifecycle.js +84 -64
- package/dist/domain/run/competition/agents/post-processing.d.ts +12 -0
- package/dist/domain/run/competition/agents/post-processing.js +4 -0
- package/dist/domain/run/competition/agents/preparation.js +64 -0
- package/dist/{commands/run → domain/run/competition}/agents/run-context.d.ts +9 -16
- package/dist/{commands/run → domain/run/competition}/agents/run-context.js +22 -70
- package/dist/{commands/run → domain/run/competition}/agents/types.d.ts +10 -13
- package/dist/domain/run/competition/agents/workspace.d.ts +21 -0
- package/dist/domain/run/competition/agents/workspace.js +47 -0
- package/dist/{commands/run → domain/run/competition}/errors.d.ts +8 -1
- package/dist/{commands/run → domain/run/competition}/errors.js +39 -9
- package/dist/{commands/run → domain/run/competition}/phases.d.ts +1 -2
- package/dist/domain/run/competition/phases.js +1 -0
- package/dist/domain/run/competition/prompt.d.ts +7 -0
- package/dist/domain/run/competition/prompt.js +27 -0
- package/dist/{commands/run → domain/run/competition}/reports.d.ts +5 -3
- package/dist/{commands/run → domain/run/competition}/reports.js +7 -19
- package/dist/domain/run/competition/termination-state.d.ts +4 -0
- package/dist/domain/run/competition/termination-state.js +12 -0
- package/dist/{records → domain/run/model}/enhanced.d.ts +6 -7
- package/dist/{records → domain/run/model}/enhanced.js +11 -11
- package/dist/{records → domain/run/model}/errors.d.ts +1 -1
- package/dist/{records → domain/run/model}/errors.js +5 -5
- package/dist/{records → domain/run/model}/mutators.d.ts +4 -3
- package/dist/{records → domain/run/model}/mutators.js +58 -36
- package/dist/domain/run/model/types.d.ts +376 -0
- package/dist/domain/run/model/types.js +192 -0
- package/dist/{records/persistence.d.ts → domain/run/persistence/adapter.d.ts} +9 -3
- package/dist/domain/run/persistence/adapter.js +340 -0
- package/dist/domain/run/persistence/error-mapping.d.ts +2 -0
- package/dist/domain/run/persistence/error-mapping.js +17 -0
- package/dist/domain/shared/lifecycle.d.ts +54 -0
- package/dist/domain/shared/lifecycle.js +165 -0
- package/dist/domain/shared/token-usage.d.ts +21 -0
- package/dist/domain/shared/token-usage.js +38 -0
- package/dist/domain/spec/competition/adapter.d.ts +31 -0
- package/dist/domain/spec/competition/adapter.js +196 -0
- package/dist/domain/spec/competition/prompt.d.ts +11 -0
- package/dist/domain/spec/competition/prompt.js +44 -0
- package/dist/domain/spec/model/output.d.ts +13 -0
- package/dist/domain/spec/model/output.js +36 -0
- package/dist/domain/spec/model/types.d.ts +98 -0
- package/dist/domain/spec/model/types.js +84 -0
- package/dist/domain/spec/persistence/adapter.d.ts +51 -0
- package/dist/domain/spec/persistence/adapter.js +140 -0
- package/dist/domain/verify/blinding/aliases.d.ts +7 -0
- package/dist/domain/verify/blinding/aliases.js +23 -0
- package/dist/domain/verify/competition/adapter.d.ts +54 -0
- package/dist/domain/verify/competition/adapter.js +444 -0
- package/dist/domain/verify/competition/artifacts.d.ts +6 -0
- package/dist/domain/verify/competition/artifacts.js +7 -0
- package/dist/domain/verify/competition/blinding.d.ts +24 -0
- package/dist/domain/verify/competition/blinding.js +109 -0
- package/dist/domain/verify/competition/finalize.d.ts +11 -0
- package/dist/domain/verify/competition/finalize.js +65 -0
- package/dist/domain/verify/competition/programmatic.d.ts +15 -0
- package/dist/domain/verify/competition/programmatic.js +352 -0
- package/dist/domain/verify/competition/prompt.d.ts +19 -0
- package/dist/domain/verify/competition/prompt.js +63 -0
- package/dist/domain/verify/competition/rubric.d.ts +23 -0
- package/dist/domain/verify/competition/rubric.js +77 -0
- package/dist/domain/verify/competition/shared-layout.d.ts +121 -0
- package/dist/domain/verify/competition/shared-layout.js +365 -0
- package/dist/domain/verify/competition/target.d.ts +47 -0
- package/dist/domain/verify/competition/target.js +1 -0
- package/dist/domain/verify/model/mutators.d.ts +16 -0
- package/dist/domain/verify/model/mutators.js +126 -0
- package/dist/domain/verify/model/types.d.ts +408 -0
- package/dist/domain/verify/model/types.js +289 -0
- package/dist/domain/verify/persistence/adapter.d.ts +43 -0
- package/dist/domain/verify/persistence/adapter.js +126 -0
- package/dist/domain/verify/programmatic/runner.d.ts +22 -0
- package/dist/domain/verify/programmatic/runner.js +209 -0
- package/dist/domain/verify/rubric-result.d.ts +28 -0
- package/dist/domain/verify/rubric-result.js +121 -0
- package/dist/extra-context/contract.d.ts +17 -0
- package/dist/extra-context/contract.js +60 -0
- package/dist/interactive/index.d.ts +2 -0
- package/dist/interactive/index.js +1 -0
- package/dist/interactive/providers/launch.d.ts +23 -0
- package/dist/interactive/providers/launch.js +203 -0
- package/dist/interactive/providers/mcp.d.ts +13 -0
- package/dist/interactive/providers/mcp.js +547 -0
- package/dist/interactive/providers/shared.d.ts +2 -0
- package/dist/interactive/providers/shared.js +1 -0
- package/dist/interactive/providers.d.ts +3 -0
- package/dist/interactive/providers.js +3 -0
- package/dist/interactive/records.d.ts +2 -0
- package/dist/interactive/records.js +1 -0
- package/dist/interactive/substrate.d.ts +21 -0
- package/dist/interactive/substrate.js +522 -0
- package/dist/interactive/types.d.ts +101 -0
- package/dist/interactive/types.js +1 -0
- package/dist/mcp/server.d.ts +88 -0
- package/dist/mcp/server.js +790 -0
- package/dist/persistence/error-mapping.d.ts +19 -0
- package/dist/persistence/error-mapping.js +44 -0
- package/dist/persistence/errors.d.ts +26 -0
- package/dist/persistence/errors.js +49 -0
- package/dist/persistence/extra-context.d.ts +9 -0
- package/dist/persistence/extra-context.js +60 -0
- package/dist/{records → persistence}/history-lock.js +2 -2
- package/dist/persistence/record-path-schema.d.ts +3 -0
- package/dist/persistence/record-path-schema.js +16 -0
- package/dist/persistence/session-store.d.ts +92 -0
- package/dist/persistence/session-store.js +412 -0
- package/dist/policy/auto.d.ts +13 -0
- package/dist/policy/auto.js +22 -0
- package/dist/policy/index.d.ts +5 -0
- package/dist/policy/index.js +5 -0
- package/dist/policy/resolution.d.ts +6 -0
- package/dist/policy/resolution.js +23 -0
- package/dist/policy/result.d.ts +53 -0
- package/dist/policy/result.js +15 -0
- package/dist/policy/selector.d.ts +11 -0
- package/dist/policy/selector.js +57 -0
- package/dist/policy/verification.d.ts +77 -0
- package/dist/policy/verification.js +365 -0
- package/dist/policy/verifier-selection.d.ts +13 -0
- package/dist/policy/verifier-selection.js +78 -0
- package/dist/preflight/branch.d.ts +9 -0
- package/dist/preflight/branch.js +48 -0
- package/dist/preflight/errors.d.ts +3 -0
- package/dist/preflight/errors.js +10 -3
- package/dist/preflight/index.d.ts +13 -0
- package/dist/preflight/index.js +43 -8
- package/dist/render/interactions/confirmation.js +4 -2
- package/dist/render/transcripts/apply.js +9 -10
- package/dist/render/transcripts/auto.d.ts +27 -0
- package/dist/render/transcripts/auto.js +21 -0
- package/dist/render/transcripts/init.d.ts +4 -15
- package/dist/render/transcripts/init.js +71 -72
- package/dist/render/transcripts/list.d.ts +10 -1
- package/dist/render/transcripts/list.js +121 -15
- package/dist/render/transcripts/message.d.ts +72 -0
- package/dist/render/transcripts/message.js +362 -0
- package/dist/render/transcripts/prune.d.ts +7 -2
- package/dist/render/transcripts/prune.js +64 -17
- package/dist/render/transcripts/reduce.d.ts +74 -0
- package/dist/render/transcripts/reduce.js +395 -0
- package/dist/render/transcripts/root-launcher.d.ts +19 -0
- package/dist/render/transcripts/root-launcher.js +40 -0
- package/dist/render/transcripts/run.d.ts +35 -6
- package/dist/render/transcripts/run.js +241 -165
- package/dist/render/transcripts/shared.d.ts +2 -0
- package/dist/render/transcripts/shared.js +11 -4
- package/dist/render/transcripts/spec.d.ts +74 -0
- package/dist/render/transcripts/spec.js +394 -0
- package/dist/render/transcripts/stage-progress.d.ts +22 -0
- package/dist/render/transcripts/stage-progress.js +6 -0
- package/dist/render/transcripts/update-check.d.ts +2 -0
- package/dist/render/transcripts/update-check.js +22 -0
- package/dist/render/transcripts/verify.d.ts +74 -0
- package/dist/render/transcripts/verify.js +409 -0
- package/dist/render/utils/agents.d.ts +10 -9
- package/dist/render/utils/agents.js +30 -82
- package/dist/render/utils/badges.d.ts +3 -20
- package/dist/render/utils/badges.js +3 -36
- package/dist/render/utils/duration.d.ts +12 -0
- package/dist/render/utils/duration.js +37 -0
- package/dist/render/utils/interactive-frame.d.ts +6 -0
- package/dist/render/utils/interactive-frame.js +38 -0
- package/dist/render/utils/records.js +4 -4
- package/dist/render/utils/runs.d.ts +3 -9
- package/dist/render/utils/runs.js +16 -48
- package/dist/render/utils/stage-output.d.ts +20 -0
- package/dist/render/utils/stage-output.js +44 -0
- package/dist/render/utils/timezone.d.ts +2 -0
- package/dist/render/utils/timezone.js +42 -0
- package/dist/render/utils/transcript-shell.d.ts +66 -0
- package/dist/render/utils/transcript-shell.js +155 -0
- package/dist/render/utils/transcript.d.ts +7 -1
- package/dist/render/utils/transcript.js +12 -2
- package/dist/render/utils/wrap.d.ts +1 -0
- package/dist/render/utils/wrap.js +20 -0
- package/dist/status/colors.d.ts +2 -3
- package/dist/status/colors.js +3 -3
- package/dist/status/index.d.ts +108 -8
- package/dist/status/index.js +164 -5
- package/dist/update-check/checker.d.ts +24 -0
- package/dist/update-check/checker.js +130 -0
- package/dist/update-check/prompt.d.ts +25 -0
- package/dist/update-check/prompt.js +62 -0
- package/dist/update-check/semver.d.ts +17 -0
- package/dist/update-check/semver.js +36 -0
- package/dist/update-check/state-path.d.ts +8 -0
- package/dist/update-check/state-path.js +18 -0
- package/dist/utils/binaries.js +14 -8
- package/dist/utils/errors.d.ts +3 -1
- package/dist/utils/errors.js +3 -1
- package/dist/utils/git.d.ts +10 -0
- package/dist/utils/git.js +15 -3
- package/dist/utils/output.d.ts +5 -1
- package/dist/utils/output.js +4 -2
- package/dist/utils/process.d.ts +2 -1
- package/dist/utils/process.js +7 -3
- package/dist/utils/session-id.d.ts +1 -0
- package/dist/utils/session-id.js +22 -0
- package/dist/utils/slug.d.ts +2 -0
- package/dist/utils/slug.js +15 -0
- package/dist/utils/voratiq-cli-target.d.ts +9 -0
- package/dist/utils/voratiq-cli-target.js +58 -0
- package/dist/workspace/agents.d.ts +13 -16
- package/dist/workspace/agents.js +22 -147
- package/dist/workspace/chat/artifacts.d.ts +9 -0
- package/dist/workspace/chat/artifacts.js +82 -12
- package/dist/workspace/chat/native-usage.d.ts +13 -0
- package/dist/workspace/chat/native-usage.js +60 -0
- package/dist/workspace/chat/sources.d.ts +9 -5
- package/dist/workspace/chat/sources.js +89 -23
- package/dist/workspace/chat/token-usage-result.d.ts +23 -0
- package/dist/workspace/chat/token-usage-result.js +7 -0
- package/dist/workspace/chat/usage-extractor.d.ts +30 -0
- package/dist/workspace/chat/usage-extractor.js +461 -0
- package/dist/workspace/chat/usage-mappings.d.ts +20 -0
- package/dist/workspace/chat/usage-mappings.js +136 -0
- package/dist/workspace/credential-guard.js +1 -1
- package/dist/workspace/dependencies.js +4 -4
- package/dist/workspace/errors.d.ts +5 -0
- package/dist/workspace/errors.js +13 -3
- package/dist/workspace/layout.d.ts +17 -6
- package/dist/workspace/layout.js +51 -32
- package/dist/workspace/promotion.d.ts +32 -0
- package/dist/workspace/promotion.js +34 -0
- package/dist/workspace/prune.d.ts +1 -1
- package/dist/workspace/run.d.ts +1 -3
- package/dist/workspace/run.js +6 -15
- package/dist/workspace/setup.d.ts +8 -0
- package/dist/workspace/setup.js +359 -56
- package/dist/workspace/shim.js +1 -1
- package/dist/workspace/structure.d.ts +91 -26
- package/dist/workspace/structure.js +227 -43
- package/dist/workspace/templates.d.ts +9 -3
- package/dist/workspace/templates.js +26 -15
- package/dist/workspace/verification-defaults.d.ts +12 -0
- package/dist/workspace/verification-defaults.js +1017 -0
- package/package.json +30 -24
- package/dist/cli/review.d.ts +0 -12
- package/dist/cli/review.js +0 -33
- package/dist/commands/errors.d.ts +0 -4
- package/dist/commands/errors.js +0 -7
- package/dist/commands/init/evals.d.ts +0 -4
- package/dist/commands/init/evals.js +0 -219
- package/dist/commands/review/command.d.ts +0 -10
- package/dist/commands/review/command.js +0 -26
- package/dist/commands/run/agent-execution.d.ts +0 -19
- package/dist/commands/run/agent-execution.js +0 -63
- package/dist/commands/run/agents/auth-stage.d.ts +0 -23
- package/dist/commands/run/agents/auth-stage.js +0 -108
- package/dist/commands/run/agents/chat-preserver.d.ts +0 -9
- package/dist/commands/run/agents/chat-preserver.js +0 -35
- package/dist/commands/run/agents/eval-runner.d.ts +0 -19
- package/dist/commands/run/agents/eval-runner.js +0 -27
- package/dist/commands/run/agents/failures.js +0 -32
- package/dist/commands/run/agents/preparation.js +0 -123
- package/dist/commands/run/agents.d.ts +0 -14
- package/dist/commands/run/agents.js +0 -47
- package/dist/commands/run/prompts.d.ts +0 -4
- package/dist/commands/run/prompts.js +0 -16
- package/dist/commands/run/sandbox-registry.d.ts +0 -4
- package/dist/commands/run/sandbox-registry.js +0 -54
- package/dist/configs/evals/defaults.d.ts +0 -8
- package/dist/configs/evals/defaults.js +0 -28
- package/dist/configs/evals/detect.d.ts +0 -10
- package/dist/configs/evals/errors.d.ts +0 -16
- package/dist/configs/evals/errors.js +0 -29
- package/dist/configs/evals/loader.d.ts +0 -9
- package/dist/configs/evals/loader.js +0 -46
- package/dist/configs/evals/types.d.ts +0 -42
- package/dist/configs/evals/types.js +0 -74
- package/dist/evals/runner.d.ts +0 -16
- package/dist/evals/runner.js +0 -132
- package/dist/records/persistence.js +0 -469
- package/dist/records/types.d.ts +0 -255
- package/dist/records/types.js +0 -160
- package/dist/render/transcripts/review.d.ts +0 -2
- package/dist/render/transcripts/review.js +0 -36
- /package/dist/{commands/run → agents/runtime}/shim/agent-manifest.d.ts +0 -0
- /package/dist/{commands/run → agents/runtime}/shim/agent-manifest.js +0 -0
- /package/dist/{commands/run → agents/runtime/shim}/argv.d.ts +0 -0
- /package/dist/{commands/run → agents/runtime/shim}/argv.js +0 -0
- /package/dist/{commands/run/agents → agents/runtime}/types.js +0 -0
- /package/dist/{commands/run → domain/run/competition}/agents/preparation.d.ts +0 -0
- /package/dist/{commands/run/phases.js → domain/run/competition/agents/types.js} +0 -0
- /package/dist/{commands/run → domain/run/model}/id.d.ts +0 -0
- /package/dist/{commands/run → domain/run/model}/id.js +0 -0
- /package/dist/{records → persistence}/history-lock.d.ts +0 -0
|
@@ -0,0 +1,1017 @@
|
|
|
1
|
+
import { normalizeProgrammaticCommand } from "../configs/verification/methods.js";
|
|
2
|
+
import { listDetectedProgrammaticDefaults } from "../configs/verification/programmatic-defaults.js";
|
|
3
|
+
import { detectProgrammaticSuggestions } from "../configs/verification/programmatic-detect.js";
|
|
4
|
+
const DEFAULT_SPEC_RUBRIC = [{ template: "spec-verification" }];
|
|
5
|
+
const DEFAULT_RUN_RUBRIC = [{ template: "run-verification" }];
|
|
6
|
+
const DEFAULT_REDUCE_RUBRIC = [{ template: "reduce-verification" }];
|
|
7
|
+
const DEFAULT_MESSAGE_RUBRIC = [{ template: "message-verification" }];
|
|
8
|
+
export async function buildDefaultVerificationConfigYaml(params) {
|
|
9
|
+
const suggestions = await detectProgrammaticSuggestions(params.root, params.environment);
|
|
10
|
+
const runProgrammaticDefaults = listDetectedProgrammaticDefaults(suggestions).flatMap((entry) => {
|
|
11
|
+
const command = normalizeProgrammaticCommand(entry.command);
|
|
12
|
+
return command ? [{ slug: entry.slug, command }] : [];
|
|
13
|
+
});
|
|
14
|
+
const lines = [];
|
|
15
|
+
appendRubricStage(lines, "spec", DEFAULT_SPEC_RUBRIC);
|
|
16
|
+
lines.push("");
|
|
17
|
+
appendRunStage(lines, runProgrammaticDefaults);
|
|
18
|
+
lines.push("");
|
|
19
|
+
appendRubricStage(lines, "reduce", DEFAULT_REDUCE_RUBRIC);
|
|
20
|
+
lines.push("");
|
|
21
|
+
appendRubricStage(lines, "message", DEFAULT_MESSAGE_RUBRIC);
|
|
22
|
+
return `${lines.join("\n")}\n`;
|
|
23
|
+
}
|
|
24
|
+
function appendRunStage(lines, runProgrammaticDefaults) {
|
|
25
|
+
lines.push("run:");
|
|
26
|
+
if (runProgrammaticDefaults.length > 0) {
|
|
27
|
+
lines.push(" programmatic:");
|
|
28
|
+
for (const entry of runProgrammaticDefaults) {
|
|
29
|
+
lines.push(` ${entry.slug}: ${JSON.stringify(entry.command)}`);
|
|
30
|
+
}
|
|
31
|
+
lines.push("");
|
|
32
|
+
}
|
|
33
|
+
lines.push(" rubric:");
|
|
34
|
+
for (const entry of DEFAULT_RUN_RUBRIC) {
|
|
35
|
+
lines.push(` - template: ${entry.template}`);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
function appendRubricStage(lines, stage, rubric) {
|
|
39
|
+
lines.push(`${stage}:`);
|
|
40
|
+
lines.push(" rubric:");
|
|
41
|
+
for (const entry of rubric) {
|
|
42
|
+
lines.push(` - template: ${entry.template}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
export const SHIPPED_VERIFICATION_TEMPLATES = [
|
|
46
|
+
{
|
|
47
|
+
name: "spec-verification",
|
|
48
|
+
prompt: `You are a blinded verifier agent reviewing multiple spec drafts for the same task and choosing the single best draft to execute.
|
|
49
|
+
|
|
50
|
+
Method boundary:
|
|
51
|
+
|
|
52
|
+
- produce per-draft assessments and a final ranking in one structured output
|
|
53
|
+
|
|
54
|
+
Inputs:
|
|
55
|
+
|
|
56
|
+
- the original task description
|
|
57
|
+
- the full blinded draft set
|
|
58
|
+
- any shared context needed to understand the intended outcome
|
|
59
|
+
|
|
60
|
+
Expected working style:
|
|
61
|
+
|
|
62
|
+
1. Read the original task description first.
|
|
63
|
+
2. Derive the key contract items the draft must preserve and use stable ids such as \`C1\`, \`C2\`, \`C3\`.
|
|
64
|
+
3. Inspect the draft set directly.
|
|
65
|
+
4. Assess each draft against the verification rubric.
|
|
66
|
+
5. Record per-draft contract coverage, draft readiness, recommendation posture, and bounded follow-up work.
|
|
67
|
+
6. Derive a strict best-to-worst ranking from those assessments.
|
|
68
|
+
7. Make the ranking strict, complete, and tie-free across the full draft set.
|
|
69
|
+
8. Set \`preferred\` equal to \`ranking[0]\`.
|
|
70
|
+
|
|
71
|
+
Judgment discipline:
|
|
72
|
+
|
|
73
|
+
- focus on whether the draft preserves the requested task and acceptance bar, not whether it sounds polished
|
|
74
|
+
- make claims only when you can point to concrete draft text or source-task evidence
|
|
75
|
+
- call out hidden assumptions, ambiguous boundaries, and missing execution contracts explicitly
|
|
76
|
+
- treat decomposition as a quality dimension, not a mandatory outcome; a draft can be strong if it stays atomic for the right task
|
|
77
|
+
- include lightweight \`evidence_refs\` for each draft assessment
|
|
78
|
+
- keep \`comparison\` focused on cross-draft tradeoffs such as task fit, decomposition quality, and execute-now readiness
|
|
79
|
+
- make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`, not just why lower-ranked drafts lost
|
|
80
|
+
- include \`next_actions\` only for the selected draft path
|
|
81
|
+
|
|
82
|
+
Expected output shape:
|
|
83
|
+
|
|
84
|
+
- \`assessments[]\` with one entry per draft
|
|
85
|
+
- top-level \`preferred\` naming the selected draft
|
|
86
|
+
- each assessment should include:
|
|
87
|
+
- \`draft\`
|
|
88
|
+
- \`completion_status\`
|
|
89
|
+
- \`recommendation_level\`
|
|
90
|
+
- \`quality\`
|
|
91
|
+
- \`evaluation\`
|
|
92
|
+
- \`contract_coverage\`
|
|
93
|
+
- \`implementation_notes\`
|
|
94
|
+
- \`follow_up\`
|
|
95
|
+
- \`evidence_refs\`
|
|
96
|
+
- top-level \`comparison\` should capture cross-draft tradeoffs
|
|
97
|
+
- top-level \`ranking\` must be strict, complete, and tie-free
|
|
98
|
+
- top-level \`rationale\` should explain why \`preferred\` / \`ranking[0]\` is the best execution contract
|
|
99
|
+
- top-level \`next_actions\` should stay short and operational
|
|
100
|
+
`,
|
|
101
|
+
rubric: `# Spec Review
|
|
102
|
+
|
|
103
|
+
Review the draft set by assessing each draft on:
|
|
104
|
+
|
|
105
|
+
- task fidelity
|
|
106
|
+
- boundary control
|
|
107
|
+
- acceptance contract
|
|
108
|
+
- decomposition
|
|
109
|
+
- execution readiness
|
|
110
|
+
- uncertainty handling
|
|
111
|
+
|
|
112
|
+
Then derive a final ranking from those assessments.
|
|
113
|
+
|
|
114
|
+
## Task Fidelity
|
|
115
|
+
|
|
116
|
+
Ask:
|
|
117
|
+
|
|
118
|
+
- Does the draft preserve the actual requested outcome?
|
|
119
|
+
- Does it stay aligned to the originating task rather than drifting into adjacent cleanup or architecture work?
|
|
120
|
+
- Are important terms and goals concrete enough to execute against?
|
|
121
|
+
|
|
122
|
+
Task fidelity should dominate stylistic polish.
|
|
123
|
+
|
|
124
|
+
Every draft assessment should include explicit \`contract_coverage\` entries so the ranking is traceable back to the originating task rather than inferred from vague quality labels.
|
|
125
|
+
|
|
126
|
+
## Boundary Control
|
|
127
|
+
|
|
128
|
+
Ask:
|
|
129
|
+
|
|
130
|
+
- Is in-scope versus out-of-scope legible?
|
|
131
|
+
- Does the draft constrain likely overreach paths?
|
|
132
|
+
- Are constraints and non-goals clear enough to keep downstream execution bounded?
|
|
133
|
+
|
|
134
|
+
## Acceptance Contract
|
|
135
|
+
|
|
136
|
+
Ask:
|
|
137
|
+
|
|
138
|
+
- Does the draft define what done looks like?
|
|
139
|
+
- Are success conditions checkable rather than aspirational?
|
|
140
|
+
- Does it identify the artifacts, behaviors, or tests that should prove completion?
|
|
141
|
+
|
|
142
|
+
## Decomposition
|
|
143
|
+
|
|
144
|
+
Ask:
|
|
145
|
+
|
|
146
|
+
- Does the draft break the work down only when decomposition helps execution?
|
|
147
|
+
- If decomposition is present, are the parts coherent, ordered, and complete enough to act on?
|
|
148
|
+
- If decomposition is absent, is the task still executable as one bounded unit?
|
|
149
|
+
|
|
150
|
+
Strong decomposition can mean either a good phased breakdown or a disciplined choice to keep the task atomic.
|
|
151
|
+
|
|
152
|
+
## Execution Readiness
|
|
153
|
+
|
|
154
|
+
Ask:
|
|
155
|
+
|
|
156
|
+
- Could a run agent plausibly execute this draft without major guesswork?
|
|
157
|
+
- Does the draft expose concrete contracts for CLI flags, persistence, artifacts, or user-facing behavior when they matter?
|
|
158
|
+
- Is the implementation path specific enough to reduce downstream ambiguity?
|
|
159
|
+
|
|
160
|
+
## Uncertainty Handling
|
|
161
|
+
|
|
162
|
+
Ask:
|
|
163
|
+
|
|
164
|
+
- Does the draft surface key assumptions and dependencies?
|
|
165
|
+
- Does it expose meaningful uncertainty instead of hiding it?
|
|
166
|
+
- Are unresolved questions bounded and explicit?
|
|
167
|
+
|
|
168
|
+
## Draft posture
|
|
169
|
+
|
|
170
|
+
Each draft assessment should also name:
|
|
171
|
+
|
|
172
|
+
- \`completion_status\`: \`ready\`, \`ready_with_gap\`, \`ready_with_gaps\`, \`incomplete\`, or \`not_verifiable\`
|
|
173
|
+
- \`recommendation_level\`: \`execute_now\`, \`strong_foundation\`, or \`not_recommended\`
|
|
174
|
+
- \`quality\`: \`high\`, \`medium\`, or \`low\`
|
|
175
|
+
|
|
176
|
+
These fields preserve the practical decision posture a spec selector needs:
|
|
177
|
+
|
|
178
|
+
- \`execute_now\` means the draft is fit to drive execution without reopening major contract questions
|
|
179
|
+
- \`strong_foundation\` means the draft is directionally strong but still needs bounded tightening
|
|
180
|
+
- \`not_recommended\` means the draft should not win the stage
|
|
181
|
+
|
|
182
|
+
Keep descriptive task typing out of this rubric. If you need normalized labels like \`intent\`, \`scope\`, \`stack\`, or \`difficulty\`, use the separate \`spec-type\` rubric.
|
|
183
|
+
|
|
184
|
+
## Ranking rule
|
|
185
|
+
|
|
186
|
+
The final ranking should follow from the draft assessments above.
|
|
187
|
+
|
|
188
|
+
It should answer:
|
|
189
|
+
|
|
190
|
+
- which draft should win?
|
|
191
|
+
- which ordering best reflects execution trustworthiness?
|
|
192
|
+
|
|
193
|
+
It should not ignore the structured per-draft assessments.
|
|
194
|
+
It must rank the full eligible draft set with no ties.
|
|
195
|
+
Set \`preferred\` equal to \`ranking[0]\`.
|
|
196
|
+
|
|
197
|
+
The verification artifact should also include:
|
|
198
|
+
|
|
199
|
+
- \`comparison\`: cross-draft tradeoffs, explicitly including why \`ranking[0]\` beat \`ranking[1]\`
|
|
200
|
+
- \`rationale\`: why \`preferred\` / \`ranking[0]\` is the best choice
|
|
201
|
+
- \`next_actions\`: short, operational follow-up for the selected draft path
|
|
202
|
+
`,
|
|
203
|
+
schema: `type: object
|
|
204
|
+
required:
|
|
205
|
+
- assessments
|
|
206
|
+
- preferred
|
|
207
|
+
- comparison
|
|
208
|
+
- ranking
|
|
209
|
+
- rationale
|
|
210
|
+
- next_actions
|
|
211
|
+
properties:
|
|
212
|
+
assessments:
|
|
213
|
+
type: array
|
|
214
|
+
items:
|
|
215
|
+
type: object
|
|
216
|
+
required:
|
|
217
|
+
- draft
|
|
218
|
+
- completion_status
|
|
219
|
+
- recommendation_level
|
|
220
|
+
- quality
|
|
221
|
+
- evaluation
|
|
222
|
+
- contract_coverage
|
|
223
|
+
- implementation_notes
|
|
224
|
+
- follow_up
|
|
225
|
+
- evidence_refs
|
|
226
|
+
properties:
|
|
227
|
+
draft:
|
|
228
|
+
type: string
|
|
229
|
+
completion_status:
|
|
230
|
+
type: string
|
|
231
|
+
enum: ["ready", "ready_with_gap", "ready_with_gaps", "incomplete", "not_verifiable"]
|
|
232
|
+
recommendation_level:
|
|
233
|
+
type: string
|
|
234
|
+
enum: ["execute_now", "strong_foundation", "not_recommended"]
|
|
235
|
+
quality:
|
|
236
|
+
type: string
|
|
237
|
+
enum: ["high", "medium", "low"]
|
|
238
|
+
evaluation:
|
|
239
|
+
type: object
|
|
240
|
+
required:
|
|
241
|
+
- task_fidelity
|
|
242
|
+
- boundary_control
|
|
243
|
+
- acceptance_contract
|
|
244
|
+
- decomposition
|
|
245
|
+
- execution_readiness
|
|
246
|
+
- uncertainty_handling
|
|
247
|
+
properties:
|
|
248
|
+
task_fidelity:
|
|
249
|
+
type: string
|
|
250
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
251
|
+
boundary_control:
|
|
252
|
+
type: string
|
|
253
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
254
|
+
acceptance_contract:
|
|
255
|
+
type: string
|
|
256
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
257
|
+
decomposition:
|
|
258
|
+
type: string
|
|
259
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
260
|
+
execution_readiness:
|
|
261
|
+
type: string
|
|
262
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
263
|
+
uncertainty_handling:
|
|
264
|
+
type: string
|
|
265
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
266
|
+
contract_coverage:
|
|
267
|
+
type: array
|
|
268
|
+
items:
|
|
269
|
+
type: object
|
|
270
|
+
required:
|
|
271
|
+
- contract_item
|
|
272
|
+
- status
|
|
273
|
+
- note
|
|
274
|
+
- evidence_refs
|
|
275
|
+
properties:
|
|
276
|
+
contract_item:
|
|
277
|
+
type: string
|
|
278
|
+
status:
|
|
279
|
+
type: string
|
|
280
|
+
enum: ["met", "partial", "not_met", "not_verifiable"]
|
|
281
|
+
note:
|
|
282
|
+
type: string
|
|
283
|
+
evidence_refs:
|
|
284
|
+
type: array
|
|
285
|
+
items:
|
|
286
|
+
type: string
|
|
287
|
+
implementation_notes:
|
|
288
|
+
type: string
|
|
289
|
+
follow_up:
|
|
290
|
+
type: array
|
|
291
|
+
items:
|
|
292
|
+
type: string
|
|
293
|
+
evidence_refs:
|
|
294
|
+
type: array
|
|
295
|
+
items:
|
|
296
|
+
type: string
|
|
297
|
+
comparison:
|
|
298
|
+
type: string
|
|
299
|
+
preferred:
|
|
300
|
+
type: string
|
|
301
|
+
ranking:
|
|
302
|
+
type: array
|
|
303
|
+
items:
|
|
304
|
+
type: string
|
|
305
|
+
rationale:
|
|
306
|
+
type: string
|
|
307
|
+
next_actions:
|
|
308
|
+
type: array
|
|
309
|
+
items:
|
|
310
|
+
type: string
|
|
311
|
+
`,
|
|
312
|
+
},
|
|
313
|
+
{
|
|
314
|
+
name: "run-verification",
|
|
315
|
+
prompt: `You are a blinded verifier agent reviewing multiple run candidates for the same selected spec and choosing the single best candidate to apply.
|
|
316
|
+
|
|
317
|
+
Inputs:
|
|
318
|
+
|
|
319
|
+
- the selected spec
|
|
320
|
+
- the full blinded candidate set
|
|
321
|
+
- candidate diffs and supporting artifacts
|
|
322
|
+
- any shared run artifacts needed to understand the task context
|
|
323
|
+
|
|
324
|
+
Expected working style:
|
|
325
|
+
|
|
326
|
+
1. Read the spec first.
|
|
327
|
+
2. Derive the key requirements the run had to satisfy and use stable ids such as \`R1\`, \`R2\`, \`R3\`.
|
|
328
|
+
3. Inspect the candidate set directly.
|
|
329
|
+
4. Assess each candidate against the verification rubric.
|
|
330
|
+
5. Record per-candidate requirement coverage, completion posture, recommendation posture, and bounded follow-up work.
|
|
331
|
+
6. Derive a strict best-to-worst ranking from those assessments.
|
|
332
|
+
7. Make the ranking strict, complete, and tie-free across the full eligible candidate set.
|
|
333
|
+
8. Set \`preferred\` equal to \`ranking[0]\`.
|
|
334
|
+
|
|
335
|
+
Judgment discipline:
|
|
336
|
+
|
|
337
|
+
- make claims only when you can point to evidence from candidate diffs or staged files
|
|
338
|
+
- if you cannot verify something, say so explicitly
|
|
339
|
+
- distinguish cleanup issues from correctness or apply-risk issues
|
|
340
|
+
- focus on whether the candidate actually solved the asked task, not whether it merely looks plausible
|
|
341
|
+
- focus on bounded, decision-relevant follow-up work
|
|
342
|
+
- include lightweight \`evidence_refs\` for each candidate assessment
|
|
343
|
+
- keep \`comparison\` focused on cross-candidate tradeoffs such as scope adherence, approach quality, and apply-now cleanliness
|
|
344
|
+
- make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`, not just why lower-ranked candidates lost
|
|
345
|
+
- include \`next_actions\` only for the selected path
|
|
346
|
+
|
|
347
|
+
Expected output shape:
|
|
348
|
+
|
|
349
|
+
- \`assessments[]\` with one entry per candidate
|
|
350
|
+
- top-level \`preferred\` naming the selected candidate
|
|
351
|
+
- each assessment should include:
|
|
352
|
+
- \`candidate\`
|
|
353
|
+
- \`completion_status\`
|
|
354
|
+
- \`recommendation_level\`
|
|
355
|
+
- \`quality\`
|
|
356
|
+
- \`evaluation\`
|
|
357
|
+
- \`requirements_coverage\`
|
|
358
|
+
- \`implementation_notes\`
|
|
359
|
+
- \`follow_up\`
|
|
360
|
+
- \`evidence_refs\`
|
|
361
|
+
- top-level \`comparison\` should capture cross-candidate tradeoffs
|
|
362
|
+
- top-level \`ranking\` must be strict, complete, and tie-free
|
|
363
|
+
- top-level \`rationale\` should explain why \`preferred\` / \`ranking[0]\` is the best apply choice
|
|
364
|
+
- top-level \`next_actions\` should stay short and operational
|
|
365
|
+
`,
|
|
366
|
+
rubric: `# Run Review
|
|
367
|
+
|
|
368
|
+
Review the candidate set by assessing each candidate on:
|
|
369
|
+
|
|
370
|
+
- spec adherence
|
|
371
|
+
- approach
|
|
372
|
+
- codebase fit
|
|
373
|
+
- apply risk
|
|
374
|
+
- evidence
|
|
375
|
+
|
|
376
|
+
Then derive a final ranking from those assessments.
|
|
377
|
+
|
|
378
|
+
## Spec Adherence
|
|
379
|
+
|
|
380
|
+
Ask:
|
|
381
|
+
|
|
382
|
+
- Does the candidate satisfy the selected spec?
|
|
383
|
+
- Are key requirements clearly met, partially met, not met, or not verifiable?
|
|
384
|
+
- Are there obvious mismatches between the changed artifacts and the intended outcome?
|
|
385
|
+
|
|
386
|
+
Spec adherence should dominate elegance or cleanup concerns.
|
|
387
|
+
|
|
388
|
+
Every candidate assessment should include explicit \`requirements_coverage\` entries so the ranking is traceable back to the asked task rather than inferred from generic quality labels.
|
|
389
|
+
|
|
390
|
+
## Approach
|
|
391
|
+
|
|
392
|
+
Ask:
|
|
393
|
+
|
|
394
|
+
- Does the candidate take the right approach to the task, not just produce a superficially acceptable output?
|
|
395
|
+
- Does it avoid scope drift, indirect fixes, or restructuring the task did not ask for?
|
|
396
|
+
- Does it create a strong enough foundation without introducing unnecessary complexity?
|
|
397
|
+
|
|
398
|
+
This is where verification should capture the gap between "passed checks" and "is actually the change we would keep."
|
|
399
|
+
|
|
400
|
+
## Codebase Fit
|
|
401
|
+
|
|
402
|
+
Ask:
|
|
403
|
+
|
|
404
|
+
- Does the implementation fit existing patterns, interfaces, and boundaries?
|
|
405
|
+
- Does it look like a coherent extension of the codebase rather than an alien insertion?
|
|
406
|
+
- Are migrations, rollbacks, or integration seams well-bounded?
|
|
407
|
+
|
|
408
|
+
## Apply Risk
|
|
409
|
+
|
|
410
|
+
Ask:
|
|
411
|
+
|
|
412
|
+
- What is the likely blast radius of applying this candidate?
|
|
413
|
+
- Are there hidden regressions, ambiguous behaviors, or fragile assumptions?
|
|
414
|
+
- Are any missing steps or follow-ups bounded and low-risk, or do they open up unbounded uncertainty?
|
|
415
|
+
|
|
416
|
+
## Evidence
|
|
417
|
+
|
|
418
|
+
Ask:
|
|
419
|
+
|
|
420
|
+
- Are important claims supported by concrete artifacts?
|
|
421
|
+
- Does the candidate leave meaningful uncertainty unresolved?
|
|
422
|
+
|
|
423
|
+
Evidence here means direct artifact evidence for the candidate itself:
|
|
424
|
+
|
|
425
|
+
- diffs
|
|
426
|
+
- changed files
|
|
427
|
+
- summaries when present
|
|
428
|
+
- cited files and line ranges
|
|
429
|
+
|
|
430
|
+
## Candidate posture
|
|
431
|
+
|
|
432
|
+
Each candidate assessment should also name:
|
|
433
|
+
|
|
434
|
+
- \`completion_status\`: \`complete\`, \`complete_with_gap\`, \`complete_with_gaps\`, \`incomplete\`, or \`not_verifiable\`
|
|
435
|
+
- \`recommendation_level\`: \`apply_now\`, \`strong_foundation\`, or \`not_recommended\`
|
|
436
|
+
- \`quality\`: \`high\`, \`medium\`, or \`low\`
|
|
437
|
+
|
|
438
|
+
These fields preserve the practical decision posture the current verification artifact captures:
|
|
439
|
+
|
|
440
|
+
- \`apply_now\` means the candidate is fit to apply without reopening major questions
|
|
441
|
+
- \`strong_foundation\` means the candidate is strong but still needs bounded follow-up before it is the cleanest apply choice
|
|
442
|
+
- \`not_recommended\` means the candidate should not win the run
|
|
443
|
+
|
|
444
|
+
## Ranking rule
|
|
445
|
+
|
|
446
|
+
The final ranking should follow from the candidate assessments above.
|
|
447
|
+
|
|
448
|
+
It should answer:
|
|
449
|
+
|
|
450
|
+
- which candidate should win?
|
|
451
|
+
- which ordering best reflects apply trustworthiness?
|
|
452
|
+
|
|
453
|
+
It should not ignore the structured per-candidate assessments.
|
|
454
|
+
It must rank the full eligible candidate set with no ties.
|
|
455
|
+
Set \`preferred\` equal to \`ranking[0]\`.
|
|
456
|
+
|
|
457
|
+
The verification artifact should also include:
|
|
458
|
+
|
|
459
|
+
- \`comparison\`: cross-candidate tradeoffs, explicitly including why \`ranking[0]\` beat \`ranking[1]\`
|
|
460
|
+
- \`rationale\`: why \`preferred\` / \`ranking[0]\` is the best choice
|
|
461
|
+
- \`next_actions\`: short, operational follow-up for the selected path
|
|
462
|
+
`,
|
|
463
|
+
schema: `type: object
|
|
464
|
+
required:
|
|
465
|
+
- assessments
|
|
466
|
+
- preferred
|
|
467
|
+
- comparison
|
|
468
|
+
- ranking
|
|
469
|
+
- rationale
|
|
470
|
+
- next_actions
|
|
471
|
+
properties:
|
|
472
|
+
assessments:
|
|
473
|
+
type: array
|
|
474
|
+
items:
|
|
475
|
+
type: object
|
|
476
|
+
required:
|
|
477
|
+
- candidate
|
|
478
|
+
- completion_status
|
|
479
|
+
- recommendation_level
|
|
480
|
+
- quality
|
|
481
|
+
- evaluation
|
|
482
|
+
- requirements_coverage
|
|
483
|
+
- implementation_notes
|
|
484
|
+
- follow_up
|
|
485
|
+
- evidence_refs
|
|
486
|
+
properties:
|
|
487
|
+
candidate:
|
|
488
|
+
type: string
|
|
489
|
+
completion_status:
|
|
490
|
+
type: string
|
|
491
|
+
enum: ["complete", "complete_with_gap", "complete_with_gaps", "incomplete", "not_verifiable"]
|
|
492
|
+
recommendation_level:
|
|
493
|
+
type: string
|
|
494
|
+
enum: ["apply_now", "strong_foundation", "not_recommended"]
|
|
495
|
+
quality:
|
|
496
|
+
type: string
|
|
497
|
+
enum: ["high", "medium", "low"]
|
|
498
|
+
evaluation:
|
|
499
|
+
type: object
|
|
500
|
+
required:
|
|
501
|
+
- spec_adherence
|
|
502
|
+
- approach
|
|
503
|
+
- codebase_fit
|
|
504
|
+
- apply_risk
|
|
505
|
+
- evidence
|
|
506
|
+
properties:
|
|
507
|
+
spec_adherence:
|
|
508
|
+
type: string
|
|
509
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
510
|
+
approach:
|
|
511
|
+
type: string
|
|
512
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
513
|
+
codebase_fit:
|
|
514
|
+
type: string
|
|
515
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
516
|
+
apply_risk:
|
|
517
|
+
type: string
|
|
518
|
+
enum: ["low", "medium", "high", "unknown"]
|
|
519
|
+
evidence:
|
|
520
|
+
type: string
|
|
521
|
+
enum: ["strong", "acceptable", "weak", "missing"]
|
|
522
|
+
requirements_coverage:
|
|
523
|
+
type: array
|
|
524
|
+
items:
|
|
525
|
+
type: object
|
|
526
|
+
required:
|
|
527
|
+
- requirement
|
|
528
|
+
- status
|
|
529
|
+
- note
|
|
530
|
+
- evidence_refs
|
|
531
|
+
properties:
|
|
532
|
+
requirement:
|
|
533
|
+
type: string
|
|
534
|
+
status:
|
|
535
|
+
type: string
|
|
536
|
+
enum: ["met", "partial", "not_met", "not_verifiable"]
|
|
537
|
+
note:
|
|
538
|
+
type: string
|
|
539
|
+
evidence_refs:
|
|
540
|
+
type: array
|
|
541
|
+
items:
|
|
542
|
+
type: string
|
|
543
|
+
implementation_notes:
|
|
544
|
+
type: string
|
|
545
|
+
follow_up:
|
|
546
|
+
type: array
|
|
547
|
+
items:
|
|
548
|
+
type: string
|
|
549
|
+
evidence_refs:
|
|
550
|
+
type: array
|
|
551
|
+
items:
|
|
552
|
+
type: string
|
|
553
|
+
comparison:
|
|
554
|
+
type: string
|
|
555
|
+
preferred:
|
|
556
|
+
type: string
|
|
557
|
+
ranking:
|
|
558
|
+
type: array
|
|
559
|
+
items:
|
|
560
|
+
type: string
|
|
561
|
+
rationale:
|
|
562
|
+
type: string
|
|
563
|
+
next_actions:
|
|
564
|
+
type: array
|
|
565
|
+
items:
|
|
566
|
+
type: string
|
|
567
|
+
`,
|
|
568
|
+
},
|
|
569
|
+
{
|
|
570
|
+
name: "reduce-verification",
|
|
571
|
+
prompt: `You are performing reduction verification over a blinded set of reduction candidates for one completed target session.
|
|
572
|
+
|
|
573
|
+
Your goal is to decide which reduction is the best carry-forward artifact for later use.
|
|
574
|
+
|
|
575
|
+
Read order:
|
|
576
|
+
|
|
577
|
+
1. Read the blinded reduction artifacts for all candidates.
|
|
578
|
+
2. Compare them on fidelity, usefulness, compression quality, and next-step utility.
|
|
579
|
+
3. Produce one structured result that includes per-candidate assessments, a strict full ranking, and an explicit preferred reduction.
|
|
580
|
+
|
|
581
|
+
What matters:
|
|
582
|
+
|
|
583
|
+
- preserve important facts from the source session
|
|
584
|
+
- remove noise without dropping durable signal
|
|
585
|
+
- surface unresolved uncertainty honestly
|
|
586
|
+
- preserve the decisions, caveats, and next-step guidance a later operator would actually need
|
|
587
|
+
- produce guidance that is actually useful for later \`spec\`, \`run\`, \`reduce\`, or \`verify\` work
|
|
588
|
+
|
|
589
|
+
What does not matter:
|
|
590
|
+
|
|
591
|
+
- prose flourish
|
|
592
|
+
- maximal detail for its own sake
|
|
593
|
+
- ranking a reduction highly just because it is long
|
|
594
|
+
- re-litigating the full session when the reduction should carry the durable outcome forward
|
|
595
|
+
|
|
596
|
+
Do not defer to any one reduction because of agent provenance. Candidates are blinded and should be judged on artifact quality alone.
|
|
597
|
+
|
|
598
|
+
Ranking requirements:
|
|
599
|
+
|
|
600
|
+
- rank the full eligible candidate set
|
|
601
|
+
- do not use ties
|
|
602
|
+
- set \`preferred\` equal to \`ranking[0]\`
|
|
603
|
+
- make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`
|
|
604
|
+
|
|
605
|
+
Expected output shape:
|
|
606
|
+
|
|
607
|
+
- \`assessments[]\` with one entry per candidate reduction
|
|
608
|
+
- each assessment should include:
|
|
609
|
+
- \`candidate\`
|
|
610
|
+
- \`recommendation_level\`
|
|
611
|
+
- \`quality\`
|
|
612
|
+
- \`evaluation\`
|
|
613
|
+
- \`strengths\`
|
|
614
|
+
- \`gaps\`
|
|
615
|
+
- \`evidence_refs\`
|
|
616
|
+
- top-level \`preferred\` naming the selected reduction
|
|
617
|
+
- top-level \`comparison\` explaining why \`ranking[0]\` beat \`ranking[1]\`
|
|
618
|
+
- top-level \`ranking\` must be strict, complete, and tie-free
|
|
619
|
+
- top-level \`rationale\` should explain why \`preferred\` is the best carry-forward artifact
|
|
620
|
+
- top-level \`next_actions\` should stay short and operational
|
|
621
|
+
`,
|
|
622
|
+
rubric: `# Reduce Review Rubric
|
|
623
|
+
|
|
624
|
+
This rubric answers one question: which reduction is the most useful durable carry-forward artifact?
|
|
625
|
+
|
|
626
|
+
Evaluate each candidate on:
|
|
627
|
+
|
|
628
|
+
- fidelity
|
|
629
|
+
- does it preserve the important facts and conclusions from the source session?
|
|
630
|
+
- compression
|
|
631
|
+
- does it remove noise without discarding durable signal?
|
|
632
|
+
- uncertainty
|
|
633
|
+
- does it preserve unresolved caveats instead of laundering them away?
|
|
634
|
+
- next_step_utility
|
|
635
|
+
- would this artifact actually help a later operator or human continue the work without reopening the whole source session?
|
|
636
|
+
- evidence
|
|
637
|
+
- are the claims grounded in visible source artifacts?
|
|
638
|
+
|
|
639
|
+
Recommendation posture:
|
|
640
|
+
|
|
641
|
+
- carry_forward_now
|
|
642
|
+
- strong enough to use as the preferred reduction artifact immediately
|
|
643
|
+
- usable_with_gap
|
|
644
|
+
- useful, but has clear omissions or weaknesses
|
|
645
|
+
- not_recommended
|
|
646
|
+
- too lossy, misleading, or weak to be the preferred carry-forward artifact
|
|
647
|
+
|
|
648
|
+
Comparison guidance:
|
|
649
|
+
|
|
650
|
+
- prefer durable synthesis over exhaustive recap
|
|
651
|
+
- prefer honest uncertainty over false confidence
|
|
652
|
+
- prefer actionable carry-forward guidance over generic summary language
|
|
653
|
+
- prefer reductions that preserve the session's decisions and caveats, not just its topic area
|
|
654
|
+
- do not reward verbosity by default
|
|
655
|
+
|
|
656
|
+
## Candidate posture
|
|
657
|
+
|
|
658
|
+
Each candidate assessment should also name:
|
|
659
|
+
|
|
660
|
+
- \`recommendation_level\`: \`carry_forward_now\`, \`usable_with_gap\`, or \`not_recommended\`
|
|
661
|
+
- \`quality\`: \`high\`, \`medium\`, or \`low\`
|
|
662
|
+
|
|
663
|
+
These fields preserve the practical decision posture a reduction selector needs:
|
|
664
|
+
|
|
665
|
+
- \`carry_forward_now\` means the reduction is strong enough to use immediately as the preferred carry-forward artifact
|
|
666
|
+
- \`usable_with_gap\` means the reduction is directionally useful but has bounded omissions or weaknesses
|
|
667
|
+
- \`not_recommended\` means the reduction should not win the stage
|
|
668
|
+
|
|
669
|
+
Use \`strengths\` and \`gaps\` for per-candidate observations only. Put cross-candidate tradeoffs in \`comparison\` and final winner justification in \`rationale\`.
|
|
670
|
+
|
|
671
|
+
Ranking rule:
|
|
672
|
+
|
|
673
|
+
- rank the full eligible candidate set with no ties
|
|
674
|
+
- set \`preferred\` equal to \`ranking[0]\`
|
|
675
|
+
- make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`
|
|
676
|
+
|
|
677
|
+
The verification artifact should also include:
|
|
678
|
+
|
|
679
|
+
- \`comparison\`: cross-candidate tradeoffs, explicitly including why \`ranking[0]\` beat \`ranking[1]\`
|
|
680
|
+
- \`rationale\`: why \`preferred\` / \`ranking[0]\` is the best carry-forward choice
|
|
681
|
+
- \`next_actions\`: short, operational follow-up for the selected path only
|
|
682
|
+
`,
|
|
683
|
+
schema: `type: object
|
|
684
|
+
required:
|
|
685
|
+
- assessments
|
|
686
|
+
- preferred
|
|
687
|
+
- comparison
|
|
688
|
+
- ranking
|
|
689
|
+
- rationale
|
|
690
|
+
- next_actions
|
|
691
|
+
properties:
|
|
692
|
+
assessments:
|
|
693
|
+
type: array
|
|
694
|
+
items:
|
|
695
|
+
type: object
|
|
696
|
+
required:
|
|
697
|
+
- candidate
|
|
698
|
+
- recommendation_level
|
|
699
|
+
- quality
|
|
700
|
+
- evaluation
|
|
701
|
+
- strengths
|
|
702
|
+
- gaps
|
|
703
|
+
- evidence_refs
|
|
704
|
+
properties:
|
|
705
|
+
candidate:
|
|
706
|
+
type: string
|
|
707
|
+
recommendation_level:
|
|
708
|
+
type: string
|
|
709
|
+
enum: ["carry_forward_now", "usable_with_gap", "not_recommended"]
|
|
710
|
+
quality:
|
|
711
|
+
type: string
|
|
712
|
+
enum: ["high", "medium", "low"]
|
|
713
|
+
evaluation:
|
|
714
|
+
type: object
|
|
715
|
+
required:
|
|
716
|
+
- fidelity
|
|
717
|
+
- compression
|
|
718
|
+
- uncertainty
|
|
719
|
+
- next_step_utility
|
|
720
|
+
- evidence
|
|
721
|
+
properties:
|
|
722
|
+
fidelity:
|
|
723
|
+
type: string
|
|
724
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
725
|
+
compression:
|
|
726
|
+
type: string
|
|
727
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
728
|
+
uncertainty:
|
|
729
|
+
type: string
|
|
730
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
731
|
+
next_step_utility:
|
|
732
|
+
type: string
|
|
733
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
734
|
+
evidence:
|
|
735
|
+
type: string
|
|
736
|
+
enum: ["strong", "acceptable", "weak", "missing"]
|
|
737
|
+
strengths:
|
|
738
|
+
type: array
|
|
739
|
+
items:
|
|
740
|
+
type: string
|
|
741
|
+
gaps:
|
|
742
|
+
type: array
|
|
743
|
+
items:
|
|
744
|
+
type: string
|
|
745
|
+
evidence_refs:
|
|
746
|
+
type: array
|
|
747
|
+
items:
|
|
748
|
+
type: string
|
|
749
|
+
comparison:
|
|
750
|
+
type: string
|
|
751
|
+
preferred:
|
|
752
|
+
type: string
|
|
753
|
+
ranking:
|
|
754
|
+
type: array
|
|
755
|
+
items:
|
|
756
|
+
type: string
|
|
757
|
+
rationale:
|
|
758
|
+
type: string
|
|
759
|
+
next_actions:
|
|
760
|
+
type: array
|
|
761
|
+
items:
|
|
762
|
+
type: string
|
|
763
|
+
`,
|
|
764
|
+
},
|
|
765
|
+
{
|
|
766
|
+
name: "message-verification",
|
|
767
|
+
prompt: `You are a blinded verifier agent reviewing multiple message responses to the same prompt and choosing the single best response artifact to carry forward.
|
|
768
|
+
|
|
769
|
+
Inputs:
|
|
770
|
+
|
|
771
|
+
- the original message prompt
|
|
772
|
+
- the full blinded response set
|
|
773
|
+
- any shared context needed to understand the prompt
|
|
774
|
+
|
|
775
|
+
Expected working style:
|
|
776
|
+
|
|
777
|
+
1. Read the original message prompt first.
|
|
778
|
+
2. Derive the key response requirements the prompt establishes and use stable ids such as \`R1\`, \`R2\`, \`R3\`.
|
|
779
|
+
3. Inspect the blinded response set directly.
|
|
780
|
+
4. Assess each response against the verification rubric.
|
|
781
|
+
5. Record per-response requirement coverage, completion posture, recommendation posture, and bounded follow-up work.
|
|
782
|
+
6. Derive a strict best-to-worst ranking from those assessments.
|
|
783
|
+
7. Make the ranking strict, complete, and tie-free across the full eligible response set.
|
|
784
|
+
8. Set \`preferred\` equal to \`ranking[0]\`.
|
|
785
|
+
|
|
786
|
+
Judgment discipline:
|
|
787
|
+
|
|
788
|
+
- make claims only when you can point to evidence from the prompt or the response artifacts
|
|
789
|
+
- if you cannot verify something, say so explicitly
|
|
790
|
+
- focus on whether the response actually answered the asked prompt, not whether it merely sounds polished
|
|
791
|
+
- distinguish bounded omissions from fundamental prompt misses
|
|
792
|
+
- focus on decision-relevant follow-up work
|
|
793
|
+
- include lightweight \`evidence_refs\` for each response assessment
|
|
794
|
+
- keep \`comparison\` focused on cross-response tradeoffs such as prompt adherence, response quality, and carry-forward usefulness
|
|
795
|
+
- make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`, not just why lower-ranked responses lost
|
|
796
|
+
- include \`next_actions\` only for the selected response path
|
|
797
|
+
|
|
798
|
+
Expected output shape:
|
|
799
|
+
|
|
800
|
+
- \`assessments[]\` with one entry per response candidate
|
|
801
|
+
- top-level \`preferred\` naming the selected candidate
|
|
802
|
+
- each assessment should include:
|
|
803
|
+
- \`candidate\`
|
|
804
|
+
- \`completion_status\`
|
|
805
|
+
- \`recommendation_level\`
|
|
806
|
+
- \`quality\`
|
|
807
|
+
- \`evaluation\`
|
|
808
|
+
- \`requirements_coverage\`
|
|
809
|
+
- \`implementation_notes\`
|
|
810
|
+
- \`follow_up\`
|
|
811
|
+
- \`evidence_refs\`
|
|
812
|
+
- top-level \`comparison\` should capture cross-candidate tradeoffs
|
|
813
|
+
- top-level \`ranking\` must be strict, complete, and tie-free
|
|
814
|
+
- top-level \`rationale\` should explain why \`preferred\` / \`ranking[0]\` is the best carry-forward response
|
|
815
|
+
- top-level \`next_actions\` should stay short and operational
|
|
816
|
+
`,
|
|
817
|
+
rubric: `# Message Review
|
|
818
|
+
|
|
819
|
+
Review the response set by assessing each candidate on:
|
|
820
|
+
|
|
821
|
+
- prompt adherence
|
|
822
|
+
- task fit
|
|
823
|
+
- response quality
|
|
824
|
+
- decision usefulness
|
|
825
|
+
- evidence
|
|
826
|
+
|
|
827
|
+
Then derive a final ranking from those assessments.
|
|
828
|
+
|
|
829
|
+
## Prompt Adherence
|
|
830
|
+
|
|
831
|
+
Ask:
|
|
832
|
+
|
|
833
|
+
- Does the response answer the actual prompt?
|
|
834
|
+
- Are key prompt requirements clearly met, partially met, not met, or not verifiable?
|
|
835
|
+
- Are there obvious mismatches between what the prompt asked for and what the response delivered?
|
|
836
|
+
|
|
837
|
+
Prompt adherence should dominate polish or stylistic preference.
|
|
838
|
+
|
|
839
|
+
Every candidate assessment should include explicit \`requirements_coverage\` entries so the ranking is traceable back to the original prompt rather than inferred from generic quality labels.
|
|
840
|
+
|
|
841
|
+
## Task Fit
|
|
842
|
+
|
|
843
|
+
Ask:
|
|
844
|
+
|
|
845
|
+
- Does the response take the right posture for the prompt, not just produce superficially plausible language?
|
|
846
|
+
- Does it stay within the asked scope instead of drifting into adjacent advice, cleanup, or speculation?
|
|
847
|
+
- Does it answer at the right level of abstraction for the task?
|
|
848
|
+
|
|
849
|
+
## Response Quality
|
|
850
|
+
|
|
851
|
+
Ask:
|
|
852
|
+
|
|
853
|
+
- Is the response coherent, direct, and internally consistent?
|
|
854
|
+
- Does it surface uncertainty honestly instead of bluffing?
|
|
855
|
+
- Does it preserve the important distinctions or caveats the prompt context requires?
|
|
856
|
+
|
|
857
|
+
## Decision Usefulness
|
|
858
|
+
|
|
859
|
+
Ask:
|
|
860
|
+
|
|
861
|
+
- Would this response be the best durable artifact to keep from the message session?
|
|
862
|
+
- Does it leave later operators or humans with a clear answer, recommendation, or next step?
|
|
863
|
+
- Are any missing follow-ups bounded and low-risk, or do they reopen major prompt questions?
|
|
864
|
+
|
|
865
|
+
## Evidence
|
|
866
|
+
|
|
867
|
+
Ask:
|
|
868
|
+
|
|
869
|
+
- Are important claims supported by concrete prompt or response evidence?
|
|
870
|
+
- Does the response leave meaningful uncertainty unresolved?
|
|
871
|
+
|
|
872
|
+
Evidence here means direct artifact evidence for the prompt/response pair itself:
|
|
873
|
+
|
|
874
|
+
- the staged prompt artifact
|
|
875
|
+
- the blinded response artifacts
|
|
876
|
+
- cited prompt or response excerpts
|
|
877
|
+
|
|
878
|
+
## Candidate posture
|
|
879
|
+
|
|
880
|
+
Each candidate assessment should also name:
|
|
881
|
+
|
|
882
|
+
- \`completion_status\`: \`complete\`, \`complete_with_gap\`, \`complete_with_gaps\`, \`incomplete\`, or \`not_verifiable\`
|
|
883
|
+
- \`recommendation_level\`: \`carry_forward_now\`, \`strong_foundation\`, or \`not_recommended\`
|
|
884
|
+
- \`quality\`: \`high\`, \`medium\`, or \`low\`
|
|
885
|
+
|
|
886
|
+
These fields preserve the practical decision posture a message selector needs:
|
|
887
|
+
|
|
888
|
+
- \`carry_forward_now\` means the response is fit to keep as the preferred durable message artifact immediately
|
|
889
|
+
- \`strong_foundation\` means the response is directionally strong but still needs bounded tightening or follow-up
|
|
890
|
+
- \`not_recommended\` means the response should not win the message verification
|
|
891
|
+
|
|
892
|
+
## Ranking rule
|
|
893
|
+
|
|
894
|
+
The final ranking should follow from the candidate assessments above.
|
|
895
|
+
|
|
896
|
+
It should answer:
|
|
897
|
+
|
|
898
|
+
- which response should win?
|
|
899
|
+
- which ordering best reflects carry-forward usefulness?
|
|
900
|
+
|
|
901
|
+
It should not ignore the structured per-candidate assessments.
|
|
902
|
+
It must rank the full eligible response set with no ties.
|
|
903
|
+
Set \`preferred\` equal to \`ranking[0]\`.
|
|
904
|
+
|
|
905
|
+
The verification artifact should also include:
|
|
906
|
+
|
|
907
|
+
- \`comparison\`: cross-candidate tradeoffs, explicitly including why \`ranking[0]\` beat \`ranking[1]\`
|
|
908
|
+
- \`rationale\`: why \`preferred\` / \`ranking[0]\` is the best choice
|
|
909
|
+
- \`next_actions\`: short, operational follow-up for the selected path
|
|
910
|
+
`,
|
|
911
|
+
schema: `type: object
|
|
912
|
+
required:
|
|
913
|
+
- assessments
|
|
914
|
+
- preferred
|
|
915
|
+
- comparison
|
|
916
|
+
- ranking
|
|
917
|
+
- rationale
|
|
918
|
+
- next_actions
|
|
919
|
+
properties:
|
|
920
|
+
assessments:
|
|
921
|
+
type: array
|
|
922
|
+
items:
|
|
923
|
+
type: object
|
|
924
|
+
required:
|
|
925
|
+
- candidate
|
|
926
|
+
- completion_status
|
|
927
|
+
- recommendation_level
|
|
928
|
+
- quality
|
|
929
|
+
- evaluation
|
|
930
|
+
- requirements_coverage
|
|
931
|
+
- implementation_notes
|
|
932
|
+
- follow_up
|
|
933
|
+
- evidence_refs
|
|
934
|
+
properties:
|
|
935
|
+
candidate:
|
|
936
|
+
type: string
|
|
937
|
+
completion_status:
|
|
938
|
+
type: string
|
|
939
|
+
enum: ["complete", "complete_with_gap", "complete_with_gaps", "incomplete", "not_verifiable"]
|
|
940
|
+
recommendation_level:
|
|
941
|
+
type: string
|
|
942
|
+
enum: ["carry_forward_now", "strong_foundation", "not_recommended"]
|
|
943
|
+
quality:
|
|
944
|
+
type: string
|
|
945
|
+
enum: ["high", "medium", "low"]
|
|
946
|
+
evaluation:
|
|
947
|
+
type: object
|
|
948
|
+
required:
|
|
949
|
+
- prompt_adherence
|
|
950
|
+
- task_fit
|
|
951
|
+
- response_quality
|
|
952
|
+
- decision_usefulness
|
|
953
|
+
- evidence
|
|
954
|
+
properties:
|
|
955
|
+
prompt_adherence:
|
|
956
|
+
type: string
|
|
957
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
958
|
+
task_fit:
|
|
959
|
+
type: string
|
|
960
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
961
|
+
response_quality:
|
|
962
|
+
type: string
|
|
963
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
964
|
+
decision_usefulness:
|
|
965
|
+
type: string
|
|
966
|
+
enum: ["strong", "acceptable", "weak", "not_verifiable"]
|
|
967
|
+
evidence:
|
|
968
|
+
type: string
|
|
969
|
+
enum: ["strong", "acceptable", "weak", "missing"]
|
|
970
|
+
requirements_coverage:
|
|
971
|
+
type: array
|
|
972
|
+
items:
|
|
973
|
+
type: object
|
|
974
|
+
required:
|
|
975
|
+
- requirement
|
|
976
|
+
- status
|
|
977
|
+
- note
|
|
978
|
+
- evidence_refs
|
|
979
|
+
properties:
|
|
980
|
+
requirement:
|
|
981
|
+
type: string
|
|
982
|
+
status:
|
|
983
|
+
type: string
|
|
984
|
+
enum: ["met", "partial", "not_met", "not_verifiable"]
|
|
985
|
+
note:
|
|
986
|
+
type: string
|
|
987
|
+
evidence_refs:
|
|
988
|
+
type: array
|
|
989
|
+
items:
|
|
990
|
+
type: string
|
|
991
|
+
implementation_notes:
|
|
992
|
+
type: string
|
|
993
|
+
follow_up:
|
|
994
|
+
type: array
|
|
995
|
+
items:
|
|
996
|
+
type: string
|
|
997
|
+
evidence_refs:
|
|
998
|
+
type: array
|
|
999
|
+
items:
|
|
1000
|
+
type: string
|
|
1001
|
+
comparison:
|
|
1002
|
+
type: string
|
|
1003
|
+
preferred:
|
|
1004
|
+
type: string
|
|
1005
|
+
ranking:
|
|
1006
|
+
type: array
|
|
1007
|
+
items:
|
|
1008
|
+
type: string
|
|
1009
|
+
rationale:
|
|
1010
|
+
type: string
|
|
1011
|
+
next_actions:
|
|
1012
|
+
type: array
|
|
1013
|
+
items:
|
|
1014
|
+
type: string
|
|
1015
|
+
`,
|
|
1016
|
+
},
|
|
1017
|
+
];
|