@exaudeus/workrail 3.11.0 → 3.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1026,8 +1026,8 @@
1026
1026
  "bytes": 5976
1027
1027
  },
1028
1028
  "mcp/tools.js": {
1029
- "sha256": "aa364f3e613fc61a39fc7336a723231da06852d92ec7840a95327aa370e42e1c",
1030
- "bytes": 8360
1029
+ "sha256": "bf9043c94b66e7692f70e9f5f36f408d40441bb69866bda4e7b0fd646eba6022",
1030
+ "bytes": 8635
1031
1031
  },
1032
1032
  "mcp/transports/http-entry.d.ts": {
1033
1033
  "sha256": "35d313b120dcf38643de9462559163581b89943fe432706986252e8b698b9507",
@@ -1122,8 +1122,8 @@
1122
1122
  "bytes": 7206
1123
1123
  },
1124
1124
  "mcp/v2/tools.js": {
1125
- "sha256": "9fb084ad964296bf49e94327aacc8c55857822382f1636278143f8074572bc40",
1126
- "bytes": 10661
1125
+ "sha256": "d2de769feb5bb8c6ddcd415594f7cf49380f13bae5ddc422445381adcef038a3",
1126
+ "bytes": 10771
1127
1127
  },
1128
1128
  "mcp/validation/bounded-json.d.ts": {
1129
1129
  "sha256": "82203ac6123d5c6989606c3b5405aaea99ab829c8958835f9ae3ba45b8bc8fd5",
package/dist/mcp/tools.js CHANGED
@@ -8,7 +8,7 @@ exports.WorkflowListInput = zod_1.z.object({});
8
8
  exports.WorkflowGetInput = zod_1.z.object({
9
9
  workflowId: zod_1.z
10
10
  .string()
11
- .regex(/^[A-Za-z0-9_-]+$/, 'Workflow ID must contain only letters, numbers, hyphens, and underscores')
11
+ .regex(/^([a-z0-9_-]+|[a-z][a-z0-9_-]+\.[a-z][a-z0-9_-]+)$/, 'Workflow ID must be a valid legacy ID (e.g. my-workflow) or namespaced ID (e.g. wr.discovery)')
12
12
  .describe('The unique identifier of the workflow to retrieve'),
13
13
  mode: zod_1.z
14
14
  .enum(['metadata', 'preview'])
@@ -18,7 +18,7 @@ exports.WorkflowGetInput = zod_1.z.object({
18
18
  exports.WorkflowNextInput = zod_1.z.object({
19
19
  workflowId: zod_1.z
20
20
  .string()
21
- .regex(/^[A-Za-z0-9_-]+$/, 'Workflow ID must contain only letters, numbers, hyphens, and underscores')
21
+ .regex(/^([a-z0-9_-]+|[a-z][a-z0-9_-]+\.[a-z][a-z0-9_-]+)$/, 'Workflow ID must be a valid legacy ID (e.g. my-workflow) or namespaced ID (e.g. wr.discovery)')
22
22
  .describe('The unique identifier of the workflow'),
23
23
  state: state_js_1.ExecutionStateSchema.describe('Serializable workflow execution state (authoritative). ' +
24
24
  'For the first call, use: { kind: "init" }. ' +
@@ -109,7 +109,7 @@ exports.WORKFLOW_TOOL_TITLES = {
109
109
  exports.CreateSessionInput = zod_1.z.object({
110
110
  workflowId: zod_1.z
111
111
  .string()
112
- .regex(/^[A-Za-z0-9_-]+$/, 'Workflow ID must contain only letters, numbers, hyphens, and underscores')
112
+ .regex(/^([a-z0-9_-]+|[a-z][a-z0-9_-]+\.[a-z][a-z0-9_-]+)$/, 'Workflow ID must be a valid legacy ID (e.g. my-workflow) or namespaced ID (e.g. wr.discovery)')
113
113
  .describe('Workflow identifier (e.g., "bug-investigation", "mr-review")'),
114
114
  sessionId: zod_1.z
115
115
  .string()
@@ -120,14 +120,14 @@ exports.CreateSessionInput = zod_1.z.object({
120
120
  .describe('Initial session data. Can include dashboard, phases, etc.'),
121
121
  });
122
122
  exports.UpdateSessionInput = zod_1.z.object({
123
- workflowId: zod_1.z.string().regex(/^[A-Za-z0-9_-]+$/, 'Workflow ID must contain only letters, numbers, hyphens, and underscores').describe('Workflow identifier'),
123
+ workflowId: zod_1.z.string().regex(/^([a-z0-9_-]+|[a-z][a-z0-9_-]+\.[a-z][a-z0-9_-]+)$/, 'Workflow ID must be a valid legacy ID (e.g. my-workflow) or namespaced ID (e.g. wr.discovery)').describe('Workflow identifier'),
124
124
  sessionId: zod_1.z.string().describe('Session identifier'),
125
125
  updates: zod_1.z
126
126
  .record(zod_1.z.unknown())
127
127
  .describe('Data to merge into session. Supports nested updates via dot notation.'),
128
128
  });
129
129
  exports.ReadSessionInput = zod_1.z.object({
130
- workflowId: zod_1.z.string().regex(/^[A-Za-z0-9_-]+$/, 'Workflow ID must contain only letters, numbers, hyphens, and underscores').describe('Workflow identifier'),
130
+ workflowId: zod_1.z.string().regex(/^([a-z0-9_-]+|[a-z][a-z0-9_-]+\.[a-z][a-z0-9_-]+)$/, 'Workflow ID must be a valid legacy ID (e.g. my-workflow) or namespaced ID (e.g. wr.discovery)').describe('Workflow identifier'),
131
131
  sessionId: zod_1.z.string().describe('Session identifier'),
132
132
  path: zod_1.z
133
133
  .string()
@@ -18,12 +18,12 @@ exports.V2ListWorkflowsInput = zod_1.z.object({
18
18
  workspacePath: workspacePathField.describe('Required. Absolute path to your current workspace directory (e.g. the "Workspace:" value from your system parameters). WorkRail uses this to resolve project-scoped workflow variants against the correct workspace for discovery-sensitive workflow listing. Shared MCP servers cannot infer this safely.'),
19
19
  });
20
20
  exports.V2InspectWorkflowInput = zod_1.z.object({
21
- workflowId: zod_1.z.string().min(1).regex(/^[A-Za-z0-9_-]+$/, 'Workflow ID must contain only letters, numbers, hyphens, and underscores').describe('The workflow ID to inspect'),
21
+ workflowId: zod_1.z.string().min(1).regex(/^([a-z0-9_-]+|[a-z][a-z0-9_-]+\.[a-z][a-z0-9_-]+)$/, 'Workflow ID must be a valid legacy ID (e.g. my-workflow) or namespaced ID (e.g. wr.discovery)').describe('The workflow ID to inspect'),
22
22
  mode: zod_1.z.enum(['metadata', 'preview']).default('preview').describe('Detail level: metadata (name and description only) or preview (full step-by-step breakdown, default)'),
23
23
  workspacePath: workspacePathField.describe('Required. Absolute path to your current workspace directory (e.g. the "Workspace:" value from your system parameters). WorkRail uses this to resolve the correct project-scoped workflow variant for discovery-sensitive workflow inspection. Shared MCP servers cannot infer this safely.'),
24
24
  });
25
25
  exports.V2StartWorkflowInput = zod_1.z.object({
26
- workflowId: zod_1.z.string().min(1).regex(/^[A-Za-z0-9_-]+$/, 'Workflow ID must contain only letters, numbers, hyphens, and underscores').describe('The workflow ID to start'),
26
+ workflowId: zod_1.z.string().min(1).regex(/^([a-z0-9_-]+|[a-z][a-z0-9_-]+\.[a-z][a-z0-9_-]+)$/, 'Workflow ID must be a valid legacy ID (e.g. my-workflow) or namespaced ID (e.g. wr.discovery)').describe('The workflow ID to start'),
27
27
  workspacePath: workspacePathField.describe('Required. Absolute path to your current workspace directory (e.g. the "Workspace:" value from your system parameters). WorkRail uses this to resolve the correct project-scoped workflow variant and to anchor the session to the correct repo for future resume_session discovery. Shared MCP servers cannot infer this safely.'),
28
28
  });
29
29
  exports.V2ContinueWorkflowInputShape = zod_1.z.object({
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@exaudeus/workrail",
3
- "version": "3.11.0",
3
+ "version": "3.11.2",
4
4
  "description": "Step-by-step workflow enforcement for AI agents via MCP",
5
5
  "license": "MIT",
6
6
  "repository": {
@@ -578,8 +578,8 @@
578
578
  "prompt.composition",
579
579
  "workflow.definition"
580
580
  ],
581
- "rule": "Use extension points when a workflow wants stable customization slots rather than hardcoding routine or binding references inline.",
582
- "why": "Extension points make customization explicit, inspectable, and project-overridable.",
581
+ "rule": "Use extension points when a workflow wants stable project-overridable delegation seams rather than hardcoding bound routine or workflow names inline.",
582
+ "why": "Extension points make delegated customization explicit, inspectable, and project-overridable without conflating rebinding with routine injection.",
583
583
  "enforcement": [
584
584
  "advisory"
585
585
  ],
@@ -602,11 +602,15 @@
602
602
  ],
603
603
  "checks": [
604
604
  "Declare extension points at the workflow level when bindings are part of the contract.",
605
- "Avoid hidden or undocumented binding slots in prompts."
605
+ "Avoid hidden or undocumented binding slots in prompts.",
606
+ "Prefer `templateCall` when the real goal is reusable inline routine structure, visible injected steps, or parent-step confirmation behavior.",
607
+ "Use extension points only when the seam is intentionally delegated and may need project-level rebinding."
606
608
  ],
607
609
  "antiPatterns": [
608
610
  "Hardcoding team-customizable routine names in prompt text without an extension-point declaration",
609
- "Using `{{wr.bindings.*}}` tokens in a workflow that declares no extension points"
611
+ "Using `{{wr.bindings.*}}` tokens in a workflow that declares no extension points",
612
+ "Using extension points where `templateCall` would better represent the parent workflow's real structure",
613
+ "Expecting `{{wr.bindings.*}}` to change which routine gets injected inline"
610
614
  ]
611
615
  }
612
616
  ]
@@ -682,6 +686,36 @@
682
686
  }
683
687
  ]
684
688
  },
689
+ {
690
+ "id": "references",
691
+ "title": "Workflow references",
692
+ "rules": [
693
+ {
694
+ "id": "references-are-for-runtime-companion-material",
695
+ "status": "active",
696
+ "level": "recommended",
697
+ "scope": [
698
+ "workflow.references",
699
+ "workflow.definition"
700
+ ],
701
+ "rule": "Declare references only for documents the running workflow may genuinely need while executing its task.",
702
+ "why": "References are surfaced to the agent at workflow start and become part of the workflow hash. Maintainer-only or authoring-only references add cognitive load and hash churn without improving runtime execution.",
703
+ "enforcement": [
704
+ "advisory"
705
+ ],
706
+ "checks": [
707
+ "Keep references that materially help the running workflow perform its task.",
708
+ "Prefer rubrics, target-system specs, policies, or playbooks that constrain runtime judgment.",
709
+ "If removing a reference would not make the running workflow materially worse at execution, remove it."
710
+ ],
711
+ "antiPatterns": [
712
+ "Adding workflow-schema references to ordinary execution workflows that are not authoring or validation workflows",
713
+ "Adding authoring-spec or provenance references to workflows whose runtime task is unrelated to workflow authoring",
714
+ "Using references to justify a workflow's design to maintainers instead of helping the running agent do the task"
715
+ ]
716
+ }
717
+ ]
718
+ },
685
719
  {
686
720
  "id": "response-supplements",
687
721
  "title": "Response supplements and delivery-owned guidance",
@@ -0,0 +1,43 @@
1
+ ## Production Readiness Audit Rubric
2
+
3
+ Use this rubric when running the bundled `production-readiness-audit` workflow.
4
+
5
+ ### Coverage domains
6
+
7
+ - Debugging and correctness
8
+ - Runtime readiness
9
+ - Technical debt and maintainability
10
+ - Philosophy and repo-pattern alignment
11
+ - Tests and observability
12
+ - Security and performance when the audited scope materially touches them
13
+
14
+ ### Finding classes
15
+
16
+ - **Confirmed**: supported by primary evidence such as code, tests, build output, runtime traces, or a directly checked artifact
17
+ - **Plausible**: directionally concerning, but not yet strong enough to drive the verdict alone
18
+ - **Rejected**: weakened or disproved by fuller context or direct evidence
19
+
20
+ ### Verdicts
21
+
22
+ - **ready**: no material blockers, no major unresolved gaps, and confidence is strong enough for the audited scope
23
+ - **ready_with_conditions**: broadly shippable, but bounded conditions or follow-up work still matter
24
+ - **not_ready**: blockers or major risks make shipping irresponsible right now
25
+ - **inconclusive**: the scope or evidence is too weak for a clean readiness call
26
+
27
+ ### Confidence bands
28
+
29
+ - **High**: coverage is materially adequate and serious claims are backed by primary evidence
30
+ - **Medium**: most important areas are covered, but some uncertainty or weaker proof remains
31
+ - **Low**: major gaps, contradictions, or thin evidence still cap the verdict
32
+
33
+ ### Severity discipline
34
+
35
+ - Do not upgrade a claim to blocker status just because multiple subagents agree
36
+ - Do not flatten real contradictions into a single confident story without adjudication
37
+ - Do not call a scope production-ready when a material coverage gap still weakens the verdict
38
+
39
+ ### Synthesis discipline
40
+
41
+ - Treat delegated output as evidence, not final truth
42
+ - Say what changed your mind, what you rejected, and why
43
+ - Keep the final handoff decision-focused rather than implementation-focused
@@ -0,0 +1,354 @@
1
+ {
2
+ "id": "production-readiness-audit",
3
+ "name": "Production Readiness Audit (v2 • Evidence-Driven Readiness Review)",
4
+ "version": "0.1.0",
5
+ "description": "Audit a bounded codebase scope for debugging risk, runtime readiness, stale or misleading implementation surfaces, technical debt, and anything else that would keep it from being honestly production-ready.",
6
+ "recommendedPreferences": {
7
+ "recommendedAutonomy": "guided",
8
+ "recommendedRiskPolicy": "conservative"
9
+ },
10
+ "features": [
11
+ "wr.features.subagent_guidance"
12
+ ],
13
+ "references": [
14
+ {
15
+ "id": "audit-rubric",
16
+ "title": "Production Readiness Audit Rubric",
17
+ "source": "./spec/production-readiness-audit-rubric.md",
18
+ "purpose": "Canonical coverage, evidence, confidence, and verdict rubric for this workflow.",
19
+ "authoritative": true,
20
+ "resolveFrom": "package"
21
+ }
22
+ ],
23
+ "preconditions": [
24
+ "The user provides a target scope or the agent can infer a bounded scope from the request.",
25
+ "The agent can inspect the code, surrounding context, and deterministic evidence needed to assess readiness honestly.",
26
+ "A human will consume the final verdict, findings, or remediation order."
27
+ ],
28
+ "metaGuidance": [
29
+ "DEFAULT BEHAVIOR: self-execute with tools. Ask only for true scope decisions, missing external artifacts, or permissions you cannot resolve yourself.",
30
+ "V2 DURABILITY: keep workflow truth in output.notesMarkdown and explicit context fields. Human-facing markdown artifacts are optional companions only.",
31
+ "OWNERSHIP: the main agent owns the fact packet, synthesis, severity calibration, verdict, and remediation order. Delegated work is evidence, not authority.",
32
+ "SUBAGENT DISCIPLINE: use a few explicit fan-out and fan-in checkpoints rather than scattered optional subagent calls.",
33
+ "READINESS MODEL: first understand and bound the scope, then state a readiness hypothesis, then freeze a neutral readiness fact packet, then let reviewer families challenge it in parallel, then reconcile contradictions explicitly.",
34
+ "COVERAGE LEDGER: track audit domains as `checked`, `uncertain`, `not_applicable`, `contradicted`, or `needs_followup`. Do not finalize with unresolved material gaps unless you name them clearly.",
35
+ "VERDICTS: allow `ready`, `ready_with_conditions`, `not_ready`, and `inconclusive`. Do not force a cleaner answer than the evidence supports.",
36
+ "BOUNDARY: this workflow audits and prioritizes. It must not drift into implementation planning or patch sequencing unless the user explicitly asks."
37
+ ],
38
+ "steps": [
39
+ {
40
+ "id": "phase-0-understand-and-classify",
41
+ "title": "Phase 0: Understand and Classify",
42
+ "promptBlocks": {
43
+ "goal": "Build the minimum complete understanding needed to audit readiness honestly.",
44
+ "constraints": [
45
+ [
46
+ { "kind": "ref", "refId": "wr.refs.notes_first_durability" }
47
+ ],
48
+ "Use tools first. Ask only for true scope or permission gaps you cannot resolve yourself.",
49
+ "Separate in-scope code from adjacent noise before you classify rigor or risk."
50
+ ],
51
+ "procedure": [
52
+ "Locate the real target surface, likely entry points, critical paths, public contracts, invariants, data or runtime surfaces, and affected consumers that matter.",
53
+ "Find the repo patterns and philosophy sources that should shape the audit, and state what production-ready should mean for this scope instead of assuming a generic bar.",
54
+ "Classify `scopeShape`, `riskLevel`, `rigorMode`, `criticalSurfaceTouched`, and `needsSimulation` after exploration, not before.",
55
+ "Run a context-clarity check with concrete scores for boundary clarity, production-bar clarity, philosophy clarity, and verification clarity.",
56
+ "If rigor and uncertainty justify it, spawn TWO WorkRail Executors in parallel running `routine-context-gathering` with complementary completeness/depth focus, then synthesize what changed."
57
+ ],
58
+ "outputRequired": {
59
+ "notesMarkdown": "Audit scope, production bar, classification, clarity scores, and what is still unknown.",
60
+ "context": "Capture scopeShape, riskLevel, rigorMode, contextSummary, candidateFiles, criticalPaths, productionBar, contextUnknownCount, criticalSurfaceTouched, needsSimulation, and openQuestions."
61
+ },
62
+ "verify": [
63
+ "The classification is driven by evidence, not vibes.",
64
+ "Open questions are real human-decision gaps only.",
65
+ "If scope is whole-codebase or risk is High, treat confirmation as a real review barrier."
66
+ ]
67
+ },
68
+ "requireConfirmation": {
69
+ "or": [
70
+ { "var": "scopeShape", "equals": "whole_codebase" },
71
+ { "var": "riskLevel", "equals": "High" }
72
+ ]
73
+ }
74
+ },
75
+ {
76
+ "id": "phase-1-state-readiness-hypothesis",
77
+ "title": "Phase 1: State Readiness Hypothesis",
78
+ "promptBlocks": {
79
+ "goal": "State your current readiness hypothesis before the reviewer families challenge it.",
80
+ "constraints": [
81
+ "Keep this short and falsifiable.",
82
+ "This is a reference point, not a position to defend."
83
+ ],
84
+ "procedure": [
85
+ "Write your current best guess about the likely readiness verdict direction.",
86
+ "Name the issue category or failure mode you are most worried about right now.",
87
+ "Say what would most likely make your current view wrong."
88
+ ],
89
+ "outputRequired": {
90
+ "notesMarkdown": "Current readiness hypothesis and the strongest reason it might be wrong.",
91
+ "context": "Capture readinessHypothesis."
92
+ },
93
+ "verify": [
94
+ "The hypothesis is concrete enough that later synthesis can say what changed your mind."
95
+ ]
96
+ },
97
+ "requireConfirmation": false
98
+ },
99
+ {
100
+ "id": "phase-2-freeze-fact-packet-and-select-reviewers",
101
+ "title": "Phase 2: Freeze Fact Packet and Select Reviewer Families",
102
+ "promptBlocks": {
103
+ "goal": "Freeze a neutral readiness fact packet and decide how much reviewer-family parallelism is warranted.",
104
+ "constraints": [
105
+ [
106
+ { "kind": "ref", "refId": "wr.refs.notes_first_durability" }
107
+ ],
108
+ "The fact packet is the primary truth for downstream reviewer families.",
109
+ "Keep `readinessHypothesis` as a hypothesis to challenge, not a frame to defend.",
110
+ "Keep any live audit artifact optional. Workflow truth lives in notes and context."
111
+ ],
112
+ "procedure": [
113
+ "Create a neutral `readinessFactPacket` containing: scope purpose and expected behavior, key entry points and runtime surfaces, critical invariants and failure costs, data and deployment assumptions, changed or risky seams, test/observability posture, repo patterns and philosophy constraints, and explicit open unknowns.",
114
+ "Include realism signals directly in the fact packet: likely dead code paths, fixture or fake-data dependence, placeholder behavior, stale comments or docs, and any seams that look misleadingly complete.",
115
+ "Initialize `coverageLedger` for these domains: `debugging_correctness`, `runtime_operability`, `artifact_realism`, `maintainability_debt`, `tests_observability`, `philosophy_patterns`, `security_performance`.",
116
+ "Perform a preliminary self-audit from the fact packet before choosing reviewer families.",
117
+ "Reviewer family options: `correctness_debugging`, `runtime_operability`, `artifact_realism`, `maintainability_debt`, `tests_observability`, `philosophy_patterns`, `security_performance`, `false_positive_skeptic`, `missed_issue_hunter`.",
118
+ "Selection guidance: QUICK = no bundle by default unless ambiguity still feels material; STANDARD = 4 or 5 families by default; THOROUGH = 6 or 7 families by default.",
119
+ "Always include `correctness_debugging`, `runtime_operability`, and `artifact_realism` unless clearly not applicable. Include `security_performance` when the scope touches auth, permissions, input trust boundaries, secrets, network surfaces, data exposure, resource intensity, latency-sensitive flows, or unbounded work. Include `tests_observability` in STANDARD and THOROUGH unless clearly not applicable. Include `philosophy_patterns` when the repo or user philosophy is strong enough to judge honestly. Include `missed_issue_hunter` in THOROUGH. Include `false_positive_skeptic` when blocker or major-grade findings already look plausible or severity inflation risk is non-trivial.",
120
+ "Set `needsReviewerBundle` explicitly. Set `coverageUncertainCount` as the number of coverage domains not yet safely closed: `uncertain` + `contradicted` + `needs_followup`. Initialize `contradictionCount`, `blindSpotCount`, and `falsePositiveRiskCount` to `0` if no reviewer-family bundle will run."
121
+ ],
122
+ "outputRequired": {
123
+ "notesMarkdown": "Neutral readiness fact packet, preliminary self-audit, selected reviewer families, and why the bundle is sized the way it is.",
124
+ "context": "Capture readinessFactPacket, coverageLedger, selectedReviewerFamilies, needsReviewerBundle, coverageUncertainCount, contradictionCount, blindSpotCount, falsePositiveRiskCount, needsSimulation."
125
+ },
126
+ "verify": [
127
+ "The fact packet is concrete enough that downstream reviewer families can use it without regathering broad context.",
128
+ "The workflow has a clear reason for whether `needsReviewerBundle` is true or false."
129
+ ]
130
+ },
131
+ "promptFragments": [
132
+ {
133
+ "id": "phase-2-quick",
134
+ "when": { "var": "rigorMode", "equals": "QUICK" },
135
+ "text": "Keep the fact packet compact. QUICK should not manufacture a giant ceremony layer."
136
+ },
137
+ {
138
+ "id": "phase-2-thorough",
139
+ "when": { "var": "rigorMode", "equals": "THOROUGH" },
140
+ "text": "For THOROUGH rigor, make the hidden-risk surfaces explicit: blind spots, fake confidence vectors, and production assumptions that would hurt if wrong."
141
+ }
142
+ ],
143
+ "requireConfirmation": false
144
+ },
145
+ {
146
+ "id": "phase-3-reviewer-family-bundle",
147
+ "title": "Phase 3: Parallel Reviewer Family Bundle",
148
+ "runCondition": {
149
+ "var": "needsReviewerBundle",
150
+ "equals": true
151
+ },
152
+ "promptBlocks": {
153
+ "goal": "Run the selected reviewer families in parallel from the same readiness fact packet, then synthesize their output as evidence rather than conclusions.",
154
+ "constraints": [
155
+ [
156
+ { "kind": "ref", "refId": "wr.refs.notes_first_durability" }
157
+ ],
158
+ [
159
+ { "kind": "ref", "refId": "wr.refs.synthesis_under_disagreement" }
160
+ ],
161
+ "Each reviewer family must use `readinessFactPacket` as primary truth.",
162
+ "Use `readinessHypothesis` only as comparison context.",
163
+ "Reviewer-family outputs are raw evidence, not canonical audit state."
164
+ ],
165
+ "procedure": [
166
+ "Before delegating, restate the current `readinessHypothesis` and say which reviewer family is most likely to challenge it.",
167
+ "Each reviewer family must return: top findings, strongest evidence, biggest uncertainty, likely false-confidence vector, and what would most likely falsify its current conclusion.",
168
+ "Family missions: `correctness_debugging` = logic defects, contradictory state, failure paths, unsafe assumptions, and strongest debugging leads; `runtime_operability` = production behavior, concurrency/state flow, deployment assumptions, resilience, rollback pain, and observability under failure; `artifact_realism` = stale code, dead seams, placeholder behavior, misleading comments/docs, fake-data dependence, and surfaces that look complete but are not; `maintainability_debt` = complexity, duplication, brittle seams, drift, and future-change cost; `tests_observability` = test adequacy, verification blind spots, logging/monitoring gaps, hidden failure modes, and rollout confidence; `philosophy_patterns` = architectural consistency, repo-pattern drift, and principle tension; `security_performance` = trust boundaries, auth/permission mistakes, secrets handling, unsafe inputs, data exposure, expensive paths, unbounded work, and performance cliffs likely to matter in production; `false_positive_skeptic` = challenge overreach, weak evidence, or severity inflation; `missed_issue_hunter` = search for an important issue family the others may miss.",
169
+ "Mode-adaptive parallelism: STANDARD = spawn FOUR WorkRail Executors simultaneously for the selected families; THOROUGH = spawn SIX WorkRail Executors simultaneously for the selected families.",
170
+ "After receiving outputs, explicitly synthesize: what reviewer families confirmed, what was genuinely new, what appeared weak or overreached, and what changed your mind or did not.",
171
+ "Build a compact `familyEvidenceLedger` for each selected family covering its strongest concern, strongest evidence, biggest uncertainty, and what could make it wrong."
172
+ ],
173
+ "outputRequired": {
174
+ "notesMarkdown": "Reviewer-family synthesis, contradictions, blind spots, false-positive challenges, and the family evidence ledger.",
175
+ "context": "Capture familyEvidenceLedger, familyFindingsSummary, contradictionCount, blindSpotCount, falsePositiveRiskCount, coverageUncertainCount, and needsSimulation."
176
+ },
177
+ "verify": [
178
+ "The same fact packet was used as primary truth across reviewer families.",
179
+ "Reviewer-family output is not treated as self-finalizing.",
180
+ "Contradictions, blind spots, and false-positive risks are reflected structurally in context."
181
+ ]
182
+ },
183
+ "requireConfirmation": false
184
+ },
185
+ {
186
+ "id": "phase-4-evidence-and-contradiction-loop",
187
+ "type": "loop",
188
+ "title": "Phase 4: Evidence and Contradiction Loop",
189
+ "loop": {
190
+ "type": "while",
191
+ "conditionSource": {
192
+ "kind": "artifact_contract",
193
+ "contractRef": "wr.contracts.loop_control",
194
+ "loopId": "readiness_synthesis_loop"
195
+ },
196
+ "maxIterations": 4
197
+ },
198
+ "body": [
199
+ {
200
+ "id": "phase-4a-targeted-follow-up",
201
+ "title": "Targeted Follow-Up Bundle",
202
+ "promptBlocks": {
203
+ "goal": "If contradictions, blind spots, or important coverage gaps remain, run only the smallest targeted follow-up needed.",
204
+ "constraints": [
205
+ [
206
+ { "kind": "ref", "refId": "wr.refs.parallelize_cognition_serialize_synthesis" }
207
+ ],
208
+ "Prefer one compact targeted bundle over repeated broad delegation moments.",
209
+ "Do not regather broad context unless a contradiction proves the original fact packet is insufficient.",
210
+ "Targeted follow-up output is evidence only and must still be synthesized by the main agent."
211
+ ],
212
+ "procedure": [
213
+ "Before delegating, state the current likely readiness verdict, the strongest unresolved concern, and what result would change your mind.",
214
+ "If `contradictionCount > 0`, run targeted challenge or validation aimed at the specific disagreement.",
215
+ "If `coverageUncertainCount > 0` or `blindSpotCount > 0`, run the smallest reviewer-family or context follow-up needed to close the gap.",
216
+ "If `needsSimulation = true`, include `routine-execution-simulation`.",
217
+ "If `falsePositiveRiskCount > 0`, include `routine-hypothesis-challenge`.",
218
+ "If philosophy tension is materially affecting severity or verdict quality, include `routine-philosophy-alignment`.",
219
+ "If no trigger fires, do not delegate this step."
220
+ ],
221
+ "outputRequired": {
222
+ "notesMarkdown": "What targeted follow-up ran, why it was needed, and what it resolved or failed to resolve."
223
+ },
224
+ "verify": [
225
+ "Only the smallest targeted bundle needed was run.",
226
+ "No broad context regather happened without an explicit contradiction-driven reason."
227
+ ]
228
+ },
229
+ "requireConfirmation": false
230
+ },
231
+ {
232
+ "id": "phase-4b-canonical-synthesis",
233
+ "title": "Canonical Synthesis and Coverage Update",
234
+ "promptBlocks": {
235
+ "goal": "Turn reviewer-family evidence and follow-up work into one canonical readiness state.",
236
+ "constraints": [
237
+ "If a blocker-grade or major-grade finding is still only plausible, say so plainly instead of silently upgrading it.",
238
+ "If a domain remains uncertain, carry that uncertainty into the final verdict."
239
+ ],
240
+ "procedure": [
241
+ "Revisit `readinessHypothesis`: say what the evidence confirmed, what it challenged, what changed your mind, what held firm, and what you explicitly reject.",
242
+ "Apply this decision table: if multiple reviewer families independently surface the same serious issue with compatible evidence, treat it as strongly supported; if severities disagree, default upward only when the lower-severity position lacks concrete counter-evidence; if one family says false positive and another says valid issue, explicitly adjudicate the disagreement in notes before finalization; if simulation reveals a new operational risk, add a new finding and re-evaluate verdict confidence.",
243
+ "Update the findings ledger and classify each material finding as Confirmed, Plausible, or Rejected.",
244
+ "Update the coverage ledger honestly: move a domain to `checked` only when evidence is materially adequate; keep it `uncertain` if disagreement or missing evidence still affects verdict quality; use `not_applicable` only when the scope truly does not engage that area; clear `contradicted` only when the contradiction is explicitly resolved.",
245
+ "Cap `finalConfidenceBand` downward when unresolved blind spots still cover materially risky space, and prefer `inconclusive` later if those blind spots remain decision-relevant without a cheap next check."
246
+ ],
247
+ "outputRequired": {
248
+ "notesMarkdown": "Canonical findings ledger update, readiness-hypothesis comparison, coverage update, and confidence update.",
249
+ "context": "Capture findingsLedger, confirmedFindingsCount, plausibleFindingsCount, rejectedFindingsCount, blockerCount, majorGapCount, coverageLedger, coverageUncertainCount, contradictionCount, blindSpotCount, falsePositiveRiskCount, finalConfidenceBand, needsEvidenceRefinement."
250
+ },
251
+ "verify": [
252
+ "Decision-driving findings are explicitly classified.",
253
+ "Coverage status matches the actual evidence quality."
254
+ ]
255
+ },
256
+ "requireConfirmation": false
257
+ },
258
+ {
259
+ "id": "phase-4c-loop-decision",
260
+ "title": "Synthesis Loop Decision",
261
+ "promptBlocks": {
262
+ "goal": "Decide whether the evidence-and-contradiction loop should continue.",
263
+ "constraints": [
264
+ "Use the trigger rules, not vibes."
265
+ ],
266
+ "procedure": [
267
+ "Continue if `contradictionCount > 0`.",
268
+ "Otherwise continue if `coverageUncertainCount > 0` and the uncertainty materially affects the verdict.",
269
+ "Otherwise continue if `falsePositiveRiskCount > 0` for a serious finding, or if `blindSpotCount > 0` for uncovered materially risky space.",
270
+ "Otherwise continue if `needsEvidenceRefinement = true`.",
271
+ "Otherwise stop."
272
+ ],
273
+ "outputRequired": {
274
+ "artifact": "Emit a `wr.loop_control` artifact for `readiness_synthesis_loop` with `decision` set to `continue` or `stop`."
275
+ },
276
+ "verify": [
277
+ "The output preserves the loop-control contract without forcing one decision in the example."
278
+ ]
279
+ },
280
+ "outputContract": {
281
+ "contractRef": "wr.contracts.loop_control"
282
+ },
283
+ "requireConfirmation": false
284
+ }
285
+ ]
286
+ },
287
+ {
288
+ "id": "phase-5-final-validation",
289
+ "title": "Phase 5: Final Validation",
290
+ "promptBlocks": {
291
+ "goal": "Stress-test the current readiness verdict before final handoff.",
292
+ "constraints": [
293
+ [
294
+ { "kind": "ref", "refId": "wr.refs.adversarial_challenge_rules" }
295
+ ],
296
+ [
297
+ { "kind": "ref", "refId": "wr.refs.synthesis_under_disagreement" }
298
+ ],
299
+ "Validation output is evidence to synthesize, not automatic authority."
300
+ ],
301
+ "procedure": [
302
+ "Run final validation if any of these are true: `criticalSurfaceTouched = true`, `needsSimulation = true`, `falsePositiveRiskCount > 0`, `blindSpotCount > 0`, `coverageUncertainCount > 0`, or `finalConfidenceBand != High`.",
303
+ "Before delegating, state: what is your current verdict, where are you least confident, and what finding would most likely change your mind now.",
304
+ "Set the current readiness verdict first: `ready`, `ready_with_conditions`, `not_ready`, or `inconclusive`.",
305
+ "Use `inconclusive` deliberately when material coverage uncertainty or unresolved blind spots remain and there is no bounded next check that would resolve them cheaply.",
306
+ "Mode-adaptive validation: QUICK = self-validate and optionally spawn ONE WorkRail Executor running `routine-hypothesis-challenge` if a serious uncertainty remains; STANDARD = if validation is required and delegation is available, spawn TWO WorkRail Executors simultaneously running `routine-hypothesis-challenge` and either `routine-execution-simulation` or `routine-final-verification`; THOROUGH = if validation is required and delegation is available, spawn THREE WorkRail Executors simultaneously running `routine-hypothesis-challenge`, `routine-execution-simulation` when needed, and `routine-final-verification`.",
307
+ "After receiving validator output, explicitly synthesize what was confirmed, what was new, what appears weak, and whether your verdict changed.",
308
+ "State explicitly whether the verdict is being limited by unresolved contradictions, unresolved false-positive risk, blind spots, or coverage uncertainty."
309
+ ],
310
+ "outputRequired": {
311
+ "notesMarkdown": "Validation synthesis, verdict stress test, and any conditions the verdict still depends on.",
312
+ "context": "Capture finalVerdict, finalConfidenceBand, validationSummary, followUpCount, and verdictConditions."
313
+ },
314
+ "verify": [
315
+ "If multiple validators still raise serious concerns, confidence is downgraded and synthesis is reopened.",
316
+ "If exactly one validator raises a concern, it is adjudicated before finalization.",
317
+ "If no validator can materially break the current verdict and the evidence is internally consistent, proceed to handoff."
318
+ ]
319
+ },
320
+ "requireConfirmation": {
321
+ "or": [
322
+ { "var": "finalConfidenceBand", "equals": "Low" },
323
+ { "var": "finalVerdict", "equals": "inconclusive" }
324
+ ]
325
+ }
326
+ },
327
+ {
328
+ "id": "phase-6-final-handoff",
329
+ "title": "Phase 6: Final Handoff",
330
+ "promptBlocks": {
331
+ "goal": "Deliver the final production-readiness handoff for a human decision-maker.",
332
+ "constraints": [
333
+ "This workflow informs a decision. It does not approve a release or make code changes by itself.",
334
+ "Do not drift into implementation planning, patch sequencing, or PR execution unless the user explicitly asks."
335
+ ],
336
+ "procedure": [
337
+ "Summarize the target scope, audit intent, final verdict, and confidence band.",
338
+ "List blocker-grade findings, major gaps, strongest remaining uncertainties, top confirmed findings, and plausible but unresolved findings that still matter.",
339
+ "Call out the strongest debugging leads, runtime or operational risks, artifact-realism concerns such as stale code or fake completeness, and the most important technical-debt themes.",
340
+ "Summarize the coverage ledger, especially any domains still uncertain or needing follow-up.",
341
+ "Give a remediation order and verification or monitoring follow-ups. Mention human-facing companion artifacts only if you actually created them."
342
+ ],
343
+ "outputRequired": {
344
+ "notesMarkdown": "Decision-ready final handoff covering verdict, confidence, findings, coverage gaps, and recommended remediation order."
345
+ },
346
+ "verify": [
347
+ "The handoff is verdict-first and evidence-aware.",
348
+ "Open uncertainty is disclosed rather than hidden."
349
+ ]
350
+ },
351
+ "requireConfirmation": false
352
+ }
353
+ ]
354
+ }