npm - @exaudeus/workrail - Versions diffs - 3.9.2 → 3.11.0 - Mend

@exaudeus/workrail 3.9.2 → 3.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/spec/workflow.schema.json CHANGED Viewed

@@ -135,6 +135,14 @@
       },
       "uniqueItems": true
     },
+    "assessments": {
+      "type": "array",
+      "description": "Workflow-declared assessment definitions. Steps reference these by ID via assessmentRefs instead of embedding assessment structure inline repeatedly.",
+      "items": {
+        "$ref": "#/$defs/assessmentDefinition"
+      },
+      "uniqueItems": true
+    },
     "extensionPoints": {
       "type": "array",
       "description": "Bounded cognitive slots that users can customize via .workrail/bindings.json. Each slot is referenced in step prompts via {{wr.bindings.slotId}} and resolved at compile time. Use extension points for bounded cognitive units, not for core orchestration or final synthesis ownership.",
@@ -293,6 +301,21 @@
         "runCondition": { "$ref": "#/$defs/condition" },
         "validationCriteria": { "oneOf": [ { "type": "array", "items": { "$ref": "#/$defs/validationRule" } }, { "$ref": "#/$defs/validationComposition" } ] },
         "outputContract": { "$ref": "#/$defs/outputContract" },
+        "assessmentRefs": {
+          "type": "array",
+          "description": "References to workflow-level assessment definitions expected for this step. V1 supports exactly one assessmentRef per step.",
+          "items": { "type": "string", "minLength": 1, "maxLength": 64 },
+          "minItems": 1,
+          "maxItems": 1,
+          "uniqueItems": true
+        },
+        "assessmentConsequences": {
+          "type": "array",
+          "description": "Step-local assessment consequence declarations. V1 supports at most one exact-match follow-up consequence.",
+          "items": { "$ref": "#/$defs/assessmentConsequence" },
+          "minItems": 1,
+          "maxItems": 1
+        },
         "notesOptional": { "type": "boolean", "description": "When true, output.notesMarkdown is not required for this step. Steps with outputContract are automatically exempt. Use sparingly for mechanical steps with no substantive work to document." },
         "functionDefinitions": { "type": "array", "items": { "$ref": "#/$defs/functionDefinition" } },
         "functionCalls": { "type": "array", "items": { "$ref": "#/$defs/functionCall" } },
@@ -727,6 +750,93 @@
       "required": ["contractRef"],
       "additionalProperties": false
     },
+    "assessmentDimension": {
+      "type": "object",
+      "description": "A bounded dimension within a workflow-declared assessment definition.",
+      "properties": {
+        "id": {
+          "type": "string",
+          "minLength": 1,
+          "maxLength": 64
+        },
+        "purpose": {
+          "type": "string",
+          "minLength": 1,
+          "maxLength": 256
+        },
+        "levels": {
+          "type": "array",
+          "items": {
+            "type": "string",
+            "minLength": 1,
+            "maxLength": 64
+          },
+          "minItems": 2,
+          "uniqueItems": true
+        },
+        "required": {
+          "type": "boolean",
+          "default": true
+        }
+      },
+      "required": ["id", "purpose", "levels"],
+      "additionalProperties": false
+    },
+    "assessmentDefinition": {
+      "type": "object",
+      "description": "Workflow-declared assessment definition referenced by steps.",
+      "properties": {
+        "id": {
+          "type": "string",
+          "minLength": 1,
+          "maxLength": 64
+        },
+        "purpose": {
+          "type": "string",
+          "minLength": 1,
+          "maxLength": 256
+        },
+        "dimensions": {
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/assessmentDimension"
+          },
+          "minItems": 1
+        }
+      },
+      "required": ["id", "purpose", "dimensions"],
+      "additionalProperties": false
+    },
+    "assessmentConsequenceTrigger": {
+      "type": "object",
+      "description": "Exact-match trigger on one declared assessment dimension and one declared canonical level.",
+      "properties": {
+        "dimensionId": { "type": "string", "minLength": 1, "maxLength": 64 },
+        "equalsLevel": { "type": "string", "minLength": 1, "maxLength": 64 }
+      },
+      "required": ["dimensionId", "equalsLevel"],
+      "additionalProperties": false
+    },
+    "assessmentConsequenceEffect": {
+      "type": "object",
+      "description": "V1 assessment consequence effect. Keeps the same step pending and requires follow-up before retry.",
+      "properties": {
+        "kind": { "const": "require_followup" },
+        "guidance": { "type": "string", "minLength": 1, "maxLength": 512 }
+      },
+      "required": ["kind", "guidance"],
+      "additionalProperties": false
+    },
+    "assessmentConsequence": {
+      "type": "object",
+      "description": "Step-local assessment consequence declaration. V1 supports one exact-match follow-up consequence only.",
+      "properties": {
+        "when": { "$ref": "#/$defs/assessmentConsequenceTrigger" },
+        "effect": { "$ref": "#/$defs/assessmentConsequenceEffect" }
+      },
+      "required": ["when", "effect"],
+      "additionalProperties": false
+    },
     "functionCall": {
       "type": "object",
       "properties": {

package/workflows/bug-investigation.agentic.v2.json CHANGED Viewed

@@ -31,6 +31,19 @@
     "TRIGGERS: WorkRail can only react to explicit outputs. Use structural fields like `contextUnknownCount`, `hypothesesConsideredCount`, `hasStrongAlternative`, `contradictionCount`, `unresolvedEvidenceGapCount`, and `diagnosisConfidenceBand`.",
     "BOUNDARY: this workflow investigates and proves root cause. It may describe high-level fix direction and likely files, but must not create implementation plans, patch sequencing, PR plans, or code-writing momentum."
   ],
+  "assessments": [
+    {
+      "id": "diagnosis_readiness_gate",
+      "purpose": "Assess whether the diagnosis is ready to hand off after validation.",
+      "dimensions": [
+        {
+          "id": "confidence",
+          "purpose": "How confident the agent is that the diagnosis is ready for final handoff.",
+          "levels": ["low", "high"]
+        }
+      ]
+    }
+  ],
   "steps": [
     {
       "id": "phase-0-triage-and-intake",
@@ -116,7 +129,20 @@
     {
       "id": "phase-5-diagnosis-validation",
       "title": "Phase 5: Diagnosis Validation Bundle",
-      "prompt": "Stress-test the current diagnosis before handoff.\n\nSet `diagnosisConfidenceBand` using these rules:\n- High = all symptoms explained, no material contradictions, no unresolved evidence gaps\n- Medium = likely diagnosis, but one bounded uncertainty remains\n- Low = multiple viable explanations remain or contradictions are unresolved\n\nMode-adaptive validation:\n- QUICK: self-challenge; if `diagnosisConfidenceBand != High` or contradictions remain, optionally spawn ONE WorkRail Executor running `routine-hypothesis-challenge`\n- STANDARD: if delegation is available, spawn TWO WorkRail Executors SIMULTANEOUSLY running `routine-hypothesis-challenge` and `routine-execution-simulation`\n- THOROUGH: if delegation is available, spawn THREE WorkRail Executors SIMULTANEOUSLY running `routine-hypothesis-challenge`, `routine-execution-simulation`, and an additional `routine-hypothesis-challenge` pass focused on breaking the current diagnosis from a different angle\n\nParallel-output synthesis rules:\n- if 2+ validators raise serious concerns, reopen evidence or shortlist work\n- if exactly one validator raises a concern, investigate it before escalating\n- if no validator can materially break the diagnosis and `contradictionCount = 0`, proceed to handoff\n\nSet context variables:\n- `diagnosisConfidenceBand`\n- `validationFindingsCountBySeverity`\n- `validationSummary`\n\nBoundary rule:\n- allowed: high-level fix direction, likely files involved, verification recommendations\n- not allowed: implementation plan, patch sequencing, PR plan, or code-writing momentum",
+      "prompt": "Stress-test the current diagnosis before handoff.\n\nSet `diagnosisConfidenceBand` using these rules:\n- High = all symptoms explained, no material contradictions, no unresolved evidence gaps\n- Medium = likely diagnosis, but one bounded uncertainty remains\n- Low = multiple viable explanations remain or contradictions are unresolved\n\nMode-adaptive validation:\n- QUICK: self-challenge; if `diagnosisConfidenceBand != High` or contradictions remain, optionally spawn ONE WorkRail Executor running `routine-hypothesis-challenge`\n- STANDARD: if delegation is available, spawn TWO WorkRail Executors SIMULTANEOUSLY running `routine-hypothesis-challenge` and `routine-execution-simulation`\n- THOROUGH: if delegation is available, spawn THREE WorkRail Executors SIMULTANEOUSLY running `routine-hypothesis-challenge`, `routine-execution-simulation`, and an additional `routine-hypothesis-challenge` pass focused on breaking the current diagnosis from a different angle\n\nParallel-output synthesis rules:\n- if 2+ validators raise serious concerns, reopen evidence or shortlist work\n- if exactly one validator raises a concern, investigate it before escalating\n- if no validator can materially break the diagnosis and `contradictionCount = 0`, proceed to handoff\n\nAfter synthesizing the validation result, assess whether the diagnosis is ready for final handoff.\n\nSet context variables:\n- `diagnosisConfidenceBand`\n- `validationFindingsCountBySeverity`\n- `validationSummary`\n\nBoundary rule:\n- allowed: high-level fix direction, likely files involved, verification recommendations\n- not allowed: implementation plan, patch sequencing, PR plan, or code-writing momentum",
+      "assessmentRefs": ["diagnosis_readiness_gate"],
+      "assessmentConsequences": [
+        {
+          "when": {
+            "dimensionId": "confidence",
+            "equalsLevel": "low"
+          },
+          "effect": {
+            "kind": "require_followup",
+            "guidance": "Resolve the remaining diagnosis uncertainty, tighten the evidence summary, and retry this validation step before handing off."
+          }
+        }
+      ],
       "requireConfirmation": {
         "or": [
           { "var": "diagnosisConfidenceBand", "equals": "Low" },
@@ -131,4 +157,4 @@
       "requireConfirmation": true
     }
   ]
-}
+}

package/workflows/mr-review-workflow.agentic.v2.json CHANGED Viewed

@@ -19,19 +19,22 @@
     "DEFAULT BEHAVIOR: self-execute with tools. Only ask for missing external artifacts, permissions, or business context you cannot resolve yourself.",
     "V2 DURABILITY: use output.notesMarkdown and explicit `continue_workflow` context keys as durable workflow state. Do NOT rely on the review document as required workflow memory.",
     "ARTIFACT STRATEGY: `reviewDocPath` is an optional human-facing artifact only. Create or update it only when it materially improves handoff or readability. Workflow truth lives in notes and explicit context fields.",
+    "NOTES QUALITY: notes should work for both a human reader and a future agent resuming later. For important phases, make clear what was learned, what was decided, what remains uncertain, and what should happen next.",
     "OWNERSHIP & DELEGATION: the main agent owns truth, synthesis, severity calibration, recommendation, and final handoff. Delegate only bounded reviewer or validation work through the WorkRail Executor.",
     "SUBAGENT SYNTHESIS: treat reviewer-family and validator output as evidence, not conclusions. State your current hypothesis before delegation, then say what was confirmed, what was new, what you reject, and what changed your mind.",
     "PARALLELISM: parallelize independent cognition; serialize canonical synthesis, coverage-ledger updates, recommendation decisions, and document writes.",
     "REVIEW MODEL: first build shared understanding, then freeze a neutral fact packet, then let reviewer families challenge it in parallel, then reconcile contradictions explicitly.",
     "COVERAGE LEDGER: explicitly track review domains as `checked`, `uncertain`, `not_applicable`, `contradicted`, or `needs_followup`. Do not finalize with unresolved material gaps unless you name them clearly.",
     "TRIGGERS: WorkRail can only react to explicit fields. Use structural fields such as `contextUnknownCount`, `criticalSurfaceTouched`, `coverageUncertainCount`, `contradictionCount`, `falsePositiveRiskCount`, `blindSpotCount`, and `needsSimulation`.",
+    "BOUNDARY DISCIPLINE: attempt to determine the real review target and the likely ancestor-relative review surface. If that confidence remains weak, continue with downgraded confidence and disclose the limitation clearly instead of pretending certainty.",
+    "SOURCE DISCOVERY: opportunistically recover PR context, ticket/docs context, and repo/user policy context from the strongest available sources. Missing sources should usually lower confidence and be disclosed, not block the workflow.",
     "BOUNDARY: do not post comments, approve, reject, or merge unless the user explicitly asks. Produce findings, recommendation, and handoff material only."
   ],
   "steps": [
     {
       "id": "phase-0-understand-and-classify",
-      "title": "Phase 0: Understand & Classify",
-      "prompt": "Build understanding and classify the review in one pass.\n\nStep 1 — Early exit / minimum inputs:\nBefore exploring, verify that the review target is real and inspectable. If the diff, changed files, or equivalent review material are completely absent and cannot be inferred with tools, ask for the minimum missing artifact and stop. Do NOT ask questions you can resolve with tools.\n\nStep 2 — Explore:\nUse tools to build the minimum complete understanding needed to review accurately. Read independent files in parallel when possible.\n\nGather:\n- MR title and purpose, if discoverable\n- ticket or acceptance-criteria context when available\n- changed files overview and changed-file count\n- module roots, call chain highlights, public contracts, impacted consumers, and repo patterns that matter\n- explicit unknowns, likely blind spots, and whether author intent remains unclear\n- whether any critical surface is touched\n\nStep 3 — Classify after exploration:\nSet:\n- `reviewMode`: QUICK / STANDARD / THOROUGH\n- `riskLevel`: Low / Medium / High\n- `maxParallelism`: 0 / 3 / 5\n- `criticalSurfaceTouched`: true / false\n- `needsSimulation`: true / false\n\nDecision guidance:\n- QUICK: very small, isolated, low-risk changes with little ambiguity\n- STANDARD: typical feature or bug-fix reviews with moderate ambiguity or moderate risk\n- THOROUGH: critical surfaces, architectural novelty, high risk, broad change sets, or strong need for independent reviewer perspectives\n\nStep 4 — Optional deeper context:\nIf `reviewMode` is STANDARD or THOROUGH and understanding still feels incomplete, and delegation is available, spawn TWO WorkRail Executors SIMULTANEOUSLY running `routine-context-gathering` with focus=COMPLETENESS and focus=DEPTH. Synthesize both outputs before finishing this step.\n\nStep 5 — Human-facing artifact:\nChoose `reviewDocPath` only if a live artifact will materially improve human readability. Default suggestion: `mr-review.md` at the project root. This artifact is optional and never canonical workflow state.\n\nSet these keys in the next `continue_workflow` call's `context` object:\n- `mrTitle`\n- `mrPurpose`\n- `ticketContext`\n- `focusAreas`\n- `changedFileCount`\n- `criticalSurfaceTouched`\n- `reviewMode`\n- `riskLevel`\n- `maxParallelism`\n- `reviewDocPath`\n- `contextSummary`\n- `candidateFiles`\n- `moduleRoots`\n- `contextUnknownCount`\n- `coverageGapCount`\n- `authorIntentUnclear`\n- `needsSimulation`\n- `openQuestions`\n\nRules:\n- answer your own questions with tools whenever possible\n- only keep true human-decision questions in `openQuestions`\n- keep `openQuestions` bounded to the minimum necessary\n- if the review target is missing entirely, ask only for that missing artifact\n- classify AFTER exploring, not before",
+      "title": "Phase 0: Locate, Bound, Enrich & Classify",
+      "prompt": "Build the review foundation in one pass.\n\nStep 1 — Early exit / minimum inputs:\nBefore exploring, verify that the review target is real and inspectable. If the diff, changed files, or equivalent review material are completely absent and cannot be inferred with tools, ask for the minimum missing artifact and stop. Do NOT ask questions you can resolve with tools.\n\nStep 2 — Locate and bound the review target:\nAttempt to determine the strongest available review target and boundary.\n\nAttempt to establish:\n- `reviewTargetKind` from the strongest available source such as PR/MR, branch, patch, diff, or local working tree changes\n- `reviewTargetSource` describing where the target came from\n- likely PR/MR identity when available (`prUrl`, `prNumber`)\n- likely base / ancestor reference (`baseCandidate`, `mergeBaseRef`) when available\n- whether the branch may include inherited or out-of-scope changes\n- `boundaryConfidence`: High / Medium / Low\n\nDo not over-prescribe your own investigation path. Use the strongest available evidence and record uncertainty honestly.\n\nStep 3 — Enrich with context:\nRecover the strongest available intent and policy context from whatever sources are actually available.\n\nAttempt to recover:\n- MR title and purpose\n- ticket / issue / acceptance context (`ticketRefs`, `ticketContext`)\n- supporting docs / specs / rollout context (`supportingDocsFound`)\n- repo or user policy/convention context when it is likely to affect review judgment (`policySourcesFound`)\n- `contextConfidence`: High / Medium / Low\n\nStep 4 — Review-surface hygiene:\nClassify the visible change into a minimal review surface.\n\nSet:\n- `coreReviewSurface`\n- `likelyNoiseOrMechanicalChurn`\n- `likelyInheritedOrOutOfScopeChanges`\n- `reviewSurfaceSummary`\n- `reviewScopeWarnings`\n\nThe goal is not a giant ledger. The goal is to avoid treating every visible changed file as equally worthy of deep review by default.\n\nStep 5 — Classify the review:\nAfter exploration, classify the work.\n\nSet:\n- `reviewMode`: QUICK / STANDARD / THOROUGH\n- `riskLevel`: Low / Medium / High\n- `shapeProfile`: choose the best primary label from `isolated_change`, `crosscutting_change`, `mechanically_noisy_change`, or `ambiguous_boundary`\n- `changeTypeProfile`: choose the best primary label from `general_code_change`, `api_contract_change`, `data_model_or_migration`, `security_sensitive`, or `test_only`\n- `maxParallelism`: 0 / 3 / 5\n- `criticalSurfaceTouched`: true / false\n- `needsSimulation`: true / false\n- `needsBoundaryFollowup`: true / false\n- `needsContextFollowup`: true / false\n- `needsReviewerBundle`: true / false\n\nDecision guidance:\n- QUICK: very small, isolated, low-risk changes with little ambiguity\n- STANDARD: typical feature or bug-fix reviews with moderate ambiguity or moderate risk\n- THOROUGH: critical surfaces, architectural novelty, high risk, broad change sets, or strong need for independent reviewer perspectives\n\nMinimal routing guidance:\n- if `boundaryConfidence = Low`, bias toward boundary/context follow-up before strong recommendation confidence\n- if `changeTypeProfile = api_contract_change`, bias toward contract/consumer/backward-compatibility scrutiny\n- if `changeTypeProfile = data_model_or_migration`, bias toward rollout / compatibility / simulation scrutiny\n- if `changeTypeProfile = security_sensitive`, bias toward adversarial/runtime-risk scrutiny and lower tolerance for weak evidence\n- if `changeTypeProfile = test_only`, bias toward stronger false-positive suppression\n- if `shapeProfile = mechanically_noisy_change`, bias toward stronger noise filtering and lower appetite for style-only findings\n\nStep 6 — Optional deeper context:\nIf `reviewMode` is STANDARD or THOROUGH and context remains incomplete, and delegation is available, spawn TWO WorkRail Executors SIMULTANEOUSLY running `routine-context-gathering` with focus=COMPLETENESS and focus=DEPTH. Synthesize both outputs before finishing this step.\n\nStep 7 — Human-facing artifact:\nChoose `reviewDocPath` only if a live artifact will materially improve human readability. Default suggestion: `mr-review.md` at the project root. This artifact is optional and never canonical workflow state.\n\nFallback behavior:\n- if PR/MR is not found but a branch/diff is inspectable, continue with downgraded context confidence and disclose missing PR context later\n- if the branch is inspectable but merge-base / ancestor remains ambiguous, continue with downgraded boundary confidence, set `needsBoundaryFollowup = true`, and disclose the uncertainty later\n- if ticket or supporting docs are missing, continue with downgraded context confidence and avoid overclaiming intent-sensitive findings\n- if only a patch/diff is available, continue if it is inspectable, but keep lower confidence on intent/boundary-dependent conclusions\n- if the review target itself is missing, ask only for that missing artifact and stop\n\nSet these keys in the next `continue_workflow` call's `context` object:\n- `reviewTargetKind`\n- `reviewTargetSource`\n- `prUrl`\n- `prNumber`\n- `baseCandidate`\n- `mergeBaseRef`\n- `boundaryConfidence`\n- `contextConfidence`\n- `mrTitle`\n- `mrPurpose`\n- `ticketRefs`\n- `ticketContext`\n- `supportingDocsFound`\n- `policySourcesFound`\n- `accessibleContextSources`\n- `missingContextSources`\n- `focusAreas`\n- `changedFileCount`\n- `criticalSurfaceTouched`\n- `reviewMode`\n- `riskLevel`\n- `shapeProfile`\n- `changeTypeProfile`\n- `maxParallelism`\n- `reviewDocPath`\n- `contextSummary`\n- `candidateFiles`\n- `moduleRoots`\n- `contextUnknownCount`\n- `coverageGapCount`\n- `authorIntentUnclear`\n- `needsSimulation`\n- `needsBoundaryFollowup`\n- `needsContextFollowup`\n- `needsReviewerBundle`\n- `coreReviewSurface`\n- `likelyNoiseOrMechanicalChurn`\n- `likelyInheritedOrOutOfScopeChanges`\n- `reviewSurfaceSummary`\n- `reviewScopeWarnings`\n- `openQuestions`\n\nRules:\n- answer your own questions with tools whenever possible\n- only keep true human-decision questions in `openQuestions`\n- keep `openQuestions` bounded to the minimum necessary\n- classify AFTER exploring, not before\n- before leaving this phase, either establish the likely review boundary or explicitly record why you could not",
       "requireConfirmation": {
         "or": [
           { "var": "reviewMode", "equals": "THOROUGH" },
@@ -58,12 +61,13 @@
           "Keep `recommendationHypothesis` as a secondary hypothesis to challenge, not a frame to defend."
         ],
         "procedure": [
-          "Create a neutral `reviewFactPacket` containing: MR purpose and expected behavior change, changed files and module roots, key contracts / invariants / affected consumers, call-chain highlights, relevant repo patterns and exemplars, tests/docs expectations, and explicit open unknowns.",
+          "Create a neutral `reviewFactPacket` containing: MR purpose and expected behavior change, review target and review-surface summary, changed files and module roots, key contracts / invariants / affected consumers, call-chain highlights, relevant repo patterns and exemplars, tests/docs expectations, discovered ticket/doc/policy context, accessible and missing context sources, and explicit open unknowns.",
           "Initialize `coverageLedger` for these domains: `correctness_logic`, `contracts_invariants`, `patterns_architecture`, `runtime_production_risk`, `tests_docs_rollout`, `security_performance`.",
           "Perform a preliminary self-review from the fact packet before choosing reviewer families.",
           "Reviewer family options: `correctness_invariants`, `patterns_architecture`, `runtime_production_risk`, `test_docs_rollout`, `false_positive_skeptic`, `missed_issue_hunter`.",
           "Selection guidance: QUICK = no bundle by default unless ambiguity still feels material; STANDARD = 3 families by default; THOROUGH = 5 families by default.",
           "Always include `correctness_invariants` unless clearly not applicable. Include `test_docs_rollout` in STANDARD and THOROUGH unless clearly not applicable. Include `runtime_production_risk` when `criticalSurfaceTouched = true` or `needsSimulation = true`. Include `missed_issue_hunter` in THOROUGH. Include `false_positive_skeptic` when Major/Critical findings seem plausible or severity inflation risk is non-trivial.",
+          "Routing guidance: for `api_contract_change`, bias toward contract / consumer / backward-compatibility scrutiny; for `data_model_or_migration`, bias toward rollout / compatibility / simulation scrutiny; for `security_sensitive`, bias toward runtime-risk scrutiny and lower tolerance for weak evidence; for `test_only`, bias toward stronger false-positive suppression; for `mechanically_noisy_change`, bias toward stronger noise filtering and lower appetite for style-only findings.",
           "Set `coverageUncertainCount` as the number of coverage domains not yet safely closed: `uncertain` + `contradicted` + `needs_followup`.",
           "Initialize `contradictionCount`, `blindSpotCount`, and `falsePositiveRiskCount` to `0` if no reviewer-family bundle will run."
         ],
@@ -191,8 +195,9 @@
           "Before delegating, state: what is your current recommendation, where are you least confident, and what finding would most likely change your mind now?",
           "Mode-adaptive validation: QUICK = self-validate and optionally spawn ONE WorkRail Executor running `routine-hypothesis-challenge` if a serious uncertainty remains; STANDARD = if validation is required and delegation is available, spawn TWO WorkRail Executors SIMULTANEOUSLY running `routine-hypothesis-challenge` and either `routine-execution-simulation` or `routine-plan-analysis`; THOROUGH = if validation is required and delegation is available, spawn THREE WorkRail Executors SIMULTANEOUSLY running `routine-hypothesis-challenge`, `routine-execution-simulation` when needed, and `routine-plan-analysis`.",
           "After receiving validator output, explicitly synthesize what was confirmed, what was new, what appears weak, and whether your recommendation changed.",
+          "Perform a compact confidence assessment using these dimensions: `boundaryConfidence`, `intentConfidence`, `evidenceConfidence`, `coverageConfidence`, and `consensusConfidence`. Rate each as High / Medium / Low, explain each in one sentence, and then derive final recommendation confidence with these rules: if boundary is Low, final confidence is Low; else if evidence is Low, final confidence is Low; else if 2 or more dimensions are Medium, final confidence is Medium; else if all key dimensions are High, final confidence is High. Unresolved disagreement can only lower confidence, never raise it.",
           "Compute `docCompletenessConcernCount` by counting one concern for each material packaging gap: missing rationale for any Critical or Major finding, missing ready-to-post MR comment for any Critical or Major finding, recommendation mismatch with canonical findings, still-uncertain / contradicted / needs-followup coverage domains not summarized clearly, or any missing required final section needed for actionability.",
-          "Set these keys in the next `continue_workflow` call's `context` object: `validatorConsensusLevel`, `validationSummary`, `recommendationConfidenceBand`, `docCompletenessConcernCount`."
+          "Set these keys in the next `continue_workflow` call's `context` object: `intentConfidence`, `evidenceConfidence`, `coverageConfidence`, `consensusConfidence`, `confidenceAssessmentSummary`, `validatorConsensusLevel`, `validationSummary`, `recommendationConfidenceBand`, `docCompletenessConcernCount`."
         ],
         "verify": [
           "If 2+ validators still raise serious concerns, confidence is downgraded and synthesis is reopened.",
@@ -210,7 +215,7 @@
     {
       "id": "phase-6-final-handoff",
       "title": "Phase 6: Final Handoff",
-      "prompt": "Provide the final MR review handoff.\n\nInclude:\n- MR title and purpose\n- review mode used\n- final recommendation and confidence band\n- counts of Critical / Major / Minor / Nit findings\n- top findings with rationale\n- strongest remaining areas of uncertainty, if any\n- summary of the coverage ledger, especially any still-uncertain domains\n- ready-to-post MR comments summary\n- any validation outcomes a human reviewer should see\n- path to the full human-facing review artifact (`reviewDocPath`) only if one was created\n\nRules:\n- the final recommendation assists a human reviewer; it does not replace them\n- if `reviewDocPath` exists, treat it as a human-facing companion artifact only\n- do not post comments, approve, reject, or merge unless the user explicitly asks",
+      "prompt": "Provide the final MR review handoff.\n\nInclude:\n- MR title and purpose\n- review mode used\n- final recommendation and confidence band\n- confidence assessment summary, including the most important reason confidence was capped if it was not High\n- counts of Critical / Major / Minor / Nit findings\n- top findings with rationale\n- strongest remaining areas of uncertainty, if any\n- summary of the coverage ledger, especially any still-uncertain domains\n- ready-to-post MR comments summary\n- any validation outcomes a human reviewer should see\n- review environment status:\n  - what review target/context sources were successfully used\n  - what important sources were missing or ambiguous\n  - boundary confidence and context confidence\n  - how those limits affected the review\n- path to the full human-facing review artifact (`reviewDocPath`) only if one was created\n\nRules:\n- the final recommendation assists a human reviewer; it does not replace them\n- if `reviewDocPath` exists, treat it as a human-facing companion artifact only\n- be explicit when missing PR/ticket/doc/boundary context limited confidence\n- do not post comments, approve, reject, or merge unless the user explicitly asks",
       "requireConfirmation": true
     }
   ]

package/workflows/test-artifact-loop-control.json CHANGED Viewed

@@ -53,8 +53,34 @@
     {
       "id": "complete",
       "title": "Complete",
-      "prompt": "The iteration loop has completed. Summarize what was accomplished.",
-      "requireConfirmation": false
+      "prompt": "The iteration loop has completed. Assess whether the loop result is ready to publish.\n\nProvide an assessment artifact for readiness.",
+      "requireConfirmation": false,
+      "assessmentRefs": ["readiness_gate"],
+      "assessmentConsequences": [
+        {
+          "when": {
+            "dimensionId": "confidence",
+            "equalsLevel": "low"
+          },
+          "effect": {
+            "kind": "require_followup",
+            "guidance": "Review the loop result one more time and confirm the outcome before completing this step."
+          }
+        }
+      ]
+    }
+  ],
+  "assessments": [
+    {
+      "id": "readiness_gate",
+      "purpose": "Assess whether the loop result is ready to publish.",
+      "dimensions": [
+        {
+          "id": "confidence",
+          "purpose": "How confident the agent is that the loop result is complete and correct.",
+          "levels": ["low", "high"]
+        }
+      ]
     }
   ]
 }