npm - workflow-supervisor - Versions diffs - 0.1.4 → 0.2.0 - Mend

workflow-supervisor 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/CHANGELOG.md +30 -0
package/README.md +69 -5
package/bin/workflow-skills.mjs +201 -1
package/docs/artifacts.md +4 -0
package/docs/cli.md +3 -1
package/docs/portable-delegation.md +14 -1
package/docs/skill-reference.md +11 -1
package/docs/troubleshooting.md +24 -0
package/package.json +1 -1
package/schemas/dossier-v1.schema.json +38 -0
package/schemas/worker-report-v1.schema.json +120 -12
package/skills/acceptance-matrix/SKILL.md +114 -2
package/skills/acceptance-matrix/agents/openai.yaml +1 -1
package/skills/dossier-builder/SKILL.md +28 -0
package/skills/work-unit/SKILL.md +46 -6
package/skills/workflow-docs/references/workflow-control.md +58 -6
package/skills/workflow-supervisor/SKILL.md +51 -3
package/skills/workflow-supervisor/agents/openai.yaml +1 -1

package/schemas/worker-report-v1.schema.json CHANGED Viewed

@@ -49,27 +49,19 @@
     },
     "evidence": {
       "type": "array",
-      "items": {
-        "type": "string"
-      }
+      "items": { "$ref": "#/$defs/evidenceEntry" }
     },
     "checks_run": {
       "type": "array",
-      "items": {
-        "type": "string"
-      }
+      "items": { "$ref": "#/$defs/evidenceEntry" }
     },
     "skipped_checks": {
       "type": "array",
-      "items": {
-        "type": "string"
-      }
+      "items": { "$ref": "#/$defs/evidenceEntry" }
     },
     "findings": {
       "type": "array",
-      "items": {
-        "type": "string"
-      }
+      "items": { "$ref": "#/$defs/evidenceEntry" }
     },
     "blocking_question": {
       "type": ["string", "null"]
@@ -77,6 +69,13 @@
     "next_action": {
       "type": "string"
     },
+    "verification_environment": {
+      "$ref": "#/$defs/verificationEnvironment"
+    },
+    "outcome_evaluations": {
+      "type": "array",
+      "items": { "$ref": "#/$defs/outcomeEvaluation" }
+    },
     "adapter": {
       "type": ["object", "null"],
       "additionalProperties": false,
@@ -115,5 +114,114 @@
     "reason": {
       "type": ["string", "null"]
     }
+  },
+  "$defs": {
+    "evidenceEntry": {
+      "anyOf": [
+        { "type": "string" },
+        {
+          "type": "object",
+          "additionalProperties": true
+        }
+      ]
+    },
+    "verificationCapability": {
+      "type": "string",
+      "enum": [
+        "static_diff_inspection",
+        "diff_inspection",
+        "shell_command",
+        "unit_test",
+        "integration_test",
+        "contract_test",
+        "data_contract_test",
+        "jsdom_render",
+        "api_probe",
+        "file_snapshot",
+        "generated_html_snapshot",
+        "component_tree_snapshot",
+        "accessibility_tree_snapshot",
+        "state_machine_test",
+        "browser_snapshot",
+        "human_required",
+        "manual_review"
+      ]
+    },
+    "capabilityList": {
+      "type": "array",
+      "items": { "$ref": "#/$defs/verificationCapability" }
+    },
+    "verificationEnvironment": {
+      "type": "object",
+      "additionalProperties": false,
+      "properties": {
+        "shell": { "type": "boolean" },
+        "filesystem": { "type": "boolean" },
+        "git_diff": { "type": "boolean" },
+        "browser": { "type": "boolean" },
+        "playwright_mcp": { "type": "boolean" },
+        "network": { "type": "boolean" },
+        "capabilities": { "$ref": "#/$defs/capabilityList" },
+        "limitations": {
+          "type": "array",
+          "items": { "type": "string" }
+        }
+      }
+    },
+    "evidenceStrength": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["strongest_possible", "strongest_available"],
+      "properties": {
+        "strongest_possible": { "$ref": "#/$defs/capabilityList" },
+        "strongest_available": { "$ref": "#/$defs/capabilityList" },
+        "limitation": { "type": "string" }
+      }
+    },
+    "outcomeEvaluation": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": [
+        "id",
+        "source_requirement",
+        "expected_outcome",
+        "preferred_verification",
+        "available_verification",
+        "evidence_strength",
+        "evidence",
+        "invalid_pass_conditions",
+        "verdict"
+      ],
+      "properties": {
+        "id": { "type": "string", "minLength": 1 },
+        "source_requirement": { "type": "string", "minLength": 1 },
+        "expected_outcome": { "type": "string", "minLength": 1 },
+        "preferred_verification": { "$ref": "#/$defs/capabilityList" },
+        "available_verification": { "$ref": "#/$defs/capabilityList" },
+        "evidence_strength": { "$ref": "#/$defs/evidenceStrength" },
+        "evidence": {
+          "type": "array",
+          "items": { "$ref": "#/$defs/evidenceEntry" }
+        },
+        "invalid_pass_conditions": {
+          "type": "array",
+          "items": { "type": "string" }
+        },
+        "verdict": {
+          "type": "string",
+          "enum": ["PASS", "FAIL", "BLOCKED", "CONDITIONAL_PASS"]
+        },
+        "limitation": { "type": "string" },
+        "capability_limitations": {
+          "type": "array",
+          "items": { "type": "string" }
+        },
+        "required_external_check": {
+          "type": "array",
+          "items": { "type": "string" }
+        },
+        "finding": { "type": "string" }
+      }
+    }
   }
 }

package/skills/acceptance-matrix/SKILL.md CHANGED Viewed

@@ -22,6 +22,10 @@ This skill owns evidence rows and supervisor verdict mapping. `$work-unit` may d
 - BLOCKED applies when evidence cannot be obtained or sources conflict.
 - Residual risks must not be hidden inside PASS.
 - If residual risks, skipped checks, future work, or next recommended actions contain an unimplemented material source requirement, the matrix status is FAIL or BLOCKED, not PASS.
+- Bug fixes and risky behavior changes require a red-capable feedback loop, or an explicit waiver explaining why no correct loop exists.
+- Treat implementer output as a claim. Verification must map source requirement -> acceptance row -> outcome evidence -> verifier verdict -> supervisor audit.
+- Tests, typecheck, lint, and build are evidence types, not automatic proof. They can satisfy a row only when the row is explicitly technical or the command observes the expected outcome.
+- Outcome rows may use `CONDITIONAL_PASS` only as a row-level verdict for behavior that is strongly inferred but not fully observable in the current environment. A final supervisor PASS still requires material rows to be fully observed as PASS or explicitly waived.
 ## Source Fidelity Rules
@@ -46,13 +50,110 @@ If a requirement cannot be verified in the current environment, mark it BLOCKED
 ## Row Shape
-| ID | Source Ref | Requirement | Evidence Required | Verification Method | Adversarial Check | Status | Evidence |
-|---|---|---|---|---|---|---|---|
+| ID | Source Ref | Requirement | Expected Outcome | Evidence Required | Preferred Verification | Available Verification | Evidence Strength | Verification Method | Feedback Loop | Evidence Classification | Adversarial Check | Invalid PASS Conditions | Status | Evidence |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
 Use statuses: Pending, PASS, FAIL, BLOCKED, Waived.
+For outcome evaluation, each material row should also be expressible as:
+```yaml
+outcome_evaluation:
+  id:
+  source_requirement:
+  expected_outcome:
+  preferred_verification:
+    - browser_snapshot
+    - jsdom_render
+    - integration_test
+    - api_probe
+    - static_diff_inspection
+  available_verification:
+    - integration_test
+    - api_probe
+    - static_diff_inspection
+  evidence_strength:
+    strongest_possible:
+      - browser_snapshot
+    strongest_available:
+      - jsdom_render
+      - api_probe
+      - static_diff_inspection
+    limitation:
+  invalid_pass_conditions:
+    - tests only
+    - typecheck only
+    - mocked behavior only
+    - hardcoded fixture
+    - requirement moved to future work
+    - verifier did not inspect diff
+  verdict: PASS | FAIL | BLOCKED | CONDITIONAL_PASS
+  evidence:
+    - exact command, artifact, file, trace, UI state, or inspection result
+  limitation:
+  required_external_check:
+    - manual browser review
+  finding:
+```
+`CONDITIONAL_PASS` is not a final workflow status. It means the behavior is strongly inferred through the strongest available substitute evidence, while a stronger material capability remains unavailable. If that unavailable capability is required to prove the source requirement, the supervisor must mark the material row or workflow BLOCKED unless the user explicitly accepts a waiver or narrower scope.
+## Capability Manifest
+Before judging outcome rows, record the verification environment when capability limits are material:
+```yaml
+verification_environment:
+  shell: true | false
+  filesystem: true | false
+  git_diff: true | false
+  browser: true | false
+  playwright_mcp: true | false
+  network: true | false
+  capabilities:
+    - static_diff_inspection
+    - shell_command
+    - unit_test
+    - integration_test
+    - contract_test
+    - data_contract_test
+    - jsdom_render
+    - api_probe
+    - file_snapshot
+    - browser_snapshot
+    - human_required
+  limitations:
+    - "Responsive visual layout not verified because browser capability is unavailable"
+```
+Do not require browser snapshots as the core verifier. Use the strongest available observable predicate. If the source requirement truly depends on unavailable browser, visual, service, credential, network, or human-review capability, mark the row BLOCKED or `CONDITIONAL_PASS` with the limitation and required external check. Do not mark the row PASS.
 For documentation and review workflows, also record a domain-specific review state when useful: Needs Revision, Approved With Caveats, Ready To Publish, SME Review Needed, Legal Review Needed, Stale, or Deferred. Map it back to PASS/FAIL/BLOCKED for supervisor decisions.
+## Red-Capable Feedback Loops
+For bug fixes and risky behavior changes, each material acceptance row must name a feedback loop:
+```yaml
+feedback_loop:
+  command_or_evidence:
+  red_capable: yes | no | not_applicable
+  exact_symptom_or_behavior:
+  deterministic: yes | no
+  expected_runtime:
+  agent_runnable: yes | no
+```
+`red_capable: yes` means the loop would have failed, or visibly shown the wrong behavior, before the fix. A related check is not red-capable unless it catches the exact symptom or behavior under review.
+Classify every row's evidence as one of:
+- `behavior_was_tested`: a red-capable command, test, UI state, artifact check, or reviewer action exercised the exact behavior.
+- `related_check_ran`: a nearby test, build, lint, static check, or inspection ran but does not catch the exact behavior by itself.
+- `substitute_evidence_accepted`: the correct loop is unavailable and the user or governing source accepted substitute evidence.
+For bug fixes and risky behavior changes, PASS requires `behavior_was_tested` or `substitute_evidence_accepted` with waiver evidence. If no correct test surface exists, record that as an architecture or verification finding. Do not turn it into a quiet skipped check.
 ## Adversarial Checks
 Consider:
@@ -81,6 +182,7 @@ Consider:
 status: PASS|FAIL|BLOCKED
 verified_work_unit:
 verified_worker:
+verification_environment:
 matrix:
   - id:
     requirement:
@@ -88,6 +190,14 @@ matrix:
     evidence:
     verification_method:
     finding:
+outcome_evaluations:
+  - id:
+    source_requirement:
+    expected_outcome:
+    verdict:
+    evidence_strength:
+    evidence:
+    limitation:
 findings:
 residual_risks:
 skipped_checks:
@@ -102,3 +212,5 @@ After repairs, verification must rerun against the affected rows and any regress
 ## Rubber-Stamp Guard
 Reject verification that says only "looks good", "tests pass", or "implemented" without row-by-row evidence. Ask for exact evidence or mark BLOCKED.
+Reject PASS when the evidence is only tests/typecheck/build unless the row is explicitly scoped as a purely technical requirement or the command observes the expected user/system-visible outcome.

package/skills/acceptance-matrix/agents/openai.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 interface:
   display_name: "Acceptance Matrix"
   short_description: "Turn goals into verifiable criteria"
-  default_prompt: "Use $acceptance-matrix to define evidence-backed pass, fail, and blocked criteria."
+  default_prompt: "Use $acceptance-matrix to define evidence-backed pass, fail, blocked, and row-level conditional outcome criteria with expected outcomes, verification capabilities, evidence strength, invalid PASS conditions, and waiver handling."
 policy:
   allow_implicit_invocation: false

package/skills/dossier-builder/SKILL.md CHANGED Viewed

@@ -22,10 +22,13 @@ The dossier does not own acceptance design. It references or embeds acceptance r
 - known allowed and forbidden surfaces or artifacts
 - acceptance criteria or acceptance draft
 - required checks or evidence
+- expected outcomes, capability limits, and invalid PASS conditions for outcome-bearing work
 - worker role and report expectations
 If these inputs are missing, create a discovery dossier or return BLOCKED.
+For bug-fix dossiers and risky behavior-change dossiers, include a red-capable feedback loop or explain why no correct loop exists. The `feedback_loop` field is optional in `DossierV1` during the compatibility phase, but `validate-dossier` emits warnings when risky work omits it.
 Before delegation, validate the dossier with:
 ```bash
@@ -66,6 +69,28 @@ reviewers:
 acceptance_matrix:
 adversarial_checks:
 required_commands_or_evidence:
+verification_environment:
+  shell: true | false
+  filesystem: true | false
+  git_diff: true | false
+  browser: true | false
+  playwright_mcp: true | false
+  network: true | false
+outcome_evaluations:
+  - id:
+    source_requirement:
+    expected_outcome:
+    preferred_verification:
+    available_verification:
+    evidence_strength:
+    invalid_pass_conditions:
+feedback_loop:
+  command_or_evidence:
+  red_capable: yes | no | not_applicable
+  exact_symptom_or_behavior:
+  deterministic: yes | no
+  expected_runtime:
+  agent_runnable: yes | no
 worker_role:
 worker_prompt:
 supervisor_checkpoints:
@@ -85,6 +110,9 @@ The machine gate requires concrete strings or arrays for the core fields. Use `o
 - Include forbidden surfaces even when the worker seems trustworthy.
 - Convert unknowns into open questions, not hidden assumptions.
 - Include adversarial checks for malformed input, stale state, authorization, schema drift, replay, no-op implementation, and untrusted sources when relevant.
+- For outcome-bearing work, require workers to report row-mapped outcome evidence. The worker must not treat tests/typecheck/build as sufficient unless the row is explicitly technical or those commands observe the expected outcome.
+- Include capability limitations and required external checks when an expected outcome depends on browser, visual, live-service, credential, network, or human-review capability that may be unavailable.
+- For bug fixes and risky behavior changes, require a feedback loop that would catch the exact symptom or behavior. A related build, lint, or broad test run is not enough unless waiver evidence accepts it as substitute evidence.
 - Require workers to report skipped checks and assumptions.
 - For non-code work, use evidence such as citations, before/after excerpts, review rubrics, examples, artifact diffs, or explicit user decisions instead of commands.
 - Require repair tickets to cite the verification finding or acceptance row they repair.

package/skills/work-unit/SKILL.md CHANGED Viewed

@@ -11,12 +11,43 @@ Use this skill to make work small enough that another agent can complete and ver
 Work units can be bounded by code package, document section, source set, stakeholder decision, research question, design screen, workflow step, data slice, risk class, or output artifact. Do not force repository terminology onto non-code work.
+## Product And Integration Slices
+When work describes user-facing behavior or integration behavior, prefer tracer-bullet work units. A tracer-bullet unit cuts through the smallest useful set of layers needed to make one behavior observable, demonstrable, and verifiable.
+Use `slice_type: tracer_bullet` for product implementation that can expose behavior to a user, API caller, integration partner, workflow operator, evaluator, or verifier.
+Horizontal units are valid only for prefactoring, migration safety, infrastructure, documentation, research, or a dependency that cannot yet be verified as behavior. Use one of these non-product slice types when a tracer bullet is not the right shape:
+- `prefactor`
+- `migration`
+- `research`
+- `document`
+- `risk_boundary`
+Every product or integration implementation unit must name:
+```yaml
+slice_type: tracer_bullet | prefactor | migration | research | document | risk_boundary
+observable_behavior:
+expected_outcome:
+demo_or_verification:
+layers_touched:
+horizontal_slice_justification:
+```
+For `tracer_bullet`, `observable_behavior`, `expected_outcome`, and `demo_or_verification` are required and `layers_touched` should name the smallest layers needed for that behavior. For horizontal or non-product slice types, set `observable_behavior` and `expected_outcome` to `not_applicable` only when the unit names a concrete `horizontal_slice_justification`.
+Reject vague horizontal feature phases such as "backend foundation", "frontend pass", "data model work", or "integration prep" unless the unit has a valid non-product `slice_type`, a concrete dependency it unlocks, and a verification method for that slice.
 ## Unit Quality Bar
 A good work unit has:
 - one objective
 - a stable unit ID suitable for dossier and worker naming
+- a `slice_type` that matches the work shape
+- observable behavior, expected outcome, and demo or verification for product or integration behavior
 - named dependencies
 - explicit in-scope and out-of-scope surfaces
 - known sources or source gaps
@@ -34,12 +65,13 @@ Work-unit drafts coarse done criteria only. Use `$acceptance-matrix` when those
 1. Restate the parent objective.
 2. Identify natural boundaries: user workflow, package, document, API contract, risk class, or dependency layer.
-3. Split into units that can be verified independently.
-4. Mark dependencies and ordering constraints.
-5. Mark which units can run in parallel only when they do not mutate the same surfaces.
-6. Define readiness and done criteria for each unit.
-7. If sources are absent, create a discovery/intake unit before production work.
-8. Identify the first unit that is safe to dossier.
+3. For product or integration behavior, split into tracer-bullet units before horizontal layers.
+4. Split remaining work into units that can be verified independently.
+5. Mark dependencies and ordering constraints.
+6. Mark which units can run in parallel only when they do not mutate the same surfaces.
+7. Define readiness and done criteria for each unit.
+8. If sources are absent, create a discovery/intake unit before production work.
+9. Identify the first unit that is safe to dossier.
 For over-broad one-pass requests, produce a sequencing recommendation and invoke or mirror `$loop-policy` fields for mode, parallel safety, approval gates, and repair limits.
@@ -69,6 +101,12 @@ units:
     worker_slug:
     title:
     objective:
+    slice_type:
+    observable_behavior:
+    expected_outcome:
+    demo_or_verification:
+    layers_touched:
+    horizontal_slice_justification:
     in_scope:
     out_of_scope:
     dependencies:
@@ -89,3 +127,5 @@ first_recommended_unit:
 ## Stop Gates
 Stop when a unit cannot name a done criterion, required source, or boundary. Ask for a decision or return a smaller discovery unit.
+Stop when a product or integration implementation unit lacks `observable_behavior`, `expected_outcome`, or `demo_or_verification`. Return a tracer-bullet split instead of a horizontal phase unless the unit has a valid non-product `slice_type` and `horizontal_slice_justification`.

package/skills/workflow-docs/references/workflow-control.md CHANGED Viewed

@@ -80,8 +80,8 @@ Escalation Triggers:
 ## Units
-| ID | Source Ref | Scope | Done Signal | Check | Status | Touched Surfaces | Evidence | Blocker Or Next Action |
-|---|---|---|---|---|---|---|---|---|
+| ID | Source Ref | Slice Type | Scope | Observable Behavior | Done Signal | Check | Status | Touched Surfaces | Evidence | Blocker Or Next Action |
+|---|---|---|---|---|---|---|---|---|---|---|
 ## Batch Checkpoints
@@ -192,8 +192,20 @@ Notes:
 ```md
 # Work Units
-| ID | Worker Slug | Title | Objective | Dependencies | Status | Verification |
-|---|---|---|---|---|---|---|
+| ID | Worker Slug | Title | Slice Type | Observable Behavior | Expected Outcome | Demo Or Verification | Dependencies | Status | Verification |
+|---|---|---|---|---|---|---|---|---|---|
+## Unit Slice Details
+For each unit, record:
+    id:
+    slice_type: tracer_bullet | prefactor | migration | research | document | risk_boundary
+    observable_behavior:
+    expected_outcome:
+    demo_or_verification:
+    layers_touched:
+    horizontal_slice_justification:
 ## Sequencing
@@ -239,6 +251,16 @@ Notes:
 ## Quality Or Risk Checks
+## Feedback Loop
+    feedback_loop:
+      command_or_evidence:
+      red_capable: yes | no | not_applicable
+      exact_symptom_or_behavior:
+      deterministic: yes | no
+      expected_runtime:
+      agent_runnable: yes | no
 ## Required Checks Or Evidence
 ## Owner Or Contributor Role
@@ -276,12 +298,32 @@ Closed means the terminal report has been consumed and any native thread or suba
 ```md
 # Acceptance Matrix
-| ID | Requirement | Evidence Required | Verification Method | Adversarial Check | Status | Evidence |
-|---|---|---|---|---|---|---|
+## Verification Environment
+| Capability | Available | Notes |
+|---|---|---|
+| shell |  |  |
+| filesystem |  |  |
+| git_diff |  |  |
+| browser |  |  |
+| playwright_mcp |  |  |
+| network |  |  |
+## Outcome Evaluation Matrix
+| ID | Source Requirement | Expected Outcome | Preferred Verification | Available Verification | Evidence Strength | Invalid PASS Conditions | Verdict | Evidence | Limitation |
+|---|---|---|---|---|---|---|---|---|---|
+## Acceptance Rows
+| ID | Requirement | Evidence Required | Verification Method | Feedback Loop | Evidence Classification | Adversarial Check | Status | Evidence |
+|---|---|---|---|---|---|---|---|---|
 ## Residual Risks
 ## Waivers
+## Verification Findings
 ```
 ## VERIFICATION-REPORT.md
@@ -304,6 +346,16 @@ Verified Worker:
 | Method | Result | Evidence |
 |---|---|---|
+## Verification Environment
+| Capability | Available | Notes |
+|---|---|---|
+## Outcome Evaluations
+| Row | Source Requirement | Expected Outcome | Verdict | Evidence Strength | Evidence | Limitation | Required External Check |
+|---|---|---|---|---|---|---|---|
 ## Acceptance Mapping
 | Requirement | Verdict | Evidence |