workflow-supervisor 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +139 -0
- package/README.md +125 -28
- package/bin/workflow-skills.mjs +201 -1
- package/docs/artifacts.md +9 -0
- package/docs/cli.md +3 -1
- package/docs/portable-delegation.md +19 -1
- package/docs/skill-reference.md +12 -2
- package/docs/troubleshooting.md +34 -0
- package/package.json +8 -2
- package/schemas/dossier-v1.schema.json +38 -0
- package/schemas/worker-report-v1.schema.json +120 -12
- package/skills/acceptance-matrix/SKILL.md +114 -2
- package/skills/acceptance-matrix/agents/openai.yaml +1 -1
- package/skills/dossier-builder/SKILL.md +28 -0
- package/skills/loop-policy/SKILL.md +29 -6
- package/skills/work-unit/SKILL.md +46 -6
- package/skills/workflow-docs/SKILL.md +2 -1
- package/skills/workflow-docs/references/workflow-control.md +93 -6
- package/skills/workflow-supervisor/SKILL.md +195 -46
- package/skills/workflow-supervisor/agents/openai.yaml +2 -2
package/bin/workflow-skills.mjs
CHANGED
|
@@ -19,6 +19,26 @@ const AGENTS = new Set([...INSTALLABLE_AGENTS, "generic"]);
|
|
|
19
19
|
const DELEGATE_AGENTS = new Set(["codex", "claude-code"]);
|
|
20
20
|
const WORKER_ROLES = new Set(["implementer", "verifier", "repair", "documenter"]);
|
|
21
21
|
const REPORT_STATUSES = new Set(["PASS", "FAIL", "BLOCKED"]);
|
|
22
|
+
const OUTCOME_VERDICTS = new Set(["PASS", "FAIL", "BLOCKED", "CONDITIONAL_PASS"]);
|
|
23
|
+
const VERIFICATION_CAPABILITIES = new Set([
|
|
24
|
+
"static_diff_inspection",
|
|
25
|
+
"diff_inspection",
|
|
26
|
+
"shell_command",
|
|
27
|
+
"unit_test",
|
|
28
|
+
"integration_test",
|
|
29
|
+
"contract_test",
|
|
30
|
+
"data_contract_test",
|
|
31
|
+
"jsdom_render",
|
|
32
|
+
"api_probe",
|
|
33
|
+
"file_snapshot",
|
|
34
|
+
"generated_html_snapshot",
|
|
35
|
+
"component_tree_snapshot",
|
|
36
|
+
"accessibility_tree_snapshot",
|
|
37
|
+
"state_machine_test",
|
|
38
|
+
"browser_snapshot",
|
|
39
|
+
"human_required",
|
|
40
|
+
"manual_review",
|
|
41
|
+
]);
|
|
22
42
|
const WORKFLOW_STATE_IGNORE_ENTRY = ".workflow/";
|
|
23
43
|
|
|
24
44
|
function usage() {
|
|
@@ -483,6 +503,7 @@ function parseSimpleYaml(text) {
|
|
|
483
503
|
}
|
|
484
504
|
|
|
485
505
|
const items = [];
|
|
506
|
+
const object = {};
|
|
486
507
|
for (i += 1; i < lines.length; i += 1) {
|
|
487
508
|
const next = lines[i];
|
|
488
509
|
if (!next.trim() || next.trimStart().startsWith("#")) continue;
|
|
@@ -492,8 +513,10 @@ function parseSimpleYaml(text) {
|
|
|
492
513
|
}
|
|
493
514
|
const item = next.match(/^\s*-\s*(.*)$/);
|
|
494
515
|
if (item) items.push(unquoteScalar(item[1]));
|
|
516
|
+
const property = next.match(/^\s+([A-Za-z_][A-Za-z0-9_-]*):(?:\s*(.*))?$/);
|
|
517
|
+
if (property) object[property[1]] = parseDossierScalar(property[2] || "");
|
|
495
518
|
}
|
|
496
|
-
result[key] = items.length > 0 ? items : "";
|
|
519
|
+
result[key] = items.length > 0 ? items : Object.keys(object).length > 0 ? object : "";
|
|
497
520
|
}
|
|
498
521
|
return result;
|
|
499
522
|
}
|
|
@@ -556,6 +579,14 @@ const DOSSIER_CORE_ARRAY_FIELDS = [
|
|
|
556
579
|
];
|
|
557
580
|
|
|
558
581
|
const DOSSIER_EXPLICIT_ARRAY_FIELDS = ["assumptions", "open_questions"];
|
|
582
|
+
const FEEDBACK_LOOP_FIELDS = [
|
|
583
|
+
"command_or_evidence",
|
|
584
|
+
"red_capable",
|
|
585
|
+
"exact_symptom_or_behavior",
|
|
586
|
+
"deterministic",
|
|
587
|
+
"expected_runtime",
|
|
588
|
+
"agent_runnable",
|
|
589
|
+
];
|
|
559
590
|
|
|
560
591
|
function isPlaceholder(value, { allowNone = false } = {}) {
|
|
561
592
|
const normalized = String(value || "").trim().toLowerCase().replace(/[.!]+$/, "");
|
|
@@ -584,6 +615,61 @@ function validateConcreteArray(data, field, errors, options = {}) {
|
|
|
584
615
|
return values;
|
|
585
616
|
}
|
|
586
617
|
|
|
618
|
+
function dossierSearchText(data) {
|
|
619
|
+
return [
|
|
620
|
+
data.workflow,
|
|
621
|
+
data.work_unit,
|
|
622
|
+
data.title,
|
|
623
|
+
data.objective,
|
|
624
|
+
...fieldArray(data.work_points),
|
|
625
|
+
...fieldArray(data.acceptance_matrix),
|
|
626
|
+
...fieldArray(data.adversarial_checks),
|
|
627
|
+
...fieldArray(data.required_commands_or_evidence),
|
|
628
|
+
...fieldArray(data.stop_gates),
|
|
629
|
+
]
|
|
630
|
+
.join(" ")
|
|
631
|
+
.toLowerCase();
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
function dossierNeedsFeedbackLoop(data) {
|
|
635
|
+
return /\b(bug|fix|regression|defect|broken|crash|error|failure|failing|risky behavior|behavior change|behaviour change|change behavior|change behaviour)\b/.test(
|
|
636
|
+
dossierSearchText(data),
|
|
637
|
+
);
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
function validateFeedbackLoop(data, warnings) {
|
|
641
|
+
const loop = data.feedback_loop;
|
|
642
|
+
const needsLoop = dossierNeedsFeedbackLoop(data);
|
|
643
|
+
if (!loop) {
|
|
644
|
+
if (needsLoop) {
|
|
645
|
+
warnings.push("feedback_loop is recommended for bug-fix or risky behavior-change dossiers");
|
|
646
|
+
}
|
|
647
|
+
return;
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
if (typeof loop !== "object" || Array.isArray(loop)) {
|
|
651
|
+
warnings.push("feedback_loop should be an object with command_or_evidence, red_capable, exact_symptom_or_behavior, deterministic, expected_runtime, and agent_runnable");
|
|
652
|
+
return;
|
|
653
|
+
}
|
|
654
|
+
|
|
655
|
+
for (const field of FEEDBACK_LOOP_FIELDS) {
|
|
656
|
+
if (isPlaceholder(loop[field])) warnings.push(`feedback_loop.${field} should be concrete`);
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
if (loop.red_capable && !["yes", "no", "not_applicable"].includes(String(loop.red_capable))) {
|
|
660
|
+
warnings.push("feedback_loop.red_capable should be yes, no, or not_applicable");
|
|
661
|
+
}
|
|
662
|
+
if (loop.deterministic && !["yes", "no"].includes(String(loop.deterministic))) {
|
|
663
|
+
warnings.push("feedback_loop.deterministic should be yes or no");
|
|
664
|
+
}
|
|
665
|
+
if (loop.agent_runnable && !["yes", "no"].includes(String(loop.agent_runnable))) {
|
|
666
|
+
warnings.push("feedback_loop.agent_runnable should be yes or no");
|
|
667
|
+
}
|
|
668
|
+
if (needsLoop && String(loop.red_capable) !== "yes") {
|
|
669
|
+
warnings.push("bug-fix or risky behavior-change dossiers should name a red-capable feedback loop or explicit waiver");
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
|
|
587
673
|
function validateDossierData(data, { role, unitId } = {}) {
|
|
588
674
|
const errors = [];
|
|
589
675
|
const warnings = [];
|
|
@@ -649,6 +735,8 @@ function validateDossierData(data, { role, unitId } = {}) {
|
|
|
649
735
|
if (!/\b[A-Z]+[0-9]+\b/.test(row)) warnings.push(`acceptance_matrix[${index}] should include a stable row ID`);
|
|
650
736
|
});
|
|
651
737
|
|
|
738
|
+
validateFeedbackLoop(data, warnings);
|
|
739
|
+
|
|
652
740
|
const unresolved = fieldArray(data.open_questions).filter((item) => !/^(none|no open questions|empty)$/i.test(item));
|
|
653
741
|
if (unresolved.length > 0) {
|
|
654
742
|
errors.push("open_questions must be explicitly none before delegation; create a discovery dossier or stop as BLOCKED");
|
|
@@ -877,6 +965,8 @@ function buildWorkerPrompt({ role, unitId, dossierText }) {
|
|
|
877
965
|
findings: [],
|
|
878
966
|
blocking_question: null,
|
|
879
967
|
next_action: "",
|
|
968
|
+
verification_environment: null,
|
|
969
|
+
outcome_evaluations: [],
|
|
880
970
|
adapter: null,
|
|
881
971
|
guard: null,
|
|
882
972
|
reason: null,
|
|
@@ -1044,6 +1134,109 @@ function ensureArray(value) {
|
|
|
1044
1134
|
return Array.isArray(value) ? value : [value];
|
|
1045
1135
|
}
|
|
1046
1136
|
|
|
1137
|
+
function isPlainObject(value) {
|
|
1138
|
+
return Boolean(value && typeof value === "object" && !Array.isArray(value));
|
|
1139
|
+
}
|
|
1140
|
+
|
|
1141
|
+
function validateCapabilityList(value, field, errors) {
|
|
1142
|
+
if (!Array.isArray(value)) {
|
|
1143
|
+
errors.push(`${field} must be an array`);
|
|
1144
|
+
return;
|
|
1145
|
+
}
|
|
1146
|
+
for (const capability of value) {
|
|
1147
|
+
if (!VERIFICATION_CAPABILITIES.has(capability)) {
|
|
1148
|
+
errors.push(`${field} contains unsupported capability: ${capability}`);
|
|
1149
|
+
}
|
|
1150
|
+
}
|
|
1151
|
+
}
|
|
1152
|
+
|
|
1153
|
+
function validateStringArray(value, field, errors) {
|
|
1154
|
+
if (!Array.isArray(value)) {
|
|
1155
|
+
errors.push(`${field} must be an array`);
|
|
1156
|
+
return;
|
|
1157
|
+
}
|
|
1158
|
+
if (value.some((item) => typeof item !== "string")) {
|
|
1159
|
+
errors.push(`${field} must contain only strings`);
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
function validateVerificationEnvironment(environment, errors) {
|
|
1164
|
+
if (environment == null) return;
|
|
1165
|
+
if (!isPlainObject(environment)) {
|
|
1166
|
+
errors.push("verification_environment must be an object or null");
|
|
1167
|
+
return;
|
|
1168
|
+
}
|
|
1169
|
+
for (const field of ["shell", "filesystem", "git_diff", "browser", "playwright_mcp", "network"]) {
|
|
1170
|
+
if (environment[field] != null && typeof environment[field] !== "boolean") {
|
|
1171
|
+
errors.push(`verification_environment.${field} must be boolean`);
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
if (environment.capabilities != null) {
|
|
1175
|
+
validateCapabilityList(environment.capabilities, "verification_environment.capabilities", errors);
|
|
1176
|
+
}
|
|
1177
|
+
if (environment.limitations != null) {
|
|
1178
|
+
validateStringArray(environment.limitations, "verification_environment.limitations", errors);
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
function validateOutcomeEvaluations(report, errors) {
|
|
1183
|
+
const rows = report?.outcome_evaluations;
|
|
1184
|
+
if (!Array.isArray(rows)) {
|
|
1185
|
+
errors.push("outcome_evaluations must be an array");
|
|
1186
|
+
return;
|
|
1187
|
+
}
|
|
1188
|
+
if (report.status === "PASS" && rows.some((row) => row?.verdict !== "PASS")) {
|
|
1189
|
+
errors.push("top-level PASS requires every outcome_evaluations row verdict to be PASS");
|
|
1190
|
+
}
|
|
1191
|
+
rows.forEach((row, index) => {
|
|
1192
|
+
const prefix = `outcome_evaluations[${index}]`;
|
|
1193
|
+
if (!isPlainObject(row)) {
|
|
1194
|
+
errors.push(`${prefix} must be an object`);
|
|
1195
|
+
return;
|
|
1196
|
+
}
|
|
1197
|
+
for (const field of ["id", "source_requirement", "expected_outcome", "verdict"]) {
|
|
1198
|
+
if (typeof row[field] !== "string" || row[field].trim() === "") {
|
|
1199
|
+
errors.push(`${prefix}.${field} must be a non-empty string`);
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
if (!OUTCOME_VERDICTS.has(row.verdict)) {
|
|
1203
|
+
errors.push(`${prefix}.verdict must be PASS, FAIL, BLOCKED, or CONDITIONAL_PASS`);
|
|
1204
|
+
}
|
|
1205
|
+
validateCapabilityList(row.preferred_verification, `${prefix}.preferred_verification`, errors);
|
|
1206
|
+
validateCapabilityList(row.available_verification, `${prefix}.available_verification`, errors);
|
|
1207
|
+
if (!isPlainObject(row.evidence_strength)) {
|
|
1208
|
+
errors.push(`${prefix}.evidence_strength must be an object`);
|
|
1209
|
+
} else {
|
|
1210
|
+
validateCapabilityList(row.evidence_strength.strongest_possible, `${prefix}.evidence_strength.strongest_possible`, errors);
|
|
1211
|
+
validateCapabilityList(row.evidence_strength.strongest_available, `${prefix}.evidence_strength.strongest_available`, errors);
|
|
1212
|
+
if (row.evidence_strength.limitation != null && typeof row.evidence_strength.limitation !== "string") {
|
|
1213
|
+
errors.push(`${prefix}.evidence_strength.limitation must be a string`);
|
|
1214
|
+
}
|
|
1215
|
+
}
|
|
1216
|
+
if (!Array.isArray(row.evidence)) errors.push(`${prefix}.evidence must be an array`);
|
|
1217
|
+
validateStringArray(row.invalid_pass_conditions, `${prefix}.invalid_pass_conditions`, errors);
|
|
1218
|
+
if (row.verdict === "PASS" && Array.isArray(row.evidence) && row.evidence.length === 0) {
|
|
1219
|
+
errors.push(`${prefix}.PASS requires row evidence`);
|
|
1220
|
+
}
|
|
1221
|
+
if (row.verdict === "CONDITIONAL_PASS") {
|
|
1222
|
+
const hasLimitation = typeof row.limitation === "string" && row.limitation.trim() !== "";
|
|
1223
|
+
const hasCapabilityLimitation = Array.isArray(row.capability_limitations) && row.capability_limitations.length > 0;
|
|
1224
|
+
if (!hasLimitation && !hasCapabilityLimitation) {
|
|
1225
|
+
errors.push(`${prefix}.CONDITIONAL_PASS requires limitation or capability_limitations`);
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
if (row.capability_limitations != null) {
|
|
1229
|
+
validateStringArray(row.capability_limitations, `${prefix}.capability_limitations`, errors);
|
|
1230
|
+
}
|
|
1231
|
+
if (row.required_external_check != null) {
|
|
1232
|
+
validateStringArray(row.required_external_check, `${prefix}.required_external_check`, errors);
|
|
1233
|
+
}
|
|
1234
|
+
if (row.finding != null && typeof row.finding !== "string") {
|
|
1235
|
+
errors.push(`${prefix}.finding must be a string`);
|
|
1236
|
+
}
|
|
1237
|
+
});
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1047
1240
|
function reportAdapterMeta(adapter, result = {}) {
|
|
1048
1241
|
return {
|
|
1049
1242
|
agent: adapter?.agent || null,
|
|
@@ -1069,6 +1262,8 @@ function blockedReport({ role, unitId, reason, summary, adapter, guard, stdout,
|
|
|
1069
1262
|
findings: reason ? [{ id: reason, severity: "blocking", summary }] : [],
|
|
1070
1263
|
blocking_question: null,
|
|
1071
1264
|
next_action: "supervisor_review",
|
|
1265
|
+
verification_environment: null,
|
|
1266
|
+
outcome_evaluations: [],
|
|
1072
1267
|
adapter: adapter || null,
|
|
1073
1268
|
guard: guard || { allowed_surface_violations: [], role_violations: [], warnings: [] },
|
|
1074
1269
|
reason,
|
|
@@ -1092,8 +1287,11 @@ function normalizeReport(report, { role, unitId, adapter, guard }) {
|
|
|
1092
1287
|
findings: ensureArray(report.findings),
|
|
1093
1288
|
blocking_question: report.blocking_question ?? null,
|
|
1094
1289
|
next_action: report.next_action || "",
|
|
1290
|
+
verification_environment: report.verification_environment ?? null,
|
|
1291
|
+
outcome_evaluations: ensureArray(report.outcome_evaluations),
|
|
1095
1292
|
adapter,
|
|
1096
1293
|
guard,
|
|
1294
|
+
reason: report.reason ?? null,
|
|
1097
1295
|
};
|
|
1098
1296
|
}
|
|
1099
1297
|
|
|
@@ -1112,6 +1310,8 @@ function validateWorkerReport(report, { role, unitId }) {
|
|
|
1112
1310
|
errors.push("blocking_question requires BLOCKED status");
|
|
1113
1311
|
}
|
|
1114
1312
|
if (role === "verifier" && report?.changed_surfaces?.length > 0) errors.push("verifier must not report changed surfaces");
|
|
1313
|
+
validateVerificationEnvironment(report?.verification_environment, errors);
|
|
1314
|
+
validateOutcomeEvaluations(report, errors);
|
|
1115
1315
|
return errors;
|
|
1116
1316
|
}
|
|
1117
1317
|
|
package/docs/artifacts.md
CHANGED
|
@@ -8,6 +8,7 @@ In Git-backed codebases, `.workflow/` is local working state. Ensure `<workspace
|
|
|
8
8
|
|
|
9
9
|
## Workflow Control
|
|
10
10
|
|
|
11
|
+
- `.workflow/LEDGER.md`
|
|
11
12
|
- `.workflow/WORKFLOW.md`
|
|
12
13
|
- `.workflow/SOURCE-CORPUS.md`
|
|
13
14
|
- `.workflow/WORK-UNITS.md`
|
|
@@ -40,3 +41,11 @@ In Git-backed codebases, `.workflow/` is local working state. Ensure `<workspace
|
|
|
40
41
|
## State Medium
|
|
41
42
|
|
|
42
43
|
Markdown is the default, but state may also be an inline brief, spreadsheet tab, ticket set, design annotation, CRM note, runbook, decision log, slide appendix, whiteboard note, or chat continuation note.
|
|
44
|
+
|
|
45
|
+
For `lean_work_unit_runner`, prefer one compact ledger over multiple workflow documents. Each executable row should carry `id`, `source_ref`, `scope`, `done`, `check`, `status`, touched surfaces, and blockers. Escalated units may link to strict-mode SPEC, dossier, or verification artifacts only when needed.
|
|
46
|
+
|
|
47
|
+
For product or integration implementation, `WORK-UNITS.md` and lean ledger rows should also carry `slice_type`, `observable_behavior`, `expected_outcome`, `demo_or_verification`, `layers_touched`, and `horizontal_slice_justification` where useful. Prefer `tracer_bullet` units for behavior work. Use horizontal slices only for prefactoring, migration safety, infrastructure, documentation, research, or risk-boundary work with a concrete justification.
|
|
48
|
+
|
|
49
|
+
For outcome-bearing verification, `ACCEPTANCE-MATRIX.md` and `VERIFICATION-REPORT.md` should include a verification environment, outcome evaluation rows, preferred and available verification capabilities, evidence strength, invalid PASS conditions, and any required external checks. Row-level `CONDITIONAL_PASS` means strongly inferred but not fully observable; it must not be treated as final green status without explicit waiver evidence.
|
|
50
|
+
|
|
51
|
+
For native thread or subagent delegation, `WORKER-MAP.md` must record the native resource id, terminal report, close action, and close result. Do not mark a native worker closed until the resource close is recorded.
|
package/docs/cli.md
CHANGED
|
@@ -122,10 +122,12 @@ Options:
|
|
|
122
122
|
|
|
123
123
|
### `delegate`
|
|
124
124
|
|
|
125
|
-
Run one role-scoped worker through an installed Codex or Claude Code CLI and print exactly one normalized `WorkerReportV1` JSON object. Missing or invalid `DossierV1` contracts, missing CLIs, invalid worker output, timeouts, non-zero PASS results, PASS without evidence, forbidden-surface changes, and verifier mutations become `BLOCKED` reports instead of unstructured prose.
|
|
125
|
+
Run one role-scoped worker through an installed Codex or Claude Code CLI and print exactly one normalized `WorkerReportV1` JSON object. Missing or invalid `DossierV1` contracts, missing CLIs, invalid worker output, timeouts, non-zero PASS results, PASS without evidence, top-level `CONDITIONAL_PASS`, PASS with conditional outcome rows, forbidden-surface changes, and verifier mutations become `BLOCKED` reports instead of unstructured prose.
|
|
126
126
|
|
|
127
127
|
The report schema lives at `schemas/worker-report-v1.schema.json`. The Codex adapter passes it via `--output-schema`; the Claude Code adapter passes it via `--json-schema`; every adapter is still wrapper-validated after the run.
|
|
128
128
|
|
|
129
|
+
`WorkerReportV1.status` remains `PASS`, `FAIL`, or `BLOCKED`. Outcome-bearing verifier reports may include `verification_environment` and `outcome_evaluations`; `CONDITIONAL_PASS` is allowed only as an outcome row verdict to record strongly inferred but not fully observable behavior.
|
|
130
|
+
|
|
129
131
|
`--dossier` is a hard preflight gate. It must parse as `DossierV1` and pass concrete-field checks before the worker process starts. The delegate command uses `allowed_surfaces` and `forbidden_surfaces` from the dossier as surface guards unless explicit CLI surface flags are provided.
|
|
130
132
|
|
|
131
133
|
```bash
|
|
@@ -18,6 +18,10 @@ complete intake
|
|
|
18
18
|
-> final supervisor report
|
|
19
19
|
```
|
|
20
20
|
|
|
21
|
+
This document describes strict or explicitly delegated execution. `lean_work_unit_runner` normally stays in same-session phased execution with a compact ledger and targeted checks. It should enter portable delegation only when the user authorizes workers for a batch or a unit hits a strict-mode escalation trigger.
|
|
22
|
+
|
|
23
|
+
Prefer portable delegation over native threads or subagents when it satisfies the work. Portable delegation is one-shot, so the worker process exits after the report. Native thread or subagent transports are allowed only when the supervisor can record the native resource id and call the matching close operation after terminal report, timeout, blocker, cancellation, or invalid output.
|
|
24
|
+
|
|
21
25
|
The supervisor remains the only coordinator. Workers do not ask the human questions, choose final disposition, expand scope, approve plans, or talk to each other. If a worker needs a decision, it returns `BLOCKED` with a `blocking_question`; only the supervisor asks the user.
|
|
22
26
|
|
|
23
27
|
## Non-Goals
|
|
@@ -76,6 +80,17 @@ Every adapter must normalize into this shape:
|
|
|
76
80
|
"findings": [],
|
|
77
81
|
"blocking_question": null,
|
|
78
82
|
"next_action": "",
|
|
83
|
+
"verification_environment": {
|
|
84
|
+
"shell": true,
|
|
85
|
+
"filesystem": true,
|
|
86
|
+
"git_diff": true,
|
|
87
|
+
"browser": false,
|
|
88
|
+
"playwright_mcp": false,
|
|
89
|
+
"network": false,
|
|
90
|
+
"capabilities": ["shell_command", "api_probe", "static_diff_inspection"],
|
|
91
|
+
"limitations": []
|
|
92
|
+
},
|
|
93
|
+
"outcome_evaluations": [],
|
|
79
94
|
"adapter": {
|
|
80
95
|
"agent": "codex",
|
|
81
96
|
"command": "codex exec",
|
|
@@ -89,7 +104,7 @@ Every adapter must normalize into this shape:
|
|
|
89
104
|
}
|
|
90
105
|
```
|
|
91
106
|
|
|
92
|
-
`PASS`, `FAIL`, and `BLOCKED` mean the same thing on both platforms. A worker report without evidence for material acceptance rows is invalid. Invalid output is converted into a deterministic normalized `BLOCKED` report by default. The package does not make a second live worker call to repair formatting, because a second call can mutate state, consume budget, or produce another non-portable transcript.
|
|
107
|
+
`PASS`, `FAIL`, and `BLOCKED` mean the same thing on both platforms. `CONDITIONAL_PASS` is valid only as a row-level `outcome_evaluations[].verdict`, not as top-level `WorkerReportV1.status`. A worker report without evidence for material acceptance rows is invalid. A top-level PASS with failed, blocked, or conditional outcome rows is invalid. Invalid output is converted into a deterministic normalized `BLOCKED` report by default. The package does not make a second live worker call to repair formatting, because a second call can mutate state, consume budget, or produce another non-portable transcript.
|
|
93
108
|
|
|
94
109
|
The schema is a package artifact at `schemas/worker-report-v1.schema.json`. Codex receives it through `--output-schema`; Claude Code receives it through `--json-schema`; both adapters are still wrapper-validated after the run.
|
|
95
110
|
|
|
@@ -159,10 +174,13 @@ For git workspaces, the surface guard compares pre/post git status. Mutable role
|
|
|
159
174
|
| Worker hangs | Timeout returns normalized `BLOCKED` with adapter timing evidence. |
|
|
160
175
|
| Worker exits non-zero but printed useful text | Do not trust it as PASS. Normalize as `BLOCKED` unless a valid report and clean guards prove otherwise. |
|
|
161
176
|
| Worker returns PASS without evidence | Invalid report. Return normalized `BLOCKED` with `reason: report_validation_failed`. |
|
|
177
|
+
| Worker returns top-level `CONDITIONAL_PASS` | Invalid report. Use `BLOCKED` or `FAIL` top-level status and record `CONDITIONAL_PASS` only on the affected outcome row. |
|
|
178
|
+
| Worker hides conditional outcome proof inside PASS | Invalid report. Top-level PASS requires every material outcome row verdict to be PASS. |
|
|
162
179
|
| Tests cannot run | Verifier returns `BLOCKED` or `PASS` only with substitute evidence accepted by the acceptance matrix. |
|
|
163
180
|
| Repair expands scope | Reject unless the repair dossier explicitly allowed the new surfaces and criteria. |
|
|
164
181
|
| Units touch same surfaces | Run sequentially. Parallel delegation requires proven disjoint mutable surfaces. |
|
|
165
182
|
| Platform has no native subagents | Fine. Each role is a fresh one-shot CLI process. |
|
|
183
|
+
| Native subagent close is unavailable | Do not spawn it. Return `worker_resource_close_unavailable` and use portable delegation or same-session phased work only if intake allowed it. |
|
|
166
184
|
| Platform output differs | Platform output is not the contract. `WorkerReportV1` is the only supervisor input. |
|
|
167
185
|
| Platform cannot support a role safely | Adapter role is unsupported. Supervisor chooses another certified adapter or blocks. |
|
|
168
186
|
| Full support is claimed but one CLI is absent | `delegate-doctor --agent all --probe --require-pass` exits nonzero and names the missing adapter. |
|
package/docs/skill-reference.md
CHANGED
|
@@ -2,7 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
## `workflow-supervisor`
|
|
4
4
|
|
|
5
|
-
Coordinate explicit supervised or agent-loop workflows. It
|
|
5
|
+
Coordinate explicit supervised or agent-loop workflows with profile-based overhead. It starts by selecting `lean_work_unit_runner`, `strict_full_workflow`, or `planning_only`, then completes the intake needed for that profile before implementation, goal binding, worker delegation, or final disposition. The user must answer required intake items; the supervisor must not infer path, mode, delegation, final disposition, or boundaries from vague keywords. Lean mode is for large already-bounded work-unit backlogs: it keeps a compact ledger with unit id, source reference, scope, done signal, check, status, touched surfaces, and blockers, then executes one ready unit at a time with targeted checks and escalation gates. Strict mode creates a source-requirement coverage ledger and SPEC review gate before work units so controlling-source deliverables, roadmap phases, and exit criteria are either implemented, explicitly deferred, blocked, or marked non-material. In human-in-loop mode, the human can ask questions, request revisions, block, defer, or approve before execution. In autonomous goal mode, human clarification pauses resume from recorded workflow state after the answer updates only affected downstream artifacts. Strict mode can orchestrate named workers from dossiers through the portable delegate command or an approved native adapter. Native threads and subagents require a recorded native resource id plus a close result, such as `close_agent` for Codex subagents, before a worker is `closed`. Loading the skill itself does not spawn workers. It binds Codex goals only after complete intake and when the user or environment authorizes goal-oriented work, checks active goal state first, avoids unrelated active-goal collisions, and treats terminal blocked goals as history when resuming through workflow docs.
|
|
6
|
+
|
|
7
|
+
Route first before profile selection. If Workflow Supervisor was not explicitly invoked and the task is a small, clear edit with obvious files and acceptance, do not use Workflow Supervisor; execute directly. If the user explicitly invokes `workflow-supervisor`, `$workflow-supervisor`, or says to use the skill, select the proportional profile instead of silently skipping the supervisor.
|
|
8
|
+
|
|
9
|
+
| Situation | Route |
|
|
10
|
+
|---|---|
|
|
11
|
+
| Small, clear edit with obvious files and acceptance | Do not use Workflow Supervisor. Execute directly. |
|
|
12
|
+
| Large bounded backlog with clear unit done signals | `lean_work_unit_runner`. |
|
|
13
|
+
| Broad, ambiguous, source-of-truth, delegated, security-sensitive, dirty-state, release, resume, or externally published work | `strict_full_workflow`. |
|
|
14
|
+
| Sequencing, risk review, or backlog shaping only | `planning_only`. |
|
|
15
|
+
| Runnable uncertainty before implementation | Create a discovery or prototype unit first. |
|
|
6
16
|
|
|
7
17
|
## `source-corpus`
|
|
8
18
|
|
|
@@ -22,7 +32,7 @@ Define role contracts and solo-mode phase separation. It prevents role bleed: ve
|
|
|
22
32
|
|
|
23
33
|
## `acceptance-matrix`
|
|
24
34
|
|
|
25
|
-
Create formal evidence-mapped acceptance rows for high-risk, supervised, ambiguous, resumable, or delegated workflows. Rows must preserve source requirement strength, including named systems, quantities, live integration language, and exit criteria; weaker proxy checks require explicit user waiver or scope narrowing.
|
|
35
|
+
Create formal evidence-mapped acceptance rows for high-risk, supervised, ambiguous, resumable, or delegated workflows. Rows must preserve source requirement strength, including named systems, quantities, live integration language, and exit criteria; weaker proxy checks require explicit user waiver or scope narrowing. Outcome-bearing rows also name expected outcomes, preferred and available verification capabilities, evidence strength, invalid PASS conditions, and capability limitations. `CONDITIONAL_PASS` is row-level only and must not be treated as final green status without explicit waiver evidence.
|
|
26
36
|
|
|
27
37
|
## `loop-policy`
|
|
28
38
|
|
package/docs/troubleshooting.md
CHANGED
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
|
|
5
5
|
Keep `policy.allow_implicit_invocation: false`. Use explicit `$skill-name` invocation until live routing tests prove trigger precision.
|
|
6
6
|
|
|
7
|
+
## Workflow Supervisor is used for a tiny edit
|
|
8
|
+
|
|
9
|
+
If Workflow Supervisor was not explicitly invoked and the task has obvious files, obvious acceptance, and no hard supervisor trigger, do not invoke the skill. Execute directly and run the relevant check.
|
|
10
|
+
|
|
11
|
+
If the user explicitly invoked `workflow-supervisor`, `$workflow-supervisor`, or said to use the skill, do not silently skip it. Select the lightest valid profile, usually `lean_work_unit_runner` for bounded unit work or `planning_only` when the user only needs sequencing, and explain that direct execution would normally fit a tiny edit.
|
|
12
|
+
|
|
7
13
|
## The agent cannot find the skills
|
|
8
14
|
|
|
9
15
|
Run:
|
|
@@ -23,10 +29,38 @@ Use `.workflow/GOAL-STATE.md` or a workflow continuation document. The superviso
|
|
|
23
29
|
|
|
24
30
|
Use `$workflow-docs` with a minimal artifact request. The skill must reject "create every document just in case."
|
|
25
31
|
|
|
32
|
+
## Large backlogs run slowly or exhaust memory
|
|
33
|
+
|
|
34
|
+
Use `lean_work_unit_runner` instead of `strict_full_workflow` when the source already contains clear work units and the user's priority is throughput. Keep one compact ledger with `id`, `source_ref`, `scope`, `done`, `check`, `status`, touched surfaces, and blockers. Run one unit at a time by default, avoid subagents unless explicitly authorized, avoid broad scans unless required for the current unit, and checkpoint by batch rather than rewriting full workflow docs after every unit.
|
|
35
|
+
|
|
36
|
+
Do not remove work units to make the process lean. If a unit cannot name its boundary, done signal, or targeted check, mark it `blocked` or escalate that unit to strict mode.
|
|
37
|
+
|
|
38
|
+
## Native subagents remain open after completion
|
|
39
|
+
|
|
40
|
+
Treat this as a lifecycle bug, not a cosmetic cleanup task. A terminal report or completed notification does not close a native Codex subagent. Record every native worker id in `WORKER-MAP.md`, call the native close action such as `close_agent` after the terminal report or blocker is captured, and block the final outcome if any native worker lacks a close result. Prefer one-shot portable delegation when it satisfies the work.
|
|
41
|
+
|
|
42
|
+
## Unsupported gauntlet summaries are used as proof
|
|
43
|
+
|
|
44
|
+
Unsupported external gauntlet summaries are not validation evidence. Treat them as raw leads only unless they preserve per-scenario reports, commands, artifacts, and expected outcomes that another maintainer can inspect. Use repo-native tests, fixtures, `npm run validate`, and live adapter probes such as `workflow-supervisor delegate-doctor --agent all --probe --require-pass` for real confidence.
|
|
45
|
+
|
|
26
46
|
## Verification rubber-stamps the result
|
|
27
47
|
|
|
28
48
|
Use `$acceptance-matrix` for formal evidence rows. A PASS requires row-by-row evidence or explicit waiver evidence.
|
|
29
49
|
|
|
50
|
+
## Outcome evidence is only inferred
|
|
51
|
+
|
|
52
|
+
Use row-level `CONDITIONAL_PASS` only when the strongest available checks strongly infer the expected outcome but cannot fully observe it. Record the missing capability, limitation, and required external check. Do not roll that row into a final PASS unless the user explicitly accepts the limitation as a waiver or narrowed scope.
|
|
53
|
+
|
|
54
|
+
## Browser snapshots are unavailable
|
|
55
|
+
|
|
56
|
+
Browser snapshots are a verifier adapter, not the core verification model. If browser, screenshot, Playwright, Storybook, visual diff, or manual-review capability is unavailable, use the strongest available lower-level observable contract such as jsdom render, API probe, state-machine test, file snapshot, route manifest, or static semantic diff inspection. If the source requirement truly depends on browser or visual proof, mark the row BLOCKED or `CONDITIONAL_PASS` with the limitation.
|
|
57
|
+
|
|
58
|
+
## Bug fix passes with only related checks
|
|
59
|
+
|
|
60
|
+
A related build, lint, broad test run, or inspection is not enough for a bug fix or risky behavior change unless it would catch the exact symptom. Add a red-capable feedback loop with the command, artifact, UI state, or manual check that would fail before the fix and pass after it.
|
|
61
|
+
|
|
62
|
+
If no correct test surface exists, record an architecture or verification finding and either block the row or get explicit substitute-evidence waiver from the user. Do not hide this as a skipped check in a PASS report.
|
|
63
|
+
|
|
30
64
|
## A broad roadmap becomes one giant work unit
|
|
31
65
|
|
|
32
66
|
Use the source-requirement coverage gate before work-unit finalization. Every material roadmap item, exit criterion, named integration, and numeric target should be mapped to a unit and acceptance row, explicitly deferred by the user, blocked for a decision, or marked non-material with a reason. Do not accept "future work" or residual risk notes as a substitute for work units.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "workflow-supervisor",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "Portable workflow supervision skills for Codex, Claude Code, and generic agent workspaces.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"repository": {
|
|
@@ -19,9 +19,15 @@
|
|
|
19
19
|
"skills",
|
|
20
20
|
"adapters",
|
|
21
21
|
"schemas",
|
|
22
|
-
"docs",
|
|
22
|
+
"docs/artifacts.md",
|
|
23
|
+
"docs/cli.md",
|
|
24
|
+
"docs/compatibility.md",
|
|
25
|
+
"docs/portable-delegation.md",
|
|
26
|
+
"docs/skill-reference.md",
|
|
27
|
+
"docs/troubleshooting.md",
|
|
23
28
|
"assets",
|
|
24
29
|
"bin",
|
|
30
|
+
"CHANGELOG.md",
|
|
25
31
|
"README.md",
|
|
26
32
|
"LICENSE"
|
|
27
33
|
],
|
|
@@ -114,6 +114,44 @@
|
|
|
114
114
|
"required_commands_or_evidence": {
|
|
115
115
|
"$ref": "#/$defs/stringList"
|
|
116
116
|
},
|
|
117
|
+
"feedback_loop": {
|
|
118
|
+
"type": "object",
|
|
119
|
+
"required": [
|
|
120
|
+
"command_or_evidence",
|
|
121
|
+
"red_capable",
|
|
122
|
+
"exact_symptom_or_behavior",
|
|
123
|
+
"deterministic",
|
|
124
|
+
"expected_runtime",
|
|
125
|
+
"agent_runnable"
|
|
126
|
+
],
|
|
127
|
+
"additionalProperties": true,
|
|
128
|
+
"properties": {
|
|
129
|
+
"command_or_evidence": {
|
|
130
|
+
"type": "string",
|
|
131
|
+
"minLength": 1
|
|
132
|
+
},
|
|
133
|
+
"red_capable": {
|
|
134
|
+
"type": "string",
|
|
135
|
+
"enum": ["yes", "no", "not_applicable"]
|
|
136
|
+
},
|
|
137
|
+
"exact_symptom_or_behavior": {
|
|
138
|
+
"type": "string",
|
|
139
|
+
"minLength": 1
|
|
140
|
+
},
|
|
141
|
+
"deterministic": {
|
|
142
|
+
"type": "string",
|
|
143
|
+
"enum": ["yes", "no"]
|
|
144
|
+
},
|
|
145
|
+
"expected_runtime": {
|
|
146
|
+
"type": "string",
|
|
147
|
+
"minLength": 1
|
|
148
|
+
},
|
|
149
|
+
"agent_runnable": {
|
|
150
|
+
"type": "string",
|
|
151
|
+
"enum": ["yes", "no"]
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
},
|
|
117
155
|
"supervisor_checkpoints": {
|
|
118
156
|
"$ref": "#/$defs/stringList"
|
|
119
157
|
},
|