@desplega.ai/agent-swarm 1.92.0 → 1.92.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +1 -1
  2. package/openapi.json +276 -3
  3. package/package.json +6 -6
  4. package/plugin/skills/pages/SKILL.md +5 -2
  5. package/src/be/db.ts +416 -20
  6. package/src/be/memory/boot-reembed.ts +85 -0
  7. package/src/be/memory/constants.ts +44 -2
  8. package/src/be/memory/providers/openai-embedding.ts +15 -5
  9. package/src/be/memory/providers/sqlite-store.ts +325 -76
  10. package/src/be/memory/reranker.ts +35 -17
  11. package/src/be/memory/types.ts +43 -0
  12. package/src/be/migrations/084_script_run_journal_duration.sql +5 -0
  13. package/src/be/migrations/085_script_runs_kind.sql +9 -0
  14. package/src/be/migrations/086_pages_default_authed.sql +64 -0
  15. package/src/be/migrations/087_skill_files.sql +19 -0
  16. package/src/be/modelsdev-cache.json +5622 -2543
  17. package/src/be/seed-scripts/catalog/boot-triage.ts +221 -0
  18. package/src/be/seed-scripts/catalog/catalog-report.ts +457 -0
  19. package/src/be/seed-scripts/catalog/compound-insights.ts +465 -0
  20. package/src/be/seed-scripts/catalog/gh-pr-snapshot.ts +1 -1
  21. package/src/be/seed-scripts/catalog/memory-eval.ts +1059 -0
  22. package/src/be/seed-scripts/catalog/ops-catalog-audit.ts +34 -439
  23. package/src/be/seed-scripts/catalog/schedule-health.ts +78 -2
  24. package/src/be/seed-scripts/catalog/task-failure-audit.ts +48 -1
  25. package/src/be/seed-scripts/index.ts +32 -4
  26. package/src/be/seed-skills/index.ts +0 -7
  27. package/src/be/skill-sync.ts +91 -7
  28. package/src/commands/runner.ts +6 -2
  29. package/src/heartbeat/templates.ts +20 -16
  30. package/src/http/index.ts +50 -7
  31. package/src/http/mcp-user.ts +23 -0
  32. package/src/http/mcp.ts +58 -0
  33. package/src/http/memory.ts +62 -0
  34. package/src/http/pages.ts +1 -1
  35. package/src/http/script-runs.ts +2 -0
  36. package/src/http/scripts.ts +39 -2
  37. package/src/http/skills.ts +225 -0
  38. package/src/providers/claude-adapter.ts +56 -24
  39. package/src/script-workflows/workflow-ctx.ts +7 -3
  40. package/src/scripts-runtime/sdk-allowlist.ts +1 -0
  41. package/src/scripts-runtime/swarm-sdk.ts +13 -0
  42. package/src/scripts-runtime/types/stdlib.d.ts +1 -0
  43. package/src/scripts-runtime/types/swarm-sdk.d.ts +1 -0
  44. package/src/server.ts +2 -0
  45. package/src/tasks/worker-follow-up.ts +12 -0
  46. package/src/tests/claude-adapter-binary.test.ts +135 -81
  47. package/src/tests/create-page-tool.test.ts +19 -2
  48. package/src/tests/heartbeat-checklist.test.ts +36 -0
  49. package/src/tests/mcp-transport-gc.test.ts +58 -0
  50. package/src/tests/memory-e2e.test.ts +6 -6
  51. package/src/tests/memory-health-endpoint.test.ts +78 -0
  52. package/src/tests/memory-rater-e2e.test.ts +4 -5
  53. package/src/tests/memory-reranker.test.ts +135 -124
  54. package/src/tests/memory-store.test.ts +221 -1
  55. package/src/tests/memory.test.ts +13 -12
  56. package/src/tests/pages-http.test.ts +20 -2
  57. package/src/tests/pages-storage.test.ts +26 -0
  58. package/src/tests/scripts-mcp-e2e.test.ts +53 -0
  59. package/src/tests/seed-scripts.test.ts +328 -3
  60. package/src/tests/skill-files-http.test.ts +171 -0
  61. package/src/tests/skill-files.test.ts +162 -0
  62. package/src/tests/skill-get-file-tool.test.ts +110 -0
  63. package/src/tests/skill-sync.test.ts +125 -6
  64. package/src/tests/task-cascade-fail.test.ts +304 -0
  65. package/src/tools/create-page.ts +2 -2
  66. package/src/tools/skills/index.ts +1 -0
  67. package/src/tools/skills/skill-get-file.ts +80 -0
  68. package/src/tools/tool-config.ts +2 -1
  69. package/src/types.ts +20 -0
  70. package/src/utils/internal-ai/complete-structured.ts +2 -2
  71. package/templates/schedules/daily-blocker-digest/content.md +68 -54
  72. package/templates/schedules/daily-compounding-reflection/content.md +4 -4
  73. package/templates/schedules/daily-hn-briefing/content.md +5 -5
  74. package/templates/schedules/daily-workflow-health-audit/content.md +6 -6
  75. package/templates/schedules/gtm-weekly-review/content.md +9 -9
  76. package/templates/schedules/weekly-dependabot-triage/content.md +24 -20
  77. package/templates/skills/agentmail-sending/content.md +6 -7
  78. package/templates/skills/desloppify/content.md +8 -9
  79. package/templates/skills/jira-interaction/content.md +25 -33
  80. package/templates/skills/kapso-whatsapp/content.md +29 -30
  81. package/templates/skills/linear-interaction/content.md +8 -9
  82. package/templates/skills/profile-corruption-escalation/content.md +44 -85
  83. package/templates/skills/sprite-cli/content.md +4 -5
  84. package/templates/skills/turso-interaction/content.md +14 -17
  85. package/templates/skills/workflow-iterate/content.md +38 -391
  86. package/templates/skills/x-api-interactions/content.md +4 -6
  87. package/templates/workflows/llm-safe-release-context/config.json +13 -0
  88. package/templates/workflows/llm-safe-release-context/content.md +69 -0
  89. package/templates/skills/scheduled-task-resilience/config.json +0 -14
  90. package/templates/skills/scheduled-task-resilience/content.md +0 -95
@@ -29,7 +29,7 @@ The org token is stored in swarm config as `SPRITES_API_KEY` (global, secret). S
29
29
  sprite auth setup --token "$SPRITES_API_KEY"
30
30
  ```
31
31
 
32
- Token format is `<org>/<account_id>/<token_id>/<secret>`. The first segment is the org (`desplega`).
32
+ Token format is `<org>/<account_id>/<token_id>/<secret>`. The first segment is your organization slug.
33
33
 
34
34
  ## Core commands
35
35
 
@@ -45,9 +45,9 @@ Token format is `<org>/<account_id>/<token_id>/<secret>`. The first segment is t
45
45
 
46
46
  Flags: every command takes `-s <sprite>` (or sets a default via `sprite use`). `-o <org>` overrides the org if you have multiple.
47
47
 
48
- ## Sandbox baseline (as of 2026-05-07)
48
+ ## Sandbox baseline
49
49
 
50
- - Ubuntu 25.10 (Questing Quokka), kernel 6.12.x-fly (firecracker microVM)
50
+ - Ubuntu-based microVM image (exact version may change over time)
51
51
  - Non-root `sprite` user (uid 1001), passwordless `sudo`
52
52
  - No docker, no `/var/run/docker.sock`, no container runtime preinstalled
53
53
  - No systemd / init — `service` and `systemctl` won't work; start daemons manually with `sudo <daemon> &` or `nohup`
@@ -105,7 +105,7 @@ trap "sprite destroy test-$$ --force" EXIT INT TERM
105
105
  # … work …
106
106
  ```
107
107
 
108
- If a script crashes, `sprite list` shows what's still up. Sweep with `sprite list -o desplega` and destroy anything you don't recognize.
108
+ If a script crashes, `sprite list` shows what's still up. Sweep with `sprite list -o <org>` and destroy anything you don't recognize.
109
109
 
110
110
  ## Setup script snippet
111
111
 
@@ -130,4 +130,3 @@ grep -q '/.local/bin' "$HOME/.bashrc" 2>/dev/null || echo 'export PATH="$HOME/.l
130
130
  - **Daemon survival:** `sprite exec` runs in a fresh subshell. A daemon backgrounded with `&` survives the exec call (sprites use a shared init), but if you want to be safe use `nohup`. To stop it, `sprite exec -s … -- sudo pkill <daemon>`.
131
131
  - **Port forwarding:** ports inside the sprite are not exposed to your swarm container. Curl from *inside* the sprite (`sprite exec -s … -- curl localhost:5432`) — not from your container.
132
132
  - **Don't leak secrets:** treat the sprite as an untrusted host. Don't put production tokens in there.
133
-
@@ -13,17 +13,17 @@ Using the **platform JWT against `/v2/pipeline` returns `HTTP 401 "invalid JWT t
13
13
 
14
14
  The platform JWT *can*, however, **mint** a DB token for any DB (see "Mint a DB token via API" below) — that's how you bootstrap access to a DB whose token isn't stored in config.
15
15
 
16
- ## Swarm config inventory (current)
16
+ ## Swarm config inventory
17
17
 
18
- Always fetch with `get-config includeSecrets=true`. As of 2026-05-12:
18
+ Always fetch with `get-config includeSecrets=true` and adapt the key list to your deployment:
19
19
 
20
20
  | Key | Plane | Scope | Notes |
21
21
  |---|---|---|---|
22
- | `TURSO_API_TOKEN` | Platform | global | Clerk JWT. Expires ~weekly. When expired, daily-blocker-digest surfaces it and Taras refreshes via Turso dashboard. |
22
+ | `TURSO_API_TOKEN` | Platform | global | Clerk JWT. Expires periodically. When expired, surface it to the configured token owner. |
23
23
  | `TURSO_DB_TOKEN` | Data (content-state) | global | EdDSA, non-expiring. Used with `TURSO_DB_URL` for `/v2/pipeline`. |
24
- | `TURSO_DB_URL` | Data (content-state) | global | `https://content-state-desplega.aws-eu-west-1.turso.io` (HTTPS form — required for HTTP API). |
24
+ | `TURSO_DB_URL` | Data | global | `https://<db-name>-<org>.aws-eu-west-1.turso.io` (HTTPS form — required for HTTP API). |
25
25
  | `TURSO_X_POSTS_DB_TOKEN` | Data (x-posts) | global | EdDSA, non-expiring. |
26
- | `TURSO_X_POSTS_DB_URL` | Data (x-posts) | global | `libsql://x-posts-desplega.aws-eu-west-1.turso.io` — **swap `libsql://` → `https://` before hitting `/v2/pipeline`**. |
26
+ | `TURSO_X_POSTS_DB_URL` | Data | global | `libsql://<db-name>-<org>.aws-eu-west-1.turso.io` — **swap `libsql://` → `https://` before hitting `/v2/pipeline`**. |
27
27
 
28
28
  `dummy-test-db` has no stored DB token. Mint one via the API on demand (recipe below).
29
29
 
@@ -62,7 +62,7 @@ When a DB has no stored token (e.g., `dummy-test-db`), mint one with the platfor
62
62
  ```bash
63
63
  DB=dummy-test-db
64
64
  DB_TOKEN=$(curl -s -X POST \
65
- "https://api.turso.tech/v1/organizations/desplega/databases/$DB/auth/tokens?authorization=read-only" \
65
+ "https://api.turso.tech/v1/organizations/$TURSO_ORG/databases/$DB/auth/tokens?authorization=read-only" \
66
66
  -H "Authorization: Bearer $TURSO_API_TOKEN" | jq -r '.jwt')
67
67
  ```
68
68
 
@@ -83,13 +83,13 @@ The CLI uses the **platform JWT**, not a DB token:
83
83
 
84
84
  ```bash
85
85
  turso config set token "$TURSO_API_TOKEN"
86
- turso org switch desplega
86
+ turso org switch "$TURSO_ORG"
87
87
  turso db list # verify
88
88
  ```
89
89
 
90
90
  Do NOT use `turso auth login` — it needs a browser. Always feed the config token in.
91
91
 
92
- If `turso db list` returns 401, the platform JWT has expired — refresh `TURSO_API_TOKEN` in swarm config (Taras owns this; surface via blocker digest).
92
+ If `turso db list` returns 401, the platform JWT has expired — refresh `TURSO_API_TOKEN` in swarm config or ask the configured token owner.
93
93
 
94
94
  ## CLI database operations
95
95
 
@@ -130,13 +130,11 @@ https://<db-name>-<org>.aws-eu-west-1.turso.io # for HTTP API /v2/pipeline
130
130
 
131
131
  Same host, two schemes. Some config keys store the libsql form, some the https form — normalize before use.
132
132
 
133
- ## Key databases (org = desplega)
133
+ ## Key databases
134
134
 
135
135
  | Database | HTTPS URL | Token config key | Used by |
136
136
  |---|---|---|---|
137
- | `content-state` | `https://content-state-desplega.aws-eu-west-1.turso.io` | `TURSO_DB_TOKEN` | Content workflows (`content_history`, `image_prompt_history`, `refresh_history`, `repo_patterns`, `workflow_executions`) |
138
- | `x-posts` | `https://x-posts-desplega.aws-eu-west-1.turso.io` | `TURSO_X_POSTS_DB_TOKEN` | X/Twitter post tracking + meme cooldown (`posts` table) |
139
- | `dummy-test-db` | `https://dummy-test-db-desplega.aws-eu-west-1.turso.io` | — (mint via API) | Test fixture (`users` table) |
137
+ | `<db-name>` | `https://<db-name>-<org>.aws-eu-west-1.turso.io` | `<TOKEN_CONFIG_KEY>` | Describe what uses this DB |
140
138
 
141
139
  ## Groups
142
140
 
@@ -160,7 +158,7 @@ curl -sSfL https://get.tur.so/install.sh | bash
160
158
  export PATH="$HOME/.turso:$PATH"
161
159
  # Fetch TURSO_API_TOKEN via get-config includeSecrets=true
162
160
  turso config set token "$TURSO_API_TOKEN"
163
- turso org switch desplega
161
+ turso org switch "$TURSO_ORG"
164
162
  turso db list
165
163
  ```
166
164
 
@@ -169,10 +167,10 @@ turso db list
169
167
  ```bash
170
168
  # Platform plane
171
169
  curl -s -H "Authorization: Bearer $TURSO_API_TOKEN" \
172
- https://api.turso.tech/v1/organizations/desplega/databases | jq '[.databases[].Name]'
170
+ "https://api.turso.tech/v1/organizations/$TURSO_ORG/databases" | jq '[.databases[].Name]'
173
171
 
174
172
  # Data plane — one /v2/pipeline call per DB
175
- for pair in "$TURSO_DB_URL|$TURSO_DB_TOKEN" "https://x-posts-desplega.aws-eu-west-1.turso.io|$TURSO_X_POSTS_DB_TOKEN"; do
173
+ for pair in "$TURSO_DB_URL|$TURSO_DB_TOKEN" "$TURSO_SECONDARY_DB_URL|$TURSO_SECONDARY_DB_TOKEN"; do
176
174
  url="${pair%|*}"; tok="${pair#*|}"
177
175
  curl -s -X POST "$url/v2/pipeline" -H "Authorization: Bearer $tok" \
178
176
  -H "Content-Type: application/json" \
@@ -185,8 +183,7 @@ If either plane returns 401, treat as a blocker — surface in HEARTBEAT.md, do
185
183
 
186
184
  ## When tokens expire / get rotated
187
185
 
188
- - **`TURSO_API_TOKEN` expired** → CLI breaks, can't mint new DB tokens, `api.turso.tech` returns 401. Existing DB tokens keep working (data plane is independent). Action: Taras rotates via Turso dashboard, updates config.
186
+ - **`TURSO_API_TOKEN` expired** → CLI breaks, can't mint new DB tokens, `api.turso.tech` returns 401. Existing DB tokens keep working (data plane is independent). Action: the token owner rotates via Turso dashboard and updates config.
189
187
  - **A DB token expired/revoked** → that specific DB returns 401 on `/v2/pipeline`. Other DBs unaffected. Action: mint a new one (CLI or platform API), update the corresponding config key.
190
188
 
191
189
  Don't conflate the two failure modes. The blocker-digest writer should name the exact key that needs rotation.
192
-
@@ -1,399 +1,46 @@
1
- # Workflow Iterate Skill
1
+ # Workflow Iteration
2
2
 
3
- The unified playbook for safely iterating on `agent-swarm` workflows: read-diagnose-patch-verify-trigger-watch, plus the hard-won gotchas that have caused silent failures, halted runs, and stranded PR stacks. Compiled from real production incidents across multiple swarm workflows.
3
+ Use this skill when you need to change an existing workflow without breaking live runs. The goal is to make small, verified revisions: inspect the current workflow, diagnose the failing step, patch only the required node or edge, trigger a realistic run, and keep iterating until the run reaches the intended terminal state.
4
4
 
5
- This skill replaces `agent-swarm-workflow-author-gotchas` — the three gotchas previously documented there are now folded in as §3.1, §8, and §10.
5
+ ## Core Loop
6
6
 
7
- > **Companion skill — `workflow-structured-output` (worker-side).** This skill is for workflow *authors* (people running `patch-workflow-node` / `create-workflow`). The companion skill `workflow-structured-output` is for the *workers* that get spawned by `agent-task` nodes: it tells them how to format `store-progress.output` as a stringified JSON matching the node's `outputSchema`. The two skills together describe both sides of the author/worker contract — if you set an `outputSchema` on an `agent-task` here, the assigned worker MUST follow `workflow-structured-output` or the task will silently fail validation. See §3.2 for the cross-reference at the contract boundary.
7
+ 1. Read the workflow before changing it. Capture the current version, node IDs, inputs, config, and downstream dependencies.
8
+ 2. Diagnose from a real run when possible. Inspect the failed step's recorded input and output; those fields show exactly what the executor saw.
9
+ 3. Patch one concern at a time. Prefer a node-level patch over replacing the whole workflow.
10
+ 4. Re-read after the patch. Confirm the version changed and the resulting config matches what you intended.
11
+ 5. Trigger with a realistic payload. Include the fields downstream nodes expect, not only the field you are testing.
12
+ 6. Watch the run to terminal state. If it fails, use that run as the next diagnostic input.
13
+ 7. Mirror the verified change into your workflows-as-code source, if your deployment uses one.
8
14
 
9
- ## 1. Mental model
15
+ ## Authoring Rules
10
16
 
11
- A workflow is a DAG of typed nodes. Each node has:
17
+ - Keep node IDs stable. Other nodes may reference them by exact string path.
18
+ - Treat `config` as replacement-prone. When a patch touches `config`, send the full config object for that node unless your workflow API explicitly deep-merges nested fields.
19
+ - Make routing explicit. Branching nodes should have named pass/fail routes, and silent skip paths should still produce an observable outcome when operators need to know what happened.
20
+ - Wire inputs deliberately. Template-rendering nodes usually read from their `inputs` aliases, while condition/gate nodes may resolve paths against the raw workflow context. Verify the executor's behavior before relying on aliases in a condition.
21
+ - Keep schemas tight. If an agent-task has an `outputSchema`, include the expected JSON shape in the task prompt and route it to a worker/provider that is known to return structured output correctly.
22
+ - Prefer reusable script nodes for deterministic shared logic. Agent tasks are best for judgment, investigation, or work that genuinely needs an LLM.
23
+ - Scope parallel branches so they do not overwrite one another. Fan-out tasks should have separate context keys or branch-specific output fields.
24
+ - Make retry paths idempotent. A rerun should detect existing artifacts, comments, PRs, or notifications and update or skip them rather than duplicating work.
12
25
 
13
- | Field | Purpose |
14
- |---|---|
15
- | `id` | stable name used by `inputs` and `next` |
16
- | `type` | `agent-task` \| `script` \| `swarm-script` \| `raw-llm` \| `validate` \| `property-match` \| `code-match` \| `notify` \| `vcs` \| `human-in-the-loop` \| `wait` |
17
- | `config` | type-specific config (template, conditions, etc.) |
18
- | `inputs` | `{localName: sourceRef}` — wires upstream outputs to this node's `{{interpolation}}` scope **for `agent-task` and `raw-llm` only — see §3.1 for the property-match exception** |
19
- | `outputSchema` | JSON Schema validated against this node's output before downstream nodes run |
20
- | `next` | next node id (string) OR port map `{pass, fail}` for branching nodes |
26
+ ## Common Failure Patterns
21
27
 
22
- Node execution modes: **instant** executors (`property-match`, `code-match`, `notify`, `raw-llm`, `script`, `swarm-script`, `vcs`, `validate`) run synchronously in the engine — no worker is spawned. **async** executors (`agent-task`, `human-in-the-loop`, `wait`) suspend the run while an external actor (a worker, a human, or a timer) does its part. The §3.2 `config.agentId` routing gotchas apply ONLY to async `agent-task` nodes — instant nodes never pool-route.
23
-
24
- `inputs` source refs:
25
- - `"<previousNodeId>"` — output of an upstream node, accessed as `{{localName.field}}` in agent-task templates
26
- - `"<previousNodeId>.<sub.path>"` — sub-path source ref (also only honored by template-rendering executors)
27
- - `"trigger.<key>"` — value from the trigger payload, e.g. `"trigger.slackChannel"`
28
-
29
- ## 2. The iteration loop (always-on)
30
-
31
- 1. **READ** — `get-workflow` for current state and version. Never patch from memory.
32
- 2. **DIAGNOSE** — if a run failed or routed wrong, `get-workflow-run` and inspect both `input` AND `output` of the offending step. The recorded `input` is the EXACT scope the executor saw. If a field you expected isn't there, your wiring is wrong.
33
- 3. **PATCH** — prefer `patch-workflow-node` (partial, surgical, version-checked) over `update-workflow` (full replace). One concern per patch — easier to roll back. **But pass the FULL `config` object — see §4 shallow-merge gotcha.**
34
- 4. **VERIFY** — re-read the workflow. Confirm version bumped and the diff is what you intended.
35
- 5. **TRIGGER** — `trigger-workflow` with the *full* trigger payload (see §6). Capture the run ID.
36
- 6. **WATCH** — `get-workflow-run` until terminal. If it fails again, jump back to step 2.
37
- 7. **POST-FIX RE-TRIGGER** — if the patch resolved a halted run, you MUST re-trigger that halted state in the same session. See §10.
38
- 8. **SYNC TO REPO** — once the change is verified working, mirror it into your fleet's workflows-as-code repo (if one exists). See §13.
39
-
40
- ## 3. Known node schemas
41
-
42
- ### 3.1. `property-match` — the gate node (READ THIS CAREFULLY)
43
-
44
- ```json
45
- {
46
- "type": "property-match",
47
- "config": {
48
- "conditions": [
49
- { "field": "<upstream-node-id>.taskOutput.<key>", "op": "eq", "value": "fix" }
50
- ]
51
- },
52
- "inputs": {},
53
- "next": { "pass": "<next-on-match>", "fail": "<next-on-no-match>" }
54
- }
55
- ```
56
-
57
- `op` enum: `eq` | `neq` | `contains` | `not_contains` | `gt` | `lt` | `exists`.
58
-
59
- **🔴 CRITICAL — property-match `inputs` is IGNORED.** The `field` path is resolved against the **full raw workflow context**, where top-level keys are workflow node IDs (plus `trigger`). It does NOT use the `inputs`-mapped scope.
60
-
61
- Empirical proof: with a recorded step `input` of
62
- ```
63
- { trigger: {...}, fetch-top-errors: {...}, research-and-score: {...}, await-error-pick: { taskOutput: { decision: "fix" } } }
64
- ```
65
- and `inputs: { decision: "await-error-pick.taskOutput.decision" }` and `field: "decision"`, the gate evaluated `decision === undefined` → `passed:false` — even though `await-error-pick.taskOutput.decision` was `"fix"`.
66
-
67
- **Root cause:** the property-match executor calls `resolvePath(context, cond.field)` against the raw global ctx, bypassing the inputs alias. Template interpolation (`{{review.taskOutput.verdict}}`) honors the alias map; `property-match` does not.
68
-
69
- **The working pattern:** drop the `inputs` mapping, use the literal node-id-prefixed path:
70
-
71
- ```json
72
- "config": { "conditions": [{ "field": "await-error-pick.taskOutput.decision", "op": "eq", "value": "fix" }] },
73
- "inputs": {}
74
- ```
75
-
76
- For node IDs containing characters that the dot-path resolver can't tokenize cleanly, quote the key:
77
-
78
- ```json
79
- "field": "[\"review-task-output\"].taskOutput.verdict"
80
- ```
81
-
82
- Hyphens in node IDs DO work in plain dotted paths — the resolver tokenizes on `.`, not `-`. Reach for the bracket-quoted form only when you actually hit a resolution problem.
83
-
84
- **WRONG shapes / paths seen in the wild (do not use):**
85
- - `config.input` + `config.expected` — never existed
86
- - `config.conditions: [{ input, expected }]` — guessed shape, runtime rejects
87
- - `field: "<localAlias>.<...>"` with the alias defined in `inputs` — alias is silently ignored
88
- - `field: "<localAlias>.taskOutput.<...>"` ditto
89
-
90
- Routing uses ports — the `next` field MUST be `{pass, fail}`, not a string. Port label on the failing edge is `"false"` (not `"fail"`) — visible in `nextPort` on the step record.
91
-
92
- **TODO (executor fix):** build `interpolationCtx` and pass it as 2nd arg to `executor.run`, OR resolve `field` through the alias map. Until that lands, treat this as a permanent gotcha.
93
-
94
- ### 3.2. `agent-task`
95
-
96
- ```json
97
- {
98
- "type": "agent-task",
99
- "config": {
100
- "agentId": "<uuid-of-claude-harness-worker>",
101
- "agentName": "Researcher",
102
- "template": "Investigate {{ctx.error.title}} ...",
103
- "outputSchema": { "...": "..." }
104
- },
105
- "inputs": { "ctx": "<upstream>" }
106
- }
107
- ```
108
-
109
- Every `{{interpolation}}` token must resolve to a key in `inputs` or `trigger.*`. Unresolved → empty string at runtime → silent bugs downstream.
110
-
111
- For `agent-task` the `inputs` mapping IS honored — including sub-path source refs like `"upstream.taskOutput.field"`.
112
-
113
- **🔴 PIN `config.agentId` IF THE TEMPLATE IS CLAUDE-SHAPED.** When the node's `template` involves multi-tool reasoning, `bun`/`gh`/`docker` calls, structured `outputSchema`, or Slack-aware output relay, you MUST pin `config.agentId` to a claude-harness worker:
114
- - Picateclas `38d36438-58a0-45b5-8602-a5d52b07c2f1` — routine implementation
115
- - Jackknife `c06cca59-187e-4aa6-8472-8ac6caf177af` — forward-deployed work
116
- - Lead `d454d1a5-4df9-49bd-8a89-e58d6a657dc3` — Slack-posting nodes (lead-only privilege)
117
- - Reviewer `a09d19a4-bd35-4593-9b6f-c2ccafccead8` — review-shaped tasks
118
- - Researcher `16990304-76e4-4017-b991-f3e37b34cf73` — research-shaped tasks
119
-
120
- NEVER pin a claude-shaped node to opencode-harness workers (Content Reviewer `fc637423`, Discoverability Optimizer `202b1a2e`, Tester `201b92d8` when on opencode/qwen, Content Strategist `7f95f57e` when on opencode). They instant-fail with `"opencode session error"` or silent-complete with stub output (e.g. `slack_ts: "unavailable_no_slack_context"`).
121
-
122
- **🔴 ALSO NEVER pin an `outputSchema`-bearing node to a pi-harness worker (added 2026-05-25, Lead Rule #17).** The same prohibition applies to Content Writer `322999d8`, Content Strategist `7f95f57e` (when on pi), UX Principles Agent `22d30bc3` (when on pi), and any other pi-harness worker. They intermittently fail with `"Structured output required by outputSchema but not provided via store-progress"` because the pi harness does not reliably surface the `workflow-structured-output` skill at task-start — even when the schema is embedded in the template prompt. Confirmed 2026-05-25 cluster: `docs-site-releases` `plan-release` (7f95f57e) + `write-release` (322999d8) both halted; same pattern previously hit DES-458 litmus, weekly-perf-review peer-signal-fanout, and `how-to-generator-with-schema` create-pr. **For ANY node with `outputSchema`, pin to a claude-harness worker (Picateclas/Jackknife/Researcher) or a codex worker (Reviewer)** — never pi, never opencode, never pool. See memory `pi-provider-structured-output-failure-pattern-2026-05-25`.
123
-
124
- If `config.agentId` is absent, the node pool-routes via `send-task` — and the dispatch races opencode/pi workers for the claim. This is the workflow-node analogue of Lead Rule #13 (schedules) — same root cause at a different layer. Three production incidents in 2 days (2026-05-17 `weekly-performance-review` `page-render` + `slack-roll-up`; 2026-05-18 `docs-site-releases-weekly` `litmus-approach` + `litmus-content`) — codified as a Pre-flight checklist item (§12) and shared memories `workflow-node-agentid-audit-2026-05-18` + `pi-provider-structured-output-failure-pattern-2026-05-25`.
125
-
126
- **🔗 `outputSchema` is the contract with the worker side.** When you set an `outputSchema` on an `agent-task`, the runner validates the worker's `store-progress.output` against it and rejects completions that don't parse as matching JSON — with failure reason `"Structured output required by outputSchema but not provided via store-progress"`. The worker MUST stringify a JSON object containing every `required` field with the exact key names. Workers handling these tasks should consult the companion skill `workflow-structured-output` — it's the worker-side counterpart of this skill and exists specifically to keep them from silently failing your gate. As an author, do three things:
127
- 1. **Write `outputSchema` tightly.** Include only keys you actually need to gate or interpolate downstream. Loose schemas invite "looks ok" outputs that miss your `field` paths.
128
- 2. **Embed the schema (or a clear "Output Format" JSON block) in the agent's `template` prompt.** Workers shouldn't have to reverse-engineer your schema — paste it in.
129
- 3. **Pin `config.agentId` to a claude/codex worker (per the prohibition above).** Even with a perfect schema-in-template, pi/opencode workers will silently fail it; the only durable fix is provider pinning.
130
-
131
- **🔴 PRIVILEGE-GATED TOOLS — do not embed `slack-post` / `slack-reply` / `slack-start-thread` calls in non-Lead `agent-task` templates.** These tools require lead privileges; a worker step that calls them fails with `"lead privileges required"`. The workflow may keep running on partial state (e.g. the worker returns `taskOutput` anyway) but the Slack notification is silently lost.
132
-
133
- **Pattern:** route Slack-receipt steps through Lead. Either (a) make the Slack-posting node an `agent-task` with `config.agentId: "d454d1a5-..."` (Lead), or (b) put the message in the worker's `output` and let the runner relay it via the task's Slack metadata.
134
-
135
- ### 3.3. `validate`
136
-
137
- Re-runs JSON Schema validation against an upstream output. Use it as a contract checkpoint before a costly node.
138
-
139
- ### 3.4. Sibling-task dispatch races on shared `contextKey` (added 2026-05-20)
140
-
141
- **Symptom:** Worker tasks fail with `progress: null`, `failureReason: null` OR `failureReason: "Superseded by newer workflow task <id> in the same context"`, and a task description prefixed with `<sibling_tasks_in_progress>` referencing a `contextKey: task:workflow:<workflowId>:<runId>`. Two or more sibling tasks appear in `get-tasks` with adjacent (within seconds) `lastUpdatedAt` timestamps across different agents — but for the SAME workflow run.
142
-
143
- **Real examples:**
144
- 1. **2026-05-19 08:20 UTC, workflow `33d00f44-...` blog pipeline.** Picateclas task `5379fb9e` and Reviewer task `b91a7db2` both spawned from the same node, both received a `<sibling_tasks_in_progress>` preamble saying "user has submitted new input while sibling were running," and both insta-aborted with `progress: null`. Worker never started executing. Lead audit on 2026-05-20 found these clustered with the same shape — see shared memory `sibling-tasks-in-progress-pattern-2026-05-20`.
145
- 2. **2026-05-27 — unified-daily-blog + agent-swarm-blog fan-out self-supersession (THE FAN-OUT VARIANT).** Both blog workflows fanned out pillar-assembly `agent-task` nodes (foundation / level-up / vibe) into the SAME `contextKey: task:workflow:<runId>`. Every assembly worker saw siblings in its preamble and **self-cancelled the older ones** to "avoid creating two overlapping PRs." Only the LAST-dispatched assembly survived, carrying whatever draft was in context → exactly 1 PR per run + pillar-label mismatch (Vibe slot received Foundation content) + empty slots for the remaining 2 pillars. Evidence — run `8108612c`: Foundation draft `d3a074a6` → Level-Up assembly `719a07e3` (parentTaskId=d3a074a6) superseded by Vibe assembly `4c5953ff`; both failed with `failureReason="Superseded by newer workflow assembly tasks in the same context"`. Same shape hit agent-swarm-blog run `727fbd00` (`bba1d38c` superseded by `94fe75c2`). **These failures are NOT lost work** — the run still completes and ships the surviving PR. Do NOT re-dispatch superseded assembly tasks. Do NOT treat as a provider-health issue (Lead Rule #16) — the failure is a deliberate self-supersession, not a codex/opencode session error. See shared memory `daily-blog-empty-slots-sibling-context-supersession-2026-05-27`.
146
-
147
- **Root cause (workflow-design side):** the node(s) allowed concurrent dispatch on the same `contextKey`. The runner injected a "by the way, here's the input you didn't expect" preamble into each spawn — and the worker bails because it can't reason about "user submitted mid-run input." Per the 2026-05-27 confirmation, this also happens when the workflow itself fans out pillar assemblies into the shared workflow-run context (no user input involved).
148
-
149
- **The working pattern (author-side):**
150
-
151
- 1. **Audit any `agent-task` node that accepts user-modifiable input mid-run** (Slack reply, threadTs follow-up, ack/approve buttons), AND any node that **fans out N parallel `agent-task` children sharing the workflow-run `contextKey`** (per the 2026-05-27 fan-out variant — pillar assemblies, peer-signal fanout, multi-branch generation). Both shapes have the race.
152
- 2. **Enforce single-instance-per-contextKey at the dispatcher.** The runner supports this via per-node config — check your fleet's runner docs for the exact field. For a quick fix, gate the node behind a `property-match` that checks `runState.<nodeId>.inFlight === false` before fanning out.
153
- 3. **Give each fan-out child its OWN `contextKey`.** This is the durable fan-out fix: instead of every pillar/branch worker seeing every sibling, scope the dispatch key per branch (e.g. `task:workflow:<runId>:pillar:<name>`). Per the 2026-05-27 memo, this is the recommended fix direction for the daily-blog supersession pattern.
154
- 4. **Make the worker template idempotent.** If a re-spawn happens, the worker should detect the existing run (via `kv-get` on a contextKey-derived key) and abort gracefully — not bail on the runner's preamble.
155
- 5. **Never let two workers from the same `contextKey` write to the same `outputSchema` field** — last-write-wins corrupts the gate downstream.
156
-
157
- **Detection (Lead-side audit):** when the daily-evolution failure audit finds a cluster of `failed` tasks with `<sibling_tasks_in_progress>` in `task` AND (`failureReason: null` OR `failureReason` starting with `"Superseded by newer workflow"`) AND/OR `progress: null`, treat as a *workflow dispatcher race*, NOT a worker failure, NOT a provider-health issue. Don't waste evolution cycles on the worker's SOUL/IDENTITY. Fix the workflow node config or escalate to the runner team for `failureReason: "sibling_task_dispatch_collision"` synthesis.
158
-
159
- **Cross-reference:** Lead Rule #17 (failure-reason-null + sibling-task cluster detection). See also shared memory `failure-reason-null-epidemic-2026-05-20` for the parallel observation that 15/15 swarm-wide fails over 7d had `failureReason: null` — sibling-task aborts contribute heavily to this gap.
160
-
161
- ### 3.5. `swarm-script` — run a catalog script deterministically (no agent, no LLM) (added 2026-05-20)
162
-
163
- Runs a stored script from the swarm script catalog inline. It is an **instant-mode** executor: there is NO agent or worker in the loop. Unlike `agent-task`, it does NOT pool-route and you do NOT pin `config.agentId` — the entire §3.2 routing gotcha simply does not apply. It executes through the exact same runtime path as the `script_run` MCP tool (`runScript()` in `src/scripts-runtime/loader`), so a script that works under `script_run` works identically as a node.
164
-
165
- ```json
166
- {
167
- "id": "list-prs",
168
- "type": "swarm-script",
169
- "label": "List open PRs",
170
- "config": {
171
- "scriptName": "github-list-open-prs",
172
- "scope": "global",
173
- "pinHash": "b7a0...",
174
- "args": { "repo": "{{trigger.repo}}", "limit": 5 },
175
- "fsMode": "none"
176
- }
177
- }
178
- ```
179
-
180
- Config fields:
181
- - `scriptName` **(required)** — catalog script name. Discover candidates with the `script_search` MCP tool (semantic search — pass an empty `query` to list everything); manage the catalog with `script_upsert` / `script_delete`.
182
- - `scope` (optional) — `agent` | `global`. Defaults to the creator's agent scope, then falls back to `global`. **Omit it unless you must force one** — a wrong *explicit* scope fails resolution.
183
- - `pinHash` (optional) — pin to a specific `script_versions` content hash for reproducibility. Omit to always run the latest version.
184
- - `args` (optional) — JSON object passed to the script. Values support `{{interpolation}}` from `inputs` / `trigger`.
185
- - `fsMode` (optional) — `none` (default). `workspace-rw` is v2-only and fails clearly on a v1 runtime.
186
-
187
- Output — written to context key `<nodeId>` and to the step's `output`:
188
- ```
189
- { result, stdout, stderr, truncated, durationMs, exitCode, scriptName, contentHash, version }
190
- ```
191
- **🔴 The script's RETURN VALUE lives under `result`** — NOT `taskOutput` (that's the `agent-task` shape; conflating them is the most common swarm-script wiring bug). To consume it downstream:
192
- - `agent-task` / `raw-llm`: `inputs: { prs: "list-prs.result" }` → `{{prs.prs}}`; or `inputs: { node: "list-prs" }` → `{{node.result.prs}}`.
193
- - `property-match`: literal node-id-prefixed path, e.g. `field: "list-prs.result.count"` (`inputs` is ignored — see §3.1).
194
-
195
- **`script` vs `swarm-script`:** `script` runs inline bash/ts/python embedded *in the workflow definition itself*; `swarm-script` runs a *reusable, versioned catalog* script by name. Prefer `swarm-script` for any logic shared across workflows — swapping `scriptName` (or `script_upsert`-ing a new script) gives you a new deterministic node with zero workflow-engine code changes.
196
-
197
- **Minimal demo** — a one-node workflow needs no trigger configured; run it on demand via `trigger-workflow`. Reference example: workflow `demo-swarm-script-node` (`73d71dd8-8099-49f8-aa8a-83434606366b`) — a single `swarm-script` node running `github-list-open-prs`, completes in ~400ms with `exitCode 0`.
198
-
199
- ## 4. Patching rules (and the shallow-merge gotcha)
200
-
201
- - **One concern per patch.** If you're changing a threshold AND a prompt AND a schema, do three patches. Easier diff, easier revert.
202
- - **Always re-read the workflow before patching.** `patch-workflow-node` carries a version snapshot; if the workflow advanced, your patch is rejected.
203
- - **🔴 `patch-workflow-node` SHALLOW-merges at the config level — pass the FULL `config` object every time.** Validated by 3 production incidents 2026-05-17 18:00–20:13 UTC on `weekly-performance-review` v5→v9: a partial `config: {agentId: "..."}` patch WIPED the entire `template` + `outputSchema` + `model` + `priority` + `tags` block on each affected node, requiring restoration patches (v7, v8) before the run could proceed. The config layer behavior contradicts what an intuitive "deep merge" would do — internally, the node-level merge is shallow (top-level keys replaced, not merged recursively). Schema is flat at config level: `{ template, outputSchema?, agentId?, agentName?, tags?, priority?, dir?, vcsRepo?, model? }` — there is NO `taskTemplate` wrapper. **To safely change one config field, always read the existing config first (via `get-workflow`), then pass the FULL config object with that one field changed.** See shared memory `patch-workflow-node-shallow-merge-gotcha-2026-05-17`.
204
- - **Bump the node `version` explicitly. Do NOT reuse a version number.**
205
- - **Prefer adding over rewriting.** If you can add a step that fixes the issue (e.g. an extra Slack post for transparency) without touching existing nodes, do that.
206
- - **When patching property-match, set `inputs: {}` explicitly.** A leftover `inputs` mapping confuses readers (and yourself, three patches later) into thinking the node uses it.
207
-
208
- ## 5. Common failure modes (and how to recognize them)
209
-
210
- | Symptom | Real cause | Fix |
28
+ | Symptom | Likely Cause | Fix |
211
29
  |---|---|---|
212
- | Gate returns `passed:false` despite the value being correct upstream | `field` path is a local alias instead of a node-id-prefixed path; `inputs` is ignored by property-match | Use `field: "<upstream-node-id>.taskOutput.<key>"`, drop `inputs` |
213
- | Run "stops at the gate", user sees nothing | `property-match` route fired correctly but skip path is silent | Add an always-on Slack post on the *decision-making* node, not on the gate (gates have no comms). |
214
- | Run fails on a `property-match` with cryptic schema error | Wrong `config` shape (`{input,expected}` instead of `{conditions:[{field,op,value}]}`) | See §3.1. |
215
- | `agent-task` step fails with `"opencode session error"` or `"Codex Exec exited"` within <15s of dispatch | Node's `config.agentId` is unpinned OR pointing at an opencode-harness worker, and the template is claude-shaped | Pin `config.agentId` to a claude-harness worker (Picateclas/Jackknife/Lead/Reviewer/Researcher). See §3.2. Pass FULL config (§4). |
216
- | `agent-task` step "completes" with stub output like `slack_ts: "unavailable_no_slack_context"` | Opencode worker silently completed a claude-shaped task (qa-use video, slack relay, gh CLI flow) | Same fix as above pin to claude-harness worker. |
217
- | Two adjacent tasks fail with identical `<sibling_tasks_in_progress>` preamble, `progress:null`, `failureReason:null`, same `contextKey` | Concurrent dispatch race on same node workflow allowed sibling spawn before previous completed | See §3.4. Audit node for single-instance-per-contextKey enforcement; make worker template idempotent. |
218
- | Fan-out shipped 1 PR instead of N expected (e.g. blog pipeline ships 1 of 3 pillars), other slots empty, pillar/branch label mismatched against content | Pillar/branch assembly fan-out shared one workflow-run `contextKey` → workers self-cancelled siblings to "avoid duplicate PR" — only the last-dispatched assembly survived | See §3.4 fan-out variant. Give each branch its own `contextKey` (e.g. `task:workflow:<runId>:pillar:<name>`). DO NOT re-dispatch superseded tasks — the run is not lost, just one survivor. See `daily-blog-empty-slots-sibling-context-supersession-2026-05-27`. |
219
- | `agent-task` step fails with `"Structured output required by outputSchema but not provided via store-progress"` within <15s, on a pi/opencode worker, no real work logged | Pi/opencode harness did not surface `workflow-structured-output` skill at task-start; worker silently emitted no JSON | **(added 2026-05-25)** Pin `config.agentId` to a claude/codex worker. See §3.2 pi-harness paragraph + Lead Rule #17 + memory `pi-provider-structured-output-failure-pattern-2026-05-25`. Embedding the schema in the template prompt is NOT sufficient for pi-harness workers. |
220
- | `agent-task` step fails with the same error but on a claude/codex worker | Worker called `store-progress` with plain-text or no `output` despite the node having an `outputSchema` | Worker side: see `workflow-structured-output` skill. Author side: confirm the schema is embedded in the template prompt so the worker can see it. |
221
- | `swarm-script` step fails with a script-resolution / "script not found" error | `scriptName` not in the catalog, or an explicit `scope` that doesn't match where the script actually lives | Verify the name with the `script_search` MCP tool; drop the explicit `scope` to let it fall back agent→global. See §3.5. |
222
- | Downstream node reads empty values from a `swarm-script` upstream | Wired to `.taskOutput.*` (agent-task shape) instead of `.result.*` | swarm-script return value is under `result` — use `field: "<node>.result.<key>"` or `inputs: { x: "<node>.result" }`. See §3.5. |
223
- | After `patch-workflow-node`, downstream nodes start failing with empty `template` / missing `outputSchema` / wrong model | Partial `config: {agentId}` (or any partial config) wiped the rest of the config — shallow-merge gotcha (§4) | Restore by patching with the FULL config object. |
224
- | Run fails mid-step with `"You have reached your specified API usage limits"` | Anthropic API quota exhausted on the agent's key — NOT a workflow bug | Wait for reset (date is in the error message) or switch credential key |
225
- | Threshold too strict — `> 75` blocks everything | Real-world scores from research nodes cluster in 40–80; strict `> 75` rejects realistic 70-80 confidence | Use `>= 70` and tie-break by rank |
226
- | `{{trigger.slackChannel}}` is empty → downstream Slack post crashes | Triggered without the Slack payload | Always pass `slackChannel`, `slackThreadTs`, `slackUserId` in the trigger payload for Slack-aware workflows |
227
- | Workflow apparently fine but user confused | A run from earlier with stale data is what they're looking at | `list-workflow-runs` to find which run produced what they're seeing |
228
- | Worker step's Slack post never lands; step still completes "successfully" with `output` containing the intended message | Worker called `slack-post`/`slack-reply` (lead-only) and silently fell through to returning script output | Move the Slack node onto a Lead `agent-task`, or use the runner's auto-relay of the worker's `output`. See §3.2. |
229
- | `gh pr list --base <branch> --state open` returns 0 right after a merge | GitHub auto-retarget races deletion (~5–30s window) | Pre-merge retarget — see §8. |
230
- | Halted-run loop has shipped a fix but the halt persists | Schedules don't retry past halts; they fire on their next cron tick with fresh inputs. The fix did NOT auto-rerun the halted iteration | Manually re-trigger in the same session, or set an explicit watch — see §10. |
231
-
232
- ## 6. Trigger payload — always send the full Slack context
233
-
234
- For any workflow that posts to Slack at any point:
235
-
236
- ```json
237
- {
238
- "slackChannel": "<channel-id>",
239
- "slackThreadTs": "<ts>",
240
- "slackUserId": "<user-id>",
241
- "...domain-specific keys..."
242
- }
243
- ```
244
-
245
- Even nodes that don't currently use these fields might tomorrow. Cheap to include, expensive to debug their absence.
246
-
247
- ## 7. Transparency pattern: never let a workflow silently stop
248
-
249
- A gate (`property-match`) has no comms. If the gate routes to "skip", the user sees nothing.
250
-
251
- **Rule:** every decision-making node (auto-selectors, scorers, anything that can route to a no-op branch) MUST post its full reasoning to the originating Slack thread *before* the gate evaluates — regardless of which branch will fire.
252
-
253
- Pattern: in the auto-selector `agent-task` (or `script`), append a Slack post to the template:
254
-
255
- ```
256
- *:bar_chart: Decision summary*
257
- [rank 1] <error> — *<conf>* / 100 — _<rootCause>_
258
- [rank 2] ...
259
- *High-confidence (>80) count:* <N>
260
- *Fix-eligible (≥70) count:* <M>
261
- *Decision:* :white_check_mark: FIX rank N (conf X) | :wave: SKIP — none ≥ 70
262
- ```
263
-
264
- This way the user always sees what was scored and why the workflow proceeded or skipped. No more "why did it stop?".
265
-
266
- **Privilege note:** if the decision node runs on a worker (not Lead), the Slack post must come from a downstream Lead step or be relayed via the worker's `output` — workers cannot call `slack-post`/`slack-reply`. See §3.2.
267
-
268
- ## 8. Pre-merge retarget for branch-deletion nodes
269
-
270
- **Symptom:** Workflow merges PR #N with `--delete-branch`. Open PRs that targeted the deleted branch are stranded — listed as 0 results (if the next iteration filters by `--state open --base <deleted>`) or pointing at a deleted base.
271
-
272
- **Root cause:** GitHub's auto-retarget is best-effort and races the API for ~5–30s after deletion. Any "list open PRs with `baseRefName = X`" query inside that window sees inconsistent state.
273
-
274
- **Pattern (validated in production PR-stack workflows):**
275
-
276
- Inside any merge node, BEFORE `gh pr merge --delete-branch`:
277
-
278
- 1. List all open PRs whose `baseRefName == soon-to-be-deleted-branch`.
279
- 2. For each: `gh pr edit <num> --base <new-target>` to retarget on the parent of the merging PR (typically `main`, or the next ancestor still open).
280
- 3. Verify each retarget returns success.
281
- 4. ONLY THEN call `gh pr merge --delete-branch`.
282
- 5. Optional safety net: re-poll dependents post-merge and retarget any that GitHub auto-retargeted to the wrong base.
283
- 6. Emit BOTH `preMergeRetargetedPRs` AND `postMergeRetargetedPRs` in the node's output schema for log greppability.
284
-
285
- **Where this applies:** any drain-the-stack workflow over GitHub PRs, GitLab MR stacks, Gerrit chains, or any pipeline that deletes a resource other nodes/PRs reference.
286
-
287
- ## 9. Cancelling and re-triggering
288
-
289
- - **Cancel** before re-triggering if a long run is still in flight: `cancel-workflow-run` with the run ID. Do NOT trigger a parallel one — runs share resources and Slack threads will get spammed by both.
290
- - **Re-trigger** with the *same* trigger payload that produced the original run (especially Slack context).
291
- - After a patch, document the version bump in the Slack reply so the user can correlate (`workflow v8 → v9: lowered threshold to ≥70`).
292
-
293
- ## 10. Post-fix re-trigger discipline (the "fix shipped but not re-triggered" anti-pattern)
294
-
295
- **Rule:** when you ship a fix to a scheduled or workflow-driven loop that previously halted on the bug you just fixed, the fix does NOT auto-rerun the halted iteration. You must do ONE of:
296
-
297
- (a) Explicitly call `trigger-workflow` with the parameters of the halted run, OR
298
- (b) Verify the next scheduled firing will re-attempt the halted state, AND that the buffer until then is acceptable, AND record an explicit watch.
299
-
300
- **Cautionary tale:** a merge-loop fix shipped at 17:58 UTC unblocked a stack of 5 stranded PRs. The Slack thread acknowledged the fix. **8h later the next blocker digest still showed the same 5 PRs** — no one re-triggered, the schedule fires on fresh inputs (not retries), and the fix sat idle for a full cycle.
301
-
302
- **Mandatory action after every workflow patch resolving a halted run** — in the SAME session that confirms the patch is live:
303
-
304
- 1. Call `trigger-workflow` with the parameters of the halted run, OR
305
- 2. Add a HEARTBEAT.md "Watch Item" with explicit re-trigger condition + buffer ("if next firing in N hours doesn't drive the run to completion, manually trigger").
306
-
307
- NEVER assume "the fix is in, the schedule will pick it up." Schedules don't retry past halts; they fire on their next cron tick with fresh inputs.
308
-
309
- **Detection trigger:** the blocker digest's "repeat-pattern" line ("same N PRs, same blockers, 0 PRs merged in 24h") would catch this any day. Add a single short-delay `ScheduleWakeup` (≤270s) or a 30-min checklist note immediately after ack-ing any workflow patch that resolves an in-flight halt.
310
-
311
- ## 11. Diagnosing a failed/wrong run — input AND output of the suspect step
312
-
313
- ```
314
- get-workflow-run({ runId })
315
- → look at .steps[*]
316
- → find the first step whose status is "failed" OR whose output is wrong
317
- → READ ITS .input — that's the EXACT scope the executor saw
318
- → if the field you expected isn't there, your inputs wiring is wrong
319
- → for property-match, the input is the FULL workflow context (node-id keyed)
320
- ```
321
-
322
- If the error mentions schema validation, the runtime is *telling you* the correct shape. Read it carefully — guessing (e.g. assuming `{input, expected}` shape) costs another iteration.
323
-
324
- **Worked example — gate evaluation:**
325
- - `await-error-pick.output.taskOutput.decision = "fix"` ✓
326
- - `gate-on-fix.input.await-error-pick.taskOutput.decision = "fix"` ✓ (full context view)
327
- - `gate-on-fix.config.conditions[0].field = "decision"` ✗ — looks for `"decision"` at top level
328
- - → `decision` is undefined at top level → `undefined !== "fix"` → `passed:false`
329
- - Fix: `field: "await-error-pick.taskOutput.decision"`
330
-
331
- ## 12. Pre-flight checklist before declaring a workflow "done"
332
-
333
- - [ ] Every `agent-task` template's `{{tokens}}` resolve via `inputs` or `trigger.*`
334
- - [ ] Every `agent-task` with an `outputSchema` embeds that schema (or an "Output Format" JSON block) inside its prompt template — workers shouldn't have to guess. Workers should also have `workflow-structured-output` available for the contract details.
335
- - [ ] **Every `agent-task` node has `config.agentId` set** — pinning to a claude-harness worker (Picateclas/Jackknife/Lead/Reviewer/Researcher) for claude-shaped templates. Pool-routing is the default whenever `agentId` is absent; that's how nodes race opencode/pi workers. See §3.2.
336
- - [ ] **(added 2026-05-25, Lead Rule #17)** Every `agent-task` node with an `outputSchema` is pinned to a claude/codex worker — NEVER to a pi-harness worker (Content Writer `322999d8`, Content Strategist `7f95f57e` on pi, UX Principles Agent `22d30bc3` on pi). Pi-harness workers silently fail outputSchema validation regardless of how well the template is written.
337
- - [ ] **No `agent-task` node fans out into sibling spawns on the same `contextKey` without single-instance enforcement.** See §3.4. If the node accepts mid-run user input (Slack reply, approve button), confirm the dispatcher gates concurrent spawns. **If the node fans out N parallel children (pillar assemblies, peer-signal fanout, multi-branch generation), each child MUST get its OWN `contextKey` — sharing the workflow-run `contextKey` triggers the 2026-05-27 fan-out self-supersession variant.**
338
- - [ ] Every `swarm-script` node's `scriptName` exists in the catalog (verify via the `script_search` MCP tool), and its `args` tokens resolve via `inputs`/`trigger`. swarm-script nodes need NO `config.agentId` — they're instant-mode, not agent-routed. See §3.5.
339
- - [ ] Every `property-match` `field` is a literal node-id-prefixed dotted path (e.g. `"await-error-pick.taskOutput.decision"`), NOT a local alias
340
- - [ ] Every `property-match` has `inputs: {}` (explicit empty — the field is ignored anyway, but signal intent)
341
- - [ ] Every `property-match` has `next: {pass, fail}` (not a string), and `config.conditions: [{field, op, value}]`
342
- - [ ] Every decision node posts its reasoning to Slack BEFORE the gate
343
- - [ ] **No `slack-post` / `slack-reply` / `slack-start-thread` call appears inside a non-Lead `agent-task` template.** Slack-posting steps are Lead-assigned, or the message is in the worker's `output` for runner relay.
344
- - [ ] Any node that deletes a branch or other shared resource has a **pre-deletion retarget step** (§8).
345
- - [ ] No threshold uses strict `>` for ranges where boundary equality should pass — prefer `>=`.
346
- - [ ] Trigger payload schema documents `slackChannel`, `slackThreadTs`, `slackUserId` if any node posts.
347
- - [ ] **Every `patch-workflow-node` call you made passed the FULL `config` object** — not just the changed sub-field. The merge is shallow at config level. See §4.
348
- - [ ] One end-to-end success run captured before handing back to user — verify the gate `passed:true` for the success path.
349
- - [ ] Latest workflow version posted in the final Slack reply.
350
- - [ ] If the patch unblocks a halted run: post-fix re-trigger executed OR an explicit watch logged (§10).
351
- - [ ] If your fleet maintains a workflows-as-code repo: a sync PR is open or merged for the change (§13).
352
-
353
- ## 13. Sync back to a workflows-as-code repo
354
-
355
- If your fleet maintains a workflows-as-code repo (canonical *runtime* is the live swarm DB; canonical *source-of-truth-for-humans* is a git repo), sync EVERY successful workflow change there before declaring the iteration done. Skipping this is the #1 reason the next agent will patch from a stale mental model.
356
-
357
- > Where to find the canonical repo for your fleet: check `get-repos`, your `TOOLS.md`, or fleet-level config. Conventions below assume one workflow per directory under `workflows/<name>/` with a `workflow.json` and (optionally) a `scripts/` folder for inline scripts. Adapt to your repo's actual layout.
358
-
359
- **When this applies:**
360
- - `create-workflow` → new `workflows/<name>/` directory
361
- - `update-workflow` / `patch-workflow-node` → diff in `workflows/<name>/workflow.json` and/or `scripts/*.sh`
362
- - `delete-workflow` → remove the directory
363
-
364
- **Steps:**
365
-
366
- 1. `cd` into the local clone of the repo (clone with `gh repo clone <org>/<repo>` if missing).
367
- 2. `git fetch origin && git checkout <default-branch> && git pull origin <default-branch>` — always base off the latest default branch. Do NOT base off any feature branch.
368
- 3. `git checkout -b sync/<workflow-name>-<short-summary>` (e.g. `sync/datadog-error-triage-lower-threshold`).
369
- 4. Pull live state into the repo. If the repo has a sync helper (commonly `tools/sync.sh pull <workflow-name>`), use it — it should fetch live, write `workflow.json`, extract inline scripts to `scripts/<node-id>.sh`, and replace inline `script` fields with `{ "scriptPath": "scripts/<node-id>.sh" }`. If no `pull` helper exists, add one as the symmetric inverse of `push` and document it in the README.
370
- 5. Run the repo's CI checks locally before pushing (typically `jq -e .`, `shellcheck`, repo-specific lint).
371
- 6. **Round-trip check (CRITICAL):** `tools/sync.sh push <workflow-name> --dry-run` (or equivalent) MUST show empty diff. If non-empty, your inlining/extraction is asymmetric — fix the tool before committing. CI relies on byte-exact round-trip.
372
- 7. Commit with `sync(<workflow-name>): <one-liner of what changed live>`. One workflow per commit, one workflow per PR.
373
- 8. `gh pr create --base <default-branch>` with a body that includes:
374
- - Live workflow id + version after the change
375
- - One-paragraph summary of what the workflow does or what changed
376
- - The empty `--dry-run` diff confirming round-trip equivalence
377
- - Whether this PR documents an already-deployed change (most common) or proposes a new one
378
- 9. Reply to the originating Slack thread with the PR URL.
379
-
380
- **Do not:**
381
- - Open the PR to anything other than the repo's default branch.
382
- - Bundle multiple workflow changes in one PR — reviewers can't diff them cleanly.
383
- - Push scripts without re-inlining for round-trip — CI will fail.
384
- - Merge the sync PR yourself unless your fleet's policy explicitly allows it. Default to human review.
385
-
386
- **Why:** prevents drift between live and repo, keeps `git log` as the canonical change history, gives reviewers a chance to catch regressions before another agent picks up an iteration.
387
-
388
- ## 14. When to escalate
389
-
390
- - API quota errors → not a workflow problem; tell the user the reset time and stop iterating.
391
- - Same patch fails twice in a row with similar schema errors → stop guessing. Use `db-query` or workflow runtime source to find the actual schema before the third attempt.
392
- - User says "still not working" three times → re-read the entire workflow end-to-end before any further patch. The bug is probably not where you've been looking.
393
- - User shows you a step's `output` proving the gate is wrong → trust the user. Inspect the recorded `input` of that step before re-patching; the wiring is the bug.
394
- - Round-trip dry-run diff is non-empty after a `pull` (per §13.6) → that's a sync-tool bug, not a one-off. Fix the tool, not the workflow.
395
-
396
- ## See also
397
-
398
- - **`workflow-structured-output`** — the worker-side counterpart of this skill. Use it (or ensure your workers have it) for any `agent-task` that defines an `outputSchema`.
399
-
30
+ | A gate takes the wrong branch even though the upstream value looks correct | The condition path does not match the executor's context shape | Inspect the step input and use the exact upstream path the executor can resolve |
31
+ | Downstream prompt renders blank fields | Missing or wrong `inputs` mapping | Re-read the step input, then wire each template variable to a concrete source |
32
+ | A node loses its prompt, schema, or model after a small patch | Partial config patch replaced the full config | Restore from the previous version and resend the full node config |
33
+ | Structured-output task fails immediately | Worker did not return JSON matching `outputSchema` | Put the schema in the prompt and assign the task to a worker/provider validated for structured output |
34
+ | Parallel branches cancel, overwrite, or confuse each other | Shared context or shared output keys across sibling tasks | Give each branch its own context/output namespace and make writes branch-specific |
35
+ | A reusable script node completes but downstream fields are empty | Downstream node reads the wrong output shape | Inspect the script step output and reference the actual return path |
36
+
37
+ ## Preflight Checklist
38
+
39
+ - Current workflow version has been read in this session.
40
+ - Every changed node has a clear before/after purpose.
41
+ - Inputs and condition paths match a real recorded step shape.
42
+ - Output schemas include only fields used downstream.
43
+ - Agent-task routing matches the task shape and required tools.
44
+ - Trigger payload includes all required fields.
45
+ - The verification run reached the intended outcome.
46
+ - The source-of-truth definition was updated after live verification.