oh-my-workflow 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/skill/SKILL.md ADDED
@@ -0,0 +1,491 @@
1
+ ---
2
+ name: oh-my-workflow
3
+ description: Use when a task decomposes into multiple coding-agent CLI calls (claude -p / codex exec) that should run as one structured, schema-gated, journaled workflow — fan-out search, verify-vote, pipeline, or loop-until-dry. Teaches you to author a plain-JS omw script, run it with `omw run`, read the JSONL journal, and repair your own script from structured failures.
4
+ ---
5
+
6
+ # oh-my-workflow (omw)
7
+
8
+ You write a **plain-JS orchestration script**. Its nodes are whole coding-agent
9
+ CLIs you already pay for (`claude -p`, `codex exec`). omw is the thin glue: it
10
+ runs your script, schema-gates each node's output, and journals every step — so
11
+ you can read your own failure and fix your own script. (What's "deterministic" is
12
+ scoped below — the engine's guarantees and `--agent fake`, not your script.)
13
+
14
+ The runtime gives your script exactly **five hooks** (`agent` / `pipeline` /
15
+ `parallel` / `phase` / `log`). That is the entire surface. There is no DSL to
16
+ learn; everything else is ordinary JavaScript control flow.
17
+
18
+ ## When to use this
19
+
20
+ Reach for omw when a task is a **multi-step pipeline over agent calls** that
21
+ benefits from structure you'd otherwise hand-roll:
22
+
23
+ - **Fan-out**: run N independent agent calls concurrently, collect results.
24
+ - **Verify / vote**: produce a finding, then have K independent agents judge it.
25
+ - **Pipeline**: each item flows scope → search → verify → synthesize independently.
26
+ - **Loop-until-dry**: keep spawning finders until a round returns nothing new.
27
+
28
+ You want: bounded concurrency, schema-validated node output with automatic
29
+ node-level retry, a replayable journal, and a `null`-on-failure contract so one
30
+ bad node never crashes the run.
31
+
32
+ **Don't** use omw for a single agent call, or for work that needs a sandbox
33
+ (omw deliberately has none — your script is trusted code), or where a node is a
34
+ single raw LLM API call (that's LangGraph/Mastra territory; an omw node is a
35
+ *whole coding agent*).
36
+
37
+ ## The 30-second free demo (no API key)
38
+
39
+ ```sh
40
+ git clone <repo-url> && cd oh-my-workflow # repo not yet public; fill in the URL
41
+ bun install
42
+ bun src/cli/omw.ts run examples/deep-research --agent fake
43
+ # → {"confirmed":[…],"summary":{…}} exit 0 · no key · no cost · `--agent fake` is deterministic
44
+ ```
45
+
46
+ `--agent fake` is a built-in deterministic adapter: it runs the full spine
47
+ (fan-out + pipeline + a scripted schema-fail→self-repair + a scripted
48
+ timeout→drop) and prints one result JSON. Add `--pretty` to see the phase/fan-out
49
+ tree on stderr. Swap `--agent claude` once you've run `claude login`.
50
+
51
+ > Once published this is `bunx oh-my-workflow run …`; today run the bin directly
52
+ > from a clone as shown above.
53
+
54
+ ---
55
+
56
+ ## The 5 hooks (the entire API)
57
+
58
+ Your script is a module that **default-exports** `async (rt, args) => result`.
59
+ `rt` is the runtime; `args` is whatever `--args '{…}'` passed (parsed JSON).
60
+ The returned value is serialized to stdout as the run's single result JSON.
61
+
62
+ ```ts
63
+ export default async function (rt, args) {
64
+ // rt.agent / rt.pipeline / rt.parallel / rt.phase / rt.log
65
+ return { /* whatever you want on stdout */ };
66
+ }
67
+ ```
68
+
69
+ ### `rt.agent(prompt, opts?) => Promise<result | null>`
70
+
71
+ Runs one coding-agent CLI node. **Never throws.** A terminal failure resolves to
72
+ `null` (and is journaled with a failure `kind`). This is the load-bearing
73
+ **null-contract** — build on it with `filter(Boolean)` and abstain quorums.
74
+
75
+ ```ts
76
+ const out = await rt.agent("SCOPE the question into topics", {
77
+ schema: { type: "object", required: ["topics"], properties: { topics: { type: "array" } } },
78
+ label: "scope", // shows in the journal / --pretty tree
79
+ phase: "Scope", // overrides the ambient phase() for this call
80
+ model: "smart", // tier alias or raw model string, passed to the adapter
81
+ timeoutMs: 120_000, // kill the subprocess after this; failure kind = "timeout"
82
+ cwd: "/path/to/repo", // run the agent in this directory
83
+ maxRetries: 2, // schema-gate retries (default 2 → up to 3 attempts)
84
+ });
85
+ ```
86
+
87
+ - **With `schema`**: omw extracts JSON from the node's text, validates it with
88
+ ajv, and on a mismatch **re-prompts the same node** with the validation errors
89
+ (in-session via the adapter's resume if available, else fresh + error appended)
90
+ up to `maxRetries` times. On success `agent()` returns the **validated object**.
91
+ On exhaustion it returns `null`. **You never see schema noise** — only the
92
+ structured outcome. The schema is plain JSON Schema.
93
+ - **Without `schema`**: one shot; returns the raw text string, or `null` on
94
+ adapter failure.
95
+
96
+ ### `rt.parallel(thunks) => Promise<any[]>` — barrier
97
+
98
+ Runs thunks concurrently, awaits **all** of them. A thunk that throws (or whose
99
+ agent fails) becomes `null` in the result array — the call itself never rejects.
100
+ **`.filter(Boolean)` before using results.** Use only when you need every result
101
+ together (dedup, count, cross-comparison).
102
+
103
+ ```ts
104
+ const results = (await rt.parallel(
105
+ topics.map((t) => () => rt.agent(`SEARCH ${t}`, { schema: S, label: `search:${t}` })),
106
+ )).filter(Boolean);
107
+ ```
108
+
109
+ ### `rt.pipeline(items, ...stages) => Promise<any[]>` — no barrier (default)
110
+
111
+ Runs each item through **all** stages independently. Item A can be in stage 3
112
+ while item B is still in stage 1 — wall-clock is the slowest single chain, not
113
+ the sum of slowest-per-stage. Each stage receives `(prev, item, index)`. A stage
114
+ that throws drops that item to `null` (skips its remaining stages). This is the
115
+ default for multi-stage work; only use `parallel` as a barrier when a stage
116
+ genuinely needs the whole previous result set at once.
117
+
118
+ ```ts
119
+ const verified = (await rt.pipeline(
120
+ found,
121
+ async (f) => {
122
+ const v = await rt.agent(`VERIFY ${JSON.stringify(f)}`, { schema: V });
123
+ return v ? { ...f, ...v } : null; // null → dropped by the filter below
124
+ },
125
+ )).filter(Boolean);
126
+ ```
127
+
128
+ ### `rt.phase(title)` and `rt.log(msg)`
129
+
130
+ `phase` groups subsequent `agent()` calls under a heading in the journal and the
131
+ `--pretty` tree. `log` emits a narration line. Both are side-channel only — they
132
+ never touch stdout.
133
+
134
+ ### Concurrency
135
+
136
+ The runtime bounds concurrency **at the `agent()` boundary** (default 4, set with
137
+ `--concurrency N`). `parallel`/`pipeline` themselves don't take a slot, so you can
138
+ pass hundreds of items — only ~N agent subprocesses run at once; the rest queue.
139
+
140
+ ---
141
+
142
+ ## Pattern templates (copy-paste, then adapt)
143
+
144
+ ### Fan-out (barrier)
145
+
146
+ ```ts
147
+ export default async function (rt, args) {
148
+ rt.phase("Search");
149
+ const hits = (await rt.parallel(
150
+ args.queries.map((q) => () => rt.agent(`SEARCH: ${q}`, { schema: HIT, label: `q:${q}` })),
151
+ )).filter(Boolean);
152
+ return { hits, count: hits.length };
153
+ }
154
+ ```
155
+
156
+ ### Verify-vote with an **abstain quorum**
157
+
158
+ A node that fails returns `null`, i.e. it **abstains** — it does not vote "no".
159
+ Count only real verdicts, and require a quorum of *cast* votes so an all-abstain
160
+ finding doesn't silently survive.
161
+
162
+ ```ts
163
+ async function survives(rt, claim) {
164
+ const votes = (await rt.parallel(
165
+ [1, 2, 3].map(() => () =>
166
+ rt.agent(`Try to REFUTE this claim. Default to refuted=true if unsure: ${claim}`, {
167
+ schema: { type: "object", required: ["refuted"], properties: { refuted: { type: "boolean" } } },
168
+ })),
169
+ )).filter(Boolean); // drop abstainers (null)
170
+ if (votes.length < 2) return false; // quorum: need ≥2 cast votes
171
+ return votes.filter((v) => !v.refuted).length >= 2;
172
+ }
173
+ ```
174
+
175
+ **Fresh context is the point — not self-critique.** Each `rt.agent()` call is a
176
+ brand-new `claude -p` subprocess with no memory of the producer's turn, so a
177
+ verify-vote node judges the claim cold. That is the structural form of Anthropic's
178
+ own guidance for its most capable model: *"Separate, fresh-context verifier
179
+ subagents tend to outperform self-critique"* ([Fable 5 prompting
180
+ guide](https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prompting-claude-fable-5)).
181
+ omw gets it for free — **as long as you keep verification a separate `agent()`
182
+ call.** Do **not** verify by feeding the result back into the producer's own
183
+ session: the schema-gate's in-session self-repair (the `--resume` / `followUp`
184
+ path) deliberately *reuses* the producer's context to fix output **format**, which
185
+ is the exact opposite of fresh-context verification. Use self-repair to make a
186
+ node's JSON valid; use a new `agent()` to judge whether the *content* is true.
187
+ (A **cross-CLI** verifier — a different agent CLI than the producer, so a shared
188
+ memorized shortcut can't survive both — is a natural extension but **not a feature
189
+ today**: omw binds one adapter per run, so per-node verifier selection is future
190
+ work. Verify with fresh same-CLI nodes for now.)
191
+
192
+ ### Gate on evidence, not intent
193
+
194
+ The schema gate can only check the *shape* of what a node returns — so for an
195
+ **action-producing** node (one that builds, runs, edits, or fetches), make the
196
+ schema **`required` the evidence the action leaves behind** (an exit code, a test
197
+ count, a file path, the observed output), not just a plan or a `rationale`
198
+ string. A frontier model can fabricate status ("I'll now run the tests…") with no
199
+ tool call behind it; if "I did X" prose satisfies your gate, the gate verifies
200
+ nothing. Requiring the artifact makes an intent string fail validation — the node
201
+ must actually produce the proof:
202
+
203
+ ```ts
204
+ // ✗ intent-only — "I ran the build" text passes this gate
205
+ const weak = { type: "object", required: ["summary"], properties: { summary: { type: "string" } } };
206
+
207
+ // ✓ evidence-bearing — the node must surface what the action actually produced
208
+ const strong = {
209
+ type: "object",
210
+ required: ["command", "exitCode", "testsPassed", "output"],
211
+ properties: {
212
+ command: { type: "string" },
213
+ exitCode: { type: "number" },
214
+ testsPassed: { type: "number" },
215
+ output: { type: "string" }, // the observed tail, not a claim about it
216
+ },
217
+ };
218
+ const built = await rt.agent("Run the build and the test suite. Report the command, its exit code, the number of passing tests, and the tail of the output.", { schema: strong });
219
+ ```
220
+
221
+ **Executable-evidence verify node** — combine this with fresh-context verification:
222
+ a separate node *runs what the producer built and observes the result* before the
223
+ finding is accepted, rather than judging the producer's description of it.
224
+
225
+ ```ts
226
+ const verified = (await rt.pipeline(
227
+ artifacts,
228
+ async (a) => {
229
+ // a.path was written by an upstream node; this fresh node runs it and reports facts.
230
+ const v = await rt.agent(
231
+ `Run \`${a.runCmd}\` in ${a.path}. Report exitCode and the output tail. Do not fix anything — only observe and report.`,
232
+ { schema: { type: "object", required: ["exitCode", "output"], properties: { exitCode: { type: "number" }, output: { type: "string" } } } },
233
+ );
234
+ return v && (v as { exitCode: number }).exitCode === 0 ? { ...a, ...(v as object) } : null;
235
+ },
236
+ )).filter(Boolean); // anything that didn't run clean abstains
237
+ ```
238
+
239
+ ### Pipeline (no barrier)
240
+
241
+ ```ts
242
+ const out = (await rt.pipeline(
243
+ items,
244
+ (item) => rt.agent(`ANALYZE ${item.id}`, { schema: A, label: `analyze:${item.id}` }),
245
+ (analysis, item) => (analysis ? rt.agent(`SUMMARIZE ${item.id}: ${JSON.stringify(analysis)}`, { schema: S }) : null),
246
+ )).filter(Boolean);
247
+ ```
248
+
249
+ ### Loop-until-dry
250
+
251
+ For unknown-size discovery: keep going until K consecutive rounds find nothing new.
252
+
253
+ ```ts
254
+ const seen = new Set(); const found = []; let dry = 0;
255
+ while (dry < 2) {
256
+ const round = (await rt.parallel(
257
+ FINDERS.map((f) => () => rt.agent(f.prompt, { schema: BUG })),
258
+ )).filter(Boolean);
259
+ const fresh = round.filter((b) => !seen.has(b.key));
260
+ if (fresh.length === 0) { dry++; continue; }
261
+ dry = 0; fresh.forEach((b) => seen.add(b.key)); found.push(...fresh);
262
+ }
263
+ ```
264
+
265
+ ---
266
+
267
+ ## The run → journal → fix loop (this is the UX)
268
+
269
+ ```sh
270
+ bun src/cli/omw.ts run my-workflow.ts --agent claude --args '{"q":"…"}'
271
+ ```
272
+
273
+ - **stdout** = the result JSON, one blob. Pipe it, parse it.
274
+ - **journal** = `.omw/<runId>.jsonl`, one event per line. This is where you read
275
+ *why* a node failed and repair your script.
276
+ - **`--pretty`** = a phase/fan-out tree on **stderr** (never stdout).
277
+
278
+ ### Exit codes
279
+
280
+ | code | meaning | where the detail is |
281
+ |---|---|---|
282
+ | `0` | run completed (node failures are absorbed by the null-contract) | stdout = result JSON |
283
+ | `1` | **script error** — your JS threw, or syntax/load failure | stderr: `{"error":"script_error"\|"load_failed",…}` |
284
+ | `2` | usage error (bad flags) | stderr: usage line |
285
+ | `3` | adapter CLI not on PATH | stderr: `{"error":"adapter_missing","install_hint":…}` |
286
+
287
+ Exit `1` means **your script** threw (an `agent()` returning `null` does *not*
288
+ throw — only your own code does). Exit `0` with fewer results than expected means
289
+ nodes failed and were filtered — read the journal.
290
+
291
+ ### Reading a journal
292
+
293
+ The events are `run_start · phase · agent_start · attempt · agent_end · log ·
294
+ run_end`. A self-repair looks like this (excerpt from a real fake run, `ts`
295
+ fields elided — `search:a` returns invalid JSON, gets re-prompted, recovers):
296
+
297
+ ```jsonl
298
+ {"ev":"agent_start","call":2,"label":"search:a","phase":"Search","adapter":"fake","promptHash":"sha256:…","optsHash":"sha256:…"}
299
+ {"ev":"attempt","call":2,"n":1,"kind":"schema_violation","errors":["/ must have required property 'topic'","/ must have required property 'hits'"],"rawText":"{\"oops\":1}"}
300
+ {"ev":"attempt","call":2,"n":2,"kind":"ok"}
301
+ {"ev":"agent_end","call":2,"ok":true,"result":{"topic":"a","hits":3},"durationMs":0}
302
+ ```
303
+
304
+ A terminal failure carries the **kind** so you know what to fix:
305
+
306
+ ```jsonl
307
+ {"ev":"agent_end","call":3,"ok":false,"kind":"timeout","durationMs":0}
308
+ ```
309
+
310
+ Failure `kind`s on `agent_end`:
311
+
312
+ - **adapter**: `timeout` · `nonzero_exit` · `spawn_failure` (the CLI itself failed)
313
+ - **`refusal`**: the model **declined** the task (a safety/decline outcome — HTTP
314
+ 200, `stop_reason:"refusal"` — not a crash). Detected by the `claude` adapter;
315
+ N/A for `codex` (no distinct refusal signal → it stays `nonzero_exit`). Kept
316
+ separate so an abstain-quorum can treat **declined ≠ failed**: a node that
317
+ *can't* answer and a node that *won't* are different signals, and the journaled
318
+ kind makes *why* a null happened auditable. It still abstains (resolves to
319
+ `null`, dropped by `filter(Boolean)`) — `refusal` is a journaled outcome, never
320
+ a thrown error or a silent pass.
321
+ - **gate**: `no_json` · `schema_violation` (the node never produced valid JSON in
322
+ `maxRetries+1` attempts — `rawText` is journaled so you can see what it said)
323
+ - **`internal_error`**: a bug in omw or your schema (e.g. an invalid JSON Schema
324
+ that won't compile) — distinct from a flaky node, so you don't misdiagnose.
325
+
326
+ `omw replay .omw/<runId>.jsonl [--json]` reconstructs the tree / a stats summary
327
+ from a journal — a read-only **fixture replay** (reading back what a run
328
+ recorded). For *live* resume (re-running nodes whose key changed, reusing the
329
+ cached ones), use `omw run <wf> --resume <journal>` — see Scope below.
330
+
331
+ `omw validate <wf> [--json]` is a pre-flight that loads the module and lints a
332
+ `fake` fixture for the silent-degradation traps (top-level `responses`, a string
333
+ `match`, no rules+default) **without spawning agents** — exit 0 clean, 1 on a
334
+ load/fixture problem. And a node that throws an `internal_error` (e.g. a JSON
335
+ Schema that won't compile) no longer hides behind the null-contract: the run
336
+ escalates to **exit 4** (the partial result still prints to stdout, and a
337
+ `{"error":"internal_error_nodes","calls":[…]}` line goes to stderr), so an author
338
+ bug reads differently from a flaky node abstaining.
339
+
340
+ ---
341
+
342
+ ## Conventions (follow these)
343
+
344
+ 1. **Build on the null-contract.** `agent()` returns `null`, never throws.
345
+ `.filter(Boolean)` after every `parallel`/`pipeline`. For votes, require a
346
+ quorum of *cast* (non-null) results so all-abstain can't pass.
347
+ 2. **Always pass a `schema` when you need structured data.** The gate's
348
+ self-repair is the one genuine differentiator — use it instead of parsing
349
+ prose yourself. Keep schemas tight (`required` + types).
350
+ 3. **Stay deterministic.** Don't branch the *shape* of the run on `Date.now()` /
351
+ `Math.random()` / wall-clock. The resume key is `(callIndex, promptHash,
352
+ optsHash)` (the journaled field is `call`); if a re-run's `agent()` call order shifts, every key shifts and
353
+ resume breaks. Vary content by index, not by randomness. (omw can't *enforce*
354
+ this — no sandbox — so it's a convention you keep; enforcement is v2.)
355
+ 4. **stdout is for the machine.** Return your result; use `rt.log` / `--pretty`
356
+ for humans. Never `console.log` to stdout from a workflow.
357
+ 5. **Ship a `fake` fixture for your example.** Export `const fake` alongside your
358
+ default export so `--agent fake` runs deterministically with no key. The shape:
359
+
360
+ ```ts
361
+ export const fake = {
362
+ // Each rule's `match` is a PREDICATE FUNCTION over the prompt (not a string/regex).
363
+ // `responses` is a cursor that advances per invocation and sticks on the last —
364
+ // so [invalidJSON, validJSON] models a schema self-repair, and a single
365
+ // { fail } models a hard failure. A FakeResponse is { text } (a raw JSON
366
+ // STRING the gate then extracts + validates) or { fail, stderr }.
367
+ rules: [
368
+ { match: (p) => p.includes("SCOPE"), responses: [{ text: '{"topics":["a","b"]}' }] },
369
+ { match: (p) => p.includes("SEARCH a"),
370
+ responses: [{ text: '{"oops":1}', sessionId: "sa" }, { text: '{"topic":"a","hits":3}' }] }, // self-repair
371
+ { match: (p) => p.includes("SEARCH b"), responses: [{ fail: "timeout" }] }, // dropped
372
+ ],
373
+ default: { text: "{}" }, // returned when no rule matches — keep it valid so unmatched nodes don't crash
374
+ };
375
+ ```
376
+
377
+ Common mistake: a top-level `responses` array (instead of `rules`) or a string
378
+ `match` is silently ignored — every node then returns `default` and the demo
379
+ degenerates to an empty result. See `examples/deep-research/workflow.ts` for a
380
+ full working fixture.
381
+
382
+ ---
383
+
384
+ ## Adapters
385
+
386
+ A node is a coding agent driven through its **headless prompt→result CLI**. Only
387
+ agents that expose such a CLI can be nodes.
388
+
389
+ | adapter | status | invoke | structured out | in-session follow-up |
390
+ |---|---|---|---|---|
391
+ | **fake** | built-in, free, deterministic | in-process fixtures | as scripted | yes (fixture) |
392
+ | **claude** | **full** (live-verified, claude 2.1.177) | `claude -p <p> --output-format json` | parse `.result` | `--resume` |
393
+ | **codex** | **experimental** (live-verified, codex 0.137.0) | `codex exec --json -s workspace-write` | last `agent_message` from JSONL | `exec resume` |
394
+ | **pi** | planned | `pi --print` | stdout | — |
395
+ | **kiro** | **not a fit** | — | — | — |
396
+
397
+ > The "in-session follow-up" column is the adapter flag the **schema gate** uses to
398
+ > re-prompt a node in the same session — *not* run-level resume. Run-level resume
399
+ > (skipping unchanged nodes across separate runs) is **v2**; see Honest scope below.
400
+
401
+ - **claude** renames its envelope onto omw's contract (`session_id→sessionId`,
402
+ `total_cost_usd→costUsd`, `duration_ms→durationMs`; `is_error`/non-success
403
+ `subtype` → `ok:false`).
404
+ - **codex** is experimental: it has **no cost field** (tokens only, so `costUsd`
405
+ stays undefined), and its JSONL can include malformed lines under MCP
406
+ (openai/codex#15451) — omw tolerates them line-by-line and fails *actionably*
407
+ (surfacing the reason) rather than returning empty. Default sandbox is
408
+ `workspace-write`.
409
+ - **pi** isn't wired yet (not installed locally → `--agent pi` returns exit 3
410
+ with an install hint). It's a planned experimental adapter.
411
+ - **kiro is excluded on purpose**: its CLI is a VS-Code-based IDE launcher (open
412
+ files, diffs, extensions), with no headless prompt→result interface — so it
413
+ can't be an omw node. The bar for an adapter is a real headless execution CLI.
414
+
415
+ Missing CLI → exit 3 with `install_hint`. Run `--agent fake` any time for the
416
+ free path.
417
+
418
+ ---
419
+
420
+ ## Honest scope — what omw resembles, and what it doesn't
421
+
422
+ omw externalizes a pattern Claude Code uses internally for dynamic workflows
423
+ ("the model authors a deterministic orchestration script on the fly"). It is a
424
+ **faithful reconstruction of that pattern as OSS**, not a decompiled copy and not
425
+ a first/best/moat claim. Where it lands honestly:
426
+
427
+ | | who writes the script | where it runs | a node is | agent-agnostic |
428
+ |---|---|---|---|---|
429
+ | Bernstein, pi-builder | a human, ahead of time | external | varies | — |
430
+ | sub-agents-skills | per-turn routing (no standing script) | in-harness | a subagent | no |
431
+ | Claude Code Workflow | the model, on the fly | sealed sandbox | a subagent (one in-harness agent) | no (Claude only) |
432
+ | **oh-my-workflow** | **the model, on the fly (taught by this skill)** | **external** | **a whole coding-agent CLI** | **yes (claude/codex/…)** |
433
+
434
+ No single shipped project does all three of *(a) host-agent-authored on the fly +
435
+ (b) executed externally via reusable agent CLIs + (c) agent-agnostic*. omw is the
436
+ reference implementation of that **2-of-3 intersection** — plus the schema-gate
437
+ self-repair loop, which is the one piece a "subprocess + for-loop" doesn't have.
438
+
439
+ ### Resemblance ledger (vs the CC dynamic-workflow surface)
440
+
441
+ **✅ Genuinely the same idea** — model-authored plain-JS orchestration; the
442
+ 5-hook shape (`agent`/`pipeline`/`parallel`/`phase`/`log`); `null`-resolution +
443
+ `filter(Boolean)`; schema-forced structured output; a step-by-step journal;
444
+ resume key `(callIndex, promptHash, optsHash)` (frozen and **proven byte-stable**
445
+ across re-runs); **live resume** via `omw run --resume <journal>` — a **per-node
446
+ key match** (cached nodes skip the adapter, `agent_end{cached:true}`; nodes whose
447
+ key changed re-run; verified end-to-end on `--agent fake`).
448
+
449
+ > One honest altitude difference even here: a CC Workflow node is a single
450
+ > in-harness subagent; an **omw node is a whole external coding-agent CLI**
451
+ > subprocess. Same orchestration shape, heavier nodes.
452
+
453
+ **🟡 Designed-but-scoped** —
454
+ - *Determinism enforcement*: CC throws on `Date.now`/`Math.random`; omw treats it
455
+ as a **convention** (no sandbox), so live resume holds **only for workflows that
456
+ keep it**. A guard that *enforces* it in resume mode is v2.
457
+ - *Resume is per-node, not dependency-aware*: it matches `(callIndex, promptHash,
458
+ optsHash)`, so an upstream edit invalidates a downstream node **only if** that
459
+ output is threaded into the downstream prompt/opts. This is deliberate — it
460
+ preserves **parallel/pipeline sibling cache** (independent fan-out nodes aren't
461
+ forced live just because an earlier sibling changed). **The trap**: an omw node
462
+ is a whole coding-agent CLI that works on the **filesystem**, so "node 1 writes
463
+ files, node 2 reads them" is the *normal* coding-agent idiom — not an exotic
464
+ anti-pattern — and that channel is invisible to the key. Edit node 1 → on resume
465
+ it re-runs and writes different files, but node 2 **hits its cache and serves a
466
+ summary of the old files** (silently stale). Remedies: (a) re-run fresh (drop
467
+ `--resume`) when an upstream's filesystem effects changed, or (b) thread a
468
+ content digest of the changed files into the downstream prompt so its hash moves.
469
+ An opt-in `--strict-resume` (prefix truncation: force every node after the first
470
+ key MISS live — correct cascade for *linear* workflows, but over-invalidates
471
+ *parallel* siblings) and a dependency-aware cascade are both **v2** candidates;
472
+ per-node stays the default precisely because it keeps the parallel cache.
473
+
474
+ **❌ Not implemented (CC Workflow has these; omw v1 does not)** — `budget`
475
+ (token-target loops), nested `workflow()` (running another workflow inline), a
476
+ `meta`/`phases` declaration block, `opts.agentType` (custom subagent types),
477
+ `opts.effort`, `run_in_background`, and `isolation: 'worktree'`. Don't write
478
+ scripts that assume these.
479
+
480
+ ---
481
+
482
+ ## Quick reference
483
+
484
+ - Module: `export default async (rt, args) => result` · optional `export const fake`.
485
+ - Path resolves a directory to `workflow.ts` / `workflow.js` / `index.ts` / `index.js`.
486
+ - `omw run <wf> --agent <fake|claude|codex|pi> [--args JSON] [--concurrency N] [--resume <journal.jsonl>] [--pretty]`
487
+ - `omw replay <journal.jsonl> [--json]`
488
+ - `omw validate <wf> [--json]` — pre-flight: load + fake-fixture lint, no agents spawned.
489
+ - exit codes: `0` ok · `1` script/load error · `2` usage · `3` adapter missing · `4` completed but a node hit `internal_error` (author bug; result still on stdout).
490
+ - stdout = result JSON · journal = `.omw/<runId>.jsonl` · `--pretty` tree = stderr.
491
+ - `agent()` never throws → `filter(Boolean)`; quorum of cast votes for verify-vote.
@@ -0,0 +1,146 @@
1
+ // The claude adapter: a node is a whole `claude -p` run, not a single LLM call.
2
+ // It shells out to `claude -p <prompt> --output-format json`, parses the single
3
+ // JSON result object, and renames claude's snake_case fields onto our AgentResult
4
+ // contract (session_id→sessionId, total_cost_usd→costUsd, duration_ms→durationMs;
5
+ // is_error/subtype collapse to ok:false). followUp uses `--resume <sessionId>` to
6
+ // continue the same session for schema-gate self-repair.
7
+ //
8
+ // Spawn is injected so the parse/argv logic is tested without a subprocess or an
9
+ // API call; the default spawn uses Bun.spawn and is exercised live under OMW_LIVE.
10
+
11
+ import type { AgentPort, AgentResult, InvokeRequest } from "./types";
12
+
13
+ const errMsg = (e: unknown): string => (e instanceof Error ? e.message : String(e));
14
+
15
+ /** Map a parsed `claude -p --output-format json` payload onto AgentResult. A
16
+ * non-"result" type, is_error, or a non-success subtype all collapse to a
17
+ * terminal failure with the subtype + model message surfaced for the journal. */
18
+ export function parseClaudeResult(raw: unknown): AgentResult {
19
+ const j = raw as Record<string, unknown> | null;
20
+ const durationMs = Number(j?.duration_ms) || 0;
21
+
22
+ // A safety/decline refusal (stop_reason "refusal") is a journaled DECLINE — not
23
+ // a crash, and not a real answer. Classify it FIRST, before the is_error/subtype
24
+ // envelope checks, so a decline that arrives as subtype:"success" isn't mistaken
25
+ // for an empty success. Carrier per the API docs is stop_reason; subtype is
26
+ // matched defensively. The decline category (stop_details.category) is journaled
27
+ // so the reason stays auditable. Not yet verified against a live CLI refusal.
28
+ if (j?.stop_reason === "refusal" || j?.subtype === "refusal") {
29
+ const detail = typeof j?.result === "string" ? j.result : "";
30
+ const sd = j?.stop_details as { category?: unknown } | undefined;
31
+ const category = typeof sd?.category === "string" ? sd.category : "";
32
+ return {
33
+ ok: false,
34
+ kind: "refusal",
35
+ stderr: `refusal${category ? `(${category})` : ""}: ${detail}`.trim(),
36
+ meta: { durationMs },
37
+ };
38
+ }
39
+
40
+ if (!j || j.type !== "result" || j.is_error === true || j.subtype !== "success") {
41
+ const subtype = (j?.subtype ?? j?.type ?? "unknown") as string;
42
+ const detail = typeof j?.result === "string" ? j.result : "";
43
+ return { ok: false, kind: "nonzero_exit", stderr: `${subtype}: ${detail}`.trim(), meta: { durationMs } };
44
+ }
45
+
46
+ return {
47
+ ok: true,
48
+ text: String(j.result ?? ""),
49
+ meta: {
50
+ durationMs,
51
+ sessionId: j.session_id as string | undefined,
52
+ costUsd: j.total_cost_usd as number | undefined,
53
+ },
54
+ };
55
+ }
56
+
57
+ export type ClaudeSpawnResult = { code: number; stdout: string; stderr: string; timedOut?: boolean };
58
+ export type ClaudeSpawn = (
59
+ args: string[],
60
+ opts?: { cwd?: string; timeoutMs?: number },
61
+ ) => Promise<ClaudeSpawnResult>;
62
+
63
+ export type ClaudeAdapterDeps = {
64
+ spawn?: ClaudeSpawn;
65
+ /** Binary name/path; defaults to "claude" on PATH. */
66
+ bin?: string;
67
+ };
68
+
69
+ /** Default spawn over Bun.spawn. Kills the child after timeoutMs and flags it so
70
+ * the result maps to a `timeout` kind rather than a generic nonzero exit. */
71
+ function defaultSpawn(bin: string): ClaudeSpawn {
72
+ return async (args, opts) => {
73
+ const proc = Bun.spawn([bin, ...args], {
74
+ cwd: opts?.cwd,
75
+ stdout: "pipe",
76
+ stderr: "pipe",
77
+ });
78
+ let timedOut = false;
79
+ let timer: ReturnType<typeof setTimeout> | undefined;
80
+ if (opts?.timeoutMs && opts.timeoutMs > 0) {
81
+ timer = setTimeout(() => {
82
+ timedOut = true;
83
+ proc.kill();
84
+ }, opts.timeoutMs);
85
+ }
86
+ const [stdout, stderr] = await Promise.all([
87
+ new Response(proc.stdout).text(),
88
+ new Response(proc.stderr).text(),
89
+ ]);
90
+ const code = await proc.exited;
91
+ if (timer) clearTimeout(timer);
92
+ return { code, stdout, stderr, timedOut };
93
+ };
94
+ }
95
+
96
+ export function makeClaudeAdapter(deps: ClaudeAdapterDeps = {}): AgentPort {
97
+ const spawn = deps.spawn ?? defaultSpawn(deps.bin ?? "claude");
98
+
99
+ async function run(args: string[], cwd?: string, timeoutMs?: number): Promise<AgentResult> {
100
+ let res: ClaudeSpawnResult;
101
+ try {
102
+ res = await spawn(args, { cwd, timeoutMs });
103
+ } catch (e) {
104
+ // A throw at the spawn boundary IS an adapter failure (e.g. ENOENT).
105
+ return { ok: false, kind: "spawn_failure", stderr: errMsg(e), meta: { durationMs: 0 } };
106
+ }
107
+
108
+ if (res.timedOut) {
109
+ return { ok: false, kind: "timeout", stderr: res.stderr || `timed out after ${timeoutMs}ms`, meta: { durationMs: 0 } };
110
+ }
111
+ if (res.code !== 0) {
112
+ return {
113
+ ok: false,
114
+ kind: "nonzero_exit",
115
+ stderr: res.stderr || res.stdout || `claude exited ${res.code}`,
116
+ meta: { durationMs: 0 },
117
+ };
118
+ }
119
+
120
+ let json: unknown;
121
+ try {
122
+ json = JSON.parse(res.stdout);
123
+ } catch {
124
+ return {
125
+ ok: false,
126
+ kind: "nonzero_exit",
127
+ stderr: `unparseable claude output: ${res.stdout.slice(0, 200)}`,
128
+ meta: { durationMs: 0 },
129
+ };
130
+ }
131
+ return parseClaudeResult(json);
132
+ }
133
+
134
+ return {
135
+ name: "claude",
136
+ invoke(req: InvokeRequest): Promise<AgentResult> {
137
+ const args = ["-p", req.prompt, "--output-format", "json"];
138
+ if (req.model) args.push("--model", req.model);
139
+ return run(args, req.cwd, req.timeoutMs);
140
+ },
141
+ followUp(sessionId: string, prompt: string): Promise<AgentResult> {
142
+ const args = ["-p", prompt, "--resume", sessionId, "--output-format", "json"];
143
+ return run(args);
144
+ },
145
+ };
146
+ }