pi-prompt-template-model 0.6.8 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +56 -0
- package/README.md +115 -3
- package/args.ts +306 -6
- package/examples/best-of-n.md +28 -0
- package/index.ts +533 -18
- package/loop-utils.ts +6 -0
- package/model-selection.ts +12 -4
- package/package.json +6 -5
- package/prompt-execution.ts +2 -3
- package/prompt-loader.ts +550 -8
- package/subagent-runtime.ts +3 -0
- package/subagent-step.ts +52 -27
- package/subagent-widget.ts +158 -68
- package/tool-manager.ts +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,61 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.7.0] - 2026-04-01
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- Added a first-class `bestOfN:` prompt-template authoring surface for compare workflows, with configurable `workers`, `reviewers`, optional `finalApplier`, and nested `bestOfN.worktree` support.
|
|
7
|
+
- Added configurable compare runtime overrides via `--workers`, `--reviewers`, `--workers-append`, `--reviewers-append`, and `--final-applier`.
|
|
8
|
+
- Added `count: N` compare-slot shorthand so one worker or reviewer slot can fan out into repeated identical runs without manual duplication.
|
|
9
|
+
- Added a shipped `/best-of-n` example prompt that demonstrates mixed worker counts, mixed reviewer counts, thinking-level model suffixes, `taskSuffix`, and final real-branch application.
|
|
10
|
+
|
|
11
|
+
### Changed
|
|
12
|
+
- Compare prompts now run as worker fan-out, reviewer fan-in, then an optional final apply step that edits the real branch instead of stopping at recommendation prose.
|
|
13
|
+
- Reviewer defaults now stay findings-only, while `finalApplier` handles winner-picking or synthesis plus best-effort verification on the current branch.
|
|
14
|
+
- Legacy top-level compare frontmatter in templates (`workers`, `reviewers`, `finalApplier`, top-level compare `worktree`) is now rejected in favor of `bestOfN:`.
|
|
15
|
+
- Delegated parallel live rendering now shows richer per-task output, including task index, model, recent tools, and bounded rolling output lines.
|
|
16
|
+
- README and bundled examples were refreshed to match the shipped compare/runtime surface, including `bestOfN`, compare-wide vs slot-level `cwd`, and chain `parallel(...)` `cwd` rules under `worktree: true`.
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
- Compare execution now allows partial success by phase: worker and reviewer phases continue as long as at least one slot succeeds, and `finalApplier` can still fall back to worker-only evidence when every reviewer fails.
|
|
20
|
+
- Preserved successful worker `=== Worktree Changes ===` summaries when building reviewer and final-apply inputs, so downstream compare stages do not lose worktree evidence.
|
|
21
|
+
- Invalid `bestOfN` blocks and legacy compare templates no longer degrade into ordinary runnable prompts after diagnostics; they are skipped entirely.
|
|
22
|
+
- `finalApplier` validation now correctly rejects unsupported `cwd` and `count` fields in both frontmatter and runtime overrides.
|
|
23
|
+
- Corrected remaining docs/tooling drift, including the `run-prompt` unlimited-loop cap text and compare/chain `cwd` wording.
|
|
24
|
+
|
|
25
|
+
## [0.6.10] - 2026-03-30
|
|
26
|
+
|
|
27
|
+
### Added
|
|
28
|
+
- Added lineup-based compare frontmatter for prompt templates: `workers` and `reviewers` slot lists (`agent` or `subagent`, optional `model`, optional `task`, optional `taskSuffix`, optional `cwd`, optional `count`), plus optional `finalApplier` for one final apply step.
|
|
29
|
+
- Added a two-phase compare execution flow for lineup templates: parallel worker phase first, then reviewer phase fed by aggregated worker output.
|
|
30
|
+
- Added runtime lineup override flags for compare templates: `--workers`, `--reviewers`, `--workers-append`, `--reviewers-append`, and `--final-applier`.
|
|
31
|
+
- Added compare lineup `count: N` shorthand so one worker or reviewer slot can expand into repeated identical runs without manually duplicating entries.
|
|
32
|
+
- Added an example compare prompt template under `examples/`: `best-of-n`, intended for manual installation into `~/.pi/agent/prompts/`.
|
|
33
|
+
- Added compare-lineup `subagent` shorthand so prompt authors can use `subagent: true` for default worker/reviewer slots instead of spelling the internal `delegate` / `reviewer` agent names directly.
|
|
34
|
+
|
|
35
|
+
### Changed
|
|
36
|
+
- Prompt-template compare authoring now uses nested `bestOfN:` frontmatter; the loader lowers `bestOfN.workers`, `bestOfN.reviewers`, `bestOfN.finalApplier`, and `bestOfN.worktree` into the existing runtime compare fields and rejects legacy top-level compare fields in templates.
|
|
37
|
+
- Removed fixed three-worker compare assumptions from docs and prompt guidance; compare lineups are now caller-defined and duplicate slots are preserved.
|
|
38
|
+
- The shipped `best-of-n` example now shows mixed workers, mixed reviewers, and an optional final apply phase, while the runtime fallback still stays at one worker when `workers` is omitted.
|
|
39
|
+
- Reviewers now default to findings-only output, and the optional final phase now applies the real-branch patch instead of ending at recommendation prose.
|
|
40
|
+
- Parallel delegated task contract now supports per-task `cwd` passthrough end-to-end across the prompt-template bridge.
|
|
41
|
+
- README now explains same-model vs multi-model best-of-N configuration explicitly.
|
|
42
|
+
|
|
43
|
+
### Fixed
|
|
44
|
+
- Compare execution now uses partial-success-by-phase behavior: worker and reviewer phases continue as long as at least one slot succeeds, and `finalApplier` can fall back to worker-only synthesis if reviewer slots all fail.
|
|
45
|
+
- Compare lineup slots now support `taskSuffix` so shared worker/reviewer instructions can stay in the prompt body while slots add small per-model suffixes such as output-file paths.
|
|
46
|
+
- Documented the current compare worktree constraint explicitly: when `worktree: true` is enabled, all worker slots must resolve to the same `cwd`.
|
|
47
|
+
|
|
48
|
+
## [0.6.9] - 2026-03-28
|
|
49
|
+
|
|
50
|
+
### Added
|
|
51
|
+
- Added `worktree: true` frontmatter and `--worktree` runtime flag for chain templates with parallel steps. When enabled, each parallel subagent runs in its own git worktree to avoid file conflicts during concurrent execution. Requires a chain with at least one `parallel()` step.
|
|
52
|
+
- Added `parallel: N` frontmatter for delegated prompts. This expands one delegated prompt into `N` parallel `pi-subagents` tasks targeting the same agent, with automatic slot headers like `[Parallel subagent 2/3]` prepended to each task.
|
|
53
|
+
|
|
54
|
+
### Changed
|
|
55
|
+
- Bumped `@mariozechner/pi-agent-core`, `@mariozechner/pi-ai`, `@mariozechner/pi-coding-agent`, and `@mariozechner/pi-tui` to `^0.64.0`.
|
|
56
|
+
- Delegated prompt execution now forwards `ctx.signal` into subagent runs so turn cancellation can stop in-flight delegated work for both single delegated prompts and delegated parallel chain steps.
|
|
57
|
+
- `worktree: true` now also works on delegated prompts that use `parallel: N`, not just chain templates with `parallel()` steps.
|
|
58
|
+
|
|
3
59
|
## [0.6.8] - 2026-03-28
|
|
4
60
|
|
|
5
61
|
### Added
|
package/README.md
CHANGED
|
@@ -73,6 +73,7 @@ All fields are optional. Templates that don't use any extension features (no `mo
|
|
|
73
73
|
| `rotate` | `false` | When `true` and looping, cycle through models in the `model` list instead of using fallback semantics. Thinking levels can also be comma-separated to pair with each model. |
|
|
74
74
|
| `fresh` | `false` | When looping, collapse the conversation between iterations to a brief summary instead of carrying the full context forward. Saves tokens on long loops. |
|
|
75
75
|
| `converge` | `true` | When looping, stop early if an iteration makes no file changes. Set `false` to always run every iteration. |
|
|
76
|
+
| `worktree` | `false` | When `true`, parallel delegated work runs in separate git worktrees. Valid on chain templates with `parallel()` steps, on delegated prompts with `parallel: N`, and on compare templates via `bestOfN.worktree`. |
|
|
76
77
|
|
|
77
78
|
### Delegation
|
|
78
79
|
|
|
@@ -80,7 +81,12 @@ All fields are optional. Templates that don't use any extension features (no `mo
|
|
|
80
81
|
|-------|---------|--------------|
|
|
81
82
|
| `subagent` | — | Delegate execution to a subagent instead of running in the current session. `true` uses the default `delegate` agent; a string value like `reviewer` targets that specific agent. Requires [pi-subagents](https://github.com/nicobailon/pi-subagents/). |
|
|
82
83
|
| `inheritContext` | `false` | Only meaningful with `subagent`. When `true`, the subagent receives a fork of the current conversation context instead of starting fresh. |
|
|
83
|
-
| `
|
|
84
|
+
| `parallel` | — | Delegated prompts only. Repeats the same subagent in parallel `N` times. Each copy gets a slot header like `[Parallel subagent 2/3]` prepended to the task. Must be an integer greater than or equal to 2. |
|
|
85
|
+
| `bestOfN` | — | Compare templates only. Nested compare authoring block with `workers`, `reviewers`, optional `finalApplier`, and optional `worktree`. Top-level compare fields are not supported in templates. |
|
|
86
|
+
| `bestOfN.workers` | — | Ordered worker lineup used for the worker phase. Each slot object supports optional `agent`/`subagent`, optional `model`, optional `task`, optional `taskSuffix`, optional `cwd`, and optional `count`. If both `agent` and `subagent` are omitted, the default agent is `delegate`. |
|
|
87
|
+
| `bestOfN.reviewers` | — | Ordered reviewer lineup used after worker aggregation. Slot shape matches workers. If both `agent` and `subagent` are omitted, the default agent is `reviewer`. |
|
|
88
|
+
| `bestOfN.finalApplier` | — | Optional single-slot final apply phase that edits the real branch after reviewers. Supports optional `agent`/`subagent`, optional `model`, optional `task`, and optional `taskSuffix`. If both `agent` and `subagent` are omitted, the default agent is `delegate`. `count` and `cwd` are not supported. Requires `bestOfN.worktree: true` at runtime. |
|
|
89
|
+
| `cwd` | — | Working directory for delegated subagent subprocesses. Must be an absolute path (`~/...` is expanded). Valid with `subagent`, on chain templates as the default cwd for delegated steps, and on compare prompts as the default repo cwd. Worker/reviewer slots can also set their own `cwd` inside `bestOfN.workers` / `bestOfN.reviewers`. |
|
|
84
90
|
|
|
85
91
|
## Model Format
|
|
86
92
|
|
|
@@ -238,6 +244,21 @@ Use url in the prompt to take screenshot: $@
|
|
|
238
244
|
|
|
239
245
|
The subagent process runs with `/tmp/screenshots` as its working directory. Paths must be absolute (`~/...` is expanded). The directory is validated at execution time.
|
|
240
246
|
|
|
247
|
+
To fan the same delegated prompt out to multiple copies in parallel, add `parallel: N`:
|
|
248
|
+
|
|
249
|
+
```markdown
|
|
250
|
+
---
|
|
251
|
+
model: anthropic/claude-sonnet-4-20250514
|
|
252
|
+
subagent: simplifier
|
|
253
|
+
inheritContext: true
|
|
254
|
+
parallel: 3
|
|
255
|
+
worktree: true
|
|
256
|
+
---
|
|
257
|
+
Review changed code and fix any issues found.
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
This expands to three parallel `pi-subagents` tasks targeting the same agent. Each one receives the same rendered prompt plus an automatic slot header like `[Parallel subagent 1/3]`, `[Parallel subagent 2/3]`, and `[Parallel subagent 3/3]` so the body can assign different roles to each copy. `worktree: true` is optional here and gives each parallel run its own git worktree.
|
|
261
|
+
|
|
241
262
|
During execution, a live progress widget appears above the editor showing elapsed time, tool count, token usage, and the current tool. When the run finishes, it's replaced by a completion card with the task preview, tool call history, output, and usage stats.
|
|
242
263
|
|
|
243
264
|
You can override delegation at runtime per invocation with `--subagent`, `--subagent=<name>`, `--subagent:<name>`, or `--cwd=<path>`. `--cwd=<path>` must be absolute after optional `~/...` expansion. Runtime flags take precedence for that invocation only.
|
|
@@ -253,6 +274,85 @@ Two additional runtime flags work for any prompt (not just delegated ones):
|
|
|
253
274
|
/deslop --model=openai/gpt-5.4 --loop 3
|
|
254
275
|
```
|
|
255
276
|
|
|
277
|
+
Compare templates also accept runtime lineup overrides:
|
|
278
|
+
|
|
279
|
+
Prompt-template frontmatter authoring uses `bestOfN:`. Runtime overrides stay on the low-level flags below.
|
|
280
|
+
|
|
281
|
+
- `--workers=<json-array>` / `--reviewers=<json-array>` replace the corresponding frontmatter lineup.
|
|
282
|
+
- `--workers-append=<json-array>` / `--reviewers-append=<json-array>` append to the corresponding lineup.
|
|
283
|
+
- `--final-applier=<json-object-or-one-element-array>` replaces the optional final apply slot.
|
|
284
|
+
|
|
285
|
+
Each worker/reviewer JSON array entry must be an object with either `subagent` or `agent`, plus optional `model`, `task`, `taskSuffix`, `cwd`, and `count`. In worker slots, `"subagent": true` maps to `delegate`. In reviewer slots, `"subagent": true` maps to `reviewer`. `--final-applier=` accepts one slot object (or a one-element array) with `subagent`/`agent`, optional `model`, optional `task`, and optional `taskSuffix`; for this final slot, `"subagent": true` maps to `delegate`, and both `count` and `cwd` are not supported.
|
|
286
|
+
|
|
287
|
+
## Best-of-N Compare Prompt
|
|
288
|
+
|
|
289
|
+
This repo ships one example compare prompt under `examples/`:
|
|
290
|
+
|
|
291
|
+
- `examples/best-of-n.md` installs as `/best-of-n`, runs in the current repo, and shows mixed workers, mixed reviewers, and an optional final apply phase.
|
|
292
|
+
- Smoke test: `/best-of-n smoke test`.
|
|
293
|
+
|
|
294
|
+
Install them manually from this repo checkout (or from the installed package directory):
|
|
295
|
+
|
|
296
|
+
```bash
|
|
297
|
+
PTM_DIR=/path/to/pi-prompt-template-model
|
|
298
|
+
mkdir -p ~/.pi/agent/prompts
|
|
299
|
+
cp "$PTM_DIR/examples/best-of-n.md" ~/.pi/agent/prompts/best-of-n.md
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
After copying the file, restart `pi` if it is already running. The prompt then runs an explicit compare flow:
|
|
303
|
+
|
|
304
|
+
Compare prompt templates are authored under `bestOfN:`. Top-level `workers`, `reviewers`, and `finalApplier` frontmatter fields are rejected with migration diagnostics.
|
|
305
|
+
|
|
306
|
+
1. Worker phase: run the worker lineup in parallel (`context: fork`) so workers generate candidate implementations in temporary worktrees.
|
|
307
|
+
2. Continue as long as at least one worker succeeds. Reviewer slots receive successful worker outputs plus worker/worktree summaries and produce findings only.
|
|
308
|
+
3. Optional final apply phase: if `finalApplier` is configured, run one delegated apply step on the real compare repo (`compareCwd`) to pick a winner or synthesize/cherry-pick and apply the final patch.
|
|
309
|
+
4. If all reviewers fail but `finalApplier` exists, the final apply step still runs with fallback context from workers plus reviewer failure summaries.
|
|
310
|
+
|
|
311
|
+
Worker/reviewer lineups are fully configurable from `bestOfN` frontmatter or runtime overrides, so there is no fixed three-model worker assumption. If a compare prompt omits `bestOfN.workers`, it falls back to one `delegate` worker using the current/main model. If it omits `bestOfN.reviewers`, it falls back to one `reviewer` slot. `bestOfN.finalApplier` is optional, and compare runs reject an effective final applier unless `bestOfN.worktree: true` is set.
|
|
312
|
+
|
|
313
|
+
For same-model best-of-N, use `count: N` on one worker slot:
|
|
314
|
+
|
|
315
|
+
```yaml
|
|
316
|
+
bestOfN:
|
|
317
|
+
workers:
|
|
318
|
+
- model: openai-codex/gpt-5.4:low
|
|
319
|
+
count: 4
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
You can also mix models and give each slot its own count:
|
|
323
|
+
|
|
324
|
+
```yaml
|
|
325
|
+
bestOfN:
|
|
326
|
+
workers:
|
|
327
|
+
- model: openai-codex/gpt-5.4:low
|
|
328
|
+
count: 3
|
|
329
|
+
- model: google/gemini-2.5-pro:medium
|
|
330
|
+
count: 2
|
|
331
|
+
- model: anthropic/claude-sonnet-4-20250514:high
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
Reviewer slots support the same lineup shape, and `bestOfN.finalApplier` is one optional single-slot final apply step:
|
|
335
|
+
|
|
336
|
+
```yaml
|
|
337
|
+
bestOfN:
|
|
338
|
+
reviewers:
|
|
339
|
+
- model: openai-codex/gpt-5.4:low
|
|
340
|
+
count: 2
|
|
341
|
+
- model: google/gemini-2.5-pro:medium
|
|
342
|
+
taskSuffix: Focus on regression risk.
|
|
343
|
+
finalApplier:
|
|
344
|
+
model: anthropic/claude-sonnet-4-20250514:high
|
|
345
|
+
taskSuffix: Apply the final patch on the current branch and report verification.
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
Within compare lineups, omitting both `agent` and `subagent` uses phase defaults: `delegate` in workers, `reviewer` in reviewers, and `delegate` in finalApplier. You can still set explicit `agent` or `subagent` when needed.
|
|
349
|
+
|
|
350
|
+
Explicitly repeating the same slot still works, but `count: N` is the cleaner shorthand when the slot is identical.
|
|
351
|
+
|
|
352
|
+
Within a compare lineup, use `task` for a full per-slot override and `taskSuffix` for a small per-slot append. `taskSuffix` is added after the shared worker task (or after the slot's `task` if you set one), which makes it the better fit for things like per-model output file names.
|
|
353
|
+
|
|
354
|
+
When a compare prompt uses `bestOfN.worktree: true`, all worker slots must resolve to the same `cwd`. Mixed worker `cwd` values are only allowed when worktree isolation is off. Worktree isolation is for the worker phase only; `bestOfN.finalApplier` always applies on the real branch (`compareCwd`).
|
|
355
|
+
|
|
256
356
|
## Loop Execution
|
|
257
357
|
|
|
258
358
|
Run a template multiple times with `--loop`:
|
|
@@ -387,11 +487,22 @@ chain: parallel(scan-frontend, scan-backend) -> consolidate
|
|
|
387
487
|
---
|
|
388
488
|
```
|
|
389
489
|
|
|
390
|
-
Each entry inside `parallel(...)` runs as a delegated subagent task concurrently. Parallel entries can include per-step args (for example `parallel(scan-frontend, scan-backend "auth")`), but per-step `--loop` is not supported inside parallel groups. Nested `parallel(...)` is rejected. Parallel entries must be delegated templates (`subagent: ...` or runtime `--subagent` override)
|
|
490
|
+
Each entry inside `parallel(...)` runs as a delegated subagent task concurrently. Parallel entries can include per-step args (for example `parallel(scan-frontend, scan-backend "auth")`), but per-step `--loop` is not supported inside parallel groups. Nested `parallel(...)` is rejected. Parallel entries must be delegated templates (`subagent: ...` or runtime `--subagent` override). All entries in the same parallel group must resolve to the same `inheritContext` mode. Mixed `cwd` values are allowed normally, but when `worktree: true` is enabled they must all resolve to the same `cwd`.
|
|
491
|
+
|
|
492
|
+
Add `worktree: true` (or `--worktree` at runtime) so each parallel subagent runs in its own git worktree, avoiding file conflicts when agents edit concurrently:
|
|
493
|
+
|
|
494
|
+
```markdown
|
|
495
|
+
---
|
|
496
|
+
chain: parallel(scan-frontend, scan-backend) -> consolidate
|
|
497
|
+
worktree: true
|
|
498
|
+
---
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
`worktree` requires a chain with at least one `parallel()` step. The flag is passed to pi-subagents, which handles worktree creation and cleanup.
|
|
391
502
|
|
|
392
503
|
Steps with a `model` field use their own model. Steps without one inherit a snapshot of whatever model was active when the chain started — not the previous step's model. This keeps behavior deterministic regardless of what earlier steps do.
|
|
393
504
|
|
|
394
|
-
Chain templates support `loop`, `fresh`, `converge`, `restore`, and `cwd` in their frontmatter for controlling the overall execution:
|
|
505
|
+
Chain templates support `loop`, `fresh`, `converge`, `restore`, `worktree`, and `cwd` in their frontmatter for controlling the overall execution:
|
|
395
506
|
|
|
396
507
|
```markdown
|
|
397
508
|
---
|
|
@@ -439,6 +550,7 @@ Parallel groups work in `/chain-prompts` too:
|
|
|
439
550
|
|
|
440
551
|
```
|
|
441
552
|
/chain-prompts parallel(scan-fe, scan-be) -> review
|
|
553
|
+
/chain-prompts parallel(scan-fe, scan-be) -> review --worktree
|
|
442
554
|
```
|
|
443
555
|
|
|
444
556
|
Looping applies to the entire chain:
|
package/args.ts
CHANGED
|
@@ -24,6 +24,27 @@ export interface SubagentOverrideExtraction {
|
|
|
24
24
|
fork?: boolean;
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
+
export interface LineupOverrideSlot {
|
|
28
|
+
agent: string;
|
|
29
|
+
model?: string;
|
|
30
|
+
task?: string;
|
|
31
|
+
taskSuffix?: string;
|
|
32
|
+
cwd?: string;
|
|
33
|
+
count?: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface LineupOverrideAction {
|
|
37
|
+
target: "workers" | "reviewers" | "finalApplier";
|
|
38
|
+
mode: "replace" | "append";
|
|
39
|
+
slots: LineupOverrideSlot[];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface LineupOverrideExtraction {
|
|
43
|
+
args: string;
|
|
44
|
+
actions: LineupOverrideAction[];
|
|
45
|
+
errors: string[];
|
|
46
|
+
}
|
|
47
|
+
|
|
27
48
|
export function extractLoopCount(argsString: string): LoopExtraction | null {
|
|
28
49
|
let loopCount: number | null = null;
|
|
29
50
|
let loopFound = false;
|
|
@@ -165,8 +186,8 @@ export function extractLoopFlags(argsString: string): LoopFlags {
|
|
|
165
186
|
return { args: cleaned.trim(), fresh, converge: !noConverge };
|
|
166
187
|
}
|
|
167
188
|
|
|
168
|
-
|
|
169
|
-
let
|
|
189
|
+
function extractBooleanFlag(argsString: string, flag: string): { args: string; found: boolean } {
|
|
190
|
+
let found = false;
|
|
170
191
|
const tokensToRemove: Array<{ start: number; end: number }> = [];
|
|
171
192
|
|
|
172
193
|
let i = 0;
|
|
@@ -190,14 +211,14 @@ export function extractChainContextFlag(argsString: string): { args: string; cha
|
|
|
190
211
|
while (i < argsString.length && !/\s/.test(argsString[i])) i++;
|
|
191
212
|
const token = argsString.slice(tokenStart, i);
|
|
192
213
|
|
|
193
|
-
if (token ===
|
|
194
|
-
|
|
214
|
+
if (token === flag) {
|
|
215
|
+
found = true;
|
|
195
216
|
tokensToRemove.push({ start: tokenStart, end: i });
|
|
196
217
|
}
|
|
197
218
|
}
|
|
198
219
|
|
|
199
220
|
if (tokensToRemove.length === 0) {
|
|
200
|
-
return { args: argsString.trim(),
|
|
221
|
+
return { args: argsString.trim(), found: false };
|
|
201
222
|
}
|
|
202
223
|
|
|
203
224
|
tokensToRemove.sort((a, b) => b.start - a.start);
|
|
@@ -206,7 +227,17 @@ export function extractChainContextFlag(argsString: string): { args: string; cha
|
|
|
206
227
|
cleaned = cleaned.slice(0, start) + cleaned.slice(end);
|
|
207
228
|
}
|
|
208
229
|
|
|
209
|
-
return { args: cleaned.trim(),
|
|
230
|
+
return { args: cleaned.trim(), found };
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
export function extractChainContextFlag(argsString: string): { args: string; chainContext: boolean } {
|
|
234
|
+
const { args, found } = extractBooleanFlag(argsString, "--chain-context");
|
|
235
|
+
return { args, chainContext: found };
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
export function extractWorktreeFlag(argsString: string): { args: string; worktree: boolean } {
|
|
239
|
+
const { args, found } = extractBooleanFlag(argsString, "--worktree");
|
|
240
|
+
return { args, worktree: found };
|
|
210
241
|
}
|
|
211
242
|
|
|
212
243
|
export function extractSubagentOverride(argsString: string): SubagentOverrideExtraction {
|
|
@@ -290,6 +321,275 @@ export function extractSubagentOverride(argsString: string): SubagentOverrideExt
|
|
|
290
321
|
};
|
|
291
322
|
}
|
|
292
323
|
|
|
324
|
+
function parseLineupOverrideSlots(
|
|
325
|
+
raw: string,
|
|
326
|
+
target: "workers" | "reviewers" | "finalApplier",
|
|
327
|
+
mode: "replace" | "append",
|
|
328
|
+
errors: string[],
|
|
329
|
+
): LineupOverrideAction | undefined {
|
|
330
|
+
const label = `--${target === "finalApplier" ? "final-applier" : `${target}${mode === "append" ? "-append" : ""}`}`;
|
|
331
|
+
if (!raw) {
|
|
332
|
+
errors.push(`Invalid ${label}: expected ${target === "finalApplier" ? "a slot object or a one-element JSON array" : "a JSON array of slot objects"}.`);
|
|
333
|
+
return undefined;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
let parsed: unknown;
|
|
337
|
+
try {
|
|
338
|
+
parsed = JSON.parse(raw);
|
|
339
|
+
} catch (error) {
|
|
340
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
341
|
+
errors.push(`Invalid ${label}: expected valid JSON (${message}).`);
|
|
342
|
+
return undefined;
|
|
343
|
+
}
|
|
344
|
+
const entries = target === "finalApplier"
|
|
345
|
+
? (Array.isArray(parsed)
|
|
346
|
+
? parsed.length === 1
|
|
347
|
+
? parsed
|
|
348
|
+
: null
|
|
349
|
+
: [parsed])
|
|
350
|
+
: (Array.isArray(parsed) && parsed.length > 0 ? parsed : null);
|
|
351
|
+
if (!entries) {
|
|
352
|
+
errors.push(
|
|
353
|
+
target === "finalApplier"
|
|
354
|
+
? `Invalid ${label}: expected a slot object or a one-element JSON array.`
|
|
355
|
+
: `Invalid ${label}: expected a non-empty JSON array.`,
|
|
356
|
+
);
|
|
357
|
+
return undefined;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
const slots: LineupOverrideSlot[] = [];
|
|
361
|
+
for (let i = 0; i < entries.length; i++) {
|
|
362
|
+
const entry = entries[i];
|
|
363
|
+
if (!entry || typeof entry !== "object" || Array.isArray(entry)) {
|
|
364
|
+
errors.push(`Invalid ${label}: slot ${i + 1} must be an object.`);
|
|
365
|
+
return undefined;
|
|
366
|
+
}
|
|
367
|
+
const slot = entry as Record<string, unknown>;
|
|
368
|
+
if (slot.agent !== undefined && slot.subagent !== undefined) {
|
|
369
|
+
errors.push(`Invalid ${label}: slot ${i + 1} cannot combine "agent" and "subagent".`);
|
|
370
|
+
return undefined;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
let agent: string | undefined;
|
|
374
|
+
if (typeof slot.agent === "string" && slot.agent.trim()) {
|
|
375
|
+
agent = slot.agent.trim();
|
|
376
|
+
} else if (slot.agent !== undefined) {
|
|
377
|
+
errors.push(`Invalid ${label}: slot ${i + 1} requires a non-empty string "agent".`);
|
|
378
|
+
return undefined;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
if (!agent && slot.subagent !== undefined) {
|
|
382
|
+
if (slot.subagent === true) {
|
|
383
|
+
agent = target === "reviewers" ? "reviewer" : "delegate";
|
|
384
|
+
} else if (typeof slot.subagent === "string" && slot.subagent.trim()) {
|
|
385
|
+
agent = slot.subagent.trim();
|
|
386
|
+
} else {
|
|
387
|
+
errors.push(`Invalid ${label}: slot ${i + 1} requires "subagent" to be true or a non-empty string.`);
|
|
388
|
+
return undefined;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
if (!agent) {
|
|
393
|
+
errors.push(`Invalid ${label}: slot ${i + 1} requires "agent" or "subagent".`);
|
|
394
|
+
return undefined;
|
|
395
|
+
}
|
|
396
|
+
const model = typeof slot.model === "string" && slot.model.trim() ? slot.model.trim() : undefined;
|
|
397
|
+
const task = typeof slot.task === "string" && slot.task.trim() ? slot.task.trim() : undefined;
|
|
398
|
+
const taskSuffix = typeof slot.taskSuffix === "string" && slot.taskSuffix.trim() ? slot.taskSuffix.trim() : undefined;
|
|
399
|
+
if (target === "finalApplier" && slot.cwd !== undefined) {
|
|
400
|
+
errors.push(`Invalid ${label}: slot ${i + 1} "cwd" is not supported.`);
|
|
401
|
+
return undefined;
|
|
402
|
+
}
|
|
403
|
+
const cwd = typeof slot.cwd === "string" && slot.cwd.trim() ? slot.cwd.trim() : undefined;
|
|
404
|
+
let count: number | undefined;
|
|
405
|
+
if (slot.count !== undefined) {
|
|
406
|
+
if (target === "finalApplier") {
|
|
407
|
+
errors.push(`Invalid ${label}: slot ${i + 1} "count" is not supported.`);
|
|
408
|
+
return undefined;
|
|
409
|
+
}
|
|
410
|
+
const rawCount = slot.count;
|
|
411
|
+
if (typeof rawCount !== "number" || !Number.isInteger(rawCount) || rawCount < 1) {
|
|
412
|
+
errors.push(`Invalid ${label}: slot ${i + 1} "count" must be an integer greater than or equal to 1.`);
|
|
413
|
+
return undefined;
|
|
414
|
+
}
|
|
415
|
+
count = rawCount;
|
|
416
|
+
}
|
|
417
|
+
slots.push({
|
|
418
|
+
agent,
|
|
419
|
+
...(model ? { model } : {}),
|
|
420
|
+
...(task ? { task } : {}),
|
|
421
|
+
...(taskSuffix ? { taskSuffix } : {}),
|
|
422
|
+
...(cwd ? { cwd } : {}),
|
|
423
|
+
...(count !== undefined ? { count } : {}),
|
|
424
|
+
});
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
return { target, mode, slots };
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
interface LineupOverrideFlagSpec {
|
|
431
|
+
flag: string;
|
|
432
|
+
target: "workers" | "reviewers" | "finalApplier";
|
|
433
|
+
mode: "replace" | "append";
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
const LINEUP_OVERRIDE_FLAGS: LineupOverrideFlagSpec[] = [
|
|
437
|
+
{ flag: "--workers-append=", target: "workers", mode: "append" },
|
|
438
|
+
{ flag: "--reviewers-append=", target: "reviewers", mode: "append" },
|
|
439
|
+
{ flag: "--workers=", target: "workers", mode: "replace" },
|
|
440
|
+
{ flag: "--reviewers=", target: "reviewers", mode: "replace" },
|
|
441
|
+
{ flag: "--final-applier=", target: "finalApplier", mode: "replace" },
|
|
442
|
+
];
|
|
443
|
+
|
|
444
|
+
function readQuotedValue(input: string, start: number): { value: string; end: number } | undefined {
|
|
445
|
+
const quote = input[start];
|
|
446
|
+
if (quote !== `"` && quote !== `'`) return undefined;
|
|
447
|
+
|
|
448
|
+
let i = start + 1;
|
|
449
|
+
while (i < input.length) {
|
|
450
|
+
const char = input[i];
|
|
451
|
+
if (char === "\\") {
|
|
452
|
+
i += 2;
|
|
453
|
+
continue;
|
|
454
|
+
}
|
|
455
|
+
if (char === quote) {
|
|
456
|
+
return {
|
|
457
|
+
value: input.slice(start + 1, i),
|
|
458
|
+
end: i + 1,
|
|
459
|
+
};
|
|
460
|
+
}
|
|
461
|
+
i++;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return undefined;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
function readBalancedValue(
|
|
468
|
+
input: string,
|
|
469
|
+
start: number,
|
|
470
|
+
open: string,
|
|
471
|
+
close: string,
|
|
472
|
+
): { value: string; end: number } | undefined {
|
|
473
|
+
if (input[start] !== open) return undefined;
|
|
474
|
+
|
|
475
|
+
let depth = 0;
|
|
476
|
+
let inQuote: string | null = null;
|
|
477
|
+
|
|
478
|
+
for (let i = start; i < input.length; i++) {
|
|
479
|
+
const char = input[i];
|
|
480
|
+
if (inQuote) {
|
|
481
|
+
if (char === "\\") {
|
|
482
|
+
i++;
|
|
483
|
+
continue;
|
|
484
|
+
}
|
|
485
|
+
if (char === inQuote) inQuote = null;
|
|
486
|
+
continue;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
if (char === `"` || char === `'`) {
|
|
490
|
+
inQuote = char;
|
|
491
|
+
continue;
|
|
492
|
+
}
|
|
493
|
+
if (char === open) {
|
|
494
|
+
depth++;
|
|
495
|
+
continue;
|
|
496
|
+
}
|
|
497
|
+
if (char !== close) continue;
|
|
498
|
+
|
|
499
|
+
depth--;
|
|
500
|
+
if (depth === 0) {
|
|
501
|
+
return {
|
|
502
|
+
value: input.slice(start, i + 1),
|
|
503
|
+
end: i + 1,
|
|
504
|
+
};
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
return undefined;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
function readLineupOverrideValue(input: string, start: number): { value: string; end: number } {
|
|
512
|
+
if (start >= input.length) return { value: "", end: start };
|
|
513
|
+
|
|
514
|
+
const bracketed = readBalancedValue(input, start, "[", "]");
|
|
515
|
+
if (bracketed) return bracketed;
|
|
516
|
+
|
|
517
|
+
const braced = readBalancedValue(input, start, "{", "}");
|
|
518
|
+
if (braced) return braced;
|
|
519
|
+
|
|
520
|
+
const quoted = readQuotedValue(input, start);
|
|
521
|
+
if (quoted) return quoted;
|
|
522
|
+
|
|
523
|
+
let end = start;
|
|
524
|
+
while (end < input.length && !/\s/.test(input[end])) end++;
|
|
525
|
+
return {
|
|
526
|
+
value: input.slice(start, end),
|
|
527
|
+
end,
|
|
528
|
+
};
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
function parseLineupOverrideToken(
|
|
532
|
+
input: string,
|
|
533
|
+
start: number,
|
|
534
|
+
): { target: "workers" | "reviewers" | "finalApplier"; mode: "replace" | "append"; raw: string; end: number } | undefined {
|
|
535
|
+
for (const spec of LINEUP_OVERRIDE_FLAGS) {
|
|
536
|
+
if (!input.startsWith(spec.flag, start)) continue;
|
|
537
|
+
const valueStart = start + spec.flag.length;
|
|
538
|
+
const parsedValue = readLineupOverrideValue(input, valueStart);
|
|
539
|
+
return {
|
|
540
|
+
target: spec.target,
|
|
541
|
+
mode: spec.mode,
|
|
542
|
+
raw: parsedValue.value,
|
|
543
|
+
end: parsedValue.end,
|
|
544
|
+
};
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
return undefined;
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
export function extractLineupOverrides(argsString: string): LineupOverrideExtraction {
|
|
551
|
+
const actions: LineupOverrideAction[] = [];
|
|
552
|
+
const errors: string[] = [];
|
|
553
|
+
const tokensToRemove: Array<{ start: number; end: number }> = [];
|
|
554
|
+
|
|
555
|
+
let i = 0;
|
|
556
|
+
while (i < argsString.length) {
|
|
557
|
+
const char = argsString[i];
|
|
558
|
+
|
|
559
|
+
if (char === '"' || char === "'") {
|
|
560
|
+
const quote = char;
|
|
561
|
+
i++;
|
|
562
|
+
while (i < argsString.length && argsString[i] !== quote) i++;
|
|
563
|
+
if (i < argsString.length) i++;
|
|
564
|
+
continue;
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
if (/\s/.test(char)) {
|
|
568
|
+
i++;
|
|
569
|
+
continue;
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
const token = parseLineupOverrideToken(argsString, i);
|
|
573
|
+
if (token) {
|
|
574
|
+
tokensToRemove.push({ start: i, end: token.end });
|
|
575
|
+
const action = parseLineupOverrideSlots(token.raw, token.target, token.mode, errors);
|
|
576
|
+
if (action) actions.push(action);
|
|
577
|
+
i = token.end;
|
|
578
|
+
continue;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
while (i < argsString.length && !/\s/.test(argsString[i])) i++;
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
tokensToRemove.sort((a, b) => b.start - a.start);
|
|
585
|
+
let cleaned = argsString;
|
|
586
|
+
for (const { start, end } of tokensToRemove) {
|
|
587
|
+
cleaned = cleaned.slice(0, start) + cleaned.slice(end);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
return { args: cleaned.trim(), actions, errors };
|
|
591
|
+
}
|
|
592
|
+
|
|
293
593
|
export function splitByUnquotedSeparator(input: string, separator: string): string[] {
|
|
294
594
|
const parts: string[] = [];
|
|
295
595
|
let start = 0;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: Best-of-N parallel implementation compare in the current repo with one openai-codex lineup plus an optional final apply phase (worktree needs a clean repo)
|
|
3
|
+
# Usage: /best-of-n fix the flaky auth test
|
|
4
|
+
# Usage: /best-of-n implement the plan: /path/to/plan.md
|
|
5
|
+
bestOfN:
|
|
6
|
+
# Workers run in temporary worktrees; the final apply step edits the current branch.
|
|
7
|
+
worktree: true
|
|
8
|
+
workers:
|
|
9
|
+
# count means "run this exact slot N times in parallel".
|
|
10
|
+
# So this example below runs 3 spark workers and 2 gpt-5.4-mini workers in parallel.
|
|
11
|
+
- model: openai-codex/gpt-5.3-codex-spark:low
|
|
12
|
+
count: 3
|
|
13
|
+
- model: openai-codex/gpt-5.4-mini:high
|
|
14
|
+
count: 2
|
|
15
|
+
reviewers:
|
|
16
|
+
# All reviewers see the same aggregated successful worker results.
|
|
17
|
+
# count works the same way here: it runs the same reviewer slot multiple times in parallel.
|
|
18
|
+
# taskSuffix appends extra instructions to just that slot without replacing the shared prompt body.
|
|
19
|
+
- model: openai-codex/gpt-5.3-codex-spark:medium
|
|
20
|
+
count: 2
|
|
21
|
+
- model: openai-codex/gpt-5.4-mini:high
|
|
22
|
+
taskSuffix: Focus extra attention on regression risk and missing edge cases.
|
|
23
|
+
finalApplier:
|
|
24
|
+
# The final apply step picks or synthesizes from worker/reviewer findings and applies on the current branch.
|
|
25
|
+
model: openai-codex/gpt-5.4-mini:xhigh
|
|
26
|
+
taskSuffix: Apply the final patch directly on the current branch, run best-effort relevant verification, and report changed files plus verification run.
|
|
27
|
+
---
|
|
28
|
+
$@
|