@bastani/atomic 0.8.6 → 0.8.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/dist/builtin/intercom/package.json +1 -1
  3. package/dist/builtin/mcp/package.json +1 -1
  4. package/dist/builtin/subagents/package.json +1 -1
  5. package/dist/builtin/web-access/package.json +1 -1
  6. package/dist/builtin/workflows/builtin/ralph.ts +368 -52
  7. package/dist/builtin/workflows/package.json +1 -1
  8. package/dist/builtin/workflows/src/extension/index.ts +30 -2
  9. package/dist/builtin/workflows/src/runs/background/status.ts +6 -0
  10. package/dist/builtin/workflows/src/runs/foreground/executor.ts +2 -4
  11. package/dist/builtin/workflows/src/runs/foreground/stage-runner.ts +5 -5
  12. package/dist/builtin/workflows/src/shared/store-types.ts +8 -0
  13. package/dist/builtin/workflows/src/shared/store.ts +39 -4
  14. package/dist/builtin/workflows/src/shared/timing.ts +48 -0
  15. package/dist/builtin/workflows/src/tui/chat-surface-message.ts +21 -2
  16. package/dist/builtin/workflows/src/tui/graph-view.ts +17 -18
  17. package/dist/builtin/workflows/src/tui/inline-form-card.ts +2 -2
  18. package/dist/builtin/workflows/src/tui/inline-form-editor.ts +18 -15
  19. package/dist/builtin/workflows/src/tui/inputs-picker.ts +24 -22
  20. package/dist/builtin/workflows/src/tui/node-card.ts +3 -5
  21. package/dist/builtin/workflows/src/tui/prompt-card.ts +11 -11
  22. package/dist/builtin/workflows/src/tui/run-detail.ts +4 -6
  23. package/dist/builtin/workflows/src/tui/session-confirm.ts +93 -8
  24. package/dist/builtin/workflows/src/tui/session-picker.ts +10 -15
  25. package/dist/builtin/workflows/src/tui/stage-chat-view.ts +93 -22
  26. package/dist/builtin/workflows/src/tui/status-list.ts +4 -6
  27. package/dist/builtin/workflows/src/tui/text-helpers.ts +7 -1
  28. package/dist/builtin/workflows/src/tui/widget.ts +2 -1
  29. package/dist/builtin/workflows/src/tui/workflow-attach-pane.ts +2 -1
  30. package/package.json +1 -1
package/CHANGELOG.md CHANGED
@@ -2,6 +2,18 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [0.8.7] - 2026-05-19
6
+
7
+ ### Changed
8
+
9
+ - Prepared the 0.8.7 release.
10
+
11
+ ## [0.8.7-0] - 2026-05-19
12
+
13
+ ### Changed
14
+
15
+ - Prepared the 0.8.7-0 prerelease.
16
+
5
17
  ## [0.8.6] - 2026-05-18
6
18
 
7
19
  ### Changed
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/intercom",
3
- "version": "0.8.6",
3
+ "version": "0.8.7",
4
4
  "private": true,
5
5
  "description": "Atomic extension providing a private coordination channel between parent and child agent sessions.",
6
6
  "contributors": [
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/mcp",
3
- "version": "0.8.6",
3
+ "version": "0.8.7",
4
4
  "private": true,
5
5
  "description": "Atomic extension that adapts MCP (Model Context Protocol) servers into the coding agent.",
6
6
  "contributors": [
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/subagents",
3
- "version": "0.8.6",
3
+ "version": "0.8.7",
4
4
  "private": true,
5
5
  "description": "Atomic extension for delegating tasks to subagents with chains, parallel execution, and TUI clarification.",
6
6
  "contributors": [
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/web-access",
3
- "version": "0.8.6",
3
+ "version": "0.8.7",
4
4
  "private": true,
5
5
  "description": "Atomic extension for web search, URL fetching, GitHub repo cloning, PDF/video extraction.",
6
6
  "contributors": [
@@ -12,6 +12,136 @@ import type { WorkflowTaskResult } from "../src/shared/types.js";
12
12
 
13
13
  const DEFAULT_MAX_LOOPS = 10;
14
14
 
15
+ type ReviewFinding = {
16
+ readonly title: string;
17
+ readonly body: string;
18
+ readonly confidence_score: number;
19
+ readonly priority?: number | null;
20
+ readonly code_location: {
21
+ readonly absolute_file_path: string;
22
+ readonly line_range: {
23
+ readonly start: number;
24
+ readonly end: number;
25
+ };
26
+ };
27
+ };
28
+
29
+ type ReviewerError = {
30
+ readonly kind:
31
+ | "validation_unavailable"
32
+ | "dependency_unavailable"
33
+ | "tool_failure"
34
+ | "reviewer_failure";
35
+ readonly message: string;
36
+ readonly attempted_recovery: string;
37
+ };
38
+
39
+ type ReviewDecision = {
40
+ readonly findings: readonly ReviewFinding[];
41
+ readonly overall_correctness: "patch is correct" | "patch is incorrect";
42
+ readonly overall_explanation: string;
43
+ readonly overall_confidence_score: number;
44
+ readonly stop_review_loop: boolean;
45
+ readonly reviewer_error?: ReviewerError | null;
46
+ };
47
+
48
+ const reviewDecisionSchema = {
49
+ type: "object",
50
+ additionalProperties: false,
51
+ required: [
52
+ "findings",
53
+ "overall_correctness",
54
+ "overall_explanation",
55
+ "overall_confidence_score",
56
+ "stop_review_loop",
57
+ ],
58
+ properties: {
59
+ findings: {
60
+ type: "array",
61
+ items: {
62
+ type: "object",
63
+ additionalProperties: false,
64
+ required: ["title", "body", "confidence_score", "code_location"],
65
+ properties: {
66
+ title: { type: "string" },
67
+ body: { type: "string" },
68
+ confidence_score: { type: "number", minimum: 0, maximum: 1 },
69
+ priority: { type: ["integer", "null"], minimum: 0, maximum: 3 },
70
+ code_location: {
71
+ type: "object",
72
+ additionalProperties: false,
73
+ required: ["absolute_file_path", "line_range"],
74
+ properties: {
75
+ absolute_file_path: { type: "string" },
76
+ line_range: {
77
+ type: "object",
78
+ additionalProperties: false,
79
+ required: ["start", "end"],
80
+ properties: {
81
+ start: { type: "integer", minimum: 1 },
82
+ end: { type: "integer", minimum: 1 },
83
+ },
84
+ },
85
+ },
86
+ },
87
+ },
88
+ },
89
+ },
90
+ overall_correctness: {
91
+ type: "string",
92
+ enum: ["patch is correct", "patch is incorrect"],
93
+ },
94
+ overall_explanation: { type: "string" },
95
+ overall_confidence_score: { type: "number", minimum: 0, maximum: 1 },
96
+ stop_review_loop: { type: "boolean" },
97
+ reviewer_error: {
98
+ anyOf: [
99
+ { type: "null" },
100
+ {
101
+ type: "object",
102
+ additionalProperties: false,
103
+ required: ["kind", "message", "attempted_recovery"],
104
+ properties: {
105
+ kind: {
106
+ type: "string",
107
+ enum: [
108
+ "validation_unavailable",
109
+ "dependency_unavailable",
110
+ "tool_failure",
111
+ "reviewer_failure",
112
+ ],
113
+ },
114
+ message: { type: "string" },
115
+ attempted_recovery: { type: "string" },
116
+ },
117
+ },
118
+ ],
119
+ },
120
+ },
121
+ } as const;
122
+
123
+ const reviewDecisionTool = {
124
+ name: "review_decision",
125
+ label: "Review Decision",
126
+ description:
127
+ "Emit the final structured review verdict after inspecting the patch.",
128
+ promptSnippet: "Emit the final review verdict as structured data",
129
+ promptGuidelines: [
130
+ "Call review_decision after completing review investigation and validation.",
131
+ "This is a terminating structured-output tool; do not emit another assistant response after calling it.",
132
+ ],
133
+ parameters: reviewDecisionSchema,
134
+ async execute(_toolCallId: string, params: ReviewDecision) {
135
+ return {
136
+ content: [
137
+ { type: "text" as const, text: JSON.stringify(params, null, 2) },
138
+ ],
139
+ details: params,
140
+ terminate: true,
141
+ };
142
+ },
143
+ };
144
+
15
145
  const PLANNER_RFC_TEMPLATE = `
16
146
  # [Project Name] Technical Design Document / RFC
17
147
 
@@ -95,21 +225,59 @@ function positiveInteger(value: number | undefined, fallback: number): number {
95
225
  : fallback;
96
226
  }
97
227
 
98
- function reviewApproved(text: string): boolean {
99
- const normalized = text.toLowerCase();
100
- if (
101
- normalized.includes("patch is correct") ||
102
- normalized.includes("overall_correctness: patch is correct")
103
- ) {
104
- return true;
105
- }
106
- if (
107
- normalized.startsWith("approved") ||
108
- normalized.includes("no actionable findings")
109
- ) {
110
- return true;
228
+ function parseReviewDecision(text: string): ReviewDecision | undefined {
229
+ try {
230
+ const parsed = JSON.parse(text) as Partial<ReviewDecision>;
231
+ if (
232
+ parsed.overall_correctness !== "patch is correct" &&
233
+ parsed.overall_correctness !== "patch is incorrect"
234
+ ) {
235
+ return undefined;
236
+ }
237
+ if (!Array.isArray(parsed.findings)) return undefined;
238
+ if (typeof parsed.stop_review_loop !== "boolean") return undefined;
239
+ if (typeof parsed.overall_explanation !== "string") return undefined;
240
+ if (typeof parsed.overall_confidence_score !== "number") return undefined;
241
+ return parsed as ReviewDecision;
242
+ } catch {
243
+ return undefined;
111
244
  }
112
- return false;
245
+ }
246
+
247
+ function reviewApproved(text: string): boolean {
248
+ const decision = parseReviewDecision(text);
249
+ if (decision === undefined) return false;
250
+ return (
251
+ decision.stop_review_loop === true &&
252
+ decision.overall_correctness === "patch is correct" &&
253
+ decision.findings.length === 0 &&
254
+ decision.reviewer_error == null
255
+ );
256
+ }
257
+
258
+ function reviewerErrorResult(
259
+ iteration: number,
260
+ error: string,
261
+ ): WorkflowTaskResult {
262
+ const decision: ReviewDecision = {
263
+ findings: [],
264
+ overall_correctness: "patch is incorrect",
265
+ overall_explanation:
266
+ "Reviewer execution failed, so the review loop cannot safely approve this iteration.",
267
+ overall_confidence_score: 0,
268
+ stop_review_loop: false,
269
+ reviewer_error: {
270
+ kind: "reviewer_failure",
271
+ message: error,
272
+ attempted_recovery:
273
+ "Model fallbacks were configured for the reviewer stage; continuing the bounded loop without approval.",
274
+ },
275
+ };
276
+ return {
277
+ name: `reviewer-${iteration}-error`,
278
+ stageName: `reviewer-${iteration}-error`,
279
+ text: JSON.stringify(decision, null, 2),
280
+ };
113
281
  }
114
282
 
115
283
  function formatDiscovery(results: readonly WorkflowTaskResult[]): string {
@@ -131,7 +299,7 @@ export default defineWorkflow("ralph")
131
299
  .input("prompt", {
132
300
  type: "text",
133
301
  required: true,
134
- description: "The task or goal for ralph to plan, execute, and refine.",
302
+ description: "The task or goal to plan, execute, and refine.",
135
303
  })
136
304
  .input("max_loops", {
137
305
  type: "number",
@@ -212,6 +380,7 @@ export default defineWorkflow("ralph")
212
380
  ],
213
381
  thinkingLevel: "high" as const,
214
382
  tools: noAskQuestionToolSet,
383
+ customTools: [reviewDecisionTool],
215
384
  };
216
385
 
217
386
  let explorerModelConfig = {
@@ -285,6 +454,22 @@ export default defineWorkflow("ralph")
285
454
  "If prior review findings are present, explicitly address each finding or explain why it is obsolete.",
286
455
  ].join("\n"),
287
456
  ],
457
+ [
458
+ "stage_contract",
459
+ [
460
+ "This stage is investigation-first RFC authoring. The RFC is only valid if it is grounded in repository inspection performed during this stage.",
461
+ "Do not fill the template from generic architecture guesses. Before writing the final RFC, inspect relevant code, docs, tests, configs, and prior design material.",
462
+ "Treat the output format as the report after investigation, not a substitute for investigation.",
463
+ ].join("\n"),
464
+ ],
465
+ [
466
+ "evidence_expectations",
467
+ [
468
+ "Every major design claim should be traceable to concrete evidence: file paths, symbols, commands, docs, tests, configs, or prior RFCs.",
469
+ "Include those concrete references inside the RFC sections where they support the design.",
470
+ "If expected evidence cannot be found, say so in the relevant RFC section or Open Questions rather than papering over the gap.",
471
+ ].join("\n"),
472
+ ],
288
473
  [
289
474
  "output_discipline",
290
475
  [
@@ -315,14 +500,35 @@ export default defineWorkflow("ralph")
315
500
  [
316
501
  "delegation_policy",
317
502
  [
318
- "All non-trivial operations must be delegated to subagents via the `subagent` tool.",
503
+ "You are not the implementer. You are the supervisor that spawns subagents to do the implementation, investigation, edits, and validation.",
504
+ "All non-trivial operations must be delegated to subagents via the `subagent` tool before you claim progress.",
319
505
  "Delegate codebase understanding, impact analysis, and implementation research to codebase-locator, codebase-analyzer, and pattern-finder style subagents when available.",
320
506
  "Delegate shell-heavy work — especially commands likely to produce lots of output, log digging, CLI investigation, and broad grep/find exploration — to subagents that can run those commands rather than doing it in this orchestrator context.",
507
+ "Delegate implementation edits to a focused subagent with clear files, constraints, and validation expectations; do not merely describe the edits yourself.",
321
508
  "Use separate subagents for separate tasks, and launch independent subagents in parallel when useful.",
322
509
  "Do not split highly overlapping tasks across multiple subagents; consolidate overlapping work into one focused delegation to avoid duplicate effort.",
323
510
  "If a subagent takes a long time, do not attempt to do its assigned job yourself while waiting. Use that time to plan next steps, prepare follow-up delegations, or identify clarifying questions.",
324
511
  ].join("\n"),
325
512
  ],
513
+ [
514
+ "execution_contract",
515
+ [
516
+ "The required output format is a completion report, not the task itself.",
517
+ "Do not jump straight to the report. First spawn the necessary subagents, wait for their results, coordinate any follow-up subagents, and only then write the report.",
518
+ "A valid response must be grounded in actual subagent work: name the delegated work, summarize what each subagent did, and distinguish completed changes from recommendations or blockers.",
519
+ "If you cannot spawn or use subagents, treat that as a blocker and report it honestly instead of pretending the requested work was done.",
520
+ ].join("\n"),
521
+ ],
522
+ [
523
+ "subagent_tracking",
524
+ [
525
+ "Use the `todo` tool as your active control ledger for subagent work.",
526
+ "Before launching subagents, create todo items for each delegated task with enough detail to identify owner, purpose, and expected output.",
527
+ "Mark todo items in_progress when the corresponding subagent starts, append progress/results as subagents report back, and close them only after you have incorporated or explicitly rejected their result.",
528
+ "Keep pending, in_progress, blocked, and completed work accurate so you do not lose track of parallel subagents or unresolved follow-ups.",
529
+ "Before writing the final report, review the todo list and resolve every pending/in_progress item as completed, blocked, or deferred with an explanation.",
530
+ ].join("\n"),
531
+ ],
326
532
  [
327
533
  "instructions",
328
534
  [
@@ -338,11 +544,12 @@ export default defineWorkflow("ralph")
338
544
  [
339
545
  "output_format",
340
546
  [
341
- "Markdown with headings:",
342
- "1. Changes made",
343
- "2. Files touched",
344
- "3. Validation run / recommended",
345
- "4. Deferred work or blockers",
547
+ "After subagents have done the work, return Markdown with headings:",
548
+ "1. Delegations performed — subagents spawned and what each completed",
549
+ "2. Changes made — concrete changes from subagent work, not intentions",
550
+ "3. Files touched",
551
+ "4. Validation run / recommended",
552
+ "5. Deferred work or blockers",
346
553
  ].join("\n"),
347
554
  ],
348
555
  ]),
@@ -407,6 +614,27 @@ export default defineWorkflow("ralph")
407
614
  "Limit scope to code recently modified in this iteration/session unless the planner explicitly asked for broader cleanup.",
408
615
  ].join("\n"),
409
616
  ],
617
+ [
618
+ "stage_contract",
619
+ [
620
+ "This is an active code-refinement stage, not just a commentary stage.",
621
+ "Before producing the report, inspect the actual repository state and recently modified files from the planner/orchestrator context.",
622
+ "Apply safe simplifications with edit/write tools when clear behavior-preserving improvements exist. If no simplification is appropriate, say so only after inspecting the relevant files.",
623
+ ].join("\n"),
624
+ ],
625
+ [
626
+ "required_actions_before_output",
627
+ [
628
+ "1. Identify the concrete files/sections changed in this iteration.",
629
+ "2. Read those files before deciding whether to simplify.",
630
+ "3. Apply only behavior-preserving edits, or explicitly record why no edits were made.",
631
+ "4. Run or recommend focused validation tied to the touched files.",
632
+ ].join("\n"),
633
+ ],
634
+ [
635
+ "handoff_expectations",
636
+ "In the final report, distinguish edits actually applied from observations only. Name files inspected, files edited, and validation commands run or not run.",
637
+ ],
410
638
  [
411
639
  "process",
412
640
  [
@@ -445,13 +673,22 @@ export default defineWorkflow("ralph")
445
673
  "objective",
446
674
  `Find review-relevant infrastructure for the task: ${prompt}`,
447
675
  ],
676
+ [
677
+ "stage_contract",
678
+ [
679
+ "This is a repository-discovery stage. Do not answer from assumptions or common project layouts.",
680
+ "Before output, inspect the repository for each infrastructure category: package scripts, test configs, CI workflows, generated artifacts, lint/typecheck setup, and release gates.",
681
+ "The table is a compact handoff after discovery, not a substitute for discovery.",
682
+ ].join("\n"),
683
+ ],
448
684
  [
449
685
  "instructions",
450
686
  [
451
687
  "Locate package scripts, test configs, CI workflows, generated artifacts, lint/typecheck setup, and release gates.",
688
+ "Search/read relevant files such as package manifests, CI workflow directories, test configs, lint/typecheck configs, build scripts, release configs, and generated-artifact markers.",
452
689
  "Prefer exact file paths and commands.",
453
690
  "Explain how each item should influence review or validation.",
454
- "If a category does not exist, state that explicitly.",
691
+ "If a category does not exist, report `not found` and briefly name the paths or patterns checked.",
455
692
  ].join("\n"),
456
693
  ],
457
694
  [
@@ -472,6 +709,14 @@ export default defineWorkflow("ralph")
472
709
  "objective",
473
710
  `Assess infrastructure and changed-code risks for the task: ${prompt}`,
474
711
  ],
712
+ [
713
+ "stage_contract",
714
+ [
715
+ "This stage analyzes actual repository coupling, not generic integration risks.",
716
+ "Before output, inspect the changed-code context plus relevant infrastructure/configuration files discovered or inferable from the repo.",
717
+ "Classify a risk as confirmed only when repository evidence shows the coupling; otherwise mark it speculative.",
718
+ ].join("\n"),
719
+ ],
475
720
  [
476
721
  "instructions",
477
722
  [
@@ -479,8 +724,13 @@ export default defineWorkflow("ralph")
479
724
  "Name the exact validations that would most efficiently detect regressions.",
480
725
  "Separate confirmed risks from speculative risks.",
481
726
  "Do not repeat generic review advice; ground findings in repository evidence.",
727
+ "Copy validation commands from actual repository scripts/configs when available; do not invent commands that are not supported by the repo.",
482
728
  ].join("\n"),
483
729
  ],
730
+ [
731
+ "evidence_expectations",
732
+ "Each confirmed risk must include concrete evidence: path, command, symbol, config key, script name, or file relationship.",
733
+ ],
484
734
  [
485
735
  "output_format",
486
736
  "Markdown with sections: Confirmed risks, Speculative risks, Validation commands, Evidence.",
@@ -499,15 +749,29 @@ export default defineWorkflow("ralph")
499
749
  "objective",
500
750
  `Extract conventions relevant to reviewing this task: ${prompt}`,
501
751
  ],
752
+ [
753
+ "stage_contract",
754
+ [
755
+ "This is an evidence-gathering stage for repository conventions. Do not describe generic best practices.",
756
+ "Before output, find concrete examples in the repository that demonstrate conventions relevant to this task.",
757
+ "Read enough of each example to understand the convention before reporting it.",
758
+ ].join("\n"),
759
+ ],
502
760
  [
503
761
  "instructions",
504
762
  [
505
763
  "Find examples of build/test/style/release/architecture patterns the patch should mirror.",
764
+ "Search for nearby or analogous implementations, tests, configs, scripts, and docs.",
506
765
  "Use concrete paths, commands, or symbols as evidence.",
507
766
  "Highlight conventions that commonly cause subtle review failures.",
508
767
  "If examples conflict, describe the conflict instead of forcing a single rule.",
768
+ "If no relevant example exists, state what was searched and that no pattern was found.",
509
769
  ].join("\n"),
510
770
  ],
771
+ [
772
+ "handoff_expectations",
773
+ "For every required convention or useful example, include the supporting path, command, symbol, or file relationship so reviewers can verify it quickly.",
774
+ ],
511
775
  [
512
776
  "output_format",
513
777
  "Markdown with sections: Required conventions, Useful examples, Exceptions, Review implications.",
@@ -523,7 +787,11 @@ export default defineWorkflow("ralph")
523
787
  const reviewPrompt = taggedPrompt([
524
788
  [
525
789
  "role",
526
- "You are acting as a reviewer for a proposed code change made by another engineer.",
790
+ [
791
+ "You are acting as a reviewer for a proposed code change made by another engineer.",
792
+ "Persona: a grumpy senior developer who has seen too many fragile patches. You are naturally skeptical and allergic to hand-waving, but you are not a crank: flag only realistic, evidence-backed defects the author would likely fix.",
793
+ "Be terse, concrete, and technically fair. Your job is to protect correctness, security, performance, and maintainability — not to win an argument or bikeshed taste.",
794
+ ].join("\n"),
527
795
  ],
528
796
  [
529
797
  "objective",
@@ -538,29 +806,41 @@ export default defineWorkflow("ralph")
538
806
  "Use the repository's AGENTS.md and/or CLAUDE.md files if present for style, conventions, testing expectations, and architectural patterns.",
539
807
  "Project-level norms override these general instructions when they are more specific.",
540
808
  "Flag deviations only when they affect correctness, security, performance, or maintainability — not personal preference.",
809
+ "If validation requires dependencies or tools that are missing, download or install them using the repository-approved package manager/commands rather than bypassing, mocking, or skipping the verification solely because dependencies are absent.",
810
+ ].join("\n"),
811
+ ],
812
+ [
813
+ "validation_expectations",
814
+ [
815
+ "Inspect the actual diff/repository state rather than trusting stage summaries.",
816
+ "Run or delegate focused validation when it is necessary to distinguish a real bug from a hunch.",
817
+ "If tests or typechecks fail because dependencies are missing, install/download the missing dependencies with the repo's documented package manager instead of bypassing the check.",
818
+ "If validation cannot be completed after reasonable recovery, record the limitation in overall_explanation and reviewer_error; do not use missing dependencies as a reason to approve.",
541
819
  ].join("\n"),
542
820
  ],
543
821
  [
544
822
  "bug_selection_guidelines",
545
823
  [
824
+ "Use these default guidelines for deciding whether the author would appreciate the issue being flagged. More specific user, project, or file-level guidance overrides them.",
546
825
  "Flag an issue only when the original author would likely fix it if they knew about it.",
547
826
  "A finding should meaningfully impact accuracy, performance, security, or maintainability.",
548
- "A finding must be discrete and actionable, not a broad complaint about the whole codebase.",
549
- "Do not demand rigor inconsistent with the rest of the repository.",
550
- "Flag only bugs introduced by this iteration's patch; do not flag pre-existing issues.",
827
+ "A finding must be discrete and actionable, not a broad complaint about the whole codebase or a pile of related concerns.",
828
+ "Do not demand rigor inconsistent with the rest of the repository; match the seriousness of existing code and project norms.",
829
+ "Flag only bugs introduced by this iteration's patch; do not flag pre-existing issues unless the patch makes them worse in a concrete way.",
551
830
  "Do not rely on unstated assumptions about author intent or codebase behavior.",
552
831
  "Speculation is insufficient: identify the code path, scenario, environment, or input that is provably affected.",
553
832
  "Do not flag intentional behavior changes as bugs unless they clearly violate the task or documented contract.",
554
833
  "Ignore trivial style unless it obscures meaning or violates documented standards in a way that affects correctness/security/maintainability.",
834
+ "If no finding clears this bar, return an empty findings array, mark the patch correct, and set stop_review_loop true.",
555
835
  ].join("\n"),
556
836
  ],
557
837
  [
558
838
  "comment_guidelines",
559
839
  [
560
- "Each finding title must start with a priority tag such as [P1], [P2], or [P3]. Use [P0] only for universal release/operations blockers.",
561
- "Also include numeric priority: 0 for P0, 1 for P1, 2 for P2, 3 for P3.",
562
- "The body must be one concise paragraph explaining why this is a bug and the exact scenario or inputs required for it to arise.",
563
- "Use a matter-of-fact, non-accusatory tone. Avoid praise such as `Great job` or `Thanks for`.",
840
+ "Each finding title must start with a priority tag: [P0] drop-everything blocker, [P1] urgent next-cycle fix, [P2] normal fix, [P3] low-priority nice-to-have.",
841
+ "Also include numeric priority: 0 for P0, 1 for P1, 2 for P2, 3 for P3; use null only if priority genuinely cannot be determined.",
842
+ "The body must be one concise paragraph explaining why this is a bug and the exact scenario, environment, or inputs required for it to arise.",
843
+ "Use a matter-of-fact, non-accusatory tone. Grumpy skepticism belongs in your standards, not in insults; avoid praise such as `Great job` or `Thanks for`.",
564
844
  "Keep code_location ranges as short as possible, ideally one line and never longer than 5-10 lines unless unavoidable.",
565
845
  "The code_location must overlap the diff/change under review.",
566
846
  "Use one finding per distinct issue. Do not generate a PR fix.",
@@ -576,9 +856,37 @@ export default defineWorkflow("ralph")
576
856
  ].join("\n"),
577
857
  ],
578
858
  [
579
- "output_schema",
859
+ "review_stage_contract",
580
860
  [
581
- "Return JSON only. Do not wrap the JSON in markdown fences or add extra prose.",
861
+ "The structured review decision is only valid after you inspect the actual repository state for this iteration.",
862
+ "Do not approve based solely on orchestrator, simplifier, or discovery summaries.",
863
+ "The tool call is the final verdict after review work, not a shortcut around review work.",
864
+ ].join("\n"),
865
+ ],
866
+ [
867
+ "required_actions_before_tool_call",
868
+ [
869
+ "1. Identify the changed files or diff under review.",
870
+ "2. Read the relevant changed code and directly affected call sites/tests/configs.",
871
+ "3. Run or delegate focused validation when needed to resolve uncertainty.",
872
+ "4. If you cannot inspect or validate enough to approve safely, populate reviewer_error and set stop_review_loop=false.",
873
+ ].join("\n"),
874
+ ],
875
+ [
876
+ "evidence_expectations",
877
+ [
878
+ "The overall_explanation should briefly mention what was inspected and what validation was run or why validation was not completed.",
879
+ "Every finding must cite a concrete changed location and affected scenario.",
880
+ ].join("\n"),
881
+ ],
882
+ [
883
+ "structured_output_contract",
884
+ [
885
+ "You have a structured-output tool named review_decision. Use it after your investigation and validation attempts.",
886
+ "The tool terminates the turn and provides the structured data; do not emit a separate final assistant response after calling it.",
887
+ "The review loop decides whether to stop only by parsing the JSON object returned by this tool; invalid JSON, missing fields, reviewer_error, or stop_review_loop=false are treated as not approved for safety.",
888
+ "Set stop_review_loop=true only when findings is empty, overall_correctness is patch is correct, and reviewer_error is null/omitted.",
889
+ "If you hit a reviewer/tool/validation error, still return the object with stop_review_loop=false and reviewer_error populated instead of pretending the patch is approved.",
582
890
  "The JSON must match this schema exactly:",
583
891
  "{",
584
892
  ' "findings": [',
@@ -586,38 +894,46 @@ export default defineWorkflow("ralph")
586
894
  ' "title": "<≤ 80 chars, imperative, starts with [P0]/[P1]/[P2]/[P3]>",',
587
895
  ' "body": "<one paragraph of valid Markdown explaining why this is a problem; cite files/lines/functions>",',
588
896
  ' "confidence_score": <float 0.0-1.0>,',
589
- ' "priority": <int 0-3, optional>,',
897
+ ' "priority": <int 0-3 or null>,',
590
898
  ' "code_location": {',
591
- ' "file_path": "<repo-relative path>",',
899
+ ' "absolute_file_path": "<absolute file path>",',
592
900
  ' "line_range": {"start": <int>, "end": <int>}',
593
901
  " }",
594
902
  " }",
595
903
  " ],",
596
904
  ' "overall_correctness": "patch is correct" | "patch is incorrect",',
597
905
  ' "overall_explanation": "<1-3 sentence explanation justifying the verdict>",',
598
- ' "overall_confidence_score": <float 0.0-1.0>',
906
+ ' "overall_confidence_score": <float 0.0-1.0>,',
907
+ ' "stop_review_loop": <boolean>,',
908
+ ' "reviewer_error": null | {"kind": "validation_unavailable" | "dependency_unavailable" | "tool_failure" | "reviewer_failure", "message": "<what failed>", "attempted_recovery": "<what you tried>"}',
599
909
  "}",
600
910
  ].join("\n"),
601
911
  ],
602
912
  ]);
603
913
 
604
- const reviews = await ctx.parallel(
605
- [
606
- {
607
- name: `reviewer-${iteration}-a`,
608
- task: reviewPrompt,
609
- previous: [orchestrator, simplifier, ...discovery],
610
- ...reviewerModelConfig,
611
- },
612
- {
613
- name: `reviewer-${iteration}-b`,
614
- task: reviewPrompt,
615
- previous: [orchestrator, simplifier, ...discovery],
616
- ...reviewerModelConfig,
617
- },
618
- ],
619
- { task: prompt },
620
- );
914
+ let reviews: WorkflowTaskResult[];
915
+ try {
916
+ reviews = await ctx.parallel(
917
+ [
918
+ {
919
+ name: `reviewer-${iteration}-a`,
920
+ task: reviewPrompt,
921
+ previous: [orchestrator, simplifier, ...discovery],
922
+ ...reviewerModelConfig,
923
+ },
924
+ {
925
+ name: `reviewer-${iteration}-b`,
926
+ task: reviewPrompt,
927
+ previous: [orchestrator, simplifier, ...discovery],
928
+ ...reviewerModelConfig,
929
+ },
930
+ ],
931
+ { task: prompt, failFast: false },
932
+ );
933
+ } catch (err) {
934
+ const message = err instanceof Error ? err.message : String(err);
935
+ reviews = [reviewerErrorResult(iteration, message)];
936
+ }
621
937
 
622
938
  approved =
623
939
  reviews.length > 0 &&
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bastani/workflows",
3
- "version": "0.8.6",
3
+ "version": "0.8.7",
4
4
  "private": true,
5
5
  "description": "pi extension for multi-stage workflow authoring and execution.",
6
6
  "contributors": [