@nbardy/oompa 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,11 +14,13 @@
14
14
  No separate orchestrator - workers self-organize."
15
15
  (:require [agentnet.tasks :as tasks]
16
16
  [agentnet.agent :as agent]
17
+ [agentnet.core :as core]
18
+ [agentnet.harness :as harness]
17
19
  [agentnet.worktree :as worktree]
18
20
  [agentnet.runs :as runs]
19
- [cheshire.core :as json]
20
21
  [babashka.process :as process]
21
22
  [clojure.java.io :as io]
23
+ [clojure.set]
22
24
  [clojure.string :as str]))
23
25
 
24
26
  ;; =============================================================================
@@ -33,25 +35,11 @@
33
35
  ;; git index corruption from parallel checkout+merge operations.
34
36
  (def ^:private merge-lock (Object.))
35
37
 
36
- ;; Resolve absolute paths for CLI binaries at first use.
37
- ;; ProcessBuilder with :dir set can fail to find bare command names on some
38
- ;; platforms (macOS + babashka), so we resolve once via `which` and cache.
39
- (def ^:private binary-paths* (atom {}))
40
-
41
- (defn- resolve-binary!
42
- "Resolve the absolute path of a CLI binary. Caches result.
43
- Throws if binary not found on PATH."
44
- [name]
45
- (or (get @binary-paths* name)
46
- (let [result (try
47
- (process/sh ["which" name] {:out :string :err :string})
48
- (catch Exception _ {:exit -1 :out "" :err ""}))
49
- path (when (zero? (:exit result))
50
- (str/trim (:out result)))]
51
- (if path
52
- (do (swap! binary-paths* assoc name path)
53
- path)
54
- (throw (ex-info (str "Binary not found on PATH: " name) {:binary name}))))))
38
+ ;; Set by JVM shutdown hook (SIGTERM/SIGINT). Workers check this between cycles
39
+ ;; and exit gracefully finishing the current cycle before stopping.
40
+ (def ^:private shutdown-requested? (atom false))
41
+
42
+ (declare task-root-for-cwd)
55
43
 
56
44
  (defn- load-prompt
57
45
  "Load a prompt file. Tries path as-is first, then from package root."
@@ -59,33 +47,88 @@
59
47
  (or (agent/load-custom-prompt path)
60
48
  (agent/load-custom-prompt (str package-root "/" path))))
61
49
 
50
+ (defn- build-template-tokens
51
+ "Build token map for prompt template {var} substitution.
52
+ Merges core/build-context (rich YAML header, queue, hotspots, etc.)
53
+ with worker-level context (task_status, pending_tasks) and defaults
54
+ for tokens that core/build-context doesn't produce (mode_hint, targets,
55
+ recent_sec). Without these defaults, those {vars} leak into prompts."
56
+ ([worker-context]
57
+ (build-template-tokens worker-context nil))
58
+ ([worker-context cwd]
59
+ (let [pending (tasks/list-pending)
60
+ core-ctx (core/build-context {:tasks pending
61
+ :repo (System/getProperty "user.dir")})
62
+ task-root (task-root-for-cwd (or cwd (System/getProperty "user.dir")))]
63
+ (merge {:mode_hint "propose"
64
+ :targets "*"
65
+ :recent_sec "180"
66
+ :TASK_ROOT task-root
67
+ :TASKS_ROOT task-root}
68
+ core-ctx
69
+ worker-context))))
70
+
71
+ (defn- task-root-for-cwd
72
+ "Return the relative tasks root for commands issued from cwd."
73
+ [cwd]
74
+ (let [cwd-file (io/file cwd)
75
+ local-tasks (io/file cwd-file "tasks")
76
+ parent-tasks (some-> cwd-file .getParentFile (io/file "tasks"))]
77
+ (cond
78
+ (and parent-tasks (.exists parent-tasks)) "../tasks"
79
+ (.exists local-tasks) "tasks"
80
+ :else "tasks")))
81
+
82
+ (defn- render-task-header
83
+ "Inject runtime task path into auto-injected task header."
84
+ [raw-header cwd]
85
+ (let [task-root (task-root-for-cwd cwd)]
86
+ (-> (or raw-header "")
87
+ (str/replace "{{TASK_ROOT}}" task-root)
88
+ (str/replace "{{TASKS_ROOT}}" task-root)
89
+ (str/replace "{TASK_ROOT}" task-root)
90
+ (str/replace "{TASKS_ROOT}" task-root))))
91
+
92
+ (def ^:private default-max-working-resumes 5)
93
+ (def ^:private default-max-wait-for-tasks 600)
94
+
62
95
  (defn create-worker
63
96
  "Create a worker config.
64
97
  :prompts is a string or vector of strings — paths to prompt files.
65
98
  :can-plan when false, worker waits for tasks before starting (backpressure).
66
99
  :reasoning reasoning effort level (e.g. \"low\", \"medium\", \"high\") — codex only.
67
- :review-prompts paths to reviewer prompt files (loaded and concatenated for review)."
68
- [{:keys [id swarm-id harness model iterations prompts can-plan reasoning
69
- review-harness review-model review-prompts]}]
100
+ :review-prompts paths to reviewer prompt files (loaded and concatenated for review).
101
+ :wait-between seconds to sleep between cycles (nil or 0 = no wait).
102
+ :max-wait-for-tasks max seconds a non-planner waits for tasks before giving up (default 600).
103
+ :max-working-resumes max consecutive working resumes before nudge+kill (default 5)."
104
+ [{:keys [id swarm-id harness model runs max-cycles iterations prompts can-plan reasoning
105
+ reviewers wait-between
106
+ max-working-resumes max-wait-for-tasks]}]
107
+ (let [cycle-cap (or max-cycles iterations runs 10)
108
+ run-goal (or runs iterations 10)]
70
109
  {:id id
71
110
  :swarm-id swarm-id
72
111
  :harness (or harness :codex)
73
112
  :model model
74
- :iterations (or iterations 10)
113
+ ;; Legacy compatibility: :iterations remains the cycle cap.
114
+ :iterations cycle-cap
115
+ :max-cycles cycle-cap
116
+ :runs run-goal
75
117
  :prompts (cond
76
118
  (vector? prompts) prompts
77
119
  (string? prompts) [prompts]
78
120
  :else [])
79
121
  :can-plan (if (some? can-plan) can-plan true)
80
122
  :reasoning reasoning
81
- :review-harness review-harness
82
- :review-model review-model
83
- :review-prompts (cond
84
- (vector? review-prompts) review-prompts
85
- (string? review-prompts) [review-prompts]
86
- :else [])
123
+ :wait-between (when (and wait-between (pos? wait-between)) wait-between)
124
+ :max-wait-for-tasks (let [v (or max-wait-for-tasks default-max-wait-for-tasks)]
125
+ (if (and (number? v) (pos? v))
126
+ v
127
+ default-max-wait-for-tasks))
128
+ :reviewers reviewers
129
+ :max-working-resumes (or max-working-resumes default-max-working-resumes)
87
130
  :completed 0
88
- :status :idle})
131
+ :status :idle}))
89
132
 
90
133
  ;; =============================================================================
91
134
  ;; Task Execution
@@ -93,6 +136,18 @@
93
136
 
94
137
  (def ^:private max-review-retries 3)
95
138
 
139
+ ;; Nudge prompt injected when a worker hits max-working-resumes consecutive
140
+ ;; "working" outcomes without signaling. Gives the agent one final chance to
141
+ ;; produce something mergeable before the session is killed.
142
+ (def ^:private nudge-prompt
143
+ (str "You have been working for a long time without signaling completion.\n"
144
+ "You MUST take one of these actions NOW:\n\n"
145
+ "1. If you have meaningful changes: commit them and signal COMPLETE_AND_READY_FOR_MERGE\n"
146
+ "2. If scope is too large: create follow-up tasks in tasks/pending/ for remaining work,\n"
147
+ " commit what you have (even partial notes/design docs), and signal COMPLETE_AND_READY_FOR_MERGE\n"
148
+ "3. If you are stuck and cannot make progress: signal __DONE__\n\n"
149
+ "Do NOT continue working without producing a signal."))
150
+
96
151
  (defn- build-context
97
152
  "Build context for agent prompts"
98
153
  []
@@ -106,66 +161,80 @@
106
161
  :task_status (format "Pending: %d, In Progress: %d, Complete: %d"
107
162
  (count pending) (count current) (count complete))}))
108
163
 
109
- (defn- opencode-attach-url
110
- "Optional opencode server URL for run --attach mode."
111
- []
112
- (let [url (or (System/getenv "OOMPA_OPENCODE_ATTACH")
113
- (System/getenv "OPENCODE_ATTACH"))]
114
- (when (and url (not (str/blank? url)))
115
- url)))
116
-
117
- (defn- parse-opencode-run-output
118
- "Parse `opencode run --format json` output.
119
- Returns {:session-id string|nil, :text string|nil}."
120
- [s]
121
- (let [raw (or s "")
122
- events (->> (str/split-lines raw)
123
- (keep (fn [line]
124
- (try
125
- (json/parse-string line true)
126
- (catch Exception _
127
- nil))))
128
- doall)
129
- session-id (or (some #(or (:sessionID %)
130
- (:sessionId %)
131
- (get-in % [:part :sessionID])
132
- (get-in % [:part :sessionId]))
133
- events)
134
- (some-> (re-find #"(ses_[A-Za-z0-9]+)" raw) second))
135
- text (->> events
136
- (keep (fn [event]
137
- (let [event-type (or (:type event) (get-in event [:part :type]))
138
- chunk (or (:text event) (get-in event [:part :text]))]
139
- (when (and (= event-type "text")
140
- (string? chunk)
141
- (not (str/blank? chunk)))
142
- chunk))))
143
- (str/join ""))]
144
- {:session-id session-id
145
- :text (when-not (str/blank? text) text)}))
164
+
165
+ (defn- execute-claims!
166
+ "Execute CLAIM signal: attempt to claim each task ID from pending/.
167
+ Returns {:claimed [ids], :failed [ids], :resume-prompt string}."
168
+ [claim-ids]
169
+ (let [results (tasks/claim-by-ids! claim-ids)
170
+ claimed (filterv #(= :claimed (:status %)) results)
171
+ failed (filterv #(not= :claimed (:status %)) results)
172
+ claimed-ids (mapv :id claimed)
173
+ failed-ids (mapv :id failed)
174
+ context (build-context)
175
+ prompt (str "## Claim Results\n"
176
+ (if (seq claimed-ids)
177
+ (str "Claimed: " (str/join ", " claimed-ids) "\n")
178
+ "No tasks were successfully claimed.\n")
179
+ (when (seq failed-ids)
180
+ (str "Already taken or not found: "
181
+ (str/join ", " failed-ids) "\n"))
182
+ "\nTask Status: " (:task_status context) "\n"
183
+ "Remaining Pending:\n"
184
+ (if (str/blank? (:pending_tasks context))
185
+ "(none)"
186
+ (:pending_tasks context))
187
+ "\n\n"
188
+ (if (seq claimed-ids)
189
+ "Work on your claimed tasks. Signal COMPLETE_AND_READY_FOR_MERGE when done."
190
+ "No claims succeeded. CLAIM different tasks, or signal __DONE__ if no suitable work remains."))]
191
+ {:claimed claimed-ids
192
+ :failed failed-ids
193
+ :resume-prompt prompt}))
146
194
 
147
195
  (defn- run-agent!
148
- "Run agent with prompt, return {:output string, :done? bool, :merge? bool, :exit int, :session-id string}.
149
- When resume? is true and harness is :claude/:opencode, continues the existing session
150
- with a lighter prompt (just task status + continue instruction)."
151
- [{:keys [id swarm-id harness model prompts reasoning]} worktree-path context session-id resume?]
152
- (let [;; Use provided session-id, otherwise generate one for harnesses that accept custom IDs.
153
- session-id (or session-id
154
- (when (#{:codex :claude} harness)
155
- (str/lower-case (str (java.util.UUID/randomUUID)))))
156
-
157
- ;; Build prompt lighter for resume (agent already has full context)
158
- prompt (if resume?
196
+ "Run agent with prompt, return {:output :done? :merge? :claim-ids :exit :session-id}.
197
+ When resume? is true, continues the existing session with a lighter prompt.
198
+ resume-prompt-override: when non-nil, replaces the default resume prompt
199
+ (used to inject CLAIM results). All harness-specific CLI knowledge
200
+ is delegated to harness/build-cmd."
201
+ [{:keys [id swarm-id harness model prompts reasoning]} worktree-path context session-id resume?
202
+ & {:keys [resume-prompt-override]}]
203
+ (let [session-id (or session-id (harness/make-session-id harness))
204
+ template-tokens (build-template-tokens context worktree-path)
205
+ resume-prompt-override (when resume-prompt-override
206
+ (-> resume-prompt-override
207
+ (render-task-header worktree-path)
208
+ (agent/tokenize template-tokens)))
209
+
210
+ ;; Build prompt — 3-way: override → standard resume → fresh start
211
+ prompt (cond
212
+ ;; CLAIM results or other injected resume prompt
213
+ resume-prompt-override
214
+ resume-prompt-override
215
+
216
+ ;; Standard resume — lighter (agent already has full context)
217
+ resume?
159
218
  (str "Task Status: " (:task_status context) "\n"
160
219
  "Pending: " (:pending_tasks context) "\n\n"
161
220
  "Continue working. Signal COMPLETE_AND_READY_FOR_MERGE when your current task is done and ready for review.")
162
- (let [task-header (or (load-prompt "config/prompts/_task_header.md") "")
221
+
222
+ ;; Fresh start — full task header + tokenized user prompts
223
+ ;; Template tokens ({context_header}, {queue_md}, etc.) are
224
+ ;; replaced here. Without this, raw {var} placeholders leak
225
+ ;; into the agent prompt verbatim.
226
+ :else
227
+ (let [task-header (render-task-header
228
+ (load-prompt "config/prompts/_task_header.md")
229
+ worktree-path)
163
230
  user-prompts (if (seq prompts)
164
231
  (->> prompts
165
232
  (map load-prompt)
166
233
  (remove nil?)
234
+ (map #(agent/tokenize % template-tokens))
167
235
  (str/join "\n\n"))
168
- (or (load-prompt "config/prompts/worker.md")
236
+ (or (some-> (load-prompt "config/prompts/worker.md")
237
+ (agent/tokenize template-tokens))
169
238
  "You are a worker. Claim tasks, execute them, complete them."))]
170
239
  (str task-header "\n"
171
240
  "Task Status: " (:task_status context) "\n"
@@ -175,61 +244,36 @@
175
244
  swarm-id* (or swarm-id "unknown")
176
245
  tagged-prompt (str "[oompa:" swarm-id* ":" id "] " prompt)
177
246
  abs-worktree (.getAbsolutePath (io/file worktree-path))
178
- opencode-attach (opencode-attach-url)
179
-
180
- ;; Build command all harnesses run with cwd=worktree, no sandbox
181
- ;; so agents can `..` to reach project root for task management
182
- ;; Claude: --resume flag continues existing session-id conversation
183
- ;; Opencode: -s/--session + --continue continue existing session
184
- ;; and --format json for deterministic per-run session capture.
185
- ;; Codex: no native resume support, always fresh (but worktree state persists)
186
- cmd (case harness
187
- :codex (cond-> [(resolve-binary! "codex") "exec"
188
- "--dangerously-bypass-approvals-and-sandbox"
189
- "--skip-git-repo-check"
190
- "-C" abs-worktree]
191
- model (into ["--model" model])
192
- reasoning (into ["-c" (str "model_reasoning_effort=\"" reasoning "\"")])
193
- true (conj "--" tagged-prompt))
194
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"
195
- "--session-id" session-id]
196
- resume? (conj "--resume")
197
- model (into ["--model" model]))
198
- :opencode (cond-> [(resolve-binary! "opencode") "run" "--format" "json"]
199
- model (into ["-m" model])
200
- opencode-attach (into ["--attach" opencode-attach])
201
- (and resume? session-id) (into ["-s" session-id "--continue"])
202
- true (conj tagged-prompt)))
203
-
204
- ;; Run agent — all run with cwd=worktree
247
+
248
+ cmd (harness/build-cmd harness
249
+ {:cwd abs-worktree :model model :reasoning reasoning
250
+ :session-id session-id :resume? resume?
251
+ :prompt tagged-prompt :format? true})
252
+
205
253
  result (try
206
- (if (= harness :claude)
207
- (process/sh cmd {:dir abs-worktree :in tagged-prompt :out :string :err :string})
208
- (process/sh cmd {:dir abs-worktree :out :string :err :string}))
254
+ (process/sh cmd {:dir abs-worktree
255
+ :in (harness/process-stdin harness tagged-prompt)
256
+ :out :string :err :string})
209
257
  (catch Exception e
210
258
  (println (format "[%s] Agent exception: %s" id (.getMessage e)))
211
259
  {:exit -1 :out "" :err (.getMessage e)}))
212
- parsed-opencode (when (= harness :opencode)
213
- (parse-opencode-run-output (:out result)))
214
- output (if (= harness :opencode)
215
- (or (:text parsed-opencode) (:out result))
216
- (:out result))
217
- session-id' (if (= harness :opencode)
218
- (or (:session-id parsed-opencode) session-id)
219
- session-id)]
260
+
261
+ {:keys [output session-id]}
262
+ (harness/parse-output harness (:out result) session-id)]
220
263
 
221
264
  {:output output
222
265
  :exit (:exit result)
223
266
  :done? (agent/done-signal? output)
224
267
  :merge? (agent/merge-signal? output)
225
- :session-id session-id'}))
268
+ :claim-ids (agent/parse-claim-signal output)
269
+ :session-id session-id}))
226
270
 
227
271
  (defn- run-reviewer!
228
272
  "Run reviewer on worktree changes.
229
273
  Uses custom review-prompts when configured, otherwise falls back to default.
230
274
  prev-feedback: vector of previous review outputs (for multi-round context).
231
275
  Returns {:verdict :approved|:needs-changes|:rejected, :comments [...], :output string}"
232
- [{:keys [id swarm-id review-harness review-model review-prompts]} worktree-path prev-feedback]
276
+ [{:keys [id swarm-id reviewers]} worktree-path prev-feedback]
233
277
  (let [;; Get actual diff content (not just stat) — truncate to 8000 chars for prompt budget
234
278
  diff-result (process/sh ["git" "diff" "main"]
235
279
  {:dir worktree-path :out :string :err :string})
@@ -238,75 +282,74 @@
238
282
  (str (subs d 0 8000) "\n... [diff truncated at 8000 chars]")
239
283
  d))
240
284
 
241
- ;; Build review prompt — use custom prompts if configured, else default
242
285
  swarm-id* (or swarm-id "unknown")
243
- custom-prompt (when (seq review-prompts)
244
- (->> review-prompts
245
- (map load-prompt)
246
- (remove nil?)
247
- (str/join "\n\n")))
248
286
 
249
- ;; Include previous review history for multi-round context
287
+ ;; Only include the most recent round's feedback — the worker has already
288
+ ;; attempted fixes based on it, so the reviewer just needs to verify.
250
289
  history-block (when (seq prev-feedback)
251
- (str "\n## Previous Review Rounds\n\n"
252
- "The worker has already attempted fixes based on earlier feedback. "
253
- "Do NOT raise new issues only verify the original issues are resolved.\n\n"
254
- (->> prev-feedback
255
- (map-indexed (fn [i fb]
256
- (str "### Round " (inc i) " feedback:\n" fb)))
257
- (str/join "\n\n"))
258
- "\n\n"))
259
-
260
- review-body (str (or custom-prompt
261
- (str "Review the changes in this worktree.\n"
262
- "Focus on architecture and design, not style.\n"))
263
- "\n\nDiff:\n```\n" diff-content "\n```\n"
264
- (when history-block history-block)
265
- "\nYour verdict MUST be on its own line, exactly one of:\n"
266
- "VERDICT: APPROVED\n"
267
- "VERDICT: NEEDS_CHANGES\n"
268
- "VERDICT: REJECTED\n")
269
- review-prompt (str "[oompa:" swarm-id* ":" id "] " review-body)
290
+ (let [latest (last prev-feedback)
291
+ truncated (if (> (count latest) 2000)
292
+ (str (subs latest 0 2000) "\n... [feedback truncated]")
293
+ latest)]
294
+ (str "\n## Previous Review (Round " (count prev-feedback) ")\n\n"
295
+ "The worker has attempted fixes based on this feedback. "
296
+ "Verify the issues below are resolved. Do NOT raise new issues.\n\n"
297
+ truncated
298
+ "\n\n")))
270
299
 
271
300
  abs-wt (.getAbsolutePath (io/file worktree-path))
272
- opencode-attach (opencode-attach-url)
273
-
274
- ;; Build command cwd=worktree, no sandbox
275
- cmd (case review-harness
276
- :codex (cond-> [(resolve-binary! "codex") "exec"
277
- "--dangerously-bypass-approvals-and-sandbox"
278
- "--skip-git-repo-check"
279
- "-C" abs-wt]
280
- review-model (into ["--model" review-model])
281
- true (conj "--" review-prompt))
282
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"]
283
- review-model (into ["--model" review-model]))
284
- :opencode (cond-> [(resolve-binary! "opencode") "run"]
285
- review-model (into ["-m" review-model])
286
- opencode-attach (into ["--attach" opencode-attach])
287
- true (conj review-prompt)))
288
-
289
- ;; Run reviewer cwd=worktree
290
- result (try
291
- (if (= review-harness :claude)
292
- (process/sh cmd {:dir abs-wt :in review-prompt :out :string :err :string})
293
- (process/sh cmd {:dir abs-wt :out :string :err :string}))
294
- (catch Exception e
295
- {:exit -1 :out "" :err (.getMessage e)}))
301
+
302
+ ;; Try each reviewer until one succeeds and returns a verdict
303
+ result (reduce (fn [_ {:keys [harness model prompts]}]
304
+ (let [custom-prompt (when (seq prompts)
305
+ (->> prompts
306
+ (map load-prompt)
307
+ (remove nil?)
308
+ (str/join "\n\n")))
309
+ review-body (str (or custom-prompt
310
+ (str "Review the changes in this worktree.\n"
311
+ "Focus on architecture and design, not style.\n"))
312
+ "\n\nDiff:\n```\n" diff-content "\n```\n"
313
+ (when history-block history-block)
314
+ "\nYour verdict MUST be on its own line, exactly one of:\n"
315
+ "VERDICT: APPROVED\n"
316
+ "VERDICT: NEEDS_CHANGES\n\n"
317
+ "Do NOT use REJECTED. Always use NEEDS_CHANGES with specific, "
318
+ "actionable feedback explaining what must change and why. "
319
+ "The worker will attempt fixes based on your feedback.\n"
320
+ "After your verdict line, list every issue as a numbered item with "
321
+ "the file path and what needs to change.\n")
322
+ review-prompt (str "[oompa:" swarm-id* ":" id "] " review-body)
323
+ cmd (harness/build-cmd harness {:cwd abs-wt :model model :prompt review-prompt})
324
+ res (try
325
+ (process/sh cmd {:dir abs-wt
326
+ :in (harness/process-stdin harness review-prompt)
327
+ :out :string :err :string})
328
+ (catch Exception e
329
+ {:exit -1 :out "" :err (.getMessage e)}))
330
+ output (or (:out res) "")
331
+ has-verdict? (or (re-find #"VERDICT:\s*APPROVED" output)
332
+ (re-find #"VERDICT:\s*NEEDS_CHANGES" output)
333
+ (re-find #"VERDICT:\s*REJECTED" output)
334
+ (re-find #"(?i)\bAPPROVED\b" output))]
335
+ (if (and (= (:exit res) 0) has-verdict?)
336
+ (reduced res)
337
+ (do
338
+ (println (format "[%s] Reviewer %s failed or returned no verdict, falling back..." id model))
339
+ res))))
340
+ {:exit -1 :out "" :err "No reviewers configured or no verdict returned"}
341
+ reviewers)
296
342
 
297
343
  output (:out result)
298
344
 
299
- ;; Parse verdict — require explicit VERDICT: prefix to avoid false matches
345
+ ;; Parse verdict
300
346
  verdict (cond
301
347
  (re-find #"VERDICT:\s*APPROVED" output) :approved
302
- (re-find #"VERDICT:\s*REJECTED" output) :rejected
303
348
  (re-find #"VERDICT:\s*NEEDS_CHANGES" output) :needs-changes
304
- ;; Fallback to loose matching if reviewer didn't use prefix
349
+ (re-find #"VERDICT:\s*REJECTED" output) :needs-changes
305
350
  (re-find #"(?i)\bAPPROVED\b" output) :approved
306
- (re-find #"(?i)\bREJECTED\b" output) :rejected
307
351
  :else :needs-changes)]
308
352
 
309
- ;; Log reviewer output (truncated) for visibility
310
353
  (println (format "[%s] Reviewer verdict: %s" id (name verdict)))
311
354
  (let [summary (subs output 0 (min 300 (count output)))]
312
355
  (println (format "[%s] Review: %s%s" id summary
@@ -337,32 +380,94 @@
337
380
  "Fix these issues. Do not add anything the reviewer did not ask for.")
338
381
 
339
382
  abs-wt (.getAbsolutePath (io/file worktree-path))
340
- opencode-attach (opencode-attach-url)
341
-
342
- cmd (case harness
343
- :codex (cond-> [(resolve-binary! "codex") "exec"
344
- "--dangerously-bypass-approvals-and-sandbox"
345
- "--skip-git-repo-check"
346
- "-C" abs-wt]
347
- model (into ["--model" model])
348
- true (conj "--" fix-prompt))
349
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"]
350
- model (into ["--model" model]))
351
- :opencode (cond-> [(resolve-binary! "opencode") "run"]
352
- model (into ["-m" model])
353
- opencode-attach (into ["--attach" opencode-attach])
354
- true (conj fix-prompt)))
383
+
384
+ cmd (harness/build-cmd harness
385
+ {:cwd abs-wt :model model :prompt fix-prompt})
355
386
 
356
387
  result (try
357
- (if (= harness :claude)
358
- (process/sh cmd {:dir abs-wt :in fix-prompt :out :string :err :string})
359
- (process/sh cmd {:dir abs-wt :out :string :err :string}))
388
+ (process/sh cmd {:dir abs-wt
389
+ :in (harness/process-stdin harness fix-prompt)
390
+ :out :string :err :string})
360
391
  (catch Exception e
361
392
  {:exit -1 :out "" :err (.getMessage e)}))]
362
393
 
363
394
  {:output (:out result)
364
395
  :exit (:exit result)}))
365
396
 
397
+ (defn- collect-divergence-context
398
+ "Collect context about how a worktree branch has diverged from main.
399
+ Returns a map with :branch-log, :main-log, :diff-stat strings."
400
+ [wt-path]
401
+ (let [git-out (fn [& args] (:out (process/sh (vec args) {:dir wt-path :out :string :err :string})))
402
+ branch-log (git-out "git" "log" "--oneline" "main..HEAD")
403
+ main-log (git-out "git" "log" "--oneline" "HEAD..main")
404
+ diff-stat (git-out "git" "diff" "--stat" "main")]
405
+ {:branch-log (or branch-log "(none)")
406
+ :main-log (or main-log "(none)")
407
+ :diff-stat (or diff-stat "(none)")}))
408
+
409
+ (defn- verify-mergeable?
410
+ "Dry-run merge to verify a worktree branch merges cleanly into main.
411
+ Does NOT leave merge state behind — always cleans up the dry-run.
412
+ Uses --no-commit so no actual commit is created; resets afterward."
413
+ [wt-path]
414
+ (let [result (process/sh ["git" "merge" "--no-commit" "--no-ff" "main"]
415
+ {:dir wt-path :out :string :err :string})
416
+ clean? (zero? (:exit result))]
417
+ ;; Clean up: abort if conflicted, reset if staged but uncommitted
418
+ (if clean?
419
+ (process/sh ["git" "reset" "--hard" "HEAD"] {:dir wt-path})
420
+ (process/sh ["git" "merge" "--abort"] {:dir wt-path}))
421
+ clean?))
422
+
423
+ (defn- sync-worktree-to-main!
424
+ "Sync worktree branch with main before merge-to-main!.
425
+ Fast path: git merge main succeeds cleanly → :synced.
426
+ Conflict path: abort merge, give agent a clean worktree + divergence
427
+ context, let agent make the branch mergeable (rebase, cherry-pick,
428
+ manual resolution — agent's choice), verify with dry-run merge.
429
+ Runs OUTSIDE the merge-lock so the agent doesn't block other workers.
430
+ Returns :synced | :resolved | :failed."
431
+ [worker wt-path worker-id]
432
+ (let [merge-result (process/sh ["git" "merge" "main" "--no-edit"]
433
+ {:dir wt-path :out :string :err :string})]
434
+ (if (zero? (:exit merge-result))
435
+ (do (println (format "[%s] Worktree synced to main" worker-id))
436
+ :synced)
437
+ ;; Conflict — abort merge to restore clean worktree state, then
438
+ ;; hand the problem to the agent with full divergence context.
439
+ (let [_ (process/sh ["git" "merge" "--abort"] {:dir wt-path})
440
+ _ (println (format "[%s] Branch diverged from main, launching resolver agent" worker-id))
441
+ {:keys [branch-log main-log diff-stat]} (collect-divergence-context wt-path)
442
+ resolve-prompt (str "[oompa:" (or (:swarm-id worker) "unknown") ":" worker-id "] "
443
+ "Your branch has diverged from main and cannot merge cleanly.\n\n"
444
+ "Your branch's commits (not on main):\n" branch-log "\n\n"
445
+ "Commits on main since you branched:\n" main-log "\n\n"
446
+ "Divergence scope:\n" diff-stat "\n\n"
447
+ "Make this branch cleanly mergeable into main. "
448
+ "Preserve the intent of your branch's changes.\n"
449
+ "You have full git access — rebase, cherry-pick, resolve conflicts, "
450
+ "whatever works.\n"
451
+ "When done, verify with: git diff main --stat")
452
+ abs-wt (.getAbsolutePath (io/file wt-path))
453
+ cmd (harness/build-cmd (:harness worker)
454
+ {:cwd abs-wt :model (:model worker) :prompt resolve-prompt})
455
+ result (try
456
+ (process/sh cmd {:dir abs-wt
457
+ :in (harness/process-stdin (:harness worker) resolve-prompt)
458
+ :out :string :err :string})
459
+ (catch Exception e
460
+ {:exit -1 :out "" :err (.getMessage e)}))]
461
+ (if (zero? (:exit result))
462
+ ;; Agent ran — verify the branch actually merges cleanly now
463
+ (if (verify-mergeable? wt-path)
464
+ (do (println (format "[%s] Agent resolved divergence, branch is mergeable" worker-id))
465
+ :resolved)
466
+ (do (println (format "[%s] Agent ran but branch still can't merge cleanly" worker-id))
467
+ :failed))
468
+ (do (println (format "[%s] Resolver agent failed (exit %d)" worker-id (:exit result)))
469
+ :failed))))))
470
+
366
471
  (defn- worktree-has-changes?
367
472
  "Check if worktree has committed OR uncommitted changes vs main.
368
473
  Workers commit before signaling merge, so we must check both:
@@ -395,6 +500,47 @@
395
500
  {:dir wt-dir :branch wt-branch}))))
396
501
  {:dir wt-dir :branch wt-branch :path wt-path}))
397
502
 
503
+ (defn- detect-claimed-tasks
504
+ "Diff current/ task IDs before and after agent ran.
505
+ Returns set of task IDs this worker claimed during iteration."
506
+ [pre-current-ids]
507
+ (let [post-ids (tasks/current-task-ids)]
508
+ (clojure.set/difference post-ids pre-current-ids)))
509
+
510
+ (defn- emit-cycle-log!
511
+ "Write cycle event log. Called at every cycle exit point.
512
+ session-id links to the Claude CLI conversation transcript on disk.
513
+ No mutable summary state — all state is derived from immutable cycle logs."
514
+ [swarm-id worker-id cycle run start-ms session-id
515
+ {:keys [outcome claimed-task-ids recycled-tasks error-snippet review-rounds]}]
516
+ (let [duration-ms (- (System/currentTimeMillis) start-ms)]
517
+ (runs/write-cycle-log!
518
+ swarm-id worker-id cycle
519
+ {:run run
520
+ :outcome outcome
521
+ :duration-ms duration-ms
522
+ :claimed-task-ids (vec (or claimed-task-ids []))
523
+ :recycled-tasks (or recycled-tasks [])
524
+ :error-snippet error-snippet
525
+ :review-rounds (or review-rounds 0)
526
+ :session-id session-id})))
527
+
528
+ (defn- recycle-orphaned-tasks!
529
+ "Recycle tasks that a worker claimed but didn't complete.
530
+ Compares current/ task IDs before and after the agent ran —
531
+ new IDs that appeared are tasks this worker claimed. On failure
532
+ or rejection, move them back to pending/ so other workers can
533
+ pick them up. Returns count of recycled tasks."
534
+ [worker-id pre-current-ids]
535
+ (let [post-current-ids (tasks/current-task-ids)
536
+ orphaned-ids (clojure.set/difference post-current-ids pre-current-ids)
537
+ recycled (when (seq orphaned-ids)
538
+ (tasks/recycle-tasks! orphaned-ids))]
539
+ (when (seq recycled)
540
+ (println (format "[%s] Recycled %d orphaned task(s): %s"
541
+ worker-id (count recycled) (str/join ", " recycled))))
542
+ (count (or recycled []))))
543
+
398
544
  (defn- cleanup-worktree!
399
545
  "Remove worktree and branch."
400
546
  [project-root wt-dir wt-branch]
@@ -433,10 +579,10 @@
433
579
 
434
580
  (defn- merge-to-main!
435
581
  "Merge worktree changes to main branch. Serialized via merge-lock to prevent
436
- concurrent workers from corrupting the git index. On success, annotates any
437
- newly-completed tasks with worker metadata. Returns true on success.
438
- review-rounds: number of review rounds (0 for auto-merged task-only changes)."
439
- [wt-path wt-id worker-id project-root review-rounds]
582
+ concurrent workers from corrupting the git index. On success, moves claimed
583
+ tasks current→complete and annotates metadata. Returns true on success.
584
+ claimed-task-ids: set of task IDs this worker claimed (framework owns completion)."
585
+ [wt-path wt-id worker-id project-root review-rounds claimed-task-ids]
440
586
  (locking merge-lock
441
587
  (println (format "[%s] Merging changes to main" worker-id))
442
588
  (let [;; Commit in worktree if needed (no-op if already committed)
@@ -457,10 +603,26 @@
457
603
  (if success
458
604
  (do
459
605
  (println (format "[%s] Merge successful" worker-id))
460
- ;; Annotate completed tasks while still holding merge-lock
606
+ ;; Framework-owned completion: move claimed tasks current→complete
607
+ (when (seq claimed-task-ids)
608
+ (let [completed (tasks/complete-by-ids! claimed-task-ids)]
609
+ (when (seq completed)
610
+ (println (format "[%s] Completed %d task(s): %s"
611
+ worker-id (count completed) (str/join ", " completed))))))
612
+ ;; Annotate completed tasks with metadata while still holding merge-lock
461
613
  (annotate-completed-tasks! project-root worker-id review-rounds))
462
- (when merge-result
463
- (println (format "[%s] MERGE FAILED: %s" worker-id (:err merge-result)))))
614
+ ;; FAILED: Clean up git state before releasing merge-lock.
615
+ ;; Without this, a conflict leaves .git/MERGE_HEAD and poisons the
616
+ ;; shared index — every subsequent worker fails on `git checkout main`.
617
+ (do
618
+ (println (format "[%s] MERGE FAILED: %s" worker-id
619
+ (or (:err merge-result) (:err checkout-result))))
620
+ (let [abort-result (process/sh ["git" "merge" "--abort"]
621
+ {:dir project-root :out :string :err :string})]
622
+ (when-not (zero? (:exit abort-result))
623
+ ;; Abort failed (no merge in progress, or other issue) — hard reset.
624
+ (process/sh ["git" "reset" "--hard" "HEAD"]
625
+ {:dir project-root :out :string :err :string})))))
464
626
  success)))
465
627
 
466
628
  (defn- task-only-diff?
@@ -492,7 +654,7 @@
492
654
  Writes review logs to runs/{swarm-id}/reviews/ for post-mortem analysis.
493
655
  Returns {:approved? bool, :attempts int}"
494
656
  [worker wt-path worker-id iteration]
495
- (if-not (and (:review-harness worker) (:review-model worker))
657
+ (if (empty? (:reviewers worker))
496
658
  ;; No reviewer configured, auto-approve
497
659
  {:approved? true :attempts 0}
498
660
 
@@ -516,12 +678,8 @@
516
678
  (println (format "[%s] Reviewer APPROVED (attempt %d)" worker-id attempt))
517
679
  {:approved? true :attempts attempt})
518
680
 
519
- :rejected
520
- (do
521
- (println (format "[%s] Reviewer REJECTED (attempt %d)" worker-id attempt))
522
- {:approved? false :attempts attempt})
523
-
524
- ;; :needs-changes
681
+ ;; :needs-changes — always give the worker a chance to fix.
682
+ ;; Hard rejection only happens when max review rounds are exhausted.
525
683
  (let [all-feedback (conj prev-feedback output)]
526
684
  (if (>= attempt max-review-retries)
527
685
  (do
@@ -536,183 +694,298 @@
536
694
  ;; Worker Loop
537
695
  ;; =============================================================================
538
696
 
539
- (def ^:private max-wait-for-tasks 60)
540
- (def ^:private wait-poll-interval 5)
541
- (def ^:private max-consecutive-errors 3)
697
+ ;; Workers can wait for tasks before giving up; default is 10 minutes.
698
+ ;; This keeps workers alive while planners/designers ramp up the queue.
699
+ (def ^:private wait-poll-interval 10)
700
+ (def ^:private max-consecutive-errors 5)
701
+
702
+ (defn- backoff-sleep! [id errors]
703
+ (when (< errors max-consecutive-errors)
704
+ (let [wait-sec (* 60 (int (Math/pow 2 (dec errors))))]
705
+ (println (format "[%s] Backing off for %d seconds before next retry (%d/%d)..." id wait-sec errors (dec max-consecutive-errors)))
706
+ (Thread/sleep (* 1000 wait-sec)))))
707
+
542
708
 
543
709
  (defn- wait-for-tasks!
544
- "Wait up to 60s for pending/current tasks to appear. Used for backpressure
545
- on workers that can't create their own tasks (can_plan: false)."
546
- [worker-id]
710
+ "Wait up to max-wait-seconds for pending/current tasks to appear.
711
+ Used for backpressure on workers that can't create their own tasks (can_plan: false).
712
+ Polls every 10 seconds, logs every 60 seconds."
713
+ [worker-id max-wait-seconds]
547
714
  (loop [waited 0]
548
715
  (cond
549
716
  (pos? (tasks/pending-count)) true
550
717
  (pos? (tasks/current-count)) true
551
- (>= waited max-wait-for-tasks)
552
- (do (println (format "[%s] No tasks after %ds, proceeding anyway" worker-id waited))
718
+ (>= waited max-wait-seconds)
719
+ (do (println (format "[%s] No tasks after %ds, giving up" worker-id waited))
553
720
  false)
554
721
  :else
555
- (do (when (zero? (mod waited 15))
556
- (println (format "[%s] Waiting for tasks... (%ds)" worker-id waited)))
722
+ (do (when (zero? (mod waited 60))
723
+ (println (format "[%s] Waiting for tasks... (%ds/%ds)" worker-id waited max-wait-seconds)))
557
724
  (Thread/sleep (* wait-poll-interval 1000))
558
725
  (recur (+ waited wait-poll-interval))))))
559
726
 
727
+ (defn- maybe-sleep-between!
728
+ "Sleep between iterations when wait-between is configured.
729
+ Called at the start of each iteration (except the first)."
730
+ [worker-id wait-between iter]
731
+ (when (and wait-between (> iter 1))
732
+ (println (format "[%s] Sleeping %ds before next iteration" worker-id wait-between))
733
+ (Thread/sleep (* wait-between 1000))))
734
+
560
735
  (defn run-worker!
561
736
  "Run worker loop with persistent sessions.
562
737
 
563
- Sessions persist across iterations agents resume where they left off.
564
- Worktrees persist until COMPLETE_AND_READY_FOR_MERGE triggers review+merge.
565
- __DONE__ stops the worker entirely (planners only).
566
-
567
- Tracks per-worker metrics: merges, rejections, errors, review-rounds-total.
568
- Returns final worker state with metrics attached."
738
+ A run is a terminal outcome (merged/rejected/error-like).
739
+ A cycle is one worker turn/resume. Multiple cycles may occur in one run.
740
+ Cycle cap is controlled by :max-cycles (legacy key: :iterations)."
569
741
  [worker]
570
742
  (tasks/ensure-dirs!)
571
- (let [{:keys [id iterations]} worker
743
+ (let [{:keys [id runs max-cycles iterations swarm-id wait-between max-wait-for-tasks]} worker
744
+ cycle-cap (or max-cycles iterations 10)
745
+ run-goal (or runs iterations 10)
572
746
  project-root (System/getProperty "user.dir")]
573
- (println (format "[%s] Starting worker (%s:%s%s, %d iterations)"
747
+ (println (format "[%s] Starting worker (%s:%s%s, goal=%d runs, cap=%d cycles%s)"
574
748
  id
575
749
  (name (:harness worker))
576
750
  (or (:model worker) "default")
577
751
  (if (:reasoning worker) (str ":" (:reasoning worker)) "")
578
- iterations))
752
+ run-goal
753
+ cycle-cap
754
+ (if wait-between (format ", %ds between" wait-between) "")))
579
755
 
580
- ;; Backpressure: workers that can't create tasks wait for tasks to exist
581
- (when-not (:can-plan worker)
582
- (wait-for-tasks! id))
756
+ (when (and (not (:can-plan worker))
757
+ (not (pos? (tasks/pending-count)))
758
+ (not (pos? (tasks/current-count))))
759
+ (wait-for-tasks! id max-wait-for-tasks))
583
760
 
584
- ;; metrics tracks: {:merges N :rejections N :errors N :review-rounds-total N}
585
- (loop [iter 1
586
- completed 0
761
+ (loop [cycle 1
762
+ completed-runs 0
587
763
  consec-errors 0
588
- metrics {:merges 0 :rejections 0 :errors 0 :review-rounds-total 0}
589
- session-id nil ;; persistent session-id (nil = start fresh)
590
- wt-state nil] ;; {:dir :branch :path} or nil
764
+ metrics {:merges 0 :rejections 0 :errors 0 :recycled 0 :review-rounds-total 0 :claims 0}
765
+ session-id nil
766
+ wt-state nil
767
+ claimed-ids #{}
768
+ claim-resume-prompt nil
769
+ working-resumes 0]
591
770
  (let [finish (fn [status]
592
- (assoc worker :completed completed :status status
771
+ (assoc worker :completed completed-runs
772
+ :runs-completed completed-runs
773
+ :cycles-completed (dec cycle)
774
+ :status status
593
775
  :merges (:merges metrics)
594
776
  :rejections (:rejections metrics)
595
777
  :errors (:errors metrics)
596
- :review-rounds-total (:review-rounds-total metrics)))]
597
- (if (> iter iterations)
778
+ :recycled (:recycled metrics)
779
+ :review-rounds-total (:review-rounds-total metrics)
780
+ :claims (:claims metrics)))
781
+ current-run (inc completed-runs)]
782
+ (cond
783
+ (> cycle cycle-cap)
598
784
  (do
599
- ;; Cleanup any lingering worktree
600
785
  (when wt-state
601
786
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
602
- (println (format "[%s] Completed %d iterations (%d merges, %d rejections, %d errors)"
603
- id completed (:merges metrics) (:rejections metrics) (:errors metrics)))
787
+ (println (format "[%s] Completed %d/%d runs in %d cycles (%d merges, %d claims, %d rejections, %d errors, %d recycled)"
788
+ id completed-runs run-goal (dec cycle)
789
+ (:merges metrics) (:claims metrics) (:rejections metrics) (:errors metrics) (:recycled metrics)))
604
790
  (finish :exhausted))
605
791
 
606
- ;; Ensure worktree exists (create fresh if nil, reuse if persisted)
607
- (let [wt-state (try
608
- (or wt-state (create-iteration-worktree! project-root id iter))
609
- (catch Exception e
610
- (println (format "[%s] Worktree creation failed: %s" id (.getMessage e)))
611
- nil))]
612
- (if (nil? wt-state)
613
- ;; Worktree creation failed — count as error
614
- (let [errors (inc consec-errors)
615
- metrics (update metrics :errors inc)]
616
- (if (>= errors max-consecutive-errors)
617
- (do
618
- (println (format "[%s] %d consecutive errors, stopping" id errors))
619
- (finish :error))
620
- (recur (inc iter) completed errors metrics nil nil)))
621
-
622
- ;; Worktree ready run agent
623
- (let [resume? (some? session-id)
624
- _ (println (format "[%s] %s iteration %d/%d"
625
- id (if resume? "Resuming" "Starting") iter iterations))
626
- context (build-context)
627
- {:keys [output exit done? merge?] :as agent-result}
628
- (run-agent! worker (:path wt-state) context session-id resume?)
629
- new-session-id (:session-id agent-result)]
630
-
631
- (cond
632
- ;; Agent errored — cleanup, reset session
633
- (not (zero? exit))
634
- (let [errors (inc consec-errors)
635
- metrics (update metrics :errors inc)]
636
- (println (format "[%s] Agent error (exit %d): %s"
637
- id exit (subs (or output "") 0 (min 200 (count (or output ""))))))
638
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
639
- (if (>= errors max-consecutive-errors)
640
- (do
641
- (println (format "[%s] %d consecutive errors, stopping" id errors))
642
- (finish :error))
643
- (recur (inc iter) completed errors metrics nil nil)))
644
-
645
- ;; COMPLETE_AND_READY_FOR_MERGE review, merge, reset session
646
- merge?
647
- (if (worktree-has-changes? (:path wt-state))
648
- (if (task-only-diff? (:path wt-state))
649
- ;; Task-only changes — skip review, auto-merge
650
- (do
651
- (println (format "[%s] Task-only diff, auto-merging" id))
652
- (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root 0)
653
- metrics (if merged? (update metrics :merges inc) metrics)]
654
- (println (format "[%s] Iteration %d/%d complete" id iter iterations))
655
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
656
- (if (and done? (:can-plan worker))
657
- (do
658
- (println (format "[%s] Worker done after merge" id))
659
- (assoc worker :completed (inc completed) :status :done
660
- :merges (:merges metrics)
661
- :rejections (:rejections metrics)
662
- :errors (:errors metrics)
663
- :review-rounds-total (:review-rounds-total metrics)))
664
- (recur (inc iter) (inc completed) 0 metrics nil nil))))
665
- ;; Code changes — full review loop
666
- (let [{:keys [approved? attempts]} (review-loop! worker (:path wt-state) id iter)
667
- metrics (-> metrics
668
- (update :review-rounds-total + (or attempts 0))
669
- (update (if approved? :merges :rejections) inc))]
670
- (if approved?
671
- (do
672
- (merge-to-main! (:path wt-state) (:branch wt-state) id project-root (or attempts 0))
673
- (println (format "[%s] Iteration %d/%d complete" id iter iterations))
674
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
675
- ;; If also __DONE__, stop after merge
676
- (if (and done? (:can-plan worker))
677
- (do
678
- (println (format "[%s] Worker done after merge" id))
679
- (assoc worker :completed (inc completed) :status :done
680
- :merges (:merges metrics)
681
- :rejections (:rejections metrics)
682
- :errors (:errors metrics)
683
- :review-rounds-total (:review-rounds-total metrics)))
684
- (recur (inc iter) (inc completed) 0 metrics nil nil)))
685
- (do
686
- (println (format "[%s] Iteration %d/%d rejected" id iter iterations))
687
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
688
- (recur (inc iter) completed 0 metrics nil nil)))))
792
+ (>= completed-runs run-goal)
793
+ (do
794
+ (when wt-state
795
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
796
+ (println (format "[%s] Reached run goal: %d/%d runs in %d cycles"
797
+ id completed-runs run-goal (dec cycle)))
798
+ (finish :completed))
799
+
800
+ @shutdown-requested?
801
+ (do
802
+ (println (format "[%s] Shutdown requested, stopping after %d cycles" id (dec cycle)))
803
+ (when wt-state
804
+ (when (seq claimed-ids)
805
+ (let [recycled (tasks/recycle-tasks! claimed-ids)]
806
+ (when (seq recycled)
807
+ (println (format "[%s] Recycled %d claimed task(s) on shutdown" id (count recycled))))))
808
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
809
+ (emit-cycle-log! swarm-id id cycle current-run (System/currentTimeMillis) session-id
810
+ {:outcome :interrupted})
811
+ (finish :interrupted))
812
+
813
+ :else
814
+ (do
815
+ (maybe-sleep-between! id wait-between cycle)
816
+
817
+ (when (and (not (:can-plan worker))
818
+ (not (pos? (tasks/pending-count)))
819
+ (not (pos? (tasks/current-count))))
820
+ (println (format "[%s] Queue empty, waiting for tasks before cycle %d" id cycle))
821
+ (wait-for-tasks! id max-wait-for-tasks))
822
+
823
+ (let [wt-state (try
824
+ (or wt-state (create-iteration-worktree! project-root id cycle))
825
+ (catch Exception e
826
+ (println (format "[%s] Worktree creation failed: %s" id (.getMessage e)))
827
+ nil))]
828
+ (if (nil? wt-state)
829
+ (let [errors (inc consec-errors)
830
+ metrics (update metrics :errors inc)]
831
+ (if (>= errors max-consecutive-errors)
689
832
  (do
690
- (println (format "[%s] Merge signaled but no changes, skipping" id))
833
+ (println (format "[%s] %d consecutive errors, stopping" id errors))
834
+ (finish :error))
835
+ (do (backoff-sleep! id errors) (recur (inc cycle) completed-runs errors metrics nil nil #{} nil 0))))
836
+
837
+ (let [resume? (or (some? session-id) (some? claim-resume-prompt))
838
+ cycle-start-ms (System/currentTimeMillis)
839
+ pre-current-ids (tasks/current-task-ids)
840
+ _ (println (format "[%s] %s cycle %d/%d (run %d/%d)"
841
+ id (if resume? "Resuming" "Starting") cycle cycle-cap current-run run-goal))
842
+ context (build-context)
843
+ {:keys [output exit done? merge? claim-ids] :as agent-result}
844
+ (run-agent! worker (:path wt-state) context session-id resume?
845
+ :resume-prompt-override claim-resume-prompt)
846
+ new-session-id (:session-id agent-result)
847
+ mv-claimed-tasks (detect-claimed-tasks pre-current-ids)]
848
+ (cond
849
+ (not (zero? exit))
850
+ (let [errors (inc consec-errors)
851
+ recycled (recycle-orphaned-tasks! id pre-current-ids)
852
+ metrics (-> metrics (update :errors inc) (update :recycled + recycled))
853
+ error-msg (subs (or output "") 0 (min 200 (count (or output ""))))]
854
+ (println (format "[%s] Agent error (exit %d): %s" id exit error-msg))
855
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
856
+ {:outcome :error
857
+ :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
858
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
859
+ :error-snippet error-msg})
860
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
861
+ (if (>= errors max-consecutive-errors)
862
+ (do
863
+ (println (format "[%s] %d consecutive errors, stopping" id errors))
864
+ (finish :error))
865
+ (do (backoff-sleep! id errors) (recur (inc cycle) (inc completed-runs) errors metrics nil nil #{} nil 0))))
866
+
867
+ (and (seq claim-ids) (not merge?) (not done?))
868
+ (let [_ (println (format "[%s] CLAIM signal: %s" id (str/join ", " claim-ids)))
869
+ {:keys [claimed resume-prompt]} (execute-claims! claim-ids)
870
+ new-claimed-ids (into claimed-ids claimed)
871
+ metrics (update metrics :claims + (count claimed))]
872
+ (println (format "[%s] Claimed %d/%d tasks" id (count claimed) (count claim-ids)))
873
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
874
+ {:outcome :claimed :claimed-task-ids (vec claimed)})
875
+ (recur (inc cycle) completed-runs 0 metrics new-session-id wt-state
876
+ new-claimed-ids resume-prompt 0))
877
+
878
+ merge?
879
+ (if (worktree-has-changes? (:path wt-state))
880
+ (if (task-only-diff? (:path wt-state))
881
+ (do
882
+ (println (format "[%s] Task-only diff, auto-merging" id))
883
+ (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
884
+ all-claimed (into claimed-ids mv-claimed-tasks)]
885
+ (if (= :failed sync-status)
886
+ (do
887
+ (println (format "[%s] Sync to main failed, skipping merge" id))
888
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
889
+ {:outcome :sync-failed :claimed-task-ids (vec all-claimed)})
890
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
891
+ (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0))
892
+ (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root 0 all-claimed)
893
+ metrics (if merged? (update metrics :merges inc) metrics)]
894
+ (println (format "[%s] Cycle %d/%d complete" id cycle cycle-cap))
895
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
896
+ {:outcome (if merged? :merged :merge-failed)
897
+ :claimed-task-ids (vec all-claimed)
898
+ :review-rounds 0})
899
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
900
+ (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0)))))
901
+ (let [{:keys [approved? attempts]} (review-loop! worker (:path wt-state) id cycle)
902
+ metrics (-> metrics
903
+ (update :review-rounds-total + (or attempts 0))
904
+ (cond-> (not approved?) (update :rejections inc)))]
905
+ (if approved?
906
+ (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
907
+ all-claimed (into claimed-ids mv-claimed-tasks)]
908
+ (if (= :failed sync-status)
909
+ (do
910
+ (println (format "[%s] Sync to main failed after approval, skipping merge" id))
911
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
912
+ {:outcome :sync-failed
913
+ :claimed-task-ids (vec all-claimed)
914
+ :review-rounds (or attempts 0)})
915
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
916
+ (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0))
917
+ (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root (or attempts 0) all-claimed)
918
+ metrics (if merged? (update metrics :merges inc) metrics)]
919
+ (println (format "[%s] Cycle %d/%d complete" id cycle cycle-cap))
920
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
921
+ {:outcome (if merged? :merged :merge-failed)
922
+ :claimed-task-ids (vec all-claimed)
923
+ :review-rounds (or attempts 0)})
924
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
925
+ (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0))))
926
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
927
+ metrics (update metrics :recycled + recycled)]
928
+ (println (format "[%s] Cycle %d/%d rejected" id cycle cycle-cap))
929
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
930
+ {:outcome :rejected
931
+ :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
932
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
933
+ :review-rounds (or attempts 0)})
934
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
935
+ (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0)))))
936
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
937
+ metrics (update metrics :recycled + recycled)]
938
+ (println (format "[%s] Merge signaled but no changes, skipping" id))
939
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
940
+ {:outcome :no-changes
941
+ :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
942
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
943
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
944
+ (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0)))
945
+
946
+ done?
947
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
948
+ metrics (update metrics :recycled + recycled)]
949
+ (println (format "[%s] __DONE__ signal, resetting session (cycle %d/%d)" id cycle cycle-cap))
950
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
951
+ {:outcome :executor-done
952
+ :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
953
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
691
954
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
692
- (recur (inc iter) completed 0 metrics nil nil)))
693
-
694
- ;; __DONE__ without merge — only honor for planners
695
- (and done? (:can-plan worker))
696
- (do
697
- (println (format "[%s] Received __DONE__ signal" id))
698
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
699
- (println (format "[%s] Worker done after %d/%d iterations" id iter iterations))
700
- (finish :done))
701
-
702
- ;; __DONE__ from executor ignore signal, but reset session since
703
- ;; the agent process exited. Resuming a dead session causes exit 1
704
- ;; which cascades into consecutive errors and premature stopping.
705
- (and done? (not (:can-plan worker)))
706
- (do
707
- (println (format "[%s] Ignoring __DONE__ (executor), resetting session" id))
708
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
709
- (recur (inc iter) completed 0 metrics nil nil))
710
-
711
- ;; No signal agent still working, resume next iteration
712
- :else
713
- (do
714
- (println (format "[%s] Working... (will resume)" id))
715
- (recur (inc iter) completed 0 metrics new-session-id wt-state)))))))))))
955
+ (recur (inc cycle) completed-runs 0 metrics nil nil #{} nil 0))
956
+
957
+ :else
958
+ (let [wr (inc working-resumes)
959
+ max-wr (:max-working-resumes worker)]
960
+ (cond
961
+ (> wr max-wr)
962
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
963
+ metrics (update metrics :recycled + recycled)]
964
+ (println (format "[%s] Stuck after %d working resumes + nudge, resetting session" id wr))
965
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
966
+ {:outcome :stuck
967
+ :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
968
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
969
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
970
+ (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0))
971
+
972
+ (= wr max-wr)
973
+ (do
974
+ (println (format "[%s] Working... %d/%d resumes, nudging agent to wrap up" id wr max-wr))
975
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
976
+ {:outcome :working
977
+ :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
978
+ (recur (inc cycle) completed-runs 0 metrics new-session-id wt-state
979
+ claimed-ids nudge-prompt wr))
980
+
981
+ :else
982
+ (do
983
+ (println (format "[%s] Working... (will resume, %d/%d)" id wr max-wr))
984
+ (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
985
+ {:outcome :working
986
+ :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
987
+ (recur (inc cycle) completed-runs 0 metrics new-session-id wt-state
988
+ claimed-ids nil wr))))))))))))))
716
989
 
717
990
  ;; =============================================================================
718
991
  ;; Multi-Worker Execution
@@ -720,7 +993,7 @@
720
993
 
721
994
  (defn run-workers!
722
995
  "Run multiple workers in parallel.
723
- Writes swarm summary to runs/{swarm-id}/summary.edn on completion.
996
+ Writes stopped event to runs/{swarm-id}/stopped.json on completion.
724
997
 
725
998
  Arguments:
726
999
  workers - seq of worker configs
@@ -731,32 +1004,53 @@
731
1004
  (let [swarm-id (-> workers first :swarm-id)]
732
1005
  (println (format "Launching %d workers..." (count workers)))
733
1006
 
734
- (let [futures (doall
735
- (map-indexed
736
- (fn [idx worker]
737
- (let [worker (assoc worker :id (or (:id worker) (str "w" idx)))]
738
- (future (run-worker! worker))))
739
- workers))]
740
-
741
- (println "All workers launched. Waiting for completion...")
742
- (let [results (mapv deref futures)]
743
- (println "\nAll workers complete.")
744
- (doseq [w results]
745
- (println (format " [%s] %s - %d completed, %d merges, %d rejections, %d errors, %d review rounds"
746
- (:id w)
747
- (name (:status w))
748
- (:completed w)
749
- (or (:merges w) 0)
750
- (or (:rejections w) 0)
751
- (or (:errors w) 0)
752
- (or (:review-rounds-total w) 0))))
753
-
754
- ;; Write swarm summary to disk
755
- (when swarm-id
756
- (runs/write-summary! swarm-id results)
757
- (println (format "\nSwarm summary written to runs/%s/summary.edn" swarm-id)))
758
-
759
- results))))
1007
+ ;; Register JVM shutdown hook so SIGTERM/SIGINT triggers graceful stop.
1008
+ ;; Sets the shutdown atom — workers check it between cycles and exit cleanly.
1009
+ ;; The hook waits for workers to finish, then writes stopped.json only if
1010
+ ;; the clean exit path hasn't already done so (guarded by the atom).
1011
+ (let [hook (Thread. (fn []
1012
+ (println "\nShutdown signal received, stopping workers after current cycle...")
1013
+ (reset! shutdown-requested? true)
1014
+ ;; Give workers time to finish current cycle and cleanup.
1015
+ ;; After sleep, write stopped.json only if still in shutdown
1016
+ ;; (clean exit resets the atom to false before writing :completed).
1017
+ (Thread/sleep 10000)
1018
+ (when (and swarm-id @shutdown-requested?)
1019
+ (runs/write-stopped! swarm-id :interrupted))))]
1020
+ (.addShutdownHook (Runtime/getRuntime) hook)
1021
+
1022
+ (let [futures (doall
1023
+ (map-indexed
1024
+ (fn [idx worker]
1025
+ (let [worker (assoc worker :id (or (:id worker) (str "w" idx)))]
1026
+ (future (run-worker! worker))))
1027
+ workers))]
1028
+
1029
+ (println "All workers launched. Waiting for completion...")
1030
+ (let [results (mapv deref futures)]
1031
+ ;; Clean exit — tell shutdown hook not to write stopped.json
1032
+ (reset! shutdown-requested? false)
1033
+ ;; Remove the hook so it doesn't accumulate across calls
1034
+ (try (.removeShutdownHook (Runtime/getRuntime) hook) (catch Exception _))
1035
+ (println "\nAll workers complete.")
1036
+ (doseq [w results]
1037
+ (println (format " [%s] %s - %d completed, %d merges, %d claims, %d rejections, %d errors, %d recycled, %d review rounds"
1038
+ (:id w)
1039
+ (name (:status w))
1040
+ (:completed w)
1041
+ (or (:merges w) 0)
1042
+ (or (:claims w) 0)
1043
+ (or (:rejections w) 0)
1044
+ (or (:errors w) 0)
1045
+ (or (:recycled w) 0)
1046
+ (or (:review-rounds-total w) 0))))
1047
+
1048
+ ;; Write stopped event — all state derivable from cycle logs
1049
+ (when swarm-id
1050
+ (runs/write-stopped! swarm-id :completed)
1051
+ (println (format "\nStopped event written to runs/%s/stopped.json" swarm-id)))
1052
+
1053
+ results)))))
760
1054
 
761
1055
  ;; =============================================================================
762
1056
  ;; Planner — first-class config concept, NOT a worker
@@ -781,10 +1075,12 @@
781
1075
  {:tasks-created 0})
782
1076
  ;; Run agent
783
1077
  (let [context (build-context)
1078
+ template-tokens (build-template-tokens context)
784
1079
  prompt-text (str (when (seq prompts)
785
1080
  (->> prompts
786
1081
  (map load-prompt)
787
1082
  (remove nil?)
1083
+ (map #(agent/tokenize % template-tokens))
788
1084
  (str/join "\n\n")))
789
1085
  "\n\nTask Status: " (:task_status context) "\n"
790
1086
  "Pending: " (:pending_tasks context) "\n\n"
@@ -794,29 +1090,17 @@
794
1090
  swarm-id* (or swarm-id "unknown")
795
1091
  tagged-prompt (str "[oompa:" swarm-id* ":planner] " prompt-text)
796
1092
  abs-root (.getAbsolutePath (io/file project-root))
797
- opencode-attach (opencode-attach-url)
798
-
799
- cmd (case harness
800
- :codex (cond-> [(resolve-binary! "codex") "exec"
801
- "--dangerously-bypass-approvals-and-sandbox"
802
- "--skip-git-repo-check"
803
- "-C" abs-root]
804
- model (into ["--model" model])
805
- true (conj "--" tagged-prompt))
806
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"]
807
- model (into ["--model" model]))
808
- :opencode (cond-> [(resolve-binary! "opencode") "run"]
809
- model (into ["-m" model])
810
- opencode-attach (into ["--attach" opencode-attach])
811
- true (conj tagged-prompt)))
1093
+
1094
+ cmd (harness/build-cmd harness
1095
+ {:cwd abs-root :model model :prompt tagged-prompt})
812
1096
 
813
1097
  _ (println (format "[planner] Running (%s:%s, max_pending: %d, current: %d)"
814
1098
  (name harness) (or model "default") max-pending pending-before))
815
1099
 
816
1100
  result (try
817
- (if (= harness :claude)
818
- (process/sh cmd {:dir abs-root :in tagged-prompt :out :string :err :string})
819
- (process/sh cmd {:dir abs-root :out :string :err :string}))
1101
+ (process/sh cmd {:dir abs-root
1102
+ :in (harness/process-stdin harness tagged-prompt)
1103
+ :out :string :err :string})
820
1104
  (catch Exception e
821
1105
  (println (format "[planner] Agent exception: %s" (.getMessage e)))
822
1106
  {:exit -1 :out "" :err (.getMessage e)}))