@nbardy/oompa 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,11 +14,13 @@
14
14
  No separate orchestrator - workers self-organize."
15
15
  (:require [agentnet.tasks :as tasks]
16
16
  [agentnet.agent :as agent]
17
+ [agentnet.core :as core]
18
+ [agentnet.harness :as harness]
17
19
  [agentnet.worktree :as worktree]
18
20
  [agentnet.runs :as runs]
19
- [cheshire.core :as json]
20
21
  [babashka.process :as process]
21
22
  [clojure.java.io :as io]
23
+ [clojure.set]
22
24
  [clojure.string :as str]))
23
25
 
24
26
  ;; =============================================================================
@@ -33,25 +35,11 @@
33
35
  ;; git index corruption from parallel checkout+merge operations.
34
36
  (def ^:private merge-lock (Object.))
35
37
 
36
- ;; Resolve absolute paths for CLI binaries at first use.
37
- ;; ProcessBuilder with :dir set can fail to find bare command names on some
38
- ;; platforms (macOS + babashka), so we resolve once via `which` and cache.
39
- (def ^:private binary-paths* (atom {}))
40
-
41
- (defn- resolve-binary!
42
- "Resolve the absolute path of a CLI binary. Caches result.
43
- Throws if binary not found on PATH."
44
- [name]
45
- (or (get @binary-paths* name)
46
- (let [result (try
47
- (process/sh ["which" name] {:out :string :err :string})
48
- (catch Exception _ {:exit -1 :out "" :err ""}))
49
- path (when (zero? (:exit result))
50
- (str/trim (:out result)))]
51
- (if path
52
- (do (swap! binary-paths* assoc name path)
53
- path)
54
- (throw (ex-info (str "Binary not found on PATH: " name) {:binary name}))))))
38
+ ;; Set by JVM shutdown hook (SIGTERM/SIGINT). Workers check this between cycles
39
+ ;; and exit gracefully finishing the current cycle before stopping.
40
+ (def ^:private shutdown-requested? (atom false))
41
+
42
+ (declare task-root-for-cwd)
55
43
 
56
44
  (defn- load-prompt
57
45
  "Load a prompt file. Tries path as-is first, then from package root."
@@ -59,14 +47,61 @@
59
47
  (or (agent/load-custom-prompt path)
60
48
  (agent/load-custom-prompt (str package-root "/" path))))
61
49
 
50
+ (defn- build-template-tokens
51
+ "Build token map for prompt template {var} substitution.
52
+ Merges core/build-context (rich YAML header, queue, hotspots, etc.)
53
+ with worker-level context (task_status, pending_tasks) and defaults
54
+ for tokens that core/build-context doesn't produce (mode_hint, targets,
55
+ recent_sec). Without these defaults, those {vars} leak into prompts."
56
+ ([worker-context]
57
+ (build-template-tokens worker-context nil))
58
+ ([worker-context cwd]
59
+ (let [pending (tasks/list-pending)
60
+ core-ctx (core/build-context {:tasks pending
61
+ :repo (System/getProperty "user.dir")})
62
+ task-root (task-root-for-cwd (or cwd (System/getProperty "user.dir")))]
63
+ (merge {:mode_hint "propose"
64
+ :targets "*"
65
+ :recent_sec "180"
66
+ :TASK_ROOT task-root
67
+ :TASKS_ROOT task-root}
68
+ core-ctx
69
+ worker-context))))
70
+
71
+ (defn- task-root-for-cwd
72
+ "Return the relative tasks root for commands issued from cwd."
73
+ [cwd]
74
+ (let [cwd-file (io/file cwd)
75
+ local-tasks (io/file cwd-file "tasks")
76
+ parent-tasks (some-> cwd-file .getParentFile (io/file "tasks"))]
77
+ (cond
78
+ (.exists local-tasks) "tasks"
79
+ (and parent-tasks (.exists parent-tasks)) "../tasks"
80
+ :else "tasks")))
81
+
82
+ (defn- render-task-header
83
+ "Inject runtime task path into auto-injected task header."
84
+ [raw-header cwd]
85
+ (let [task-root (task-root-for-cwd cwd)]
86
+ (-> (or raw-header "")
87
+ (str/replace "{{TASK_ROOT}}" task-root)
88
+ (str/replace "{{TASKS_ROOT}}" task-root)
89
+ (str/replace "{TASK_ROOT}" task-root)
90
+ (str/replace "{TASKS_ROOT}" task-root))))
91
+
92
+ (def ^:private default-max-working-resumes 5)
93
+
62
94
  (defn create-worker
63
95
  "Create a worker config.
64
96
  :prompts is a string or vector of strings — paths to prompt files.
65
97
  :can-plan when false, worker waits for tasks before starting (backpressure).
66
98
  :reasoning reasoning effort level (e.g. \"low\", \"medium\", \"high\") — codex only.
67
- :review-prompts paths to reviewer prompt files (loaded and concatenated for review)."
99
+ :review-prompts paths to reviewer prompt files (loaded and concatenated for review).
100
+ :wait-between seconds to sleep between iterations (nil or 0 = no wait).
101
+ :max-working-resumes max consecutive working resumes before nudge+kill (default 5)."
68
102
  [{:keys [id swarm-id harness model iterations prompts can-plan reasoning
69
- review-harness review-model review-prompts]}]
103
+ review-harness review-model review-prompts wait-between
104
+ max-working-resumes]}]
70
105
  {:id id
71
106
  :swarm-id swarm-id
72
107
  :harness (or harness :codex)
@@ -78,12 +113,14 @@
78
113
  :else [])
79
114
  :can-plan (if (some? can-plan) can-plan true)
80
115
  :reasoning reasoning
116
+ :wait-between (when (and wait-between (pos? wait-between)) wait-between)
81
117
  :review-harness review-harness
82
118
  :review-model review-model
83
119
  :review-prompts (cond
84
120
  (vector? review-prompts) review-prompts
85
121
  (string? review-prompts) [review-prompts]
86
122
  :else [])
123
+ :max-working-resumes (or max-working-resumes default-max-working-resumes)
87
124
  :completed 0
88
125
  :status :idle})
89
126
 
@@ -93,6 +130,18 @@
93
130
 
94
131
  (def ^:private max-review-retries 3)
95
132
 
133
+ ;; Nudge prompt injected when a worker hits max-working-resumes consecutive
134
+ ;; "working" outcomes without signaling. Gives the agent one final chance to
135
+ ;; produce something mergeable before the session is killed.
136
+ (def ^:private nudge-prompt
137
+ (str "You have been working for a long time without signaling completion.\n"
138
+ "You MUST take one of these actions NOW:\n\n"
139
+ "1. If you have meaningful changes: commit them and signal COMPLETE_AND_READY_FOR_MERGE\n"
140
+ "2. If scope is too large: create follow-up tasks in tasks/pending/ for remaining work,\n"
141
+ " commit what you have (even partial notes/design docs), and signal COMPLETE_AND_READY_FOR_MERGE\n"
142
+ "3. If you are stuck and cannot make progress: signal __DONE__\n\n"
143
+ "Do NOT continue working without producing a signal."))
144
+
96
145
  (defn- build-context
97
146
  "Build context for agent prompts"
98
147
  []
@@ -106,66 +155,80 @@
106
155
  :task_status (format "Pending: %d, In Progress: %d, Complete: %d"
107
156
  (count pending) (count current) (count complete))}))
108
157
 
109
- (defn- opencode-attach-url
110
- "Optional opencode server URL for run --attach mode."
111
- []
112
- (let [url (or (System/getenv "OOMPA_OPENCODE_ATTACH")
113
- (System/getenv "OPENCODE_ATTACH"))]
114
- (when (and url (not (str/blank? url)))
115
- url)))
116
-
117
- (defn- parse-opencode-run-output
118
- "Parse `opencode run --format json` output.
119
- Returns {:session-id string|nil, :text string|nil}."
120
- [s]
121
- (let [raw (or s "")
122
- events (->> (str/split-lines raw)
123
- (keep (fn [line]
124
- (try
125
- (json/parse-string line true)
126
- (catch Exception _
127
- nil))))
128
- doall)
129
- session-id (or (some #(or (:sessionID %)
130
- (:sessionId %)
131
- (get-in % [:part :sessionID])
132
- (get-in % [:part :sessionId]))
133
- events)
134
- (some-> (re-find #"(ses_[A-Za-z0-9]+)" raw) second))
135
- text (->> events
136
- (keep (fn [event]
137
- (let [event-type (or (:type event) (get-in event [:part :type]))
138
- chunk (or (:text event) (get-in event [:part :text]))]
139
- (when (and (= event-type "text")
140
- (string? chunk)
141
- (not (str/blank? chunk)))
142
- chunk))))
143
- (str/join ""))]
144
- {:session-id session-id
145
- :text (when-not (str/blank? text) text)}))
158
+
159
+ (defn- execute-claims!
160
+ "Execute CLAIM signal: attempt to claim each task ID from pending/.
161
+ Returns {:claimed [ids], :failed [ids], :resume-prompt string}."
162
+ [claim-ids]
163
+ (let [results (tasks/claim-by-ids! claim-ids)
164
+ claimed (filterv #(= :claimed (:status %)) results)
165
+ failed (filterv #(not= :claimed (:status %)) results)
166
+ claimed-ids (mapv :id claimed)
167
+ failed-ids (mapv :id failed)
168
+ context (build-context)
169
+ prompt (str "## Claim Results\n"
170
+ (if (seq claimed-ids)
171
+ (str "Claimed: " (str/join ", " claimed-ids) "\n")
172
+ "No tasks were successfully claimed.\n")
173
+ (when (seq failed-ids)
174
+ (str "Already taken or not found: "
175
+ (str/join ", " failed-ids) "\n"))
176
+ "\nTask Status: " (:task_status context) "\n"
177
+ "Remaining Pending:\n"
178
+ (if (str/blank? (:pending_tasks context))
179
+ "(none)"
180
+ (:pending_tasks context))
181
+ "\n\n"
182
+ (if (seq claimed-ids)
183
+ "Work on your claimed tasks. Signal COMPLETE_AND_READY_FOR_MERGE when done."
184
+ "No claims succeeded. CLAIM different tasks, or signal __DONE__ if no suitable work remains."))]
185
+ {:claimed claimed-ids
186
+ :failed failed-ids
187
+ :resume-prompt prompt}))
146
188
 
147
189
  (defn- run-agent!
148
- "Run agent with prompt, return {:output string, :done? bool, :merge? bool, :exit int, :session-id string}.
149
- When resume? is true and harness is :claude/:opencode, continues the existing session
150
- with a lighter prompt (just task status + continue instruction)."
151
- [{:keys [id swarm-id harness model prompts reasoning]} worktree-path context session-id resume?]
152
- (let [;; Use provided session-id, otherwise generate one for harnesses that accept custom IDs.
153
- session-id (or session-id
154
- (when (#{:codex :claude} harness)
155
- (str/lower-case (str (java.util.UUID/randomUUID)))))
156
-
157
- ;; Build prompt lighter for resume (agent already has full context)
158
- prompt (if resume?
190
+ "Run agent with prompt, return {:output :done? :merge? :claim-ids :exit :session-id}.
191
+ When resume? is true, continues the existing session with a lighter prompt.
192
+ resume-prompt-override: when non-nil, replaces the default resume prompt
193
+ (used to inject CLAIM results). All harness-specific CLI knowledge
194
+ is delegated to harness/build-cmd."
195
+ [{:keys [id swarm-id harness model prompts reasoning]} worktree-path context session-id resume?
196
+ & {:keys [resume-prompt-override]}]
197
+ (let [session-id (or session-id (harness/make-session-id harness))
198
+ template-tokens (build-template-tokens context worktree-path)
199
+ resume-prompt-override (when resume-prompt-override
200
+ (-> resume-prompt-override
201
+ (render-task-header worktree-path)
202
+ (agent/tokenize template-tokens)))
203
+
204
+ ;; Build prompt — 3-way: override → standard resume → fresh start
205
+ prompt (cond
206
+ ;; CLAIM results or other injected resume prompt
207
+ resume-prompt-override
208
+ resume-prompt-override
209
+
210
+ ;; Standard resume — lighter (agent already has full context)
211
+ resume?
159
212
  (str "Task Status: " (:task_status context) "\n"
160
213
  "Pending: " (:pending_tasks context) "\n\n"
161
214
  "Continue working. Signal COMPLETE_AND_READY_FOR_MERGE when your current task is done and ready for review.")
162
- (let [task-header (or (load-prompt "config/prompts/_task_header.md") "")
215
+
216
+ ;; Fresh start — full task header + tokenized user prompts
217
+ ;; Template tokens ({context_header}, {queue_md}, etc.) are
218
+ ;; replaced here. Without this, raw {var} placeholders leak
219
+ ;; into the agent prompt verbatim.
220
+ :else
221
+ (let [task-header (render-task-header
222
+ (load-prompt "config/prompts/_task_header.md")
223
+ worktree-path)
163
224
  user-prompts (if (seq prompts)
164
225
  (->> prompts
165
226
  (map load-prompt)
166
227
  (remove nil?)
228
+ (map #(agent/tokenize % template-tokens))
167
229
  (str/join "\n\n"))
168
- (or (load-prompt "config/prompts/worker.md")
230
+ (or (some-> (load-prompt "config/prompts/worker.md")
231
+ (agent/tokenize template-tokens))
169
232
  "You are a worker. Claim tasks, execute them, complete them."))]
170
233
  (str task-header "\n"
171
234
  "Task Status: " (:task_status context) "\n"
@@ -175,54 +238,29 @@
175
238
  swarm-id* (or swarm-id "unknown")
176
239
  tagged-prompt (str "[oompa:" swarm-id* ":" id "] " prompt)
177
240
  abs-worktree (.getAbsolutePath (io/file worktree-path))
178
- opencode-attach (opencode-attach-url)
179
-
180
- ;; Build command all harnesses run with cwd=worktree, no sandbox
181
- ;; so agents can `..` to reach project root for task management
182
- ;; Claude: --resume flag continues existing session-id conversation
183
- ;; Opencode: -s/--session + --continue continue existing session
184
- ;; and --format json for deterministic per-run session capture.
185
- ;; Codex: no native resume support, always fresh (but worktree state persists)
186
- cmd (case harness
187
- :codex (cond-> [(resolve-binary! "codex") "exec"
188
- "--dangerously-bypass-approvals-and-sandbox"
189
- "--skip-git-repo-check"
190
- "-C" abs-worktree]
191
- model (into ["--model" model])
192
- reasoning (into ["-c" (str "model_reasoning_effort=\"" reasoning "\"")])
193
- true (conj "--" tagged-prompt))
194
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"
195
- "--session-id" session-id]
196
- resume? (conj "--resume")
197
- model (into ["--model" model]))
198
- :opencode (cond-> [(resolve-binary! "opencode") "run" "--format" "json"]
199
- model (into ["-m" model])
200
- opencode-attach (into ["--attach" opencode-attach])
201
- (and resume? session-id) (into ["-s" session-id "--continue"])
202
- true (conj tagged-prompt)))
203
-
204
- ;; Run agent — all run with cwd=worktree
241
+
242
+ cmd (harness/build-cmd harness
243
+ {:cwd abs-worktree :model model :reasoning reasoning
244
+ :session-id session-id :resume? resume?
245
+ :prompt tagged-prompt :format? true})
246
+
205
247
  result (try
206
- (if (= harness :claude)
207
- (process/sh cmd {:dir abs-worktree :in tagged-prompt :out :string :err :string})
208
- (process/sh cmd {:dir abs-worktree :out :string :err :string}))
248
+ (process/sh cmd {:dir abs-worktree
249
+ :in (harness/process-stdin harness tagged-prompt)
250
+ :out :string :err :string})
209
251
  (catch Exception e
210
252
  (println (format "[%s] Agent exception: %s" id (.getMessage e)))
211
253
  {:exit -1 :out "" :err (.getMessage e)}))
212
- parsed-opencode (when (= harness :opencode)
213
- (parse-opencode-run-output (:out result)))
214
- output (if (= harness :opencode)
215
- (or (:text parsed-opencode) (:out result))
216
- (:out result))
217
- session-id' (if (= harness :opencode)
218
- (or (:session-id parsed-opencode) session-id)
219
- session-id)]
254
+
255
+ {:keys [output session-id]}
256
+ (harness/parse-output harness (:out result) session-id)]
220
257
 
221
258
  {:output output
222
259
  :exit (:exit result)
223
260
  :done? (agent/done-signal? output)
224
261
  :merge? (agent/merge-signal? output)
225
- :session-id session-id'}))
262
+ :claim-ids (agent/parse-claim-signal output)
263
+ :session-id session-id}))
226
264
 
227
265
  (defn- run-reviewer!
228
266
  "Run reviewer on worktree changes.
@@ -238,7 +276,6 @@
238
276
  (str (subs d 0 8000) "\n... [diff truncated at 8000 chars]")
239
277
  d))
240
278
 
241
- ;; Build review prompt — use custom prompts if configured, else default
242
279
  swarm-id* (or swarm-id "unknown")
243
280
  custom-prompt (when (seq review-prompts)
244
281
  (->> review-prompts
@@ -246,16 +283,19 @@
246
283
  (remove nil?)
247
284
  (str/join "\n\n")))
248
285
 
249
- ;; Include previous review history for multi-round context
286
+ ;; Only include the most recent round's feedback — the worker has already
287
+ ;; attempted fixes based on it, so the reviewer just needs to verify.
288
+ ;; Including all prior rounds bloats the prompt and causes empty output.
250
289
  history-block (when (seq prev-feedback)
251
- (str "\n## Previous Review Rounds\n\n"
252
- "The worker has already attempted fixes based on earlier feedback. "
253
- "Do NOT raise new issues only verify the original issues are resolved.\n\n"
254
- (->> prev-feedback
255
- (map-indexed (fn [i fb]
256
- (str "### Round " (inc i) " feedback:\n" fb)))
257
- (str/join "\n\n"))
258
- "\n\n"))
290
+ (let [latest (last prev-feedback)
291
+ truncated (if (> (count latest) 2000)
292
+ (str (subs latest 0 2000) "\n... [feedback truncated]")
293
+ latest)]
294
+ (str "\n## Previous Review (Round " (count prev-feedback) ")\n\n"
295
+ "The worker has attempted fixes based on this feedback. "
296
+ "Verify the issues below are resolved. Do NOT raise new issues.\n\n"
297
+ truncated
298
+ "\n\n")))
259
299
 
260
300
  review-body (str (or custom-prompt
261
301
  (str "Review the changes in this worktree.\n"
@@ -264,49 +304,40 @@
264
304
  (when history-block history-block)
265
305
  "\nYour verdict MUST be on its own line, exactly one of:\n"
266
306
  "VERDICT: APPROVED\n"
267
- "VERDICT: NEEDS_CHANGES\n"
268
- "VERDICT: REJECTED\n")
307
+ "VERDICT: NEEDS_CHANGES\n\n"
308
+ "Do NOT use REJECTED. Always use NEEDS_CHANGES with specific, "
309
+ "actionable feedback explaining what must change and why. "
310
+ "The worker will attempt fixes based on your feedback.\n"
311
+ "After your verdict line, list every issue as a numbered item with "
312
+ "the file path and what needs to change.\n")
269
313
  review-prompt (str "[oompa:" swarm-id* ":" id "] " review-body)
270
314
 
271
315
  abs-wt (.getAbsolutePath (io/file worktree-path))
272
- opencode-attach (opencode-attach-url)
273
-
274
- ;; Build command — cwd=worktree, no sandbox
275
- cmd (case review-harness
276
- :codex (cond-> [(resolve-binary! "codex") "exec"
277
- "--dangerously-bypass-approvals-and-sandbox"
278
- "--skip-git-repo-check"
279
- "-C" abs-wt]
280
- review-model (into ["--model" review-model])
281
- true (conj "--" review-prompt))
282
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"]
283
- review-model (into ["--model" review-model]))
284
- :opencode (cond-> [(resolve-binary! "opencode") "run"]
285
- review-model (into ["-m" review-model])
286
- opencode-attach (into ["--attach" opencode-attach])
287
- true (conj review-prompt)))
288
-
289
- ;; Run reviewer — cwd=worktree
316
+
317
+ ;; No session, no resume, no format flags — reviewer is stateless one-shot
318
+ cmd (harness/build-cmd review-harness
319
+ {:cwd abs-wt :model review-model :prompt review-prompt})
320
+
290
321
  result (try
291
- (if (= review-harness :claude)
292
- (process/sh cmd {:dir abs-wt :in review-prompt :out :string :err :string})
293
- (process/sh cmd {:dir abs-wt :out :string :err :string}))
322
+ (process/sh cmd {:dir abs-wt
323
+ :in (harness/process-stdin review-harness review-prompt)
324
+ :out :string :err :string})
294
325
  (catch Exception e
295
326
  {:exit -1 :out "" :err (.getMessage e)}))
296
327
 
297
328
  output (:out result)
298
329
 
299
- ;; Parse verdict — require explicit VERDICT: prefix to avoid false matches
330
+ ;; Parse verdict — require explicit VERDICT: prefix to avoid false matches.
331
+ ;; REJECTED is treated as NEEDS_CHANGES: the reviewer must always give
332
+ ;; actionable feedback so the worker can attempt fixes. Hard rejection
333
+ ;; only happens when max review rounds are exhausted.
300
334
  verdict (cond
301
335
  (re-find #"VERDICT:\s*APPROVED" output) :approved
302
- (re-find #"VERDICT:\s*REJECTED" output) :rejected
303
336
  (re-find #"VERDICT:\s*NEEDS_CHANGES" output) :needs-changes
304
- ;; Fallback to loose matching if reviewer didn't use prefix
337
+ (re-find #"VERDICT:\s*REJECTED" output) :needs-changes
305
338
  (re-find #"(?i)\bAPPROVED\b" output) :approved
306
- (re-find #"(?i)\bREJECTED\b" output) :rejected
307
339
  :else :needs-changes)]
308
340
 
309
- ;; Log reviewer output (truncated) for visibility
310
341
  (println (format "[%s] Reviewer verdict: %s" id (name verdict)))
311
342
  (let [summary (subs output 0 (min 300 (count output)))]
312
343
  (println (format "[%s] Review: %s%s" id summary
@@ -337,32 +368,94 @@
337
368
  "Fix these issues. Do not add anything the reviewer did not ask for.")
338
369
 
339
370
  abs-wt (.getAbsolutePath (io/file worktree-path))
340
- opencode-attach (opencode-attach-url)
341
-
342
- cmd (case harness
343
- :codex (cond-> [(resolve-binary! "codex") "exec"
344
- "--dangerously-bypass-approvals-and-sandbox"
345
- "--skip-git-repo-check"
346
- "-C" abs-wt]
347
- model (into ["--model" model])
348
- true (conj "--" fix-prompt))
349
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"]
350
- model (into ["--model" model]))
351
- :opencode (cond-> [(resolve-binary! "opencode") "run"]
352
- model (into ["-m" model])
353
- opencode-attach (into ["--attach" opencode-attach])
354
- true (conj fix-prompt)))
371
+
372
+ cmd (harness/build-cmd harness
373
+ {:cwd abs-wt :model model :prompt fix-prompt})
355
374
 
356
375
  result (try
357
- (if (= harness :claude)
358
- (process/sh cmd {:dir abs-wt :in fix-prompt :out :string :err :string})
359
- (process/sh cmd {:dir abs-wt :out :string :err :string}))
376
+ (process/sh cmd {:dir abs-wt
377
+ :in (harness/process-stdin harness fix-prompt)
378
+ :out :string :err :string})
360
379
  (catch Exception e
361
380
  {:exit -1 :out "" :err (.getMessage e)}))]
362
381
 
363
382
  {:output (:out result)
364
383
  :exit (:exit result)}))
365
384
 
385
+ (defn- collect-divergence-context
386
+ "Collect context about how a worktree branch has diverged from main.
387
+ Returns a map with :branch-log, :main-log, :diff-stat strings."
388
+ [wt-path]
389
+ (let [git-out (fn [& args] (:out (process/sh (vec args) {:dir wt-path :out :string :err :string})))
390
+ branch-log (git-out "git" "log" "--oneline" "main..HEAD")
391
+ main-log (git-out "git" "log" "--oneline" "HEAD..main")
392
+ diff-stat (git-out "git" "diff" "--stat" "main")]
393
+ {:branch-log (or branch-log "(none)")
394
+ :main-log (or main-log "(none)")
395
+ :diff-stat (or diff-stat "(none)")}))
396
+
397
+ (defn- verify-mergeable?
398
+ "Dry-run merge to verify a worktree branch merges cleanly into main.
399
+ Does NOT leave merge state behind — always cleans up the dry-run.
400
+ Uses --no-commit so no actual commit is created; resets afterward."
401
+ [wt-path]
402
+ (let [result (process/sh ["git" "merge" "--no-commit" "--no-ff" "main"]
403
+ {:dir wt-path :out :string :err :string})
404
+ clean? (zero? (:exit result))]
405
+ ;; Clean up: abort if conflicted, reset if staged but uncommitted
406
+ (if clean?
407
+ (process/sh ["git" "reset" "--hard" "HEAD"] {:dir wt-path})
408
+ (process/sh ["git" "merge" "--abort"] {:dir wt-path}))
409
+ clean?))
410
+
411
+ (defn- sync-worktree-to-main!
412
+ "Sync worktree branch with main before merge-to-main!.
413
+ Fast path: git merge main succeeds cleanly → :synced.
414
+ Conflict path: abort merge, give agent a clean worktree + divergence
415
+ context, let agent make the branch mergeable (rebase, cherry-pick,
416
+ manual resolution — agent's choice), verify with dry-run merge.
417
+ Runs OUTSIDE the merge-lock so the agent doesn't block other workers.
418
+ Returns :synced | :resolved | :failed."
419
+ [worker wt-path worker-id]
420
+ (let [merge-result (process/sh ["git" "merge" "main" "--no-edit"]
421
+ {:dir wt-path :out :string :err :string})]
422
+ (if (zero? (:exit merge-result))
423
+ (do (println (format "[%s] Worktree synced to main" worker-id))
424
+ :synced)
425
+ ;; Conflict — abort merge to restore clean worktree state, then
426
+ ;; hand the problem to the agent with full divergence context.
427
+ (let [_ (process/sh ["git" "merge" "--abort"] {:dir wt-path})
428
+ _ (println (format "[%s] Branch diverged from main, launching resolver agent" worker-id))
429
+ {:keys [branch-log main-log diff-stat]} (collect-divergence-context wt-path)
430
+ resolve-prompt (str "[oompa:" (or (:swarm-id worker) "unknown") ":" worker-id "] "
431
+ "Your branch has diverged from main and cannot merge cleanly.\n\n"
432
+ "Your branch's commits (not on main):\n" branch-log "\n\n"
433
+ "Commits on main since you branched:\n" main-log "\n\n"
434
+ "Divergence scope:\n" diff-stat "\n\n"
435
+ "Make this branch cleanly mergeable into main. "
436
+ "Preserve the intent of your branch's changes.\n"
437
+ "You have full git access — rebase, cherry-pick, resolve conflicts, "
438
+ "whatever works.\n"
439
+ "When done, verify with: git diff main --stat")
440
+ abs-wt (.getAbsolutePath (io/file wt-path))
441
+ cmd (harness/build-cmd (:harness worker)
442
+ {:cwd abs-wt :model (:model worker) :prompt resolve-prompt})
443
+ result (try
444
+ (process/sh cmd {:dir abs-wt
445
+ :in (harness/process-stdin (:harness worker) resolve-prompt)
446
+ :out :string :err :string})
447
+ (catch Exception e
448
+ {:exit -1 :out "" :err (.getMessage e)}))]
449
+ (if (zero? (:exit result))
450
+ ;; Agent ran — verify the branch actually merges cleanly now
451
+ (if (verify-mergeable? wt-path)
452
+ (do (println (format "[%s] Agent resolved divergence, branch is mergeable" worker-id))
453
+ :resolved)
454
+ (do (println (format "[%s] Agent ran but branch still can't merge cleanly" worker-id))
455
+ :failed))
456
+ (do (println (format "[%s] Resolver agent failed (exit %d)" worker-id (:exit result)))
457
+ :failed))))))
458
+
366
459
  (defn- worktree-has-changes?
367
460
  "Check if worktree has committed OR uncommitted changes vs main.
368
461
  Workers commit before signaling merge, so we must check both:
@@ -395,6 +488,46 @@
395
488
  {:dir wt-dir :branch wt-branch}))))
396
489
  {:dir wt-dir :branch wt-branch :path wt-path}))
397
490
 
491
+ (defn- detect-claimed-tasks
492
+ "Diff current/ task IDs before and after agent ran.
493
+ Returns set of task IDs this worker claimed during iteration."
494
+ [pre-current-ids]
495
+ (let [post-ids (tasks/current-task-ids)]
496
+ (clojure.set/difference post-ids pre-current-ids)))
497
+
498
+ (defn- emit-cycle-log!
499
+ "Write cycle event log. Called at every cycle exit point.
500
+ session-id links to the Claude CLI conversation transcript on disk.
501
+ No mutable summary state — all state is derived from immutable cycle logs."
502
+ [swarm-id worker-id cycle start-ms session-id
503
+ {:keys [outcome claimed-task-ids recycled-tasks error-snippet review-rounds]}]
504
+ (let [duration-ms (- (System/currentTimeMillis) start-ms)]
505
+ (runs/write-cycle-log!
506
+ swarm-id worker-id cycle
507
+ {:outcome outcome
508
+ :duration-ms duration-ms
509
+ :claimed-task-ids (vec (or claimed-task-ids []))
510
+ :recycled-tasks (or recycled-tasks [])
511
+ :error-snippet error-snippet
512
+ :review-rounds (or review-rounds 0)
513
+ :session-id session-id})))
514
+
515
+ (defn- recycle-orphaned-tasks!
516
+ "Recycle tasks that a worker claimed but didn't complete.
517
+ Compares current/ task IDs before and after the agent ran —
518
+ new IDs that appeared are tasks this worker claimed. On failure
519
+ or rejection, move them back to pending/ so other workers can
520
+ pick them up. Returns count of recycled tasks."
521
+ [worker-id pre-current-ids]
522
+ (let [post-current-ids (tasks/current-task-ids)
523
+ orphaned-ids (clojure.set/difference post-current-ids pre-current-ids)
524
+ recycled (when (seq orphaned-ids)
525
+ (tasks/recycle-tasks! orphaned-ids))]
526
+ (when (seq recycled)
527
+ (println (format "[%s] Recycled %d orphaned task(s): %s"
528
+ worker-id (count recycled) (str/join ", " recycled))))
529
+ (count (or recycled []))))
530
+
398
531
  (defn- cleanup-worktree!
399
532
  "Remove worktree and branch."
400
533
  [project-root wt-dir wt-branch]
@@ -433,10 +566,10 @@
433
566
 
434
567
  (defn- merge-to-main!
435
568
  "Merge worktree changes to main branch. Serialized via merge-lock to prevent
436
- concurrent workers from corrupting the git index. On success, annotates any
437
- newly-completed tasks with worker metadata. Returns true on success.
438
- review-rounds: number of review rounds (0 for auto-merged task-only changes)."
439
- [wt-path wt-id worker-id project-root review-rounds]
569
+ concurrent workers from corrupting the git index. On success, moves claimed
570
+ tasks current→complete and annotates metadata. Returns true on success.
571
+ claimed-task-ids: set of task IDs this worker claimed (framework owns completion)."
572
+ [wt-path wt-id worker-id project-root review-rounds claimed-task-ids]
440
573
  (locking merge-lock
441
574
  (println (format "[%s] Merging changes to main" worker-id))
442
575
  (let [;; Commit in worktree if needed (no-op if already committed)
@@ -457,10 +590,26 @@
457
590
  (if success
458
591
  (do
459
592
  (println (format "[%s] Merge successful" worker-id))
460
- ;; Annotate completed tasks while still holding merge-lock
593
+ ;; Framework-owned completion: move claimed tasks current→complete
594
+ (when (seq claimed-task-ids)
595
+ (let [completed (tasks/complete-by-ids! claimed-task-ids)]
596
+ (when (seq completed)
597
+ (println (format "[%s] Completed %d task(s): %s"
598
+ worker-id (count completed) (str/join ", " completed))))))
599
+ ;; Annotate completed tasks with metadata while still holding merge-lock
461
600
  (annotate-completed-tasks! project-root worker-id review-rounds))
462
- (when merge-result
463
- (println (format "[%s] MERGE FAILED: %s" worker-id (:err merge-result)))))
601
+ ;; FAILED: Clean up git state before releasing merge-lock.
602
+ ;; Without this, a conflict leaves .git/MERGE_HEAD and poisons the
603
+ ;; shared index — every subsequent worker fails on `git checkout main`.
604
+ (do
605
+ (println (format "[%s] MERGE FAILED: %s" worker-id
606
+ (or (:err merge-result) (:err checkout-result))))
607
+ (let [abort-result (process/sh ["git" "merge" "--abort"]
608
+ {:dir project-root :out :string :err :string})]
609
+ (when-not (zero? (:exit abort-result))
610
+ ;; Abort failed (no merge in progress, or other issue) — hard reset.
611
+ (process/sh ["git" "reset" "--hard" "HEAD"]
612
+ {:dir project-root :out :string :err :string})))))
464
613
  success)))
465
614
 
466
615
  (defn- task-only-diff?
@@ -516,12 +665,8 @@
516
665
  (println (format "[%s] Reviewer APPROVED (attempt %d)" worker-id attempt))
517
666
  {:approved? true :attempts attempt})
518
667
 
519
- :rejected
520
- (do
521
- (println (format "[%s] Reviewer REJECTED (attempt %d)" worker-id attempt))
522
- {:approved? false :attempts attempt})
523
-
524
- ;; :needs-changes
668
+ ;; :needs-changes — always give the worker a chance to fix.
669
+ ;; Hard rejection only happens when max review rounds are exhausted.
525
670
  (let [all-feedback (conj prev-feedback output)]
526
671
  (if (>= attempt max-review-retries)
527
672
  (do
@@ -536,27 +681,38 @@
536
681
  ;; Worker Loop
537
682
  ;; =============================================================================
538
683
 
539
- (def ^:private max-wait-for-tasks 60)
540
- (def ^:private wait-poll-interval 5)
684
+ ;; Workers wait up to 10 minutes for tasks to appear before giving up.
685
+ ;; This keeps workers alive while planners/designers ramp up the queue.
686
+ (def ^:private max-wait-for-tasks 600)
687
+ (def ^:private wait-poll-interval 10)
541
688
  (def ^:private max-consecutive-errors 3)
542
689
 
543
690
  (defn- wait-for-tasks!
544
- "Wait up to 60s for pending/current tasks to appear. Used for backpressure
545
- on workers that can't create their own tasks (can_plan: false)."
691
+ "Wait up to 10 minutes for pending/current tasks to appear. Used for
692
+ backpressure on workers that can't create their own tasks (can_plan: false).
693
+ Polls every 10 seconds, logs every 60 seconds."
546
694
  [worker-id]
547
695
  (loop [waited 0]
548
696
  (cond
549
697
  (pos? (tasks/pending-count)) true
550
698
  (pos? (tasks/current-count)) true
551
699
  (>= waited max-wait-for-tasks)
552
- (do (println (format "[%s] No tasks after %ds, proceeding anyway" worker-id waited))
700
+ (do (println (format "[%s] No tasks after %ds, giving up" worker-id waited))
553
701
  false)
554
702
  :else
555
- (do (when (zero? (mod waited 15))
556
- (println (format "[%s] Waiting for tasks... (%ds)" worker-id waited)))
703
+ (do (when (zero? (mod waited 60))
704
+ (println (format "[%s] Waiting for tasks... (%ds/%ds)" worker-id waited max-wait-for-tasks)))
557
705
  (Thread/sleep (* wait-poll-interval 1000))
558
706
  (recur (+ waited wait-poll-interval))))))
559
707
 
708
+ (defn- maybe-sleep-between!
709
+ "Sleep between iterations when wait-between is configured.
710
+ Called at the start of each iteration (except the first)."
711
+ [worker-id wait-between iter]
712
+ (when (and wait-between (> iter 1))
713
+ (println (format "[%s] Sleeping %ds before next iteration" worker-id wait-between))
714
+ (Thread/sleep (* wait-between 1000))))
715
+
560
716
  (defn run-worker!
561
717
  "Run worker loop with persistent sessions.
562
718
 
@@ -568,41 +724,74 @@
568
724
  Returns final worker state with metrics attached."
569
725
  [worker]
570
726
  (tasks/ensure-dirs!)
571
- (let [{:keys [id iterations]} worker
727
+ (let [{:keys [id iterations swarm-id wait-between]} worker
572
728
  project-root (System/getProperty "user.dir")]
573
- (println (format "[%s] Starting worker (%s:%s%s, %d iterations)"
729
+ (println (format "[%s] Starting worker (%s:%s%s, %d iterations%s)"
574
730
  id
575
731
  (name (:harness worker))
576
732
  (or (:model worker) "default")
577
733
  (if (:reasoning worker) (str ":" (:reasoning worker)) "")
578
- iterations))
734
+ iterations
735
+ (if wait-between (format ", %ds between" wait-between) "")))
579
736
 
580
737
  ;; Backpressure: workers that can't create tasks wait for tasks to exist
581
738
  (when-not (:can-plan worker)
582
739
  (wait-for-tasks! id))
583
740
 
584
- ;; metrics tracks: {:merges N :rejections N :errors N :review-rounds-total N}
741
+ ;; metrics tracks: {:merges N :rejections N :errors N :recycled N :review-rounds-total N :claims N}
585
742
  (loop [iter 1
586
743
  completed 0
587
744
  consec-errors 0
588
- metrics {:merges 0 :rejections 0 :errors 0 :review-rounds-total 0}
589
- session-id nil ;; persistent session-id (nil = start fresh)
590
- wt-state nil] ;; {:dir :branch :path} or nil
745
+ metrics {:merges 0 :rejections 0 :errors 0 :recycled 0 :review-rounds-total 0 :claims 0}
746
+ session-id nil ;; persistent session-id (nil = start fresh)
747
+ wt-state nil ;; {:dir :branch :path} or nil
748
+ claimed-ids #{} ;; task IDs claimed this session (reset on worktree destroy)
749
+ claim-resume-prompt nil ;; override prompt for next iteration (from CLAIM results)
750
+ working-resumes 0] ;; consecutive "working" outcomes in current session
591
751
  (let [finish (fn [status]
592
752
  (assoc worker :completed completed :status status
593
753
  :merges (:merges metrics)
594
754
  :rejections (:rejections metrics)
595
755
  :errors (:errors metrics)
596
- :review-rounds-total (:review-rounds-total metrics)))]
597
- (if (> iter iterations)
756
+ :recycled (:recycled metrics)
757
+ :review-rounds-total (:review-rounds-total metrics)
758
+ :claims (:claims metrics)))]
759
+ (cond
760
+ (> iter iterations)
598
761
  (do
599
762
  ;; Cleanup any lingering worktree
600
763
  (when wt-state
601
764
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
602
- (println (format "[%s] Completed %d iterations (%d merges, %d rejections, %d errors)"
603
- id completed (:merges metrics) (:rejections metrics) (:errors metrics)))
765
+ (println (format "[%s] Completed %d iterations (%d merges, %d claims, %d rejections, %d errors, %d recycled)"
766
+ id completed (:merges metrics) (:claims metrics) (:rejections metrics) (:errors metrics) (:recycled metrics)))
604
767
  (finish :exhausted))
605
768
 
769
+ @shutdown-requested?
770
+ (do
771
+ (println (format "[%s] Shutdown requested, stopping after %d iterations" id (dec iter)))
772
+ (when wt-state
773
+ ;; Recycle any claimed tasks back to pending so other workers can pick them up
774
+ (when (seq claimed-ids)
775
+ (let [recycled (tasks/recycle-tasks! claimed-ids)]
776
+ (when (seq recycled)
777
+ (println (format "[%s] Recycled %d claimed task(s) on shutdown" id (count recycled))))))
778
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
779
+ (emit-cycle-log! swarm-id id iter (System/currentTimeMillis) session-id
780
+ {:outcome :interrupted})
781
+ (finish :interrupted))
782
+
783
+ :else
784
+ (do
785
+ ;; Sleep between iterations when wait_between is configured
786
+ (maybe-sleep-between! id wait-between iter)
787
+
788
+ ;; Backpressure: non-planner workers wait for tasks between iterations too
789
+ (when (and (not (:can-plan worker))
790
+ (not (pos? (tasks/pending-count)))
791
+ (not (pos? (tasks/current-count))))
792
+ (println (format "[%s] Queue empty, waiting for tasks before iteration %d" id iter))
793
+ (wait-for-tasks! id))
794
+
606
795
  ;; Ensure worktree exists (create fresh if nil, reuse if persisted)
607
796
  (let [wt-state (try
608
797
  (or wt-state (create-iteration-worktree! project-root id iter))
@@ -617,102 +806,177 @@
617
806
  (do
618
807
  (println (format "[%s] %d consecutive errors, stopping" id errors))
619
808
  (finish :error))
620
- (recur (inc iter) completed errors metrics nil nil)))
809
+ (recur (inc iter) completed errors metrics nil nil #{} nil 0)))
621
810
 
622
811
  ;; Worktree ready — run agent
623
- (let [resume? (some? session-id)
812
+ (let [resume? (or (some? session-id) (some? claim-resume-prompt))
813
+ iter-start-ms (System/currentTimeMillis)
814
+ ;; Snapshot current/ task IDs before agent runs so we can
815
+ ;; detect any direct mv claims (safety net for old behavior).
816
+ pre-current-ids (tasks/current-task-ids)
624
817
  _ (println (format "[%s] %s iteration %d/%d"
625
818
  id (if resume? "Resuming" "Starting") iter iterations))
626
819
  context (build-context)
627
- {:keys [output exit done? merge?] :as agent-result}
628
- (run-agent! worker (:path wt-state) context session-id resume?)
629
- new-session-id (:session-id agent-result)]
820
+ {:keys [output exit done? merge? claim-ids] :as agent-result}
821
+ (run-agent! worker (:path wt-state) context session-id resume?
822
+ :resume-prompt-override claim-resume-prompt)
823
+ new-session-id (:session-id agent-result)
824
+ ;; Safety net: detect any direct mv claims (old behavior)
825
+ mv-claimed-tasks (detect-claimed-tasks pre-current-ids)]
630
826
 
631
827
  (cond
632
- ;; Agent errored — cleanup, reset session
828
+ ;; Agent errored — recycle claimed tasks, cleanup, reset session
633
829
  (not (zero? exit))
634
830
  (let [errors (inc consec-errors)
635
- metrics (update metrics :errors inc)]
636
- (println (format "[%s] Agent error (exit %d): %s"
637
- id exit (subs (or output "") 0 (min 200 (count (or output ""))))))
831
+ recycled (recycle-orphaned-tasks! id pre-current-ids)
832
+ metrics (-> metrics
833
+ (update :errors inc)
834
+ (update :recycled + recycled))
835
+ error-msg (subs (or output "") 0 (min 200 (count (or output ""))))]
836
+ (println (format "[%s] Agent error (exit %d): %s" id exit error-msg))
837
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
838
+ {:outcome :error :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
839
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
840
+ :error-snippet error-msg})
638
841
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
639
842
  (if (>= errors max-consecutive-errors)
640
843
  (do
641
844
  (println (format "[%s] %d consecutive errors, stopping" id errors))
642
845
  (finish :error))
643
- (recur (inc iter) completed errors metrics nil nil)))
846
+ (recur (inc iter) completed errors metrics nil nil #{} nil 0)))
847
+
848
+ ;; CLAIM signal — framework claims tasks, resumes agent with results
849
+ ;; Only honored when no MERGE or DONE signal (lowest priority)
850
+ (and (seq claim-ids) (not merge?) (not done?))
851
+ (let [_ (println (format "[%s] CLAIM signal: %s" id (str/join ", " claim-ids)))
852
+ {:keys [claimed failed resume-prompt]} (execute-claims! claim-ids)
853
+ new-claimed-ids (into claimed-ids claimed)
854
+ metrics (update metrics :claims + (count claimed))]
855
+ (println (format "[%s] Claimed %d/%d tasks" id (count claimed) (count claim-ids)))
856
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
857
+ {:outcome :claimed :claimed-task-ids (vec claimed)})
858
+ (recur (inc iter) completed 0 metrics new-session-id wt-state
859
+ new-claimed-ids resume-prompt 0))
644
860
 
645
861
  ;; COMPLETE_AND_READY_FOR_MERGE — review, merge, reset session
646
862
  merge?
647
863
  (if (worktree-has-changes? (:path wt-state))
648
864
  (if (task-only-diff? (:path wt-state))
649
- ;; Task-only changes — skip review, auto-merge
865
+ ;; Task-only changes — skip review, sync to main, auto-merge
650
866
  (do
651
867
  (println (format "[%s] Task-only diff, auto-merging" id))
652
- (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root 0)
653
- metrics (if merged? (update metrics :merges inc) metrics)]
654
- (println (format "[%s] Iteration %d/%d complete" id iter iterations))
655
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
656
- (if (and done? (:can-plan worker))
868
+ (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
869
+ all-claimed (into claimed-ids mv-claimed-tasks)]
870
+ (if (= :failed sync-status)
871
+ ;; Sync failed cannot merge safely, skip
657
872
  (do
658
- (println (format "[%s] Worker done after merge" id))
659
- (assoc worker :completed (inc completed) :status :done
660
- :merges (:merges metrics)
661
- :rejections (:rejections metrics)
662
- :errors (:errors metrics)
663
- :review-rounds-total (:review-rounds-total metrics)))
664
- (recur (inc iter) (inc completed) 0 metrics nil nil))))
873
+ (println (format "[%s] Sync to main failed, skipping merge" id))
874
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
875
+ {:outcome :sync-failed :claimed-task-ids (vec all-claimed)})
876
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
877
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
878
+ ;; Synced — proceed with merge
879
+ (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root 0 all-claimed)
880
+ metrics (if merged? (update metrics :merges inc) metrics)]
881
+ (println (format "[%s] Cycle %d/%d complete" id iter iterations))
882
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
883
+ {:outcome :merged :claimed-task-ids (vec all-claimed) :review-rounds 0})
884
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
885
+ (recur (inc iter) (inc completed) 0 metrics nil nil #{} nil 0)))))
665
886
  ;; Code changes — full review loop
666
887
  (let [{:keys [approved? attempts]} (review-loop! worker (:path wt-state) id iter)
888
+ ;; Don't pre-increment :merges — defer to after actual merge succeeds
667
889
  metrics (-> metrics
668
890
  (update :review-rounds-total + (or attempts 0))
669
- (update (if approved? :merges :rejections) inc))]
891
+ (cond-> (not approved?) (update :rejections inc)))]
670
892
  (if approved?
671
- (do
672
- (merge-to-main! (:path wt-state) (:branch wt-state) id project-root (or attempts 0))
673
- (println (format "[%s] Iteration %d/%d complete" id iter iterations))
674
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
675
- ;; If also __DONE__, stop after merge
676
- (if (and done? (:can-plan worker))
893
+ (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
894
+ all-claimed (into claimed-ids mv-claimed-tasks)]
895
+ (if (= :failed sync-status)
896
+ ;; Sync failed after approval — treat as sync failure, skip merge
677
897
  (do
678
- (println (format "[%s] Worker done after merge" id))
679
- (assoc worker :completed (inc completed) :status :done
680
- :merges (:merges metrics)
681
- :rejections (:rejections metrics)
682
- :errors (:errors metrics)
683
- :review-rounds-total (:review-rounds-total metrics)))
684
- (recur (inc iter) (inc completed) 0 metrics nil nil)))
685
- (do
686
- (println (format "[%s] Iteration %d/%d rejected" id iter iterations))
898
+ (println (format "[%s] Sync to main failed after approval, skipping merge" id))
899
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
900
+ {:outcome :sync-failed :claimed-task-ids (vec all-claimed)
901
+ :review-rounds (or attempts 0)})
902
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
903
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
904
+ ;; Synced proceed with merge, capture return value
905
+ (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root (or attempts 0) all-claimed)
906
+ metrics (if merged? (update metrics :merges inc) metrics)]
907
+ (println (format "[%s] Cycle %d/%d complete" id iter iterations))
908
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
909
+ {:outcome (if merged? :merged :merge-failed)
910
+ :claimed-task-ids (vec all-claimed)
911
+ :review-rounds (or attempts 0)})
912
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
913
+ (recur (inc iter) (inc completed) 0 metrics nil nil #{} nil 0))))
914
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
915
+ metrics (update metrics :recycled + recycled)]
916
+ (println (format "[%s] Cycle %d/%d rejected" id iter iterations))
917
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
918
+ {:outcome :rejected :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
919
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
920
+ :review-rounds (or attempts 0)})
687
921
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
688
- (recur (inc iter) completed 0 metrics nil nil)))))
689
- (do
922
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0)))))
923
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
924
+ metrics (update metrics :recycled + recycled)]
690
925
  (println (format "[%s] Merge signaled but no changes, skipping" id))
926
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
927
+ {:outcome :no-changes :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
928
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
691
929
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
692
- (recur (inc iter) completed 0 metrics nil nil)))
693
-
694
- ;; __DONE__ without merge only honor for planners
695
- (and done? (:can-plan worker))
696
- (do
697
- (println (format "[%s] Received __DONE__ signal" id))
698
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
699
- (println (format "[%s] Worker done after %d/%d iterations" id iter iterations))
700
- (finish :done))
701
-
702
- ;; __DONE__ from executor ignore signal, but reset session since
703
- ;; the agent process exited. Resuming a dead session causes exit 1
704
- ;; which cascades into consecutive errors and premature stopping.
705
- (and done? (not (:can-plan worker)))
706
- (do
707
- (println (format "[%s] Ignoring __DONE__ (executor), resetting session" id))
930
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0)))
931
+
932
+ ;; __DONE__ agent signaled it finished this cycle's work.
933
+ ;; Always reset session and continue to next iteration.
934
+ ;; Planners re-plan as tasks complete; executors pick up new tasks.
935
+ done?
936
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
937
+ metrics (update metrics :recycled + recycled)]
938
+ (println (format "[%s] __DONE__ signal, resetting session (iter %d/%d)" id iter iterations))
939
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
940
+ {:outcome :executor-done :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
941
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
708
942
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
709
- (recur (inc iter) completed 0 metrics nil nil))
943
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
710
944
 
711
- ;; No signal — agent still working, resume next iteration
945
+ ;; No signal — agent still working, resume next iteration.
946
+ ;; Track consecutive working resumes. After max-working-resumes,
947
+ ;; inject a nudge prompt. If still no signal after nudge, kill session.
712
948
  :else
713
- (do
714
- (println (format "[%s] Working... (will resume)" id))
715
- (recur (inc iter) completed 0 metrics new-session-id wt-state)))))))))))
949
+ (let [wr (inc working-resumes)
950
+ max-wr (:max-working-resumes worker)]
951
+ (cond
952
+ ;; Already nudged last iteration, still no signal — stuck
953
+ (> wr max-wr)
954
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
955
+ metrics (update metrics :recycled + recycled)]
956
+ (println (format "[%s] Stuck after %d working resumes + nudge, resetting session" id wr))
957
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
958
+ {:outcome :stuck :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
959
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
960
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
961
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
962
+
963
+ ;; Hit the limit — nudge on next resume
964
+ (= wr max-wr)
965
+ (do
966
+ (println (format "[%s] Working... %d/%d resumes, nudging agent to wrap up" id wr max-wr))
967
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
968
+ {:outcome :working :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
969
+ (recur (inc iter) completed 0 metrics new-session-id wt-state
970
+ claimed-ids nudge-prompt wr))
971
+
972
+ ;; Under limit — normal resume
973
+ :else
974
+ (do
975
+ (println (format "[%s] Working... (will resume, %d/%d)" id wr max-wr))
976
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
977
+ {:outcome :working :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
978
+ (recur (inc iter) completed 0 metrics new-session-id wt-state
979
+ claimed-ids nil wr))))))))))))))
716
980
 
717
981
  ;; =============================================================================
718
982
  ;; Multi-Worker Execution
@@ -720,7 +984,7 @@
720
984
 
721
985
  (defn run-workers!
722
986
  "Run multiple workers in parallel.
723
- Writes swarm summary to runs/{swarm-id}/summary.edn on completion.
987
+ Writes stopped event to runs/{swarm-id}/stopped.json on completion.
724
988
 
725
989
  Arguments:
726
990
  workers - seq of worker configs
@@ -731,32 +995,53 @@
731
995
  (let [swarm-id (-> workers first :swarm-id)]
732
996
  (println (format "Launching %d workers..." (count workers)))
733
997
 
734
- (let [futures (doall
735
- (map-indexed
736
- (fn [idx worker]
737
- (let [worker (assoc worker :id (or (:id worker) (str "w" idx)))]
738
- (future (run-worker! worker))))
739
- workers))]
740
-
741
- (println "All workers launched. Waiting for completion...")
742
- (let [results (mapv deref futures)]
743
- (println "\nAll workers complete.")
744
- (doseq [w results]
745
- (println (format " [%s] %s - %d completed, %d merges, %d rejections, %d errors, %d review rounds"
746
- (:id w)
747
- (name (:status w))
748
- (:completed w)
749
- (or (:merges w) 0)
750
- (or (:rejections w) 0)
751
- (or (:errors w) 0)
752
- (or (:review-rounds-total w) 0))))
753
-
754
- ;; Write swarm summary to disk
755
- (when swarm-id
756
- (runs/write-summary! swarm-id results)
757
- (println (format "\nSwarm summary written to runs/%s/summary.edn" swarm-id)))
758
-
759
- results))))
998
+ ;; Register JVM shutdown hook so SIGTERM/SIGINT triggers graceful stop.
999
+ ;; Sets the shutdown atom — workers check it between cycles and exit cleanly.
1000
+ ;; The hook waits for workers to finish, then writes stopped.json only if
1001
+ ;; the clean exit path hasn't already done so (guarded by the atom).
1002
+ (let [hook (Thread. (fn []
1003
+ (println "\nShutdown signal received, stopping workers after current cycle...")
1004
+ (reset! shutdown-requested? true)
1005
+ ;; Give workers time to finish current cycle and cleanup.
1006
+ ;; After sleep, write stopped.json only if still in shutdown
1007
+ ;; (clean exit resets the atom to false before writing :completed).
1008
+ (Thread/sleep 10000)
1009
+ (when (and swarm-id @shutdown-requested?)
1010
+ (runs/write-stopped! swarm-id :interrupted))))]
1011
+ (.addShutdownHook (Runtime/getRuntime) hook)
1012
+
1013
+ (let [futures (doall
1014
+ (map-indexed
1015
+ (fn [idx worker]
1016
+ (let [worker (assoc worker :id (or (:id worker) (str "w" idx)))]
1017
+ (future (run-worker! worker))))
1018
+ workers))]
1019
+
1020
+ (println "All workers launched. Waiting for completion...")
1021
+ (let [results (mapv deref futures)]
1022
+ ;; Clean exit — tell shutdown hook not to write stopped.json
1023
+ (reset! shutdown-requested? false)
1024
+ ;; Remove the hook so it doesn't accumulate across calls
1025
+ (try (.removeShutdownHook (Runtime/getRuntime) hook) (catch Exception _))
1026
+ (println "\nAll workers complete.")
1027
+ (doseq [w results]
1028
+ (println (format " [%s] %s - %d completed, %d merges, %d claims, %d rejections, %d errors, %d recycled, %d review rounds"
1029
+ (:id w)
1030
+ (name (:status w))
1031
+ (:completed w)
1032
+ (or (:merges w) 0)
1033
+ (or (:claims w) 0)
1034
+ (or (:rejections w) 0)
1035
+ (or (:errors w) 0)
1036
+ (or (:recycled w) 0)
1037
+ (or (:review-rounds-total w) 0))))
1038
+
1039
+ ;; Write stopped event — all state derivable from cycle logs
1040
+ (when swarm-id
1041
+ (runs/write-stopped! swarm-id :completed)
1042
+ (println (format "\nStopped event written to runs/%s/stopped.json" swarm-id)))
1043
+
1044
+ results)))))
760
1045
 
761
1046
  ;; =============================================================================
762
1047
  ;; Planner — first-class config concept, NOT a worker
@@ -781,10 +1066,12 @@
781
1066
  {:tasks-created 0})
782
1067
  ;; Run agent
783
1068
  (let [context (build-context)
1069
+ template-tokens (build-template-tokens context)
784
1070
  prompt-text (str (when (seq prompts)
785
1071
  (->> prompts
786
1072
  (map load-prompt)
787
1073
  (remove nil?)
1074
+ (map #(agent/tokenize % template-tokens))
788
1075
  (str/join "\n\n")))
789
1076
  "\n\nTask Status: " (:task_status context) "\n"
790
1077
  "Pending: " (:pending_tasks context) "\n\n"
@@ -794,29 +1081,17 @@
794
1081
  swarm-id* (or swarm-id "unknown")
795
1082
  tagged-prompt (str "[oompa:" swarm-id* ":planner] " prompt-text)
796
1083
  abs-root (.getAbsolutePath (io/file project-root))
797
- opencode-attach (opencode-attach-url)
798
-
799
- cmd (case harness
800
- :codex (cond-> [(resolve-binary! "codex") "exec"
801
- "--dangerously-bypass-approvals-and-sandbox"
802
- "--skip-git-repo-check"
803
- "-C" abs-root]
804
- model (into ["--model" model])
805
- true (conj "--" tagged-prompt))
806
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"]
807
- model (into ["--model" model]))
808
- :opencode (cond-> [(resolve-binary! "opencode") "run"]
809
- model (into ["-m" model])
810
- opencode-attach (into ["--attach" opencode-attach])
811
- true (conj tagged-prompt)))
1084
+
1085
+ cmd (harness/build-cmd harness
1086
+ {:cwd abs-root :model model :prompt tagged-prompt})
812
1087
 
813
1088
  _ (println (format "[planner] Running (%s:%s, max_pending: %d, current: %d)"
814
1089
  (name harness) (or model "default") max-pending pending-before))
815
1090
 
816
1091
  result (try
817
- (if (= harness :claude)
818
- (process/sh cmd {:dir abs-root :in tagged-prompt :out :string :err :string})
819
- (process/sh cmd {:dir abs-root :out :string :err :string}))
1092
+ (process/sh cmd {:dir abs-root
1093
+ :in (harness/process-stdin harness tagged-prompt)
1094
+ :out :string :err :string})
820
1095
  (catch Exception e
821
1096
  (println (format "[planner] Agent exception: %s" (.getMessage e)))
822
1097
  {:exit -1 :out "" :err (.getMessage e)}))