@nbardy/oompa 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,81 +14,15 @@
14
14
  No separate orchestrator - workers self-organize."
15
15
  (:require [agentnet.tasks :as tasks]
16
16
  [agentnet.agent :as agent]
17
+ [agentnet.core :as core]
18
+ [agentnet.harness :as harness]
17
19
  [agentnet.worktree :as worktree]
20
+ [agentnet.runs :as runs]
18
21
  [babashka.process :as process]
19
22
  [clojure.java.io :as io]
23
+ [clojure.set]
20
24
  [clojure.string :as str]))
21
25
 
22
- ;; =============================================================================
23
- ;; codex-persist integration
24
- ;; =============================================================================
25
-
26
- (def ^:private persist-cmd* (atom nil))
27
- (def ^:private persist-missing-warned?* (atom false))
28
-
29
- (defn- command-ok?
30
- "Return true if command vector is executable (exit code ignored)."
31
- [cmd]
32
- (try
33
- (do
34
- (process/sh (vec cmd) {:out :string :err :string :continue true})
35
- true)
36
- (catch Exception _
37
- false)))
38
-
39
- (defn- resolve-codex-persist-cmd
40
- "Resolve codex-persist command vector.
41
- Order:
42
- 1) CODEX_PERSIST_BIN env var
43
- 2) codex-persist on PATH
44
- 3) node ~/git/codex-persist/dist/cli.js"
45
- []
46
- (let [cached @persist-cmd*]
47
- (if (some? cached)
48
- cached
49
- (let [env-bin (System/getenv "CODEX_PERSIST_BIN")
50
- env-cmd (when (and env-bin (not (str/blank? env-bin)))
51
- [env-bin])
52
- path-cmd ["codex-persist"]
53
- local-cli (str (System/getProperty "user.home") "/git/codex-persist/dist/cli.js")
54
- local-cmd (when (.exists (io/file local-cli))
55
- ["node" local-cli])
56
- cmd (cond
57
- (and env-cmd (command-ok? env-cmd)) env-cmd
58
- (command-ok? path-cmd) path-cmd
59
- (and local-cmd (command-ok? local-cmd)) local-cmd
60
- :else false)]
61
- (reset! persist-cmd* cmd)
62
- cmd))))
63
-
64
- (defn- safe-assistant-content
65
- "Pick a non-empty assistant message payload for persistence."
66
- [result]
67
- (let [out (or (:out result) "")
68
- err (or (:err result) "")
69
- exit-code (or (:exit result) -1)]
70
- (cond
71
- (not (str/blank? out)) out
72
- (not (str/blank? err)) (str "[agent stderr] " err)
73
- :else (str "[agent exit " exit-code "]"))))
74
-
75
- (defn- persist-message!
76
- "Write a single message to codex-persist; no-op if unavailable."
77
- [worker-id session-id cwd role content]
78
- (let [resolved (resolve-codex-persist-cmd)]
79
- (if (and resolved (not= resolved false))
80
- (let [persist-cmd resolved
81
- payload (if (str/blank? content) "(empty)" content)
82
- result (try
83
- (process/sh (into persist-cmd ["write" session-id cwd role payload])
84
- {:out :string :err :string})
85
- (catch Exception e
86
- {:exit -1 :out "" :err (.getMessage e)}))]
87
- (when-not (zero? (:exit result))
88
- (println (format "[%s] codex-persist write failed (%s)" worker-id role))))
89
- (when (compare-and-set! persist-missing-warned?* false true)
90
- (println "[oompa] codex-persist not found; set CODEX_PERSIST_BIN or install/link codex-persist")))))
91
-
92
26
  ;; =============================================================================
93
27
  ;; Worker State
94
28
  ;; =============================================================================
@@ -97,25 +31,15 @@
97
31
  "Root of the oompa package — set by bin/oompa.js, falls back to cwd."
98
32
  (or (System/getenv "OOMPA_PACKAGE_ROOT") "."))
99
33
 
100
- ;; Resolve absolute paths for CLI binaries at first use.
101
- ;; ProcessBuilder with :dir set can fail to find bare command names on some
102
- ;; platforms (macOS + babashka), so we resolve once via `which` and cache.
103
- (def ^:private binary-paths* (atom {}))
104
-
105
- (defn- resolve-binary!
106
- "Resolve the absolute path of a CLI binary. Caches result.
107
- Throws if binary not found on PATH."
108
- [name]
109
- (or (get @binary-paths* name)
110
- (let [result (try
111
- (process/sh ["which" name] {:out :string :err :string})
112
- (catch Exception _ {:exit -1 :out "" :err ""}))
113
- path (when (zero? (:exit result))
114
- (str/trim (:out result)))]
115
- (if path
116
- (do (swap! binary-paths* assoc name path)
117
- path)
118
- (throw (ex-info (str "Binary not found on PATH: " name) {:binary name}))))))
34
+ ;; Serializes merge-to-main! calls across concurrent workers to prevent
35
+ ;; git index corruption from parallel checkout+merge operations.
36
+ (def ^:private merge-lock (Object.))
37
+
38
+ ;; Set by JVM shutdown hook (SIGTERM/SIGINT). Workers check this between cycles
39
+ ;; and exit gracefully — finishing the current cycle before stopping.
40
+ (def ^:private shutdown-requested? (atom false))
41
+
42
+ (declare task-root-for-cwd)
119
43
 
120
44
  (defn- load-prompt
121
45
  "Load a prompt file. Tries path as-is first, then from package root."
@@ -123,12 +47,61 @@
123
47
  (or (agent/load-custom-prompt path)
124
48
  (agent/load-custom-prompt (str package-root "/" path))))
125
49
 
50
+ (defn- build-template-tokens
51
+ "Build token map for prompt template {var} substitution.
52
+ Merges core/build-context (rich YAML header, queue, hotspots, etc.)
53
+ with worker-level context (task_status, pending_tasks) and defaults
54
+ for tokens that core/build-context doesn't produce (mode_hint, targets,
55
+ recent_sec). Without these defaults, those {vars} leak into prompts."
56
+ ([worker-context]
57
+ (build-template-tokens worker-context nil))
58
+ ([worker-context cwd]
59
+ (let [pending (tasks/list-pending)
60
+ core-ctx (core/build-context {:tasks pending
61
+ :repo (System/getProperty "user.dir")})
62
+ task-root (task-root-for-cwd (or cwd (System/getProperty "user.dir")))]
63
+ (merge {:mode_hint "propose"
64
+ :targets "*"
65
+ :recent_sec "180"
66
+ :TASK_ROOT task-root
67
+ :TASKS_ROOT task-root}
68
+ core-ctx
69
+ worker-context))))
70
+
71
+ (defn- task-root-for-cwd
72
+ "Return the relative tasks root for commands issued from cwd."
73
+ [cwd]
74
+ (let [cwd-file (io/file cwd)
75
+ local-tasks (io/file cwd-file "tasks")
76
+ parent-tasks (some-> cwd-file .getParentFile (io/file "tasks"))]
77
+ (cond
78
+ (.exists local-tasks) "tasks"
79
+ (and parent-tasks (.exists parent-tasks)) "../tasks"
80
+ :else "tasks")))
81
+
82
+ (defn- render-task-header
83
+ "Inject runtime task path into auto-injected task header."
84
+ [raw-header cwd]
85
+ (let [task-root (task-root-for-cwd cwd)]
86
+ (-> (or raw-header "")
87
+ (str/replace "{{TASK_ROOT}}" task-root)
88
+ (str/replace "{{TASKS_ROOT}}" task-root)
89
+ (str/replace "{TASK_ROOT}" task-root)
90
+ (str/replace "{TASKS_ROOT}" task-root))))
91
+
92
+ (def ^:private default-max-working-resumes 5)
93
+
126
94
  (defn create-worker
127
95
  "Create a worker config.
128
96
  :prompts is a string or vector of strings — paths to prompt files.
129
97
  :can-plan when false, worker waits for tasks before starting (backpressure).
130
- :reasoning reasoning effort level (e.g. \"low\", \"medium\", \"high\") — codex only."
131
- [{:keys [id swarm-id harness model iterations prompts can-plan reasoning review-harness review-model]}]
98
+ :reasoning reasoning effort level (e.g. \"low\", \"medium\", \"high\") — codex only.
99
+ :review-prompts paths to reviewer prompt files (loaded and concatenated for review).
100
+ :wait-between seconds to sleep between iterations (nil or 0 = no wait).
101
+ :max-working-resumes max consecutive working resumes before nudge+kill (default 5)."
102
+ [{:keys [id swarm-id harness model iterations prompts can-plan reasoning
103
+ review-harness review-model review-prompts wait-between
104
+ max-working-resumes]}]
132
105
  {:id id
133
106
  :swarm-id swarm-id
134
107
  :harness (or harness :codex)
@@ -140,8 +113,14 @@
140
113
  :else [])
141
114
  :can-plan (if (some? can-plan) can-plan true)
142
115
  :reasoning reasoning
116
+ :wait-between (when (and wait-between (pos? wait-between)) wait-between)
143
117
  :review-harness review-harness
144
118
  :review-model review-model
119
+ :review-prompts (cond
120
+ (vector? review-prompts) review-prompts
121
+ (string? review-prompts) [review-prompts]
122
+ :else [])
123
+ :max-working-resumes (or max-working-resumes default-max-working-resumes)
145
124
  :completed 0
146
125
  :status :idle})
147
126
 
@@ -151,6 +130,18 @@
151
130
 
152
131
  (def ^:private max-review-retries 3)
153
132
 
133
+ ;; Nudge prompt injected when a worker hits max-working-resumes consecutive
134
+ ;; "working" outcomes without signaling. Gives the agent one final chance to
135
+ ;; produce something mergeable before the session is killed.
136
+ (def ^:private nudge-prompt
137
+ (str "You have been working for a long time without signaling completion.\n"
138
+ "You MUST take one of these actions NOW:\n\n"
139
+ "1. If you have meaningful changes: commit them and signal COMPLETE_AND_READY_FOR_MERGE\n"
140
+ "2. If scope is too large: create follow-up tasks in tasks/pending/ for remaining work,\n"
141
+ " commit what you have (even partial notes/design docs), and signal COMPLETE_AND_READY_FOR_MERGE\n"
142
+ "3. If you are stuck and cannot make progress: signal __DONE__\n\n"
143
+ "Do NOT continue working without producing a signal."))
144
+
154
145
  (defn- build-context
155
146
  "Build context for agent prompts"
156
147
  []
@@ -164,26 +155,80 @@
164
155
  :task_status (format "Pending: %d, In Progress: %d, Complete: %d"
165
156
  (count pending) (count current) (count complete))}))
166
157
 
158
+
159
+ (defn- execute-claims!
160
+ "Execute CLAIM signal: attempt to claim each task ID from pending/.
161
+ Returns {:claimed [ids], :failed [ids], :resume-prompt string}."
162
+ [claim-ids]
163
+ (let [results (tasks/claim-by-ids! claim-ids)
164
+ claimed (filterv #(= :claimed (:status %)) results)
165
+ failed (filterv #(not= :claimed (:status %)) results)
166
+ claimed-ids (mapv :id claimed)
167
+ failed-ids (mapv :id failed)
168
+ context (build-context)
169
+ prompt (str "## Claim Results\n"
170
+ (if (seq claimed-ids)
171
+ (str "Claimed: " (str/join ", " claimed-ids) "\n")
172
+ "No tasks were successfully claimed.\n")
173
+ (when (seq failed-ids)
174
+ (str "Already taken or not found: "
175
+ (str/join ", " failed-ids) "\n"))
176
+ "\nTask Status: " (:task_status context) "\n"
177
+ "Remaining Pending:\n"
178
+ (if (str/blank? (:pending_tasks context))
179
+ "(none)"
180
+ (:pending_tasks context))
181
+ "\n\n"
182
+ (if (seq claimed-ids)
183
+ "Work on your claimed tasks. Signal COMPLETE_AND_READY_FOR_MERGE when done."
184
+ "No claims succeeded. CLAIM different tasks, or signal __DONE__ if no suitable work remains."))]
185
+ {:claimed claimed-ids
186
+ :failed failed-ids
187
+ :resume-prompt prompt}))
188
+
167
189
  (defn- run-agent!
168
- "Run agent with prompt, return {:output string, :done? bool, :merge? bool, :exit int, :session-id string}.
169
- When resume? is true and harness is :claude, uses --resume to continue the existing session
170
- with a lighter prompt (just task status + continue instruction)."
171
- [{:keys [id swarm-id harness model prompts reasoning]} worktree-path context session-id resume?]
172
- (let [;; Use provided session-id or generate fresh one
173
- session-id (or session-id (str/lower-case (str (java.util.UUID/randomUUID))))
174
-
175
- ;; Build prompt lighter for resume (agent already has full context)
176
- prompt (if resume?
190
+ "Run agent with prompt, return {:output :done? :merge? :claim-ids :exit :session-id}.
191
+ When resume? is true, continues the existing session with a lighter prompt.
192
+ resume-prompt-override: when non-nil, replaces the default resume prompt
193
+ (used to inject CLAIM results). All harness-specific CLI knowledge
194
+ is delegated to harness/build-cmd."
195
+ [{:keys [id swarm-id harness model prompts reasoning]} worktree-path context session-id resume?
196
+ & {:keys [resume-prompt-override]}]
197
+ (let [session-id (or session-id (harness/make-session-id harness))
198
+ template-tokens (build-template-tokens context worktree-path)
199
+ resume-prompt-override (when resume-prompt-override
200
+ (-> resume-prompt-override
201
+ (render-task-header worktree-path)
202
+ (agent/tokenize template-tokens)))
203
+
204
+ ;; Build prompt — 3-way: override → standard resume → fresh start
205
+ prompt (cond
206
+ ;; CLAIM results or other injected resume prompt
207
+ resume-prompt-override
208
+ resume-prompt-override
209
+
210
+ ;; Standard resume — lighter (agent already has full context)
211
+ resume?
177
212
  (str "Task Status: " (:task_status context) "\n"
178
213
  "Pending: " (:pending_tasks context) "\n\n"
179
214
  "Continue working. Signal COMPLETE_AND_READY_FOR_MERGE when your current task is done and ready for review.")
180
- (let [task-header (or (load-prompt "config/prompts/_task_header.md") "")
215
+
216
+ ;; Fresh start — full task header + tokenized user prompts
217
+ ;; Template tokens ({context_header}, {queue_md}, etc.) are
218
+ ;; replaced here. Without this, raw {var} placeholders leak
219
+ ;; into the agent prompt verbatim.
220
+ :else
221
+ (let [task-header (render-task-header
222
+ (load-prompt "config/prompts/_task_header.md")
223
+ worktree-path)
181
224
  user-prompts (if (seq prompts)
182
225
  (->> prompts
183
226
  (map load-prompt)
184
227
  (remove nil?)
228
+ (map #(agent/tokenize % template-tokens))
185
229
  (str/join "\n\n"))
186
- (or (load-prompt "config/prompts/worker.md")
230
+ (or (some-> (load-prompt "config/prompts/worker.md")
231
+ (agent/tokenize template-tokens))
187
232
  "You are a worker. Claim tasks, execute them, complete them."))]
188
233
  (str task-header "\n"
189
234
  "Task Status: " (:task_status context) "\n"
@@ -194,135 +239,237 @@
194
239
  tagged-prompt (str "[oompa:" swarm-id* ":" id "] " prompt)
195
240
  abs-worktree (.getAbsolutePath (io/file worktree-path))
196
241
 
197
- ;; Build command — both harnesses run with cwd=worktree, no sandbox
198
- ;; so agents can `..` to reach project root for task management
199
- ;; Claude: --resume flag continues existing session-id conversation
200
- ;; Codex: no resume support, always fresh (but worktree state persists)
201
- cmd (case harness
202
- :codex (cond-> [(resolve-binary! "codex") "exec"
203
- "--dangerously-bypass-approvals-and-sandbox"
204
- "--skip-git-repo-check"
205
- "-C" abs-worktree]
206
- model (into ["--model" model])
207
- reasoning (into ["-c" (str "model_reasoning_effort=\"" reasoning "\"")])
208
- true (conj "--" tagged-prompt))
209
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"
210
- "--session-id" session-id]
211
- resume? (conj "--resume")
212
- model (into ["--model" model])))
213
-
214
- _ (when (= harness :codex)
215
- (persist-message! id session-id abs-worktree "user" tagged-prompt))
216
-
217
- ;; Run agent — both run with cwd=worktree
242
+ cmd (harness/build-cmd harness
243
+ {:cwd abs-worktree :model model :reasoning reasoning
244
+ :session-id session-id :resume? resume?
245
+ :prompt tagged-prompt :format? true})
246
+
218
247
  result (try
219
- (if (= harness :claude)
220
- (process/sh cmd {:dir abs-worktree :in tagged-prompt :out :string :err :string})
221
- (process/sh cmd {:dir abs-worktree :out :string :err :string}))
248
+ (process/sh cmd {:dir abs-worktree
249
+ :in (harness/process-stdin harness tagged-prompt)
250
+ :out :string :err :string})
222
251
  (catch Exception e
223
252
  (println (format "[%s] Agent exception: %s" id (.getMessage e)))
224
- {:exit -1 :out "" :err (.getMessage e)}))]
253
+ {:exit -1 :out "" :err (.getMessage e)}))
225
254
 
226
- (when (= harness :codex)
227
- (persist-message! id session-id abs-worktree "assistant" (safe-assistant-content result)))
255
+ {:keys [output session-id]}
256
+ (harness/parse-output harness (:out result) session-id)]
228
257
 
229
- {:output (:out result)
258
+ {:output output
230
259
  :exit (:exit result)
231
- :done? (agent/done-signal? (:out result))
232
- :merge? (agent/merge-signal? (:out result))
260
+ :done? (agent/done-signal? output)
261
+ :merge? (agent/merge-signal? output)
262
+ :claim-ids (agent/parse-claim-signal output)
233
263
  :session-id session-id}))
234
264
 
235
265
  (defn- run-reviewer!
236
266
  "Run reviewer on worktree changes.
237
- Returns {:verdict :approved|:needs-changes|:rejected, :comments [...]}"
238
- [{:keys [id swarm-id review-harness review-model]} worktree-path]
239
- (let [;; Get diff for context
240
- diff-result (process/sh ["git" "diff" "main" "--stat"]
267
+ Uses custom review-prompts when configured, otherwise falls back to default.
268
+ prev-feedback: vector of previous review outputs (for multi-round context).
269
+ Returns {:verdict :approved|:needs-changes|:rejected, :comments [...], :output string}"
270
+ [{:keys [id swarm-id review-harness review-model review-prompts]} worktree-path prev-feedback]
271
+ (let [;; Get actual diff content (not just stat) — truncate to 8000 chars for prompt budget
272
+ diff-result (process/sh ["git" "diff" "main"]
241
273
  {:dir worktree-path :out :string :err :string})
242
- diff-summary (:out diff-result)
274
+ diff-content (let [d (:out diff-result)]
275
+ (if (> (count d) 8000)
276
+ (str (subs d 0 8000) "\n... [diff truncated at 8000 chars]")
277
+ d))
243
278
 
244
- ;; Build review prompt (tagged for claude-web-view worker detection)
245
279
  swarm-id* (or swarm-id "unknown")
246
- review-prompt (str "[oompa:" swarm-id* ":" id "] "
247
- "Review the changes in this worktree.\n\n"
248
- "Diff summary:\n" diff-summary "\n\n"
249
- "Check for:\n"
250
- "- Code correctness\n"
251
- "- Matches the intended task\n"
252
- "- No obvious bugs or issues\n\n"
253
- "Respond with:\n"
254
- "- APPROVED if changes are good\n"
255
- "- NEEDS_CHANGES with bullet points of issues\n"
256
- "- REJECTED if fundamentally wrong")
280
+ custom-prompt (when (seq review-prompts)
281
+ (->> review-prompts
282
+ (map load-prompt)
283
+ (remove nil?)
284
+ (str/join "\n\n")))
285
+
286
+ ;; Only include the most recent round's feedback — the worker has already
287
+ ;; attempted fixes based on it, so the reviewer just needs to verify.
288
+ ;; Including all prior rounds bloats the prompt and causes empty output.
289
+ history-block (when (seq prev-feedback)
290
+ (let [latest (last prev-feedback)
291
+ truncated (if (> (count latest) 2000)
292
+ (str (subs latest 0 2000) "\n... [feedback truncated]")
293
+ latest)]
294
+ (str "\n## Previous Review (Round " (count prev-feedback) ")\n\n"
295
+ "The worker has attempted fixes based on this feedback. "
296
+ "Verify the issues below are resolved. Do NOT raise new issues.\n\n"
297
+ truncated
298
+ "\n\n")))
299
+
300
+ review-body (str (or custom-prompt
301
+ (str "Review the changes in this worktree.\n"
302
+ "Focus on architecture and design, not style.\n"))
303
+ "\n\nDiff:\n```\n" diff-content "\n```\n"
304
+ (when history-block history-block)
305
+ "\nYour verdict MUST be on its own line, exactly one of:\n"
306
+ "VERDICT: APPROVED\n"
307
+ "VERDICT: NEEDS_CHANGES\n\n"
308
+ "Do NOT use REJECTED. Always use NEEDS_CHANGES with specific, "
309
+ "actionable feedback explaining what must change and why. "
310
+ "The worker will attempt fixes based on your feedback.\n"
311
+ "After your verdict line, list every issue as a numbered item with "
312
+ "the file path and what needs to change.\n")
313
+ review-prompt (str "[oompa:" swarm-id* ":" id "] " review-body)
257
314
 
258
315
  abs-wt (.getAbsolutePath (io/file worktree-path))
259
316
 
260
- ;; Build command cwd=worktree, no sandbox
261
- cmd (case review-harness
262
- :codex (cond-> [(resolve-binary! "codex") "exec"
263
- "--dangerously-bypass-approvals-and-sandbox"
264
- "--skip-git-repo-check"
265
- "-C" abs-wt]
266
- review-model (into ["--model" review-model])
267
- true (conj "--" review-prompt))
268
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"]
269
- review-model (into ["--model" review-model])))
270
-
271
- ;; Run reviewer — cwd=worktree
317
+ ;; No session, no resume, no format flags — reviewer is stateless one-shot
318
+ cmd (harness/build-cmd review-harness
319
+ {:cwd abs-wt :model review-model :prompt review-prompt})
320
+
272
321
  result (try
273
- (if (= review-harness :claude)
274
- (process/sh cmd {:dir abs-wt :in review-prompt :out :string :err :string})
275
- (process/sh cmd {:dir abs-wt :out :string :err :string}))
322
+ (process/sh cmd {:dir abs-wt
323
+ :in (harness/process-stdin review-harness review-prompt)
324
+ :out :string :err :string})
276
325
  (catch Exception e
277
326
  {:exit -1 :out "" :err (.getMessage e)}))
278
327
 
279
- output (:out result)]
280
-
281
- {:verdict (cond
282
- (re-find #"(?i)\bAPPROVED\b" output) :approved
283
- (re-find #"(?i)\bREJECTED\b" output) :rejected
284
- :else :needs-changes)
328
+ output (:out result)
329
+
330
+ ;; Parse verdict — require explicit VERDICT: prefix to avoid false matches.
331
+ ;; REJECTED is treated as NEEDS_CHANGES: the reviewer must always give
332
+ ;; actionable feedback so the worker can attempt fixes. Hard rejection
333
+ ;; only happens when max review rounds are exhausted.
334
+ verdict (cond
335
+ (re-find #"VERDICT:\s*APPROVED" output) :approved
336
+ (re-find #"VERDICT:\s*NEEDS_CHANGES" output) :needs-changes
337
+ (re-find #"VERDICT:\s*REJECTED" output) :needs-changes
338
+ (re-find #"(?i)\bAPPROVED\b" output) :approved
339
+ :else :needs-changes)]
340
+
341
+ (println (format "[%s] Reviewer verdict: %s" id (name verdict)))
342
+ (let [summary (subs output 0 (min 300 (count output)))]
343
+ (println (format "[%s] Review: %s%s" id summary
344
+ (if (> (count output) 300) "..." ""))))
345
+
346
+ {:verdict verdict
285
347
  :comments (when (not= (:exit result) 0)
286
348
  [(:err result)])
287
349
  :output output}))
288
350
 
289
351
  (defn- run-fix!
290
352
  "Ask worker to fix issues based on reviewer feedback.
353
+ all-feedback: vector of all reviewer outputs so far (accumulated across rounds).
291
354
  Returns {:output string, :exit int}"
292
- [{:keys [id swarm-id harness model]} worktree-path feedback]
355
+ [{:keys [id swarm-id harness model]} worktree-path all-feedback]
293
356
  (let [swarm-id* (or swarm-id "unknown")
357
+ feedback-text (if (> (count all-feedback) 1)
358
+ (str "The reviewer has given feedback across " (count all-feedback) " rounds.\n"
359
+ "Fix ALL outstanding issues:\n\n"
360
+ (->> all-feedback
361
+ (map-indexed (fn [i fb]
362
+ (str "--- Round " (inc i) " ---\n" fb)))
363
+ (str/join "\n\n")))
364
+ (str "The reviewer found issues with your changes:\n\n"
365
+ (first all-feedback)))
294
366
  fix-prompt (str "[oompa:" swarm-id* ":" id "] "
295
- "The reviewer found issues with your changes:\n\n"
296
- feedback "\n\n"
297
- "Please fix these issues in the worktree.")
367
+ feedback-text "\n\n"
368
+ "Fix these issues. Do not add anything the reviewer did not ask for.")
298
369
 
299
370
  abs-wt (.getAbsolutePath (io/file worktree-path))
300
371
 
301
- cmd (case harness
302
- :codex (cond-> [(resolve-binary! "codex") "exec"
303
- "--dangerously-bypass-approvals-and-sandbox"
304
- "--skip-git-repo-check"
305
- "-C" abs-wt]
306
- model (into ["--model" model])
307
- true (conj "--" fix-prompt))
308
- :claude (cond-> [(resolve-binary! "claude") "-p" "--dangerously-skip-permissions"]
309
- model (into ["--model" model])))
372
+ cmd (harness/build-cmd harness
373
+ {:cwd abs-wt :model model :prompt fix-prompt})
310
374
 
311
375
  result (try
312
- (if (= harness :claude)
313
- (process/sh cmd {:dir abs-wt :in fix-prompt :out :string :err :string})
314
- (process/sh cmd {:dir abs-wt :out :string :err :string}))
376
+ (process/sh cmd {:dir abs-wt
377
+ :in (harness/process-stdin harness fix-prompt)
378
+ :out :string :err :string})
315
379
  (catch Exception e
316
380
  {:exit -1 :out "" :err (.getMessage e)}))]
317
381
 
318
382
  {:output (:out result)
319
383
  :exit (:exit result)}))
320
384
 
385
+ (defn- collect-divergence-context
386
+ "Collect context about how a worktree branch has diverged from main.
387
+ Returns a map with :branch-log, :main-log, :diff-stat strings."
388
+ [wt-path]
389
+ (let [git-out (fn [& args] (:out (process/sh (vec args) {:dir wt-path :out :string :err :string})))
390
+ branch-log (git-out "git" "log" "--oneline" "main..HEAD")
391
+ main-log (git-out "git" "log" "--oneline" "HEAD..main")
392
+ diff-stat (git-out "git" "diff" "--stat" "main")]
393
+ {:branch-log (or branch-log "(none)")
394
+ :main-log (or main-log "(none)")
395
+ :diff-stat (or diff-stat "(none)")}))
396
+
397
+ (defn- verify-mergeable?
398
+ "Dry-run merge to verify a worktree branch merges cleanly into main.
399
+ Does NOT leave merge state behind — always cleans up the dry-run.
400
+ Uses --no-commit so no actual commit is created; resets afterward."
401
+ [wt-path]
402
+ (let [result (process/sh ["git" "merge" "--no-commit" "--no-ff" "main"]
403
+ {:dir wt-path :out :string :err :string})
404
+ clean? (zero? (:exit result))]
405
+ ;; Clean up: abort if conflicted, reset if staged but uncommitted
406
+ (if clean?
407
+ (process/sh ["git" "reset" "--hard" "HEAD"] {:dir wt-path})
408
+ (process/sh ["git" "merge" "--abort"] {:dir wt-path}))
409
+ clean?))
410
+
411
+ (defn- sync-worktree-to-main!
412
+ "Sync worktree branch with main before merge-to-main!.
413
+ Fast path: git merge main succeeds cleanly → :synced.
414
+ Conflict path: abort merge, give agent a clean worktree + divergence
415
+ context, let agent make the branch mergeable (rebase, cherry-pick,
416
+ manual resolution — agent's choice), verify with dry-run merge.
417
+ Runs OUTSIDE the merge-lock so the agent doesn't block other workers.
418
+ Returns :synced | :resolved | :failed."
419
+ [worker wt-path worker-id]
420
+ (let [merge-result (process/sh ["git" "merge" "main" "--no-edit"]
421
+ {:dir wt-path :out :string :err :string})]
422
+ (if (zero? (:exit merge-result))
423
+ (do (println (format "[%s] Worktree synced to main" worker-id))
424
+ :synced)
425
+ ;; Conflict — abort merge to restore clean worktree state, then
426
+ ;; hand the problem to the agent with full divergence context.
427
+ (let [_ (process/sh ["git" "merge" "--abort"] {:dir wt-path})
428
+ _ (println (format "[%s] Branch diverged from main, launching resolver agent" worker-id))
429
+ {:keys [branch-log main-log diff-stat]} (collect-divergence-context wt-path)
430
+ resolve-prompt (str "[oompa:" (or (:swarm-id worker) "unknown") ":" worker-id "] "
431
+ "Your branch has diverged from main and cannot merge cleanly.\n\n"
432
+ "Your branch's commits (not on main):\n" branch-log "\n\n"
433
+ "Commits on main since you branched:\n" main-log "\n\n"
434
+ "Divergence scope:\n" diff-stat "\n\n"
435
+ "Make this branch cleanly mergeable into main. "
436
+ "Preserve the intent of your branch's changes.\n"
437
+ "You have full git access — rebase, cherry-pick, resolve conflicts, "
438
+ "whatever works.\n"
439
+ "When done, verify with: git diff main --stat")
440
+ abs-wt (.getAbsolutePath (io/file wt-path))
441
+ cmd (harness/build-cmd (:harness worker)
442
+ {:cwd abs-wt :model (:model worker) :prompt resolve-prompt})
443
+ result (try
444
+ (process/sh cmd {:dir abs-wt
445
+ :in (harness/process-stdin (:harness worker) resolve-prompt)
446
+ :out :string :err :string})
447
+ (catch Exception e
448
+ {:exit -1 :out "" :err (.getMessage e)}))]
449
+ (if (zero? (:exit result))
450
+ ;; Agent ran — verify the branch actually merges cleanly now
451
+ (if (verify-mergeable? wt-path)
452
+ (do (println (format "[%s] Agent resolved divergence, branch is mergeable" worker-id))
453
+ :resolved)
454
+ (do (println (format "[%s] Agent ran but branch still can't merge cleanly" worker-id))
455
+ :failed))
456
+ (do (println (format "[%s] Resolver agent failed (exit %d)" worker-id (:exit result)))
457
+ :failed))))))
458
+
321
459
  (defn- worktree-has-changes?
322
- "Check if worktree has any uncommitted changes (new/modified/deleted files)."
460
+ "Check if worktree has committed OR uncommitted changes vs main.
461
+ Workers commit before signaling merge, so we must check both:
462
+ 1. Uncommitted changes (git status --porcelain)
463
+ 2. Commits ahead of main (git rev-list --count main..HEAD)"
323
464
  [wt-path]
324
- (let [result (process/sh ["git" "status" "--porcelain"] {:dir wt-path :out :string :err :string})]
325
- (not (str/blank? (:out result)))))
465
+ (let [uncommitted (process/sh ["git" "status" "--porcelain"]
466
+ {:dir wt-path :out :string :err :string})
467
+ ahead (process/sh ["git" "rev-list" "--count" "main..HEAD"]
468
+ {:dir wt-path :out :string :err :string})
469
+ ahead-count (try (Integer/parseInt (str/trim (:out ahead)))
470
+ (catch Exception _ 0))]
471
+ (or (not (str/blank? (:out uncommitted)))
472
+ (pos? ahead-count))))
326
473
 
327
474
  (defn- create-iteration-worktree!
328
475
  "Create a fresh worktree for an iteration. Returns {:dir :branch :path}.
@@ -341,87 +488,231 @@
341
488
  {:dir wt-dir :branch wt-branch}))))
342
489
  {:dir wt-dir :branch wt-branch :path wt-path}))
343
490
 
491
+ (defn- detect-claimed-tasks
492
+ "Diff current/ task IDs before and after agent ran.
493
+ Returns set of task IDs this worker claimed during iteration."
494
+ [pre-current-ids]
495
+ (let [post-ids (tasks/current-task-ids)]
496
+ (clojure.set/difference post-ids pre-current-ids)))
497
+
498
+ (defn- emit-cycle-log!
499
+ "Write cycle event log. Called at every cycle exit point.
500
+ session-id links to the Claude CLI conversation transcript on disk.
501
+ No mutable summary state — all state is derived from immutable cycle logs."
502
+ [swarm-id worker-id cycle start-ms session-id
503
+ {:keys [outcome claimed-task-ids recycled-tasks error-snippet review-rounds]}]
504
+ (let [duration-ms (- (System/currentTimeMillis) start-ms)]
505
+ (runs/write-cycle-log!
506
+ swarm-id worker-id cycle
507
+ {:outcome outcome
508
+ :duration-ms duration-ms
509
+ :claimed-task-ids (vec (or claimed-task-ids []))
510
+ :recycled-tasks (or recycled-tasks [])
511
+ :error-snippet error-snippet
512
+ :review-rounds (or review-rounds 0)
513
+ :session-id session-id})))
514
+
515
+ (defn- recycle-orphaned-tasks!
516
+ "Recycle tasks that a worker claimed but didn't complete.
517
+ Compares current/ task IDs before and after the agent ran —
518
+ new IDs that appeared are tasks this worker claimed. On failure
519
+ or rejection, move them back to pending/ so other workers can
520
+ pick them up. Returns count of recycled tasks."
521
+ [worker-id pre-current-ids]
522
+ (let [post-current-ids (tasks/current-task-ids)
523
+ orphaned-ids (clojure.set/difference post-current-ids pre-current-ids)
524
+ recycled (when (seq orphaned-ids)
525
+ (tasks/recycle-tasks! orphaned-ids))]
526
+ (when (seq recycled)
527
+ (println (format "[%s] Recycled %d orphaned task(s): %s"
528
+ worker-id (count recycled) (str/join ", " recycled))))
529
+ (count (or recycled []))))
530
+
344
531
  (defn- cleanup-worktree!
345
532
  "Remove worktree and branch."
346
533
  [project-root wt-dir wt-branch]
347
534
  (process/sh ["git" "worktree" "remove" wt-dir "--force"] {:dir project-root})
348
535
  (process/sh ["git" "branch" "-D" wt-branch] {:dir project-root}))
349
536
 
537
+ (defn- get-head-hash
538
+ "Get the short HEAD commit hash."
539
+ [dir]
540
+ (let [result (process/sh ["git" "rev-parse" "--short" "HEAD"]
541
+ {:dir dir :out :string :err :string})]
542
+ (when (zero? (:exit result))
543
+ (str/trim (:out result)))))
544
+
545
+ (defn- annotate-completed-tasks!
546
+ "After a successful merge (called under merge-lock), annotate any tasks in
547
+ complete/ that lack metadata. Adds :completed-by, :completed-at,
548
+ :review-rounds, :merged-commit."
549
+ [project-root worker-id review-rounds]
550
+ (let [commit-hash (get-head-hash project-root)
551
+ complete-dir (io/file project-root "tasks" "complete")]
552
+ (when (.exists complete-dir)
553
+ (doseq [f (.listFiles complete-dir)]
554
+ (when (str/ends-with? (.getName f) ".edn")
555
+ (try
556
+ (let [task (read-string (slurp f))]
557
+ (when-not (:completed-by task)
558
+ (spit f (pr-str (assoc task
559
+ :completed-by worker-id
560
+ :completed-at (str (java.time.Instant/now))
561
+ :review-rounds (or review-rounds 0)
562
+ :merged-commit (or commit-hash "unknown"))))))
563
+ (catch Exception e
564
+ (println (format "[%s] Failed to annotate task %s: %s"
565
+ worker-id (.getName f) (.getMessage e))))))))))
566
+
350
567
  (defn- merge-to-main!
351
- "Merge worktree changes to main branch"
352
- [wt-path wt-id worker-id project-root]
353
- (println (format "[%s] Merging changes to main" worker-id))
354
- (let [;; Commit in worktree if needed
355
- _ (process/sh ["git" "add" "-A"] {:dir wt-path})
356
- _ (process/sh ["git" "commit" "-m" (str "Work from " wt-id)]
357
- {:dir wt-path})
358
- ;; Checkout main and merge (in project root, not worktree)
359
- checkout-result (process/sh ["git" "checkout" "main"]
360
- {:dir project-root :out :string :err :string})
361
- merge-result (when (zero? (:exit checkout-result))
362
- (process/sh ["git" "merge" wt-id "--no-edit"]
363
- {:dir project-root :out :string :err :string}))]
364
- (and (zero? (:exit checkout-result))
365
- (zero? (:exit merge-result)))))
568
+ "Merge worktree changes to main branch. Serialized via merge-lock to prevent
569
+ concurrent workers from corrupting the git index. On success, moves claimed
570
+ tasks current→complete and annotates metadata. Returns true on success.
571
+ claimed-task-ids: set of task IDs this worker claimed (framework owns completion)."
572
+ [wt-path wt-id worker-id project-root review-rounds claimed-task-ids]
573
+ (locking merge-lock
574
+ (println (format "[%s] Merging changes to main" worker-id))
575
+ (let [;; Commit in worktree if needed (no-op if already committed)
576
+ _ (process/sh ["git" "add" "-A"] {:dir wt-path})
577
+ _ (process/sh ["git" "commit" "-m" (str "Work from " wt-id)]
578
+ {:dir wt-path})
579
+ ;; Checkout main and merge (in project root, not worktree)
580
+ checkout-result (process/sh ["git" "checkout" "main"]
581
+ {:dir project-root :out :string :err :string})
582
+ _ (when-not (zero? (:exit checkout-result))
583
+ (println (format "[%s] MERGE FAILED: could not checkout main: %s"
584
+ worker-id (:err checkout-result))))
585
+ merge-result (when (zero? (:exit checkout-result))
586
+ (process/sh ["git" "merge" wt-id "--no-edit"]
587
+ {:dir project-root :out :string :err :string}))
588
+ success (and (zero? (:exit checkout-result))
589
+ (zero? (:exit merge-result)))]
590
+ (if success
591
+ (do
592
+ (println (format "[%s] Merge successful" worker-id))
593
+ ;; Framework-owned completion: move claimed tasks current→complete
594
+ (when (seq claimed-task-ids)
595
+ (let [completed (tasks/complete-by-ids! claimed-task-ids)]
596
+ (when (seq completed)
597
+ (println (format "[%s] Completed %d task(s): %s"
598
+ worker-id (count completed) (str/join ", " completed))))))
599
+ ;; Annotate completed tasks with metadata while still holding merge-lock
600
+ (annotate-completed-tasks! project-root worker-id review-rounds))
601
+ ;; FAILED: Clean up git state before releasing merge-lock.
602
+ ;; Without this, a conflict leaves .git/MERGE_HEAD and poisons the
603
+ ;; shared index — every subsequent worker fails on `git checkout main`.
604
+ (do
605
+ (println (format "[%s] MERGE FAILED: %s" worker-id
606
+ (or (:err merge-result) (:err checkout-result))))
607
+ (let [abort-result (process/sh ["git" "merge" "--abort"]
608
+ {:dir project-root :out :string :err :string})]
609
+ (when-not (zero? (:exit abort-result))
610
+ ;; Abort failed (no merge in progress, or other issue) — hard reset.
611
+ (process/sh ["git" "reset" "--hard" "HEAD"]
612
+ {:dir project-root :out :string :err :string})))))
613
+ success)))
614
+
615
+ (defn- task-only-diff?
616
+ "Check if all changes in worktree are task files only (no code changes).
617
+ Returns true if diff only touches files under tasks/ directory."
618
+ [wt-path]
619
+ (let [result (process/sh ["git" "diff" "main" "--name-only"]
620
+ {:dir wt-path :out :string :err :string})
621
+ files (when (zero? (:exit result))
622
+ (->> (str/split-lines (:out result))
623
+ (remove str/blank?)))]
624
+ (and (seq files)
625
+ (every? #(str/starts-with? % "tasks/") files))))
626
+
627
+ (defn- diff-file-names
628
+ "Get list of changed file names vs main."
629
+ [wt-path]
630
+ (let [result (process/sh ["git" "diff" "main" "--name-only"]
631
+ {:dir wt-path :out :string :err :string})]
632
+ (when (zero? (:exit result))
633
+ (->> (str/split-lines (:out result))
634
+ (remove str/blank?)
635
+ vec))))
366
636
 
367
637
  (defn- review-loop!
368
638
  "Run review loop: reviewer checks → if issues, fix & retry → back to reviewer.
639
+ Accumulates feedback across rounds so reviewer doesn't raise new issues
640
+ and fixer has full context of all prior feedback.
641
+ Writes review logs to runs/{swarm-id}/reviews/ for post-mortem analysis.
369
642
  Returns {:approved? bool, :attempts int}"
370
- [worker wt-path worker-id]
643
+ [worker wt-path worker-id iteration]
371
644
  (if-not (and (:review-harness worker) (:review-model worker))
372
645
  ;; No reviewer configured, auto-approve
373
646
  {:approved? true :attempts 0}
374
647
 
375
- ;; Run review loop
376
- (loop [attempt 1]
648
+ ;; Run review loop with accumulated feedback
649
+ (loop [attempt 1
650
+ prev-feedback []]
377
651
  (println (format "[%s] Review attempt %d/%d" worker-id attempt max-review-retries))
378
- (let [{:keys [verdict output]} (run-reviewer! worker wt-path)]
652
+ (let [{:keys [verdict output]} (run-reviewer! worker wt-path prev-feedback)
653
+ diff-files (diff-file-names wt-path)]
654
+
655
+ ;; Persist review log for this round
656
+ (when (:swarm-id worker)
657
+ (runs/write-review-log! (:swarm-id worker) worker-id iteration attempt
658
+ {:verdict verdict
659
+ :output output
660
+ :diff-files (or diff-files [])}))
661
+
379
662
  (case verdict
380
663
  :approved
381
664
  (do
382
- (println (format "[%s] Reviewer APPROVED" worker-id))
665
+ (println (format "[%s] Reviewer APPROVED (attempt %d)" worker-id attempt))
383
666
  {:approved? true :attempts attempt})
384
667
 
385
- :rejected
386
- (do
387
- (println (format "[%s] Reviewer REJECTED" worker-id))
388
- {:approved? false :attempts attempt})
389
-
390
- ;; :needs-changes
391
- (if (>= attempt max-review-retries)
392
- (do
393
- (println (format "[%s] Max review retries reached" worker-id))
394
- {:approved? false :attempts attempt})
395
- (do
396
- (println (format "[%s] Reviewer requested changes, fixing..." worker-id))
397
- (run-fix! worker wt-path output)
398
- (recur (inc attempt)))))))))
668
+ ;; :needs-changes — always give the worker a chance to fix.
669
+ ;; Hard rejection only happens when max review rounds are exhausted.
670
+ (let [all-feedback (conj prev-feedback output)]
671
+ (if (>= attempt max-review-retries)
672
+ (do
673
+ (println (format "[%s] Max review retries reached (%d rounds)" worker-id attempt))
674
+ {:approved? false :attempts attempt})
675
+ (do
676
+ (println (format "[%s] Reviewer requested changes, fixing..." worker-id))
677
+ (run-fix! worker wt-path all-feedback)
678
+ (recur (inc attempt) all-feedback)))))))))
399
679
 
400
680
  ;; =============================================================================
401
681
  ;; Worker Loop
402
682
  ;; =============================================================================
403
683
 
404
- (def ^:private max-wait-for-tasks 60)
405
- (def ^:private wait-poll-interval 5)
684
+ ;; Workers wait up to 10 minutes for tasks to appear before giving up.
685
+ ;; This keeps workers alive while planners/designers ramp up the queue.
686
+ (def ^:private max-wait-for-tasks 600)
687
+ (def ^:private wait-poll-interval 10)
406
688
  (def ^:private max-consecutive-errors 3)
407
689
 
408
690
  (defn- wait-for-tasks!
409
- "Wait up to 60s for pending/current tasks to appear. Used for backpressure
410
- on workers that can't create their own tasks (can_plan: false)."
691
+ "Wait up to 10 minutes for pending/current tasks to appear. Used for
692
+ backpressure on workers that can't create their own tasks (can_plan: false).
693
+ Polls every 10 seconds, logs every 60 seconds."
411
694
  [worker-id]
412
695
  (loop [waited 0]
413
696
  (cond
414
697
  (pos? (tasks/pending-count)) true
415
698
  (pos? (tasks/current-count)) true
416
699
  (>= waited max-wait-for-tasks)
417
- (do (println (format "[%s] No tasks after %ds, proceeding anyway" worker-id waited))
700
+ (do (println (format "[%s] No tasks after %ds, giving up" worker-id waited))
418
701
  false)
419
702
  :else
420
- (do (when (zero? (mod waited 15))
421
- (println (format "[%s] Waiting for tasks... (%ds)" worker-id waited)))
703
+ (do (when (zero? (mod waited 60))
704
+ (println (format "[%s] Waiting for tasks... (%ds/%ds)" worker-id waited max-wait-for-tasks)))
422
705
  (Thread/sleep (* wait-poll-interval 1000))
423
706
  (recur (+ waited wait-poll-interval))))))
424
707
 
708
+ (defn- maybe-sleep-between!
709
+ "Sleep between iterations when wait-between is configured.
710
+ Called at the start of each iteration (except the first)."
711
+ [worker-id wait-between iter]
712
+ (when (and wait-between (> iter 1))
713
+ (println (format "[%s] Sleeping %ds before next iteration" worker-id wait-between))
714
+ (Thread/sleep (* wait-between 1000))))
715
+
425
716
  (defn run-worker!
426
717
  "Run worker loop with persistent sessions.
427
718
 
@@ -429,115 +720,263 @@
429
720
  Worktrees persist until COMPLETE_AND_READY_FOR_MERGE triggers review+merge.
430
721
  __DONE__ stops the worker entirely (planners only).
431
722
 
432
- Returns final worker state."
723
+ Tracks per-worker metrics: merges, rejections, errors, review-rounds-total.
724
+ Returns final worker state with metrics attached."
433
725
  [worker]
434
726
  (tasks/ensure-dirs!)
435
- (let [{:keys [id iterations]} worker
727
+ (let [{:keys [id iterations swarm-id wait-between]} worker
436
728
  project-root (System/getProperty "user.dir")]
437
- (println (format "[%s] Starting worker (%s:%s%s, %d iterations)"
729
+ (println (format "[%s] Starting worker (%s:%s%s, %d iterations%s)"
438
730
  id
439
731
  (name (:harness worker))
440
732
  (or (:model worker) "default")
441
733
  (if (:reasoning worker) (str ":" (:reasoning worker)) "")
442
- iterations))
734
+ iterations
735
+ (if wait-between (format ", %ds between" wait-between) "")))
443
736
 
444
737
  ;; Backpressure: workers that can't create tasks wait for tasks to exist
445
738
  (when-not (:can-plan worker)
446
739
  (wait-for-tasks! id))
447
740
 
741
+ ;; metrics tracks: {:merges N :rejections N :errors N :recycled N :review-rounds-total N :claims N}
448
742
  (loop [iter 1
449
743
  completed 0
450
744
  consec-errors 0
451
- session-id nil ;; persistent session-id (nil = start fresh)
452
- wt-state nil] ;; {:dir :branch :path} or nil
453
- (if (> iter iterations)
454
- (do
455
- ;; Cleanup any lingering worktree
456
- (when wt-state
457
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
458
- (println (format "[%s] Completed %d iterations" id completed))
459
- (assoc worker :completed completed :status :exhausted))
460
-
461
- ;; Ensure worktree exists (create fresh if nil, reuse if persisted)
462
- (let [wt-state (try
463
- (or wt-state (create-iteration-worktree! project-root id iter))
464
- (catch Exception e
465
- (println (format "[%s] Worktree creation failed: %s" id (.getMessage e)))
466
- nil))]
467
- (if (nil? wt-state)
468
- ;; Worktree creation failed — count as error
469
- (let [errors (inc consec-errors)]
470
- (if (>= errors max-consecutive-errors)
471
- (do
472
- (println (format "[%s] %d consecutive errors, stopping" id errors))
473
- (assoc worker :completed completed :status :error))
474
- (recur (inc iter) completed errors nil nil)))
475
-
476
- ;; Worktree ready — run agent
477
- (let [resume? (some? session-id)
478
- _ (println (format "[%s] %s iteration %d/%d"
479
- id (if resume? "Resuming" "Starting") iter iterations))
480
- context (build-context)
481
- {:keys [output exit done? merge?] :as agent-result}
482
- (run-agent! worker (:path wt-state) context session-id resume?)
483
- new-session-id (:session-id agent-result)]
484
-
485
- (cond
486
- ;; Agent errored — cleanup, reset session
487
- (not (zero? exit))
488
- (let [errors (inc consec-errors)]
489
- (println (format "[%s] Agent error (exit %d): %s"
490
- id exit (subs (or output "") 0 (min 200 (count (or output ""))))))
491
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
492
- (if (>= errors max-consecutive-errors)
493
- (do
494
- (println (format "[%s] %d consecutive errors, stopping" id errors))
495
- (assoc worker :completed completed :status :error))
496
- (recur (inc iter) completed errors nil nil)))
497
-
498
- ;; COMPLETE_AND_READY_FOR_MERGE review, merge, reset session
499
- merge?
500
- (if (worktree-has-changes? (:path wt-state))
501
- (let [{:keys [approved?]} (review-loop! worker (:path wt-state) id)]
502
- (if approved?
745
+ metrics {:merges 0 :rejections 0 :errors 0 :recycled 0 :review-rounds-total 0 :claims 0}
746
+ session-id nil ;; persistent session-id (nil = start fresh)
747
+ wt-state nil ;; {:dir :branch :path} or nil
748
+ claimed-ids #{} ;; task IDs claimed this session (reset on worktree destroy)
749
+ claim-resume-prompt nil ;; override prompt for next iteration (from CLAIM results)
750
+ working-resumes 0] ;; consecutive "working" outcomes in current session
751
+ (let [finish (fn [status]
752
+ (assoc worker :completed completed :status status
753
+ :merges (:merges metrics)
754
+ :rejections (:rejections metrics)
755
+ :errors (:errors metrics)
756
+ :recycled (:recycled metrics)
757
+ :review-rounds-total (:review-rounds-total metrics)
758
+ :claims (:claims metrics)))]
759
+ (cond
760
+ (> iter iterations)
761
+ (do
762
+ ;; Cleanup any lingering worktree
763
+ (when wt-state
764
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
765
+ (println (format "[%s] Completed %d iterations (%d merges, %d claims, %d rejections, %d errors, %d recycled)"
766
+ id completed (:merges metrics) (:claims metrics) (:rejections metrics) (:errors metrics) (:recycled metrics)))
767
+ (finish :exhausted))
768
+
769
+ @shutdown-requested?
770
+ (do
771
+ (println (format "[%s] Shutdown requested, stopping after %d iterations" id (dec iter)))
772
+ (when wt-state
773
+ ;; Recycle any claimed tasks back to pending so other workers can pick them up
774
+ (when (seq claimed-ids)
775
+ (let [recycled (tasks/recycle-tasks! claimed-ids)]
776
+ (when (seq recycled)
777
+ (println (format "[%s] Recycled %d claimed task(s) on shutdown" id (count recycled))))))
778
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
779
+ (emit-cycle-log! swarm-id id iter (System/currentTimeMillis) session-id
780
+ {:outcome :interrupted})
781
+ (finish :interrupted))
782
+
783
+ :else
784
+ (do
785
+ ;; Sleep between iterations when wait_between is configured
786
+ (maybe-sleep-between! id wait-between iter)
787
+
788
+ ;; Backpressure: non-planner workers wait for tasks between iterations too
789
+ (when (and (not (:can-plan worker))
790
+ (not (pos? (tasks/pending-count)))
791
+ (not (pos? (tasks/current-count))))
792
+ (println (format "[%s] Queue empty, waiting for tasks before iteration %d" id iter))
793
+ (wait-for-tasks! id))
794
+
795
+ ;; Ensure worktree exists (create fresh if nil, reuse if persisted)
796
+ (let [wt-state (try
797
+ (or wt-state (create-iteration-worktree! project-root id iter))
798
+ (catch Exception e
799
+ (println (format "[%s] Worktree creation failed: %s" id (.getMessage e)))
800
+ nil))]
801
+ (if (nil? wt-state)
802
+ ;; Worktree creation failed — count as error
803
+ (let [errors (inc consec-errors)
804
+ metrics (update metrics :errors inc)]
805
+ (if (>= errors max-consecutive-errors)
806
+ (do
807
+ (println (format "[%s] %d consecutive errors, stopping" id errors))
808
+ (finish :error))
809
+ (recur (inc iter) completed errors metrics nil nil #{} nil 0)))
810
+
811
+ ;; Worktree ready — run agent
812
+ (let [resume? (or (some? session-id) (some? claim-resume-prompt))
813
+ iter-start-ms (System/currentTimeMillis)
814
+ ;; Snapshot current/ task IDs before agent runs so we can
815
+ ;; detect any direct mv claims (safety net for old behavior).
816
+ pre-current-ids (tasks/current-task-ids)
817
+ _ (println (format "[%s] %s iteration %d/%d"
818
+ id (if resume? "Resuming" "Starting") iter iterations))
819
+ context (build-context)
820
+ {:keys [output exit done? merge? claim-ids] :as agent-result}
821
+ (run-agent! worker (:path wt-state) context session-id resume?
822
+ :resume-prompt-override claim-resume-prompt)
823
+ new-session-id (:session-id agent-result)
824
+ ;; Safety net: detect any direct mv claims (old behavior)
825
+ mv-claimed-tasks (detect-claimed-tasks pre-current-ids)]
826
+
827
+ (cond
828
+ ;; Agent errored — recycle claimed tasks, cleanup, reset session
829
+ (not (zero? exit))
830
+ (let [errors (inc consec-errors)
831
+ recycled (recycle-orphaned-tasks! id pre-current-ids)
832
+ metrics (-> metrics
833
+ (update :errors inc)
834
+ (update :recycled + recycled))
835
+ error-msg (subs (or output "") 0 (min 200 (count (or output ""))))]
836
+ (println (format "[%s] Agent error (exit %d): %s" id exit error-msg))
837
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
838
+ {:outcome :error :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
839
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
840
+ :error-snippet error-msg})
841
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
842
+ (if (>= errors max-consecutive-errors)
503
843
  (do
504
- (merge-to-main! (:path wt-state) (:branch wt-state) id project-root)
505
- (println (format "[%s] Iteration %d/%d complete" id iter iterations))
506
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
507
- ;; If also __DONE__, stop after merge
508
- (if (and done? (:can-plan worker))
509
- (do
510
- (println (format "[%s] Worker done after merge" id))
511
- (assoc worker :completed (inc completed) :status :done))
512
- (recur (inc iter) (inc completed) 0 nil nil)))
844
+ (println (format "[%s] %d consecutive errors, stopping" id errors))
845
+ (finish :error))
846
+ (recur (inc iter) completed errors metrics nil nil #{} nil 0)))
847
+
848
+ ;; CLAIM signal framework claims tasks, resumes agent with results
849
+ ;; Only honored when no MERGE or DONE signal (lowest priority)
850
+ (and (seq claim-ids) (not merge?) (not done?))
851
+ (let [_ (println (format "[%s] CLAIM signal: %s" id (str/join ", " claim-ids)))
852
+ {:keys [claimed failed resume-prompt]} (execute-claims! claim-ids)
853
+ new-claimed-ids (into claimed-ids claimed)
854
+ metrics (update metrics :claims + (count claimed))]
855
+ (println (format "[%s] Claimed %d/%d tasks" id (count claimed) (count claim-ids)))
856
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
857
+ {:outcome :claimed :claimed-task-ids (vec claimed)})
858
+ (recur (inc iter) completed 0 metrics new-session-id wt-state
859
+ new-claimed-ids resume-prompt 0))
860
+
861
+ ;; COMPLETE_AND_READY_FOR_MERGE — review, merge, reset session
862
+ merge?
863
+ (if (worktree-has-changes? (:path wt-state))
864
+ (if (task-only-diff? (:path wt-state))
865
+ ;; Task-only changes — skip review, sync to main, auto-merge
513
866
  (do
514
- (println (format "[%s] Iteration %d/%d rejected" id iter iterations))
515
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
516
- (recur (inc iter) completed 0 nil nil))))
517
- (do
518
- (println (format "[%s] Merge signaled but no changes, skipping" id))
867
+ (println (format "[%s] Task-only diff, auto-merging" id))
868
+ (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
869
+ all-claimed (into claimed-ids mv-claimed-tasks)]
870
+ (if (= :failed sync-status)
871
+ ;; Sync failed cannot merge safely, skip
872
+ (do
873
+ (println (format "[%s] Sync to main failed, skipping merge" id))
874
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
875
+ {:outcome :sync-failed :claimed-task-ids (vec all-claimed)})
876
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
877
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
878
+ ;; Synced — proceed with merge
879
+ (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root 0 all-claimed)
880
+ metrics (if merged? (update metrics :merges inc) metrics)]
881
+ (println (format "[%s] Cycle %d/%d complete" id iter iterations))
882
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
883
+ {:outcome :merged :claimed-task-ids (vec all-claimed) :review-rounds 0})
884
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
885
+ (recur (inc iter) (inc completed) 0 metrics nil nil #{} nil 0)))))
886
+ ;; Code changes — full review loop
887
+ (let [{:keys [approved? attempts]} (review-loop! worker (:path wt-state) id iter)
888
+ ;; Don't pre-increment :merges — defer to after actual merge succeeds
889
+ metrics (-> metrics
890
+ (update :review-rounds-total + (or attempts 0))
891
+ (cond-> (not approved?) (update :rejections inc)))]
892
+ (if approved?
893
+ (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
894
+ all-claimed (into claimed-ids mv-claimed-tasks)]
895
+ (if (= :failed sync-status)
896
+ ;; Sync failed after approval — treat as sync failure, skip merge
897
+ (do
898
+ (println (format "[%s] Sync to main failed after approval, skipping merge" id))
899
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
900
+ {:outcome :sync-failed :claimed-task-ids (vec all-claimed)
901
+ :review-rounds (or attempts 0)})
902
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
903
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
904
+ ;; Synced — proceed with merge, capture return value
905
+ (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root (or attempts 0) all-claimed)
906
+ metrics (if merged? (update metrics :merges inc) metrics)]
907
+ (println (format "[%s] Cycle %d/%d complete" id iter iterations))
908
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
909
+ {:outcome (if merged? :merged :merge-failed)
910
+ :claimed-task-ids (vec all-claimed)
911
+ :review-rounds (or attempts 0)})
912
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
913
+ (recur (inc iter) (inc completed) 0 metrics nil nil #{} nil 0))))
914
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
915
+ metrics (update metrics :recycled + recycled)]
916
+ (println (format "[%s] Cycle %d/%d rejected" id iter iterations))
917
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
918
+ {:outcome :rejected :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
919
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
920
+ :review-rounds (or attempts 0)})
921
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
922
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0)))))
923
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
924
+ metrics (update metrics :recycled + recycled)]
925
+ (println (format "[%s] Merge signaled but no changes, skipping" id))
926
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
927
+ {:outcome :no-changes :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
928
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
929
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
930
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0)))
931
+
932
+ ;; __DONE__ — agent signaled it finished this cycle's work.
933
+ ;; Always reset session and continue to next iteration.
934
+ ;; Planners re-plan as tasks complete; executors pick up new tasks.
935
+ done?
936
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
937
+ metrics (update metrics :recycled + recycled)]
938
+ (println (format "[%s] __DONE__ signal, resetting session (iter %d/%d)" id iter iterations))
939
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
940
+ {:outcome :executor-done :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
941
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
519
942
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
520
- (recur (inc iter) completed 0 nil nil)))
521
-
522
- ;; __DONE__ without merge only honor for planners
523
- (and done? (:can-plan worker))
524
- (do
525
- (println (format "[%s] Received __DONE__ signal" id))
526
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
527
- (println (format "[%s] Worker done after %d/%d iterations" id iter iterations))
528
- (assoc worker :completed completed :status :done))
529
-
530
- ;; __DONE__ from executor — ignore, keep working
531
- (and done? (not (:can-plan worker)))
532
- (do
533
- (println (format "[%s] Ignoring __DONE__ (executor)" id))
534
- (recur (inc iter) completed consec-errors new-session-id wt-state))
535
-
536
- ;; No signal agent still working, resume next iteration
537
- :else
538
- (do
539
- (println (format "[%s] Working... (will resume)" id))
540
- (recur (inc iter) completed 0 new-session-id wt-state))))))))))
943
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
944
+
945
+ ;; No signalagent still working, resume next iteration.
946
+ ;; Track consecutive working resumes. After max-working-resumes,
947
+ ;; inject a nudge prompt. If still no signal after nudge, kill session.
948
+ :else
949
+ (let [wr (inc working-resumes)
950
+ max-wr (:max-working-resumes worker)]
951
+ (cond
952
+ ;; Already nudged last iteration, still no signal — stuck
953
+ (> wr max-wr)
954
+ (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
955
+ metrics (update metrics :recycled + recycled)]
956
+ (println (format "[%s] Stuck after %d working resumes + nudge, resetting session" id wr))
957
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
958
+ {:outcome :stuck :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
959
+ :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
960
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
961
+ (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
962
+
963
+ ;; Hit the limit nudge on next resume
964
+ (= wr max-wr)
965
+ (do
966
+ (println (format "[%s] Working... %d/%d resumes, nudging agent to wrap up" id wr max-wr))
967
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
968
+ {:outcome :working :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
969
+ (recur (inc iter) completed 0 metrics new-session-id wt-state
970
+ claimed-ids nudge-prompt wr))
971
+
972
+ ;; Under limit — normal resume
973
+ :else
974
+ (do
975
+ (println (format "[%s] Working... (will resume, %d/%d)" id wr max-wr))
976
+ (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
977
+ {:outcome :working :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
978
+ (recur (inc iter) completed 0 metrics new-session-id wt-state
979
+ claimed-ids nil wr))))))))))))))
541
980
 
542
981
  ;; =============================================================================
543
982
  ;; Multi-Worker Execution
@@ -545,6 +984,7 @@
545
984
 
546
985
  (defn run-workers!
547
986
  "Run multiple workers in parallel.
987
+ Writes stopped event to runs/{swarm-id}/stopped.json on completion.
548
988
 
549
989
  Arguments:
550
990
  workers - seq of worker configs
@@ -552,21 +992,117 @@
552
992
  Returns seq of final worker states."
553
993
  [workers]
554
994
  (tasks/ensure-dirs!)
555
- (println (format "Launching %d workers..." (count workers)))
556
-
557
- (let [futures (doall
558
- (map-indexed
559
- (fn [idx worker]
560
- (let [worker (assoc worker :id (or (:id worker) (str "w" idx)))]
561
- (future (run-worker! worker))))
562
- workers))]
563
-
564
- (println "All workers launched. Waiting for completion...")
565
- (let [results (mapv deref futures)]
566
- (println "\nAll workers complete.")
567
- (doseq [w results]
568
- (println (format " [%s] %s - %d iterations"
569
- (:id w)
570
- (name (:status w))
571
- (:completed w))))
572
- results)))
995
+ (let [swarm-id (-> workers first :swarm-id)]
996
+ (println (format "Launching %d workers..." (count workers)))
997
+
998
+ ;; Register JVM shutdown hook so SIGTERM/SIGINT triggers graceful stop.
999
+ ;; Sets the shutdown atom — workers check it between cycles and exit cleanly.
1000
+ ;; The hook waits for workers to finish, then writes stopped.json only if
1001
+ ;; the clean exit path hasn't already done so (guarded by the atom).
1002
+ (let [hook (Thread. (fn []
1003
+ (println "\nShutdown signal received, stopping workers after current cycle...")
1004
+ (reset! shutdown-requested? true)
1005
+ ;; Give workers time to finish current cycle and cleanup.
1006
+ ;; After sleep, write stopped.json only if still in shutdown
1007
+ ;; (clean exit resets the atom to false before writing :completed).
1008
+ (Thread/sleep 10000)
1009
+ (when (and swarm-id @shutdown-requested?)
1010
+ (runs/write-stopped! swarm-id :interrupted))))]
1011
+ (.addShutdownHook (Runtime/getRuntime) hook)
1012
+
1013
+ (let [futures (doall
1014
+ (map-indexed
1015
+ (fn [idx worker]
1016
+ (let [worker (assoc worker :id (or (:id worker) (str "w" idx)))]
1017
+ (future (run-worker! worker))))
1018
+ workers))]
1019
+
1020
+ (println "All workers launched. Waiting for completion...")
1021
+ (let [results (mapv deref futures)]
1022
+ ;; Clean exit — tell shutdown hook not to write stopped.json
1023
+ (reset! shutdown-requested? false)
1024
+ ;; Remove the hook so it doesn't accumulate across calls
1025
+ (try (.removeShutdownHook (Runtime/getRuntime) hook) (catch Exception _))
1026
+ (println "\nAll workers complete.")
1027
+ (doseq [w results]
1028
+ (println (format " [%s] %s - %d completed, %d merges, %d claims, %d rejections, %d errors, %d recycled, %d review rounds"
1029
+ (:id w)
1030
+ (name (:status w))
1031
+ (:completed w)
1032
+ (or (:merges w) 0)
1033
+ (or (:claims w) 0)
1034
+ (or (:rejections w) 0)
1035
+ (or (:errors w) 0)
1036
+ (or (:recycled w) 0)
1037
+ (or (:review-rounds-total w) 0))))
1038
+
1039
+ ;; Write stopped event — all state derivable from cycle logs
1040
+ (when swarm-id
1041
+ (runs/write-stopped! swarm-id :completed)
1042
+ (println (format "\nStopped event written to runs/%s/stopped.json" swarm-id)))
1043
+
1044
+ results)))))
1045
+
1046
+ ;; =============================================================================
1047
+ ;; Planner — first-class config concept, NOT a worker
1048
+ ;; =============================================================================
1049
+ ;; The planner creates task EDN files in tasks/pending/.
1050
+ ;; It runs in the project root (no worktree), has no review/merge cycle,
1051
+ ;; and respects max_pending backpressure to avoid flooding the queue.
1052
+
1053
+ (defn run-planner!
1054
+ "Run planner agent to create tasks. No worktree, no review, no merge.
1055
+ Runs in project root. Respects max_pending cap.
1056
+ Returns {:tasks-created N}"
1057
+ [{:keys [harness model prompts max-pending swarm-id]}]
1058
+ (tasks/ensure-dirs!)
1059
+ (let [project-root (System/getProperty "user.dir")
1060
+ pending-before (tasks/pending-count)
1061
+ max-pending (or max-pending 10)]
1062
+ ;; Backpressure: skip if queue is full
1063
+ (if (>= pending-before max-pending)
1064
+ (do
1065
+ (println (format "[planner] Skipping — %d pending tasks (max: %d)" pending-before max-pending))
1066
+ {:tasks-created 0})
1067
+ ;; Run agent
1068
+ (let [context (build-context)
1069
+ template-tokens (build-template-tokens context)
1070
+ prompt-text (str (when (seq prompts)
1071
+ (->> prompts
1072
+ (map load-prompt)
1073
+ (remove nil?)
1074
+ (map #(agent/tokenize % template-tokens))
1075
+ (str/join "\n\n")))
1076
+ "\n\nTask Status: " (:task_status context) "\n"
1077
+ "Pending: " (:pending_tasks context) "\n\n"
1078
+ "Create tasks in tasks/pending/ as .edn files.\n"
1079
+ "Maximum " (- max-pending pending-before) " new tasks.\n"
1080
+ "Signal __DONE__ when finished planning.")
1081
+ swarm-id* (or swarm-id "unknown")
1082
+ tagged-prompt (str "[oompa:" swarm-id* ":planner] " prompt-text)
1083
+ abs-root (.getAbsolutePath (io/file project-root))
1084
+
1085
+ cmd (harness/build-cmd harness
1086
+ {:cwd abs-root :model model :prompt tagged-prompt})
1087
+
1088
+ _ (println (format "[planner] Running (%s:%s, max_pending: %d, current: %d)"
1089
+ (name harness) (or model "default") max-pending pending-before))
1090
+
1091
+ result (try
1092
+ (process/sh cmd {:dir abs-root
1093
+ :in (harness/process-stdin harness tagged-prompt)
1094
+ :out :string :err :string})
1095
+ (catch Exception e
1096
+ (println (format "[planner] Agent exception: %s" (.getMessage e)))
1097
+ {:exit -1 :out "" :err (.getMessage e)}))
1098
+
1099
+ ;; Commit any new task files
1100
+ _ (process/sh ["git" "add" "tasks/pending/"] {:dir abs-root})
1101
+ _ (process/sh ["git" "commit" "-m" "Planner: add tasks"]
1102
+ {:dir abs-root :out :string :err :string})
1103
+
1104
+ pending-after (tasks/pending-count)
1105
+ created (- pending-after pending-before)]
1106
+
1107
+ (println (format "[planner] Done. Created %d tasks (pending: %d)" created pending-after))
1108
+ {:tasks-created created}))))