@nbardy/oompa 0.7.1 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,6 +21,7 @@
21
21
  [babashka.process :as process]
22
22
  [clojure.java.io :as io]
23
23
  [clojure.set]
24
+ [clojure.pprint :refer [print-table]]
24
25
  [clojure.string :as str]))
25
26
 
26
27
  ;; =============================================================================
@@ -40,6 +41,13 @@
40
41
  (def ^:private shutdown-requested? (atom false))
41
42
 
42
43
  (declare task-root-for-cwd)
44
+ (declare verify-mergeable?)
45
+
46
+ (defn- log-ts
47
+ "Readable wall-clock timestamp for worker log lines."
48
+ []
49
+ (.format (java.time.format.DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss")
50
+ (java.time.LocalDateTime/now)))
43
51
 
44
52
  (defn- load-prompt
45
53
  "Load a prompt file. Tries path as-is first, then from package root."
@@ -47,6 +55,11 @@
47
55
  (or (agent/load-custom-prompt path)
48
56
  (agent/load-custom-prompt (str package-root "/" path))))
49
57
 
58
+ (defn- snippet
59
+ [s limit]
60
+ (let [s (or s "")]
61
+ (subs s 0 (min limit (count s)))))
62
+
50
63
  (defn- build-template-tokens
51
64
  "Build token map for prompt template {var} substitution.
52
65
  Merges core/build-context (rich YAML header, queue, hotspots, etc.)
@@ -75,8 +88,8 @@
75
88
  local-tasks (io/file cwd-file "tasks")
76
89
  parent-tasks (some-> cwd-file .getParentFile (io/file "tasks"))]
77
90
  (cond
78
- (.exists local-tasks) "tasks"
79
91
  (and parent-tasks (.exists parent-tasks)) "../tasks"
92
+ (.exists local-tasks) "tasks"
80
93
  :else "tasks")))
81
94
 
82
95
  (defn- render-task-header
@@ -90,6 +103,8 @@
90
103
  (str/replace "{TASKS_ROOT}" task-root))))
91
104
 
92
105
  (def ^:private default-max-working-resumes 5)
106
+ (def ^:private default-max-needs-followups 1)
107
+ (def ^:private default-max-wait-for-tasks 600)
93
108
 
94
109
  (defn create-worker
95
110
  "Create a worker config.
@@ -97,16 +112,23 @@
97
112
  :can-plan when false, worker waits for tasks before starting (backpressure).
98
113
  :reasoning reasoning effort level (e.g. \"low\", \"medium\", \"high\") — codex only.
99
114
  :review-prompts paths to reviewer prompt files (loaded and concatenated for review).
100
- :wait-between seconds to sleep between iterations (nil or 0 = no wait).
101
- :max-working-resumes max consecutive working resumes before nudge+kill (default 5)."
102
- [{:keys [id swarm-id harness model iterations prompts can-plan reasoning
103
- review-harness review-model review-prompts wait-between
104
- max-working-resumes]}]
115
+ :wait-between seconds to sleep between cycles (nil or 0 = no wait).
116
+ :max-wait-for-tasks max seconds a non-planner waits for tasks before giving up (default 600).
117
+ :max-working-resumes max consecutive working resumes before nudge+kill (default 5).
118
+ :max-needs-followups max NEEDS_FOLLOWUP continuations in one cycle (default 1)."
119
+ [{:keys [id swarm-id harness model runs max-cycles iterations prompts can-plan reasoning
120
+ reviewers wait-between
121
+ max-working-resumes max-needs-followups max-wait-for-tasks]}]
122
+ (let [cycle-cap (or max-cycles iterations runs 10)
123
+ run-goal (or runs iterations 10)]
105
124
  {:id id
106
125
  :swarm-id swarm-id
107
126
  :harness (or harness :codex)
108
127
  :model model
109
- :iterations (or iterations 10)
128
+ ;; Legacy compatibility: :iterations remains the cycle cap.
129
+ :iterations cycle-cap
130
+ :max-cycles cycle-cap
131
+ :runs run-goal
110
132
  :prompts (cond
111
133
  (vector? prompts) prompts
112
134
  (string? prompts) [prompts]
@@ -114,15 +136,15 @@
114
136
  :can-plan (if (some? can-plan) can-plan true)
115
137
  :reasoning reasoning
116
138
  :wait-between (when (and wait-between (pos? wait-between)) wait-between)
117
- :review-harness review-harness
118
- :review-model review-model
119
- :review-prompts (cond
120
- (vector? review-prompts) review-prompts
121
- (string? review-prompts) [review-prompts]
122
- :else [])
139
+ :max-wait-for-tasks (let [v (or max-wait-for-tasks default-max-wait-for-tasks)]
140
+ (if (and (number? v) (pos? v))
141
+ v
142
+ default-max-wait-for-tasks))
143
+ :reviewers reviewers
123
144
  :max-working-resumes (or max-working-resumes default-max-working-resumes)
145
+ :max-needs-followups (or max-needs-followups default-max-needs-followups)
124
146
  :completed 0
125
- :status :idle})
147
+ :status :idle}))
126
148
 
127
149
  ;; =============================================================================
128
150
  ;; Task Execution
@@ -139,7 +161,9 @@
139
161
  "1. If you have meaningful changes: commit them and signal COMPLETE_AND_READY_FOR_MERGE\n"
140
162
  "2. If scope is too large: create follow-up tasks in tasks/pending/ for remaining work,\n"
141
163
  " commit what you have (even partial notes/design docs), and signal COMPLETE_AND_READY_FOR_MERGE\n"
142
- "3. If you are stuck and cannot make progress: signal __DONE__\n\n"
164
+ "3. If you truly cannot produce a merge-ready artifact this turn, signal NEEDS_FOLLOWUP\n"
165
+ " and explain the remaining work. The framework will keep your claimed tasks and give you\n"
166
+ " one targeted follow-up prompt. This is not success.\n\n"
143
167
  "Do NOT continue working without producing a signal."))
144
168
 
145
169
  (defn- build-context
@@ -181,11 +205,63 @@
181
205
  "\n\n"
182
206
  (if (seq claimed-ids)
183
207
  "Work on your claimed tasks. Signal COMPLETE_AND_READY_FOR_MERGE when done."
184
- "No claims succeeded. CLAIM different tasks, or signal __DONE__ if no suitable work remains."))]
208
+ "No claims succeeded. CLAIM different tasks. If you cannot finish a mergeable artifact after trying hard, signal NEEDS_FOLLOWUP with a short explanation."))]
185
209
  {:claimed claimed-ids
186
210
  :failed failed-ids
187
211
  :resume-prompt prompt}))
188
212
 
213
+ (defn- active-claimed-task-ids
214
+ "Union of tasks claimed earlier in the cycle and tasks moved into current/
215
+ during the latest attempt."
216
+ [claimed-ids mv-claimed-tasks]
217
+ (-> (set claimed-ids)
218
+ (into mv-claimed-tasks)))
219
+
220
+ (defn- recycle-task-id-set!
221
+ "Recycle a set of claimed task IDs from current/ back to pending/.
222
+ Returns a vector of recycled IDs."
223
+ [worker-id task-ids]
224
+ (let [task-ids (set (remove nil? task-ids))
225
+ recycled (when (seq task-ids)
226
+ (tasks/recycle-tasks! task-ids))]
227
+ (when (seq recycled)
228
+ (println (format "[%s] Recycled %d claimed task(s): %s"
229
+ worker-id (count recycled) (str/join ", " recycled))))
230
+ (vec (or recycled []))))
231
+
232
+ (defn- recycle-active-claims!
233
+ "Recycle all claims active in the current cycle."
234
+ [worker-id claimed-ids mv-claimed-tasks]
235
+ (recycle-task-id-set! worker-id (active-claimed-task-ids claimed-ids mv-claimed-tasks)))
236
+
237
+ (defn- build-needs-followup-prompt
238
+ "Prompt injected after NEEDS_FOLLOWUP so the worker keeps ownership and
239
+ closes the loop in the same cycle."
240
+ [claimed-ids output]
241
+ (let [context (build-context)
242
+ explanation (some-> output
243
+ (str/replace #"(?is)^\s*NEEDS_FOLLOWUP\b[\s:.-]*" "")
244
+ str/trim)]
245
+ (str "## NEEDS_FOLLOWUP Follow-up\n\n"
246
+ (if (seq claimed-ids)
247
+ (str "You still own these claimed tasks: "
248
+ (str/join ", " (sort claimed-ids))
249
+ "\n\n")
250
+ "You do not currently own any claimed tasks.\n\n")
251
+ "Continue the SAME cycle and finish a merge-ready artifact.\n"
252
+ "Do not output NEEDS_FOLLOWUP again unless you are still blocked after this follow-up.\n"
253
+ "Prefer the smallest useful diff. If scope is too large, create concrete follow-up tasks in the pending queue and still ship the artifact you have.\n\n"
254
+ (when (seq explanation)
255
+ (str "Your previous explanation:\n"
256
+ explanation
257
+ "\n\n"))
258
+ "Task Status: " (:task_status context) "\n"
259
+ "Remaining Pending:\n"
260
+ (if (str/blank? (:pending_tasks context))
261
+ "(none)"
262
+ (:pending_tasks context))
263
+ "\n\nWhen ready, signal COMPLETE_AND_READY_FOR_MERGE.")))
264
+
189
265
  (defn- run-agent!
190
266
  "Run agent with prompt, return {:output :done? :merge? :claim-ids :exit :session-id}.
191
267
  When resume? is true, continues the existing session with a lighter prompt.
@@ -239,36 +315,40 @@
239
315
  tagged-prompt (str "[oompa:" swarm-id* ":" id "] " prompt)
240
316
  abs-worktree (.getAbsolutePath (io/file worktree-path))
241
317
 
242
- cmd (harness/build-cmd harness
243
- {:cwd abs-worktree :model model :reasoning reasoning
244
- :session-id session-id :resume? resume?
245
- :prompt tagged-prompt :format? true})
246
-
247
318
  result (try
248
- (process/sh cmd {:dir abs-worktree
249
- :in (harness/process-stdin harness tagged-prompt)
250
- :out :string :err :string})
319
+ (harness/run-command! harness
320
+ {:cwd abs-worktree :model model :reasoning reasoning
321
+ :session-id session-id :resume? resume?
322
+ :prompt tagged-prompt :format? true})
251
323
  (catch Exception e
252
324
  (println (format "[%s] Agent exception: %s" id (.getMessage e)))
253
325
  {:exit -1 :out "" :err (.getMessage e)}))
254
326
 
255
- {:keys [output session-id]}
256
- (harness/parse-output harness (:out result) session-id)]
327
+ {:keys [output session-id warning raw-snippet]}
328
+ (harness/parse-output harness (:out result) session-id)
329
+ stderr-snippet (let [stderr (some-> (:err result) str/trim)]
330
+ (when (seq stderr)
331
+ (subs stderr 0 (min 400 (count stderr)))))]
257
332
 
258
333
  {:output output
259
334
  :exit (:exit result)
260
335
  :done? (agent/done-signal? output)
261
336
  :merge? (agent/merge-signal? output)
337
+ :needs-followup? (agent/needs-followup-signal? output)
262
338
  :claim-ids (agent/parse-claim-signal output)
263
- :session-id session-id}))
339
+ :session-id session-id
340
+ :parse-warning warning
341
+ :raw-snippet raw-snippet
342
+ :stderr-snippet stderr-snippet}))
264
343
 
265
344
  (defn- run-reviewer!
266
345
  "Run reviewer on worktree changes.
267
346
  Uses custom review-prompts when configured, otherwise falls back to default.
268
347
  prev-feedback: vector of previous review outputs (for multi-round context).
269
348
  Returns {:verdict :approved|:needs-changes|:rejected, :comments [...], :output string}"
270
- [{:keys [id swarm-id review-harness review-model review-prompts]} worktree-path prev-feedback]
271
- (let [;; Get actual diff content (not just stat) — truncate to 8000 chars for prompt budget
349
+ [{:keys [id swarm-id reviewers]} worktree-path prev-feedback]
350
+ (let [start-ms (System/currentTimeMillis)
351
+ ;; Get actual diff content (not just stat) — truncate to 8000 chars for prompt budget
272
352
  diff-result (process/sh ["git" "diff" "main"]
273
353
  {:dir worktree-path :out :string :err :string})
274
354
  diff-content (let [d (:out diff-result)]
@@ -277,15 +357,9 @@
277
357
  d))
278
358
 
279
359
  swarm-id* (or swarm-id "unknown")
280
- custom-prompt (when (seq review-prompts)
281
- (->> review-prompts
282
- (map load-prompt)
283
- (remove nil?)
284
- (str/join "\n\n")))
285
360
 
286
361
  ;; Only include the most recent round's feedback — the worker has already
287
362
  ;; attempted fixes based on it, so the reviewer just needs to verify.
288
- ;; Including all prior rounds bloats the prompt and causes empty output.
289
363
  history-block (when (seq prev-feedback)
290
364
  (let [latest (last prev-feedback)
291
365
  truncated (if (> (count latest) 2000)
@@ -297,46 +371,57 @@
297
371
  truncated
298
372
  "\n\n")))
299
373
 
300
- review-body (str (or custom-prompt
301
- (str "Review the changes in this worktree.\n"
302
- "Focus on architecture and design, not style.\n"))
303
- "\n\nDiff:\n```\n" diff-content "\n```\n"
304
- (when history-block history-block)
305
- "\nYour verdict MUST be on its own line, exactly one of:\n"
306
- "VERDICT: APPROVED\n"
307
- "VERDICT: NEEDS_CHANGES\n\n"
308
- "Do NOT use REJECTED. Always use NEEDS_CHANGES with specific, "
309
- "actionable feedback explaining what must change and why. "
310
- "The worker will attempt fixes based on your feedback.\n"
311
- "After your verdict line, list every issue as a numbered item with "
312
- "the file path and what needs to change.\n")
313
- review-prompt (str "[oompa:" swarm-id* ":" id "] " review-body)
314
-
315
374
  abs-wt (.getAbsolutePath (io/file worktree-path))
316
375
 
317
- ;; No session, no resume, no format flags reviewer is stateless one-shot
318
- cmd (harness/build-cmd review-harness
319
- {:cwd abs-wt :model review-model :prompt review-prompt})
320
-
321
- result (try
322
- (process/sh cmd {:dir abs-wt
323
- :in (harness/process-stdin review-harness review-prompt)
324
- :out :string :err :string})
325
- (catch Exception e
326
- {:exit -1 :out "" :err (.getMessage e)}))
376
+ ;; Try each reviewer until one succeeds and returns a verdict
377
+ result (reduce (fn [_ {:keys [harness model prompts]}]
378
+ (let [custom-prompt (when (seq prompts)
379
+ (->> prompts
380
+ (map load-prompt)
381
+ (remove nil?)
382
+ (str/join "\n\n")))
383
+ review-body (str (or custom-prompt
384
+ (str "Review the changes in this worktree.\n"
385
+ "Focus on architecture and design, not style.\n"))
386
+ "\n\nDiff:\n```\n" diff-content "\n```\n"
387
+ (when history-block history-block)
388
+ "\nYour verdict MUST be on its own line, exactly one of:\n"
389
+ "VERDICT: APPROVED\n"
390
+ "VERDICT: NEEDS_CHANGES\n\n"
391
+ "Do NOT use REJECTED. Always use NEEDS_CHANGES with specific, "
392
+ "actionable feedback explaining what must change and why. "
393
+ "The worker will attempt fixes based on your feedback.\n"
394
+ "After your verdict line, list every issue as a numbered item with "
395
+ "the file path and what needs to change.\n")
396
+ review-prompt (str "[oompa:" swarm-id* ":" id "] " review-body)
397
+ res (try
398
+ (harness/run-command! harness {:cwd abs-wt :model model :prompt review-prompt})
399
+ (catch Exception e
400
+ {:exit -1 :out "" :err (.getMessage e)}))
401
+ parsed (harness/parse-output harness (:out res) nil)
402
+ output (or (:output parsed) "")
403
+ has-verdict? (or (re-find #"VERDICT:\s*APPROVED" output)
404
+ (re-find #"VERDICT:\s*NEEDS_CHANGES" output)
405
+ (re-find #"VERDICT:\s*REJECTED" output)
406
+ (re-find #"(?i)\bAPPROVED\b" output))]
407
+ (if (and (= (:exit res) 0) has-verdict?)
408
+ (reduced res)
409
+ (do
410
+ (println (format "[%s] Reviewer %s failed or returned no verdict, falling back..." id model))
411
+ res))))
412
+ {:exit -1 :out "" :err "No reviewers configured or no verdict returned"}
413
+ reviewers)
327
414
 
328
415
  output (:out result)
329
416
 
330
- ;; Parse verdict — require explicit VERDICT: prefix to avoid false matches.
331
- ;; REJECTED is treated as NEEDS_CHANGES: the reviewer must always give
332
- ;; actionable feedback so the worker can attempt fixes. Hard rejection
333
- ;; only happens when max review rounds are exhausted.
417
+ ;; Parse verdict
334
418
  verdict (cond
335
419
  (re-find #"VERDICT:\s*APPROVED" output) :approved
336
420
  (re-find #"VERDICT:\s*NEEDS_CHANGES" output) :needs-changes
337
421
  (re-find #"VERDICT:\s*REJECTED" output) :needs-changes
338
422
  (re-find #"(?i)\bAPPROVED\b" output) :approved
339
- :else :needs-changes)]
423
+ :else :needs-changes)
424
+ duration-ms (- (System/currentTimeMillis) start-ms)]
340
425
 
341
426
  (println (format "[%s] Reviewer verdict: %s" id (name verdict)))
342
427
  (let [summary (subs output 0 (min 300 (count output)))]
@@ -346,14 +431,16 @@
346
431
  {:verdict verdict
347
432
  :comments (when (not= (:exit result) 0)
348
433
  [(:err result)])
349
- :output output}))
434
+ :output output
435
+ :duration-ms duration-ms}))
350
436
 
351
437
  (defn- run-fix!
352
438
  "Ask worker to fix issues based on reviewer feedback.
353
439
  all-feedback: vector of all reviewer outputs so far (accumulated across rounds).
354
440
  Returns {:output string, :exit int}"
355
441
  [{:keys [id swarm-id harness model]} worktree-path all-feedback]
356
- (let [swarm-id* (or swarm-id "unknown")
442
+ (let [start-ms (System/currentTimeMillis)
443
+ swarm-id* (or swarm-id "unknown")
357
444
  feedback-text (if (> (count all-feedback) 1)
358
445
  (str "The reviewer has given feedback across " (count all-feedback) " rounds.\n"
359
446
  "Fix ALL outstanding issues:\n\n"
@@ -369,18 +456,17 @@
369
456
 
370
457
  abs-wt (.getAbsolutePath (io/file worktree-path))
371
458
 
372
- cmd (harness/build-cmd harness
373
- {:cwd abs-wt :model model :prompt fix-prompt})
374
-
375
459
  result (try
376
- (process/sh cmd {:dir abs-wt
377
- :in (harness/process-stdin harness fix-prompt)
378
- :out :string :err :string})
460
+ (harness/run-command! harness
461
+ {:cwd abs-wt :model model :prompt fix-prompt})
379
462
  (catch Exception e
380
- {:exit -1 :out "" :err (.getMessage e)}))]
463
+ {:exit -1 :out "" :err (.getMessage e)}))
464
+ parsed (harness/parse-output harness (:out result) nil)
465
+ duration-ms (- (System/currentTimeMillis) start-ms)]
381
466
 
382
- {:output (:out result)
383
- :exit (:exit result)}))
467
+ {:output (:output parsed)
468
+ :exit (:exit result)
469
+ :duration-ms duration-ms}))
384
470
 
385
471
  (defn- collect-divergence-context
386
472
  "Collect context about how a worktree branch has diverged from main.
@@ -394,6 +480,65 @@
394
480
  :main-log (or main-log "(none)")
395
481
  :diff-stat (or diff-stat "(none)")}))
396
482
 
483
+ (defn- first-nonblank-line
484
+ "Return first non-blank line from text for compact logging."
485
+ [s]
486
+ (some->> (or s "")
487
+ str/split-lines
488
+ (remove str/blank?)
489
+ first))
490
+
491
+ (defn- classify-merge-failure
492
+ "Classify git merge/checkout failure text for better logs."
493
+ [failure-text]
494
+ (cond
495
+ (re-find #"untracked working tree files would be overwritten by merge" (or failure-text ""))
496
+ :untracked-overwrite
497
+ (re-find #"CONFLICT|Merge conflict" (or failure-text ""))
498
+ :conflict
499
+ (re-find #"Your local changes to the following files would be overwritten" (or failure-text ""))
500
+ :local-changes-overwrite
501
+ :else
502
+ :unknown))
503
+
504
+ (defn- run-resolver-agent!
505
+ "Run resolver agent with divergence + failure context.
506
+ Returns :resolved when branch verifies as mergeable, else :failed."
507
+ [worker wt-path worker-id reason-details]
508
+ (println (format "[%s] Branch diverged from main, launching resolver agent%s"
509
+ worker-id
510
+ (if (str/blank? reason-details)
511
+ ""
512
+ (str " (" reason-details ")"))))
513
+ (let [{:keys [branch-log main-log diff-stat]} (collect-divergence-context wt-path)
514
+ resolve-prompt (str "[oompa:" (or (:swarm-id worker) "unknown") ":" worker-id "] "
515
+ "Your branch cannot currently be merged safely into main.\n\n"
516
+ (when-not (str/blank? reason-details)
517
+ (str "Failure context from previous merge attempt:\n"
518
+ reason-details "\n\n"))
519
+ "Your branch's commits (not on main):\n" branch-log "\n\n"
520
+ "Commits on main since you branched:\n" main-log "\n\n"
521
+ "Divergence scope:\n" diff-stat "\n\n"
522
+ "Make this branch cleanly mergeable into main. "
523
+ "Preserve the intent of your branch's changes.\n"
524
+ "You have full git access — rebase, cherry-pick, resolve conflicts, "
525
+ "or clean up merge blockers.\n"
526
+ "When done, verify with: git diff main --stat")
527
+ abs-wt (.getAbsolutePath (io/file wt-path))
528
+ result (try
529
+ (harness/run-command! (:harness worker)
530
+ {:cwd abs-wt :model (:model worker) :prompt resolve-prompt})
531
+ (catch Exception e
532
+ {:exit -1 :out "" :err (.getMessage e)}))]
533
+ (if (zero? (:exit result))
534
+ (if (verify-mergeable? wt-path)
535
+ (do (println (format "[%s] Agent resolved divergence, branch is mergeable" worker-id))
536
+ :resolved)
537
+ (do (println (format "[%s] Agent ran but branch still can't merge cleanly" worker-id))
538
+ :failed))
539
+ (do (println (format "[%s] Resolver agent failed (exit %d)" worker-id (:exit result)))
540
+ :failed))))
541
+
397
542
  (defn- verify-mergeable?
398
543
  "Dry-run merge to verify a worktree branch merges cleanly into main.
399
544
  Does NOT leave merge state behind — always cleans up the dry-run.
@@ -425,36 +570,10 @@
425
570
  ;; Conflict — abort merge to restore clean worktree state, then
426
571
  ;; hand the problem to the agent with full divergence context.
427
572
  (let [_ (process/sh ["git" "merge" "--abort"] {:dir wt-path})
428
- _ (println (format "[%s] Branch diverged from main, launching resolver agent" worker-id))
429
- {:keys [branch-log main-log diff-stat]} (collect-divergence-context wt-path)
430
- resolve-prompt (str "[oompa:" (or (:swarm-id worker) "unknown") ":" worker-id "] "
431
- "Your branch has diverged from main and cannot merge cleanly.\n\n"
432
- "Your branch's commits (not on main):\n" branch-log "\n\n"
433
- "Commits on main since you branched:\n" main-log "\n\n"
434
- "Divergence scope:\n" diff-stat "\n\n"
435
- "Make this branch cleanly mergeable into main. "
436
- "Preserve the intent of your branch's changes.\n"
437
- "You have full git access — rebase, cherry-pick, resolve conflicts, "
438
- "whatever works.\n"
439
- "When done, verify with: git diff main --stat")
440
- abs-wt (.getAbsolutePath (io/file wt-path))
441
- cmd (harness/build-cmd (:harness worker)
442
- {:cwd abs-wt :model (:model worker) :prompt resolve-prompt})
443
- result (try
444
- (process/sh cmd {:dir abs-wt
445
- :in (harness/process-stdin (:harness worker) resolve-prompt)
446
- :out :string :err :string})
447
- (catch Exception e
448
- {:exit -1 :out "" :err (.getMessage e)}))]
449
- (if (zero? (:exit result))
450
- ;; Agent ran — verify the branch actually merges cleanly now
451
- (if (verify-mergeable? wt-path)
452
- (do (println (format "[%s] Agent resolved divergence, branch is mergeable" worker-id))
453
- :resolved)
454
- (do (println (format "[%s] Agent ran but branch still can't merge cleanly" worker-id))
455
- :failed))
456
- (do (println (format "[%s] Resolver agent failed (exit %d)" worker-id (:exit result)))
457
- :failed))))))
573
+ failure-snippet (first-nonblank-line (str (:out merge-result) "\n" (:err merge-result)))]
574
+ (run-resolver-agent! worker wt-path worker-id
575
+ (str "sync_worktree_to_main failed"
576
+ (when failure-snippet (str ": " failure-snippet))))))))
458
577
 
459
578
  (defn- worktree-has-changes?
460
579
  "Check if worktree has committed OR uncommitted changes vs main.
@@ -474,9 +593,11 @@
474
593
  (defn- create-iteration-worktree!
475
594
  "Create a fresh worktree for an iteration. Returns {:dir :branch :path}.
476
595
  Force-removes stale worktree+branch from previous failed runs first."
477
- [project-root worker-id iteration]
478
- (let [wt-dir (format ".w%s-i%d" worker-id iteration)
479
- wt-branch (format "oompa/%s-i%d" worker-id iteration)
596
+ [project-root swarm-id worker-id iteration]
597
+ (let [swarm-token (or swarm-id (subs (str (java.util.UUID/randomUUID)) 0 8))
598
+ work-id (format "s%s-%s-i%d" swarm-token worker-id iteration)
599
+ wt-dir (format ".w%s" work-id)
600
+ wt-branch (format "oompa/%s" work-id)
480
601
  wt-path (str project-root "/" wt-dir)]
481
602
  ;; Clean stale worktree/branch from previous failed runs
482
603
  (process/sh ["git" "worktree" "remove" wt-dir "--force"] {:dir project-root})
@@ -495,38 +616,201 @@
495
616
  (let [post-ids (tasks/current-task-ids)]
496
617
  (clojure.set/difference post-ids pre-current-ids)))
497
618
 
619
+ (defn- now-ms
620
+ []
621
+ (System/currentTimeMillis))
622
+
623
+ (defn- ms->seconds
624
+ [ms]
625
+ (/ ms 1000.0))
626
+
627
+ (defn- pct-of
628
+ [part total]
629
+ (if (pos? total)
630
+ (* 100.0 (/ part (double total)))
631
+ 0.0))
632
+
633
+ (defn- init-cycle-timing
634
+ []
635
+ {:implementation-rounds-ms []
636
+ :reviewer-response-ms []
637
+ :review-fixes-ms []
638
+ :optional-review-ms []
639
+ :llm-calls []})
640
+
641
+ (defn- add-llm-call
642
+ [timing section-name call-name duration-ms]
643
+ (let [timing (or timing (init-cycle-timing))
644
+ duration-ms (max 0 (long (or duration-ms 0)))]
645
+ (-> timing
646
+ (update section-name (fnil conj []) duration-ms)
647
+ (update :llm-calls conj {:name call-name
648
+ :section section-name
649
+ :duration-ms duration-ms}))))
650
+
651
+ (defn- cycle-llm-total-ms
652
+ [timing]
653
+ (let [sections [:implementation-rounds-ms :reviewer-response-ms :review-fixes-ms :optional-review-ms]]
654
+ (->> sections
655
+ (map #(reduce + 0 (or (get timing %) [])))
656
+ (reduce + 0))))
657
+
658
+ (defn- with-call-percent
659
+ [timing total-ms]
660
+ (update timing :llm-calls
661
+ (fn [calls]
662
+ (mapv (fn [{:keys [duration-ms] :as call}]
663
+ (assoc call :percent (pct-of duration-ms total-ms)))
664
+ calls))))
665
+
666
+ (defn- format-timing-segment
667
+ [label durations total-ms]
668
+ (let [durations (vec (or durations []))
669
+ items (if (seq durations)
670
+ (str/join ", "
671
+ (map #(format "%.2fs (%.1f%%)"
672
+ (ms->seconds %) (pct-of % total-ms))
673
+ durations))
674
+ "-")
675
+ section-ms (reduce + 0 durations)]
676
+ (format "%s=[%s] %.2fs (%.1f%%)"
677
+ label
678
+ items
679
+ (ms->seconds section-ms)
680
+ (pct-of section-ms total-ms))))
681
+
682
+ (defn- format-cycle-timing
683
+ [{:keys [implementation-rounds-ms reviewer-response-ms review-fixes-ms optional-review-ms]}
684
+ total-ms]
685
+ (let [llm-ms (cycle-llm-total-ms {:implementation-rounds-ms implementation-rounds-ms
686
+ :reviewer-response-ms reviewer-response-ms
687
+ :review-fixes-ms review-fixes-ms
688
+ :optional-review-ms optional-review-ms})
689
+ harness-ms (max 0 (- total-ms llm-ms))]
690
+ (str "timing: "
691
+ (format-timing-segment "Implementation" implementation-rounds-ms total-ms)
692
+ " | "
693
+ (format-timing-segment "Reviewer" reviewer-response-ms total-ms)
694
+ " | "
695
+ (format-timing-segment "Fixes" review-fixes-ms total-ms)
696
+ " | "
697
+ (format-timing-segment "OptionalReview" optional-review-ms total-ms)
698
+ " | LLM="
699
+ (format "%.2fs (%.1f%%)" (ms->seconds llm-ms) (pct-of llm-ms total-ms))
700
+ " | Harness="
701
+ (format "%.2fs (%.1f%%)" (ms->seconds harness-ms) (pct-of harness-ms total-ms))
702
+ " | Total="
703
+ (format "%.2fs" (ms->seconds total-ms)))))
704
+
705
+ (defn- safe-number
706
+ [v]
707
+ (if (number? v) (long v) 0))
708
+
709
+ (defn- safe-sum
710
+ [v]
711
+ (reduce + 0 (or v [])))
712
+
713
+ (defn- format-ms
714
+ [ms]
715
+ (format "%.2fs" (ms->seconds (safe-number ms))))
716
+
717
+ (defn- cycle-time-sum
718
+ [{:keys [implementation-rounds-ms reviewer-response-ms review-fixes-ms optional-review-ms] :as timing-ms}
719
+ duration-ms]
720
+ (let [impl (safe-sum implementation-rounds-ms)
721
+ review (safe-sum reviewer-response-ms)
722
+ fixes (safe-sum review-fixes-ms)
723
+ optional (safe-sum optional-review-ms)
724
+ total (safe-number duration-ms)
725
+ llm (+ impl review fixes optional)
726
+ harness (max 0 (- total llm))]
727
+ {:implementation-ms impl
728
+ :review-ms review
729
+ :fixes-ms fixes
730
+ :optional-review-ms optional
731
+ :llm-ms llm
732
+ :harness-ms harness
733
+ :total-ms total}))
734
+
735
+ (def ^:private empty-cycle-total
736
+ {:implementation-ms 0
737
+ :review-ms 0
738
+ :fixes-ms 0
739
+ :optional-review-ms 0
740
+ :llm-ms 0
741
+ :harness-ms 0
742
+ :total-ms 0})
743
+
744
+ (defn- aggregate-cycle-timings-by-worker
745
+ [swarm-id]
746
+ (reduce (fn [acc {:keys [worker-id timing-ms duration-ms]}]
747
+ (update acc worker-id
748
+ (fn [current]
749
+ (merge-with + (or current empty-cycle-total)
750
+ (cycle-time-sum timing-ms duration-ms)))))
751
+ {}
752
+ (or (when swarm-id (runs/list-cycles swarm-id)) [])))
753
+
754
+ (defn- worker-summary-row
755
+ [{:keys [id status completed cycles-completed merges claims rejections errors recycled review-rounds-total] :as _worker}
756
+ {:keys [implementation-ms review-ms fixes-ms harness-ms total-ms]}]
757
+ {:Worker id
758
+ :Runs (or completed cycles-completed 0)
759
+ :Cycles (or cycles-completed 0)
760
+ :Status (name status)
761
+ :Merges (or merges 0)
762
+ :Claims (or claims 0)
763
+ :Rejects (or rejections 0)
764
+ :Errors (or errors 0)
765
+ :Recycled (or recycled 0)
766
+ :ReviewRounds (or review-rounds-total 0)
767
+ :ImplMs (format-ms implementation-ms)
768
+ :ReviewMs (format-ms review-ms)
769
+ :FixMs (format-ms fixes-ms)
770
+ :HarnessMs (format-ms harness-ms)
771
+ :TotalMs (format-ms total-ms)})
772
+
498
773
  (defn- emit-cycle-log!
499
- "Write cycle event log. Called at every cycle exit point.
774
+ "Write cycle event log. Called at every cycle attempt exit point.
500
775
  session-id links to the Claude CLI conversation transcript on disk.
501
776
  No mutable summary state — all state is derived from immutable cycle logs."
502
- [swarm-id worker-id cycle start-ms session-id
503
- {:keys [outcome claimed-task-ids recycled-tasks error-snippet review-rounds]}]
504
- (let [duration-ms (- (System/currentTimeMillis) start-ms)]
777
+ [swarm-id worker-id cycle attempt run start-ms session-id
778
+ {:keys [outcome claimed-task-ids recycled-tasks error-snippet review-rounds timing-ms
779
+ worktree-path signals]}]
780
+ (let [duration-ms (- (now-ms) start-ms)
781
+ timing-ms (or timing-ms (init-cycle-timing))
782
+ harness-ms (max 0 (- duration-ms (cycle-llm-total-ms timing-ms)))
783
+ timing-ms (with-call-percent (assoc timing-ms
784
+ :harness-ms harness-ms
785
+ :llm-calls (or (:llm-calls timing-ms) []))
786
+ duration-ms)]
505
787
  (runs/write-cycle-log!
506
788
  swarm-id worker-id cycle
507
- {:outcome outcome
508
- :duration-ms duration-ms
509
- :claimed-task-ids (vec (or claimed-task-ids []))
510
- :recycled-tasks (or recycled-tasks [])
511
- :error-snippet error-snippet
512
- :review-rounds (or review-rounds 0)
513
- :session-id session-id})))
514
-
515
- (defn- recycle-orphaned-tasks!
516
- "Recycle tasks that a worker claimed but didn't complete.
517
- Compares current/ task IDs before and after the agent ran —
518
- new IDs that appeared are tasks this worker claimed. On failure
519
- or rejection, move them back to pending/ so other workers can
520
- pick them up. Returns count of recycled tasks."
521
- [worker-id pre-current-ids]
522
- (let [post-current-ids (tasks/current-task-ids)
523
- orphaned-ids (clojure.set/difference post-current-ids pre-current-ids)
524
- recycled (when (seq orphaned-ids)
525
- (tasks/recycle-tasks! orphaned-ids))]
526
- (when (seq recycled)
527
- (println (format "[%s] Recycled %d orphaned task(s): %s"
528
- worker-id (count recycled) (str/join ", " recycled))))
529
- (count (or recycled []))))
789
+ (cond-> {:run run
790
+ :attempt attempt
791
+ :outcome outcome
792
+ :duration-ms duration-ms
793
+ :claimed-task-ids (vec (or claimed-task-ids []))
794
+ :recycled-tasks (or recycled-tasks [])
795
+ :error-snippet error-snippet
796
+ :review-rounds (or review-rounds 0)
797
+ :session-id session-id
798
+ :timing-ms timing-ms}
799
+ worktree-path (assoc :worktree-path worktree-path)
800
+ (seq signals) (assoc :signals (vec signals))))
801
+ (let [terminal-outcomes #{:merged :merge-failed :rejected :sync-failed :no-changes
802
+ :executor-done :stuck :error :interrupted :needs-followup}]
803
+ (if (and outcome (contains? terminal-outcomes outcome))
804
+ (do
805
+ (println (format "[%s] %s" worker-id (format-cycle-timing timing-ms duration-ms)))
806
+ (when worktree-path
807
+ (println (format "[%s] worktree: %s" worker-id worktree-path)))
808
+ (when (seq signals)
809
+ (println (format "[%s] signals: %s" worker-id (str/join " → " signals)))))
810
+ (println (format "[%s] Cycle %d attempt %d continuing"
811
+ worker-id cycle attempt))))))
812
+
813
+
530
814
 
531
815
  (defn- cleanup-worktree!
532
816
  "Remove worktree and branch."
@@ -567,7 +851,8 @@
567
851
  (defn- merge-to-main!
568
852
  "Merge worktree changes to main branch. Serialized via merge-lock to prevent
569
853
  concurrent workers from corrupting the git index. On success, moves claimed
570
- tasks current→complete and annotates metadata. Returns true on success.
854
+ tasks current→complete and annotates metadata. Returns
855
+ {:ok? bool :reason keyword :message string}.
571
856
  claimed-task-ids: set of task IDs this worker claimed (framework owns completion)."
572
857
  [wt-path wt-id worker-id project-root review-rounds claimed-task-ids]
573
858
  (locking merge-lock
@@ -586,31 +871,66 @@
586
871
  (process/sh ["git" "merge" wt-id "--no-edit"]
587
872
  {:dir project-root :out :string :err :string}))
588
873
  success (and (zero? (:exit checkout-result))
589
- (zero? (:exit merge-result)))]
874
+ (zero? (:exit merge-result)))
875
+ failure-text (str/join "\n"
876
+ (remove str/blank?
877
+ [(:out checkout-result)
878
+ (:err checkout-result)
879
+ (when merge-result (:out merge-result))
880
+ (when merge-result (:err merge-result))]))
881
+ failure-reason (if (not (zero? (:exit checkout-result)))
882
+ :checkout-failed
883
+ (classify-merge-failure failure-text))]
590
884
  (if success
591
- (do
885
+ (let [completed (when (seq claimed-task-ids)
886
+ (tasks/complete-by-ids! claimed-task-ids))
887
+ completed-count (count (or completed []))]
592
888
  (println (format "[%s] Merge successful" worker-id))
593
889
  ;; Framework-owned completion: move claimed tasks current→complete
594
- (when (seq claimed-task-ids)
595
- (let [completed (tasks/complete-by-ids! claimed-task-ids)]
596
- (when (seq completed)
597
- (println (format "[%s] Completed %d task(s): %s"
598
- worker-id (count completed) (str/join ", " completed))))))
890
+ (when (seq completed)
891
+ (println (format "[%s] Completed %d task(s): %s"
892
+ worker-id completed-count (str/join ", " completed))))
599
893
  ;; Annotate completed tasks with metadata while still holding merge-lock
600
- (annotate-completed-tasks! project-root worker-id review-rounds))
894
+ (annotate-completed-tasks! project-root worker-id review-rounds)
895
+ {:ok? true
896
+ :reason :merged
897
+ :message "merge successful"
898
+ :completed-count completed-count})
601
899
  ;; FAILED: Clean up git state before releasing merge-lock.
602
900
  ;; Without this, a conflict leaves .git/MERGE_HEAD and poisons the
603
901
  ;; shared index — every subsequent worker fails on `git checkout main`.
604
902
  (do
605
- (println (format "[%s] MERGE FAILED: %s" worker-id
606
- (or (:err merge-result) (:err checkout-result))))
903
+ (println (format "[%s] MERGE FAILED (%s): %s"
904
+ worker-id
905
+ (name failure-reason)
906
+ (or (first-nonblank-line failure-text)
907
+ "no output")))
607
908
  (let [abort-result (process/sh ["git" "merge" "--abort"]
608
909
  {:dir project-root :out :string :err :string})]
609
910
  (when-not (zero? (:exit abort-result))
610
911
  ;; Abort failed (no merge in progress, or other issue) — hard reset.
611
912
  (process/sh ["git" "reset" "--hard" "HEAD"]
612
- {:dir project-root :out :string :err :string})))))
613
- success)))
913
+ {:dir project-root :out :string :err :string})))
914
+ {:ok? false
915
+ :reason failure-reason
916
+ :message (or (first-nonblank-line failure-text) "merge failed")})))))
917
+
918
+ (defn- recover-merge-failure!
919
+ "On merge-to-main failure, launch resolver agent and retry merge once.
920
+ Must run outside merge-lock to avoid blocking other workers."
921
+ [worker wt-path wt-id worker-id project-root review-rounds claimed-task-ids merge-result]
922
+ (let [reason (:reason merge-result)
923
+ msg (:message merge-result)
924
+ _ (println (format "[%s] Launching resolver after merge failure (%s): %s"
925
+ worker-id (name (or reason :unknown)) (or msg "merge failed")))
926
+ resolve-status (run-resolver-agent! worker wt-path worker-id
927
+ (str "merge_to_main failed (" (name (or reason :unknown)) ")"
928
+ (when msg (str ": " msg))))]
929
+ (if (= :failed resolve-status)
930
+ merge-result
931
+ (do
932
+ (println (format "[%s] Retrying merge after resolver" worker-id))
933
+ (merge-to-main! wt-path wt-id worker-id project-root review-rounds claimed-task-ids)))))
614
934
 
615
935
  (defn- task-only-diff?
616
936
  "Check if all changes in worktree are task files only (no code changes).
@@ -640,16 +960,21 @@
640
960
  and fixer has full context of all prior feedback.
641
961
  Writes review logs to runs/{swarm-id}/reviews/ for post-mortem analysis.
642
962
  Returns {:approved? bool, :attempts int}"
643
- [worker wt-path worker-id iteration]
644
- (if-not (and (:review-harness worker) (:review-model worker))
963
+ [worker wt-path worker-id iteration & [cycle-timing]]
964
+ (if (empty? (:reviewers worker))
645
965
  ;; No reviewer configured, auto-approve
646
- {:approved? true :attempts 0}
966
+ {:approved? true :attempts 0 :timing (or cycle-timing (init-cycle-timing))}
647
967
 
648
968
  ;; Run review loop with accumulated feedback
649
969
  (loop [attempt 1
650
- prev-feedback []]
970
+ prev-feedback []
971
+ timing (or cycle-timing (init-cycle-timing))]
651
972
  (println (format "[%s] Review attempt %d/%d" worker-id attempt max-review-retries))
652
- (let [{:keys [verdict output]} (run-reviewer! worker wt-path prev-feedback)
973
+ (let [{:keys [verdict output duration-ms]} (run-reviewer! worker wt-path prev-feedback)
974
+ timing (add-llm-call timing
975
+ :reviewer-response-ms
976
+ (str "review_" attempt)
977
+ (or duration-ms 0))
653
978
  diff-files (diff-file-names wt-path)]
654
979
 
655
980
  ;; Persist review log for this round
@@ -657,13 +982,14 @@
657
982
  (runs/write-review-log! (:swarm-id worker) worker-id iteration attempt
658
983
  {:verdict verdict
659
984
  :output output
985
+ :duration-ms (or duration-ms 0)
660
986
  :diff-files (or diff-files [])}))
661
987
 
662
988
  (case verdict
663
989
  :approved
664
990
  (do
665
991
  (println (format "[%s] Reviewer APPROVED (attempt %d)" worker-id attempt))
666
- {:approved? true :attempts attempt})
992
+ {:approved? true :attempts attempt :timing timing})
667
993
 
668
994
  ;; :needs-changes — always give the worker a chance to fix.
669
995
  ;; Hard rejection only happens when max review rounds are exhausted.
@@ -671,37 +997,49 @@
671
997
  (if (>= attempt max-review-retries)
672
998
  (do
673
999
  (println (format "[%s] Max review retries reached (%d rounds)" worker-id attempt))
674
- {:approved? false :attempts attempt})
1000
+ {:approved? false :attempts attempt :timing timing})
675
1001
  (do
676
1002
  (println (format "[%s] Reviewer requested changes, fixing..." worker-id))
677
- (run-fix! worker wt-path all-feedback)
678
- (recur (inc attempt) all-feedback)))))))))
1003
+ (let [{:keys [duration-ms]} (run-fix! worker wt-path all-feedback)
1004
+ timing (add-llm-call timing
1005
+ :review-fixes-ms
1006
+ (str "fix_" attempt)
1007
+ (or duration-ms 0))]
1008
+ (recur (inc attempt) all-feedback timing))))))))))
679
1009
 
680
1010
  ;; =============================================================================
681
1011
  ;; Worker Loop
682
1012
  ;; =============================================================================
683
1013
 
684
- ;; Workers wait up to 10 minutes for tasks to appear before giving up.
1014
+ ;; Workers can wait for tasks before giving up; default is 10 minutes.
685
1015
  ;; This keeps workers alive while planners/designers ramp up the queue.
686
- (def ^:private max-wait-for-tasks 600)
687
1016
  (def ^:private wait-poll-interval 10)
688
- (def ^:private max-consecutive-errors 3)
1017
+ (def ^:private max-consecutive-errors 5)
1018
+
1019
+ (defn- backoff-sleep! [id errors]
1020
+ (when (< errors max-consecutive-errors)
1021
+ (let [wait-sec (* 60 (int (Math/pow 2 (dec errors))))]
1022
+ (println (format "[%s] Backing off for %d seconds before next retry (%d/%d)..." id wait-sec errors (dec max-consecutive-errors)))
1023
+ (Thread/sleep (* 1000 wait-sec)))))
1024
+
689
1025
 
690
1026
  (defn- wait-for-tasks!
691
- "Wait up to 10 minutes for pending/current tasks to appear. Used for
692
- backpressure on workers that can't create their own tasks (can_plan: false).
1027
+ "Wait up to max-wait-seconds for pending/current tasks to appear.
1028
+ Used for backpressure on workers that can't create their own tasks (can_plan: false).
693
1029
  Polls every 10 seconds, logs every 60 seconds."
694
- [worker-id]
1030
+ [worker-id max-wait-seconds]
695
1031
  (loop [waited 0]
696
1032
  (cond
697
1033
  (pos? (tasks/pending-count)) true
698
1034
  (pos? (tasks/current-count)) true
699
- (>= waited max-wait-for-tasks)
700
- (do (println (format "[%s] No tasks after %ds, giving up" worker-id waited))
1035
+ (>= waited max-wait-seconds)
1036
+ (do (println (format "[%s] [%s] No tasks after %ds, giving up"
1037
+ worker-id (log-ts) waited))
701
1038
  false)
702
1039
  :else
703
1040
  (do (when (zero? (mod waited 60))
704
- (println (format "[%s] Waiting for tasks... (%ds/%ds)" worker-id waited max-wait-for-tasks)))
1041
+ (println (format "[%s] [%s] Waiting for tasks... (%ds/%ds)"
1042
+ worker-id (log-ts) waited max-wait-seconds)))
705
1043
  (Thread/sleep (* wait-poll-interval 1000))
706
1044
  (recur (+ waited wait-poll-interval))))))
707
1045
 
@@ -716,267 +1054,374 @@
716
1054
  (defn run-worker!
717
1055
  "Run worker loop with persistent sessions.
718
1056
 
719
- Sessions persist across iterations agents resume where they left off.
720
- Worktrees persist until COMPLETE_AND_READY_FOR_MERGE triggers review+merge.
721
- __DONE__ stops the worker entirely (planners only).
722
-
723
- Tracks per-worker metrics: merges, rejections, errors, review-rounds-total.
724
- Returns final worker state with metrics attached."
1057
+ A run is a terminal outcome (merged/rejected/error-like).
1058
+ A cycle is one worker turn/resume. Multiple cycles may occur in one run.
1059
+ Cycle cap is controlled by :max-cycles (legacy key: :iterations)."
725
1060
  [worker]
726
1061
  (tasks/ensure-dirs!)
727
- (let [{:keys [id iterations swarm-id wait-between]} worker
1062
+ (let [{:keys [id runs max-cycles iterations swarm-id wait-between
1063
+ max-wait-for-tasks max-needs-followups]} worker
1064
+ cycle-cap (or max-cycles iterations 10)
1065
+ run-goal (or runs iterations 10)
728
1066
  project-root (System/getProperty "user.dir")]
729
- (println (format "[%s] Starting worker (%s:%s%s, %d iterations%s)"
1067
+ (println (format "[%s] Starting worker (%s:%s%s, goal=%d runs, cap=%d cycles%s)"
730
1068
  id
731
1069
  (name (:harness worker))
732
1070
  (or (:model worker) "default")
733
1071
  (if (:reasoning worker) (str ":" (:reasoning worker)) "")
734
- iterations
1072
+ run-goal
1073
+ cycle-cap
735
1074
  (if wait-between (format ", %ds between" wait-between) "")))
736
1075
 
737
- ;; Backpressure: workers that can't create tasks wait for tasks to exist
738
- (when-not (:can-plan worker)
739
- (wait-for-tasks! id))
1076
+ (when (and (not (:can-plan worker))
1077
+ (not (pos? (tasks/pending-count)))
1078
+ (not (pos? (tasks/current-count))))
1079
+ (wait-for-tasks! id max-wait-for-tasks))
740
1080
 
741
- ;; metrics tracks: {:merges N :rejections N :errors N :recycled N :review-rounds-total N :claims N}
742
- (loop [iter 1
743
- completed 0
1081
+ (loop [cycle 1
1082
+ attempt 1
1083
+ completed-runs 0
744
1084
  consec-errors 0
745
1085
  metrics {:merges 0 :rejections 0 :errors 0 :recycled 0 :review-rounds-total 0 :claims 0}
746
- session-id nil ;; persistent session-id (nil = start fresh)
747
- wt-state nil ;; {:dir :branch :path} or nil
748
- claimed-ids #{} ;; task IDs claimed this session (reset on worktree destroy)
749
- claim-resume-prompt nil ;; override prompt for next iteration (from CLAIM results)
750
- working-resumes 0] ;; consecutive "working" outcomes in current session
1086
+ session-id nil
1087
+ wt-state nil
1088
+ claimed-ids #{}
1089
+ claim-resume-prompt nil
1090
+ working-resumes 0
1091
+ needs-followups 0
1092
+ signals []]
751
1093
  (let [finish (fn [status]
752
- (assoc worker :completed completed :status status
1094
+ (assoc worker :completed completed-runs
1095
+ :runs-completed completed-runs
1096
+ :cycles-completed (dec cycle)
1097
+ :status status
753
1098
  :merges (:merges metrics)
754
1099
  :rejections (:rejections metrics)
755
1100
  :errors (:errors metrics)
756
1101
  :recycled (:recycled metrics)
757
1102
  :review-rounds-total (:review-rounds-total metrics)
758
- :claims (:claims metrics)))]
1103
+ :claims (:claims metrics)))
1104
+ current-run (inc completed-runs)]
759
1105
  (cond
760
- (> iter iterations)
1106
+ (> cycle cycle-cap)
761
1107
  (do
762
- ;; Cleanup any lingering worktree
763
1108
  (when wt-state
1109
+ (when (seq claimed-ids)
1110
+ (recycle-task-id-set! id claimed-ids))
764
1111
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
765
- (println (format "[%s] Completed %d iterations (%d merges, %d claims, %d rejections, %d errors, %d recycled)"
766
- id completed (:merges metrics) (:claims metrics) (:rejections metrics) (:errors metrics) (:recycled metrics)))
1112
+ (println (format "[%s] Completed %d/%d runs in %d cycles (%d merges, %d claims, %d rejections, %d errors, %d recycled)"
1113
+ id completed-runs run-goal (dec cycle)
1114
+ (:merges metrics) (:claims metrics) (:rejections metrics) (:errors metrics) (:recycled metrics)))
767
1115
  (finish :exhausted))
768
1116
 
1117
+ (>= completed-runs run-goal)
1118
+ (do
1119
+ (when wt-state
1120
+ (when (seq claimed-ids)
1121
+ (recycle-task-id-set! id claimed-ids))
1122
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
1123
+ (println (format "[%s] Reached run goal: %d/%d runs in %d cycles"
1124
+ id completed-runs run-goal (dec cycle)))
1125
+ (finish :completed))
1126
+
769
1127
  @shutdown-requested?
770
1128
  (do
771
- (println (format "[%s] Shutdown requested, stopping after %d iterations" id (dec iter)))
1129
+ (println (format "[%s] Shutdown requested, stopping after %d cycles" id (dec cycle)))
772
1130
  (when wt-state
773
- ;; Recycle any claimed tasks back to pending so other workers can pick them up
774
1131
  (when (seq claimed-ids)
775
1132
  (let [recycled (tasks/recycle-tasks! claimed-ids)]
776
1133
  (when (seq recycled)
777
1134
  (println (format "[%s] Recycled %d claimed task(s) on shutdown" id (count recycled))))))
778
1135
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
779
- (emit-cycle-log! swarm-id id iter (System/currentTimeMillis) session-id
780
- {:outcome :interrupted})
1136
+ (emit-cycle-log! swarm-id id cycle attempt current-run (now-ms) session-id
1137
+ {:timing-ms (init-cycle-timing)
1138
+ :outcome :interrupted})
781
1139
  (finish :interrupted))
782
1140
 
783
1141
  :else
784
1142
  (do
785
- ;; Sleep between iterations when wait_between is configured
786
- (maybe-sleep-between! id wait-between iter)
787
-
788
- ;; Backpressure: non-planner workers wait for tasks between iterations too
789
- (when (and (not (:can-plan worker))
790
- (not (pos? (tasks/pending-count)))
791
- (not (pos? (tasks/current-count))))
792
- (println (format "[%s] Queue empty, waiting for tasks before iteration %d" id iter))
793
- (wait-for-tasks! id))
794
-
795
- ;; Ensure worktree exists (create fresh if nil, reuse if persisted)
796
- (let [wt-state (try
797
- (or wt-state (create-iteration-worktree! project-root id iter))
798
- (catch Exception e
799
- (println (format "[%s] Worktree creation failed: %s" id (.getMessage e)))
800
- nil))]
801
- (if (nil? wt-state)
802
- ;; Worktree creation failed — count as error
803
- (let [errors (inc consec-errors)
804
- metrics (update metrics :errors inc)]
805
- (if (>= errors max-consecutive-errors)
806
- (do
807
- (println (format "[%s] %d consecutive errors, stopping" id errors))
808
- (finish :error))
809
- (recur (inc iter) completed errors metrics nil nil #{} nil 0)))
810
-
811
- ;; Worktree ready — run agent
812
- (let [resume? (or (some? session-id) (some? claim-resume-prompt))
813
- iter-start-ms (System/currentTimeMillis)
814
- ;; Snapshot current/ task IDs before agent runs so we can
815
- ;; detect any direct mv claims (safety net for old behavior).
816
- pre-current-ids (tasks/current-task-ids)
817
- _ (println (format "[%s] %s iteration %d/%d"
818
- id (if resume? "Resuming" "Starting") iter iterations))
819
- context (build-context)
820
- {:keys [output exit done? merge? claim-ids] :as agent-result}
821
- (run-agent! worker (:path wt-state) context session-id resume?
822
- :resume-prompt-override claim-resume-prompt)
823
- new-session-id (:session-id agent-result)
824
- ;; Safety net: detect any direct mv claims (old behavior)
825
- mv-claimed-tasks (detect-claimed-tasks pre-current-ids)]
826
-
827
- (cond
828
- ;; Agent errored — recycle claimed tasks, cleanup, reset session
829
- (not (zero? exit))
830
- (let [errors (inc consec-errors)
831
- recycled (recycle-orphaned-tasks! id pre-current-ids)
832
- metrics (-> metrics
833
- (update :errors inc)
834
- (update :recycled + recycled))
835
- error-msg (subs (or output "") 0 (min 200 (count (or output ""))))]
836
- (println (format "[%s] Agent error (exit %d): %s" id exit error-msg))
837
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
838
- {:outcome :error :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
839
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
840
- :error-snippet error-msg})
841
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
842
- (if (>= errors max-consecutive-errors)
843
- (do
844
- (println (format "[%s] %d consecutive errors, stopping" id errors))
845
- (finish :error))
846
- (recur (inc iter) completed errors metrics nil nil #{} nil 0)))
847
-
848
- ;; CLAIM signal framework claims tasks, resumes agent with results
849
- ;; Only honored when no MERGE or DONE signal (lowest priority)
850
- (and (seq claim-ids) (not merge?) (not done?))
851
- (let [_ (println (format "[%s] CLAIM signal: %s" id (str/join ", " claim-ids)))
852
- {:keys [claimed failed resume-prompt]} (execute-claims! claim-ids)
853
- new-claimed-ids (into claimed-ids claimed)
854
- metrics (update metrics :claims + (count claimed))]
855
- (println (format "[%s] Claimed %d/%d tasks" id (count claimed) (count claim-ids)))
856
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
857
- {:outcome :claimed :claimed-task-ids (vec claimed)})
858
- (recur (inc iter) completed 0 metrics new-session-id wt-state
859
- new-claimed-ids resume-prompt 0))
860
-
861
- ;; COMPLETE_AND_READY_FOR_MERGE review, merge, reset session
862
- merge?
863
- (if (worktree-has-changes? (:path wt-state))
864
- (if (task-only-diff? (:path wt-state))
865
- ;; Task-only changes — skip review, sync to main, auto-merge
866
- (do
867
- (println (format "[%s] Task-only diff, auto-merging" id))
868
- (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
869
- all-claimed (into claimed-ids mv-claimed-tasks)]
870
- (if (= :failed sync-status)
871
- ;; Sync failed — cannot merge safely, skip
872
- (do
873
- (println (format "[%s] Sync to main failed, skipping merge" id))
874
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
875
- {:outcome :sync-failed :claimed-task-ids (vec all-claimed)})
876
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
877
- (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
878
- ;; Synced proceed with merge
879
- (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root 0 all-claimed)
880
- metrics (if merged? (update metrics :merges inc) metrics)]
881
- (println (format "[%s] Cycle %d/%d complete" id iter iterations))
882
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
883
- {:outcome :merged :claimed-task-ids (vec all-claimed) :review-rounds 0})
884
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
885
- (recur (inc iter) (inc completed) 0 metrics nil nil #{} nil 0)))))
886
- ;; Code changes — full review loop
887
- (let [{:keys [approved? attempts]} (review-loop! worker (:path wt-state) id iter)
888
- ;; Don't pre-increment :merges — defer to after actual merge succeeds
889
- metrics (-> metrics
890
- (update :review-rounds-total + (or attempts 0))
891
- (cond-> (not approved?) (update :rejections inc)))]
892
- (if approved?
893
- (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
894
- all-claimed (into claimed-ids mv-claimed-tasks)]
1143
+ (maybe-sleep-between! id wait-between cycle)
1144
+
1145
+ (when (and (not (:can-plan worker))
1146
+ (not (pos? (tasks/pending-count)))
1147
+ (not (pos? (tasks/current-count))))
1148
+ (println (format "[%s] Queue empty, waiting for tasks before cycle %d" id cycle))
1149
+ (wait-for-tasks! id max-wait-for-tasks))
1150
+
1151
+ (let [wt-state (try
1152
+ (or wt-state (create-iteration-worktree! project-root swarm-id id cycle))
1153
+ (catch Exception e
1154
+ (println (format "[%s] Worktree creation failed: %s" id (.getMessage e)))
1155
+ nil))]
1156
+ (if (nil? wt-state)
1157
+ (let [errors (inc consec-errors)
1158
+ metrics (update metrics :errors inc)]
1159
+ (if (>= errors max-consecutive-errors)
1160
+ (do
1161
+ (println (format "[%s] %d consecutive errors, stopping" id errors))
1162
+ (finish :error))
1163
+ (do (backoff-sleep! id errors)
1164
+ (recur (inc cycle) 1 completed-runs errors metrics nil nil #{} nil 0 0 []))))
1165
+
1166
+ (let [resume? (or (some? session-id) (some? claim-resume-prompt))
1167
+ cycle-start-ms (now-ms)
1168
+ cycle-timing (init-cycle-timing)
1169
+ pre-current-ids (tasks/current-task-ids)
1170
+ _ (println (format "[%s] %s cycle %d/%d (run %d/%d, attempt %d)"
1171
+ id
1172
+ (if (= attempt 1) "Starting" "Resuming")
1173
+ cycle cycle-cap current-run run-goal attempt))
1174
+ context (build-context)
1175
+ agent-start-ms (now-ms)
1176
+ {:keys [output exit done? merge? needs-followup? claim-ids parse-warning raw-snippet] :as agent-result}
1177
+ (run-agent! worker (:path wt-state) context session-id resume?
1178
+ :resume-prompt-override claim-resume-prompt)
1179
+ cycle-timing (add-llm-call cycle-timing
1180
+ :implementation-rounds-ms
1181
+ "implementation"
1182
+ (- (now-ms) agent-start-ms))
1183
+ new-session-id (:session-id agent-result)
1184
+ stderr-snippet (:stderr-snippet agent-result)
1185
+ mv-claimed-tasks (detect-claimed-tasks pre-current-ids)
1186
+ active-claimed-ids (active-claimed-task-ids claimed-ids mv-claimed-tasks)
1187
+ wt-path (:path wt-state)
1188
+ ;; Classify the signal for this attempt
1189
+ signal-label (cond
1190
+ (not (zero? exit)) (str "error:exit-" exit)
1191
+ (and (seq claim-ids) (not merge?) (not done?))
1192
+ (str "claim:" (str/join "," claim-ids))
1193
+ merge? "merge"
1194
+ done? "done"
1195
+ needs-followup? "needs-followup"
1196
+ :else "working")
1197
+ signals (conj signals signal-label)
1198
+ emit! (fn [opts]
1199
+ (emit-cycle-log! swarm-id id cycle attempt current-run cycle-start-ms new-session-id
1200
+ (merge {:worktree-path wt-path :signals signals} opts)))]
1201
+ (cond
1202
+ (not (zero? exit))
1203
+ (let [errors (inc consec-errors)
1204
+ recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1205
+ metrics (-> metrics (update :errors inc) (update :recycled + (count recycled)))
1206
+ error-msg (subs (or output "") 0 (min 200 (count (or output ""))))]
1207
+ (println (format "[%s] Agent error (exit %d): %s" id exit error-msg))
1208
+ (when (seq stderr-snippet)
1209
+ (println (format "[%s] Agent stderr snippet: %s"
1210
+ id
1211
+ (snippet (str/replace stderr-snippet #"\s+" " ") 240))))
1212
+ (emit!
1213
+ {:timing-ms cycle-timing
1214
+ :outcome :error
1215
+ :claimed-task-ids (vec active-claimed-ids)
1216
+ :recycled-tasks (seq recycled)
1217
+ :error-snippet error-msg})
1218
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
1219
+ (if (>= errors max-consecutive-errors)
1220
+ (do
1221
+ (println (format "[%s] %d consecutive errors, stopping" id errors))
1222
+ (finish :error))
1223
+ (do (backoff-sleep! id errors)
1224
+ (recur (inc cycle) 1 (inc completed-runs) errors metrics nil nil #{} nil 0 0 []))))
1225
+
1226
+ (and (seq claim-ids) (not merge?) (not done?))
1227
+ (let [_ (println (format "[%s] CLAIM signal: %s" id (str/join ", " claim-ids)))
1228
+ {:keys [claimed resume-prompt]} (execute-claims! claim-ids)
1229
+ new-claimed-ids (into active-claimed-ids claimed)
1230
+ metrics (update metrics :claims + (count claimed))]
1231
+ (println (format "[%s] Claimed %d/%d tasks" id (count claimed) (count claim-ids)))
1232
+ (emit!
1233
+ {:timing-ms cycle-timing
1234
+ :outcome :claimed :claimed-task-ids (vec claimed)})
1235
+ (recur cycle (inc attempt) completed-runs 0 metrics new-session-id wt-state
1236
+ new-claimed-ids resume-prompt 0 0 signals))
1237
+
1238
+ merge?
1239
+ (if (worktree-has-changes? (:path wt-state))
1240
+ (if (task-only-diff? (:path wt-state))
1241
+ (let [all-claimed active-claimed-ids]
1242
+ (println (format "[%s] Task-only diff, auto-merging" id))
1243
+ (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)]
895
1244
  (if (= :failed sync-status)
896
- ;; Sync failed after approval — treat as sync failure, skip merge
897
- (do
898
- (println (format "[%s] Sync to main failed after approval, skipping merge" id))
899
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
900
- {:outcome :sync-failed :claimed-task-ids (vec all-claimed)
901
- :review-rounds (or attempts 0)})
1245
+ (let [recycled (recycle-task-id-set! id all-claimed)
1246
+ metrics (update metrics :recycled + (count recycled))]
1247
+ (println (format "[%s] Sync to main failed, skipping merge" id))
1248
+ (emit!
1249
+ {:timing-ms cycle-timing
1250
+ :outcome :sync-failed
1251
+ :claimed-task-ids (vec all-claimed)
1252
+ :recycled-tasks (seq recycled)})
902
1253
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
903
- (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
904
- ;; Synced proceed with merge, capture return value
905
- (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root (or attempts 0) all-claimed)
906
- metrics (if merged? (update metrics :merges inc) metrics)]
907
- (println (format "[%s] Cycle %d/%d complete" id iter iterations))
908
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
909
- {:outcome (if merged? :merged :merge-failed)
910
- :claimed-task-ids (vec all-claimed)
911
- :review-rounds (or attempts 0)})
1254
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 []))
1255
+ (let [merge-result (merge-to-main! (:path wt-state) (:branch wt-state) id project-root 0 all-claimed)
1256
+ merge-result (if (:ok? merge-result)
1257
+ merge-result
1258
+ (recover-merge-failure! worker (:path wt-state) (:branch wt-state)
1259
+ id project-root 0 all-claimed merge-result))
1260
+ merged? (:ok? merge-result)
1261
+ recycled (when-not merged?
1262
+ (recycle-task-id-set! id all-claimed))
1263
+ completed-count (or (:completed-count merge-result) 0)
1264
+ metrics (cond-> metrics
1265
+ (and merged? (pos? completed-count)) (update :merges inc)
1266
+ (seq recycled) (update :recycled + (count recycled)))]
1267
+ (println (format "[%s] Cycle %d/%d complete" id cycle cycle-cap))
1268
+ (emit!
1269
+ {:timing-ms cycle-timing
1270
+ :outcome (if merged? :merged :merge-failed)
1271
+ :claimed-task-ids (vec all-claimed)
1272
+ :recycled-tasks (seq recycled)
1273
+ :review-rounds 0})
912
1274
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
913
- (recur (inc iter) (inc completed) 0 metrics nil nil #{} nil 0))))
914
- (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
915
- metrics (update metrics :recycled + recycled)]
916
- (println (format "[%s] Cycle %d/%d rejected" id iter iterations))
917
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
918
- {:outcome :rejected :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
919
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
920
- :review-rounds (or attempts 0)})
921
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
922
- (recur (inc iter) completed 0 metrics nil nil #{} nil 0)))))
923
- (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
924
- metrics (update metrics :recycled + recycled)]
925
- (println (format "[%s] Merge signaled but no changes, skipping" id))
926
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
927
- {:outcome :no-changes :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
928
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
929
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
930
- (recur (inc iter) completed 0 metrics nil nil #{} nil 0)))
931
-
932
- ;; __DONE__ agent signaled it finished this cycle's work.
933
- ;; Always reset session and continue to next iteration.
934
- ;; Planners re-plan as tasks complete; executors pick up new tasks.
935
- done?
936
- (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
937
- metrics (update metrics :recycled + recycled)]
938
- (println (format "[%s] __DONE__ signal, resetting session (iter %d/%d)" id iter iterations))
939
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
940
- {:outcome :executor-done :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
941
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
942
- (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
943
- (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
944
-
945
- ;; No signal agent still working, resume next iteration.
946
- ;; Track consecutive working resumes. After max-working-resumes,
947
- ;; inject a nudge prompt. If still no signal after nudge, kill session.
948
- :else
949
- (let [wr (inc working-resumes)
950
- max-wr (:max-working-resumes worker)]
951
- (cond
952
- ;; Already nudged last iteration, still no signal — stuck
953
- (> wr max-wr)
954
- (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
955
- metrics (update metrics :recycled + recycled)]
956
- (println (format "[%s] Stuck after %d working resumes + nudge, resetting session" id wr))
957
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
958
- {:outcome :stuck :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
959
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
1275
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 [])))))
1276
+ (let [{:keys [approved? attempts timing]} (review-loop! worker (:path wt-state) id cycle cycle-timing)
1277
+ cycle-timing (or timing cycle-timing)
1278
+ metrics (-> metrics
1279
+ (update :review-rounds-total + (or attempts 0))
1280
+ (cond-> (not approved?) (update :rejections inc)))]
1281
+ (if approved?
1282
+ (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
1283
+ all-claimed active-claimed-ids]
1284
+ (if (= :failed sync-status)
1285
+ (let [recycled (recycle-task-id-set! id all-claimed)
1286
+ metrics (update metrics :recycled + (count recycled))]
1287
+ (println (format "[%s] Sync to main failed after approval, skipping merge" id))
1288
+ (emit!
1289
+ {:timing-ms cycle-timing
1290
+ :outcome :sync-failed
1291
+ :claimed-task-ids (vec all-claimed)
1292
+ :recycled-tasks (seq recycled)
1293
+ :review-rounds (or attempts 0)})
1294
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
1295
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 []))
1296
+ (let [merge-result (merge-to-main! (:path wt-state) (:branch wt-state) id project-root (or attempts 0) all-claimed)
1297
+ merge-result (if (:ok? merge-result)
1298
+ merge-result
1299
+ (recover-merge-failure! worker (:path wt-state) (:branch wt-state)
1300
+ id project-root (or attempts 0) all-claimed merge-result))
1301
+ merged? (:ok? merge-result)
1302
+ recycled (when-not merged?
1303
+ (recycle-task-id-set! id all-claimed))
1304
+ completed-count (or (:completed-count merge-result) 0)
1305
+ metrics (cond-> metrics
1306
+ (and merged? (pos? completed-count)) (update :merges inc)
1307
+ (seq recycled) (update :recycled + (count recycled)))]
1308
+ (println (format "[%s] Cycle %d/%d complete" id cycle cycle-cap))
1309
+ (emit!
1310
+ {:timing-ms cycle-timing
1311
+ :outcome (if merged? :merged :merge-failed)
1312
+ :claimed-task-ids (vec all-claimed)
1313
+ :recycled-tasks (seq recycled)
1314
+ :review-rounds (or attempts 0)})
1315
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
1316
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 []))))
1317
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1318
+ metrics (update metrics :recycled + (count recycled))]
1319
+ (println (format "[%s] Cycle %d/%d rejected" id cycle cycle-cap))
1320
+ (emit!
1321
+ {:timing-ms cycle-timing
1322
+ :outcome :rejected
1323
+ :claimed-task-ids (vec active-claimed-ids)
1324
+ :recycled-tasks (seq recycled)
1325
+ :review-rounds (or attempts 0)})
1326
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
1327
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 [])))))
1328
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1329
+ metrics (update metrics :recycled + (count recycled))]
1330
+ (println (format "[%s] Merge signaled but no changes, skipping" id))
1331
+ (emit!
1332
+ {:timing-ms cycle-timing
1333
+ :outcome :no-changes
1334
+ :claimed-task-ids (vec active-claimed-ids)
1335
+ :recycled-tasks (seq recycled)})
960
1336
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
961
- (recur (inc iter) completed 0 metrics nil nil #{} nil 0))
962
-
963
- ;; Hit the limit — nudge on next resume
964
- (= wr max-wr)
965
- (do
966
- (println (format "[%s] Working... %d/%d resumes, nudging agent to wrap up" id wr max-wr))
967
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
968
- {:outcome :working :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
969
- (recur (inc iter) completed 0 metrics new-session-id wt-state
970
- claimed-ids nudge-prompt wr))
971
-
972
- ;; Under limit — normal resume
973
- :else
974
- (do
975
- (println (format "[%s] Working... (will resume, %d/%d)" id wr max-wr))
976
- (emit-cycle-log! swarm-id id iter iter-start-ms new-session-id
977
- {:outcome :working :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
978
- (recur (inc iter) completed 0 metrics new-session-id wt-state
979
- claimed-ids nil wr))))))))))))))
1337
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 [])))
1338
+
1339
+ done?
1340
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1341
+ metrics (-> metrics
1342
+ (update :recycled + (count recycled))
1343
+ (update :errors inc))]
1344
+ (println (format "[%s] Invalid __DONE__ signal from executor; stopping worker (cycle %d/%d)" id cycle cycle-cap))
1345
+ (emit!
1346
+ {:timing-ms cycle-timing
1347
+ :outcome :error
1348
+ :claimed-task-ids (vec active-claimed-ids)
1349
+ :recycled-tasks (seq recycled)
1350
+ :error-snippet "__DONE__ is not a valid executor signal; use CLAIM(...) or COMPLETE_AND_READY_FOR_MERGE"})
1351
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
1352
+ (finish :error))
1353
+
1354
+ needs-followup?
1355
+ (let [summary (subs (or output "") 0 (min 240 (count (or output ""))))
1356
+ next-followups (inc needs-followups)]
1357
+ (emit!
1358
+ {:timing-ms cycle-timing
1359
+ :outcome :needs-followup
1360
+ :claimed-task-ids (vec active-claimed-ids)
1361
+ :error-snippet summary})
1362
+ (if (> next-followups max-needs-followups)
1363
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1364
+ metrics (-> metrics
1365
+ (update :recycled + (count recycled))
1366
+ (update :errors inc))]
1367
+ (println (format "[%s] NEEDS_FOLLOWUP exhausted (%d/%d); stopping worker" id next-followups max-needs-followups))
1368
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
1369
+ (finish :error))
1370
+ (let [followup-prompt (build-needs-followup-prompt active-claimed-ids output)]
1371
+ (println (format "[%s] NEEDS_FOLLOWUP signal; continuing cycle with follow-up prompt (%d/%d)"
1372
+ id next-followups max-needs-followups))
1373
+ (recur cycle (inc attempt) completed-runs 0 metrics new-session-id wt-state
1374
+ active-claimed-ids followup-prompt 0 next-followups signals))))
1375
+
1376
+ :else
1377
+ (let [wr (inc working-resumes)
1378
+ max-wr (:max-working-resumes worker)]
1379
+ (when parse-warning
1380
+ (if (str/includes? parse-warning "AUTH_REQUIRED:")
1381
+ (println (format "[%s] LOGIN ISSUE: %s"
1382
+ id
1383
+ (str/replace parse-warning #"^AUTH_REQUIRED:\s*" "")))
1384
+ (println (format "[%s] WARNING: %s" id parse-warning))))
1385
+ (when (and parse-warning (seq raw-snippet))
1386
+ (println (format "[%s] Raw output snippet: %s"
1387
+ id
1388
+ (snippet (str/replace raw-snippet #"\s+" " ") 240))))
1389
+ (when (seq stderr-snippet)
1390
+ (println (format "[%s] Agent stderr snippet: %s"
1391
+ id
1392
+ (snippet (str/replace stderr-snippet #"\s+" " ") 240))))
1393
+ (cond
1394
+ (> wr max-wr)
1395
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1396
+ metrics (update metrics :recycled + (count recycled))]
1397
+ (println (format "[%s] Stuck after %d working resumes + nudge, resetting session" id wr))
1398
+ (emit!
1399
+ {:timing-ms cycle-timing
1400
+ :outcome :stuck
1401
+ :claimed-task-ids (vec active-claimed-ids)
1402
+ :recycled-tasks (seq recycled)})
1403
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
1404
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 []))
1405
+
1406
+ (= wr max-wr)
1407
+ (do
1408
+ (println (format "[%s] Working... %d/%d resumes, nudging agent to wrap up" id wr max-wr))
1409
+ (emit!
1410
+ {:timing-ms cycle-timing
1411
+ :outcome :working
1412
+ :claimed-task-ids (vec active-claimed-ids)})
1413
+ (recur cycle (inc attempt) completed-runs 0 metrics new-session-id wt-state
1414
+ active-claimed-ids nudge-prompt wr needs-followups signals))
1415
+
1416
+ :else
1417
+ (do
1418
+ (println (format "[%s] Working... (will resume, %d/%d)" id wr max-wr))
1419
+ (emit!
1420
+ {:timing-ms cycle-timing
1421
+ :outcome :working
1422
+ :claimed-task-ids (vec active-claimed-ids)})
1423
+ (recur cycle (inc attempt) completed-runs 0 metrics new-session-id wt-state
1424
+ active-claimed-ids nil wr needs-followups signals))))))))))))))
980
1425
 
981
1426
  ;; =============================================================================
982
1427
  ;; Multi-Worker Execution
@@ -992,7 +1437,14 @@
992
1437
  Returns seq of final worker states."
993
1438
  [workers]
994
1439
  (tasks/ensure-dirs!)
995
- (let [swarm-id (-> workers first :swarm-id)]
1440
+ (let [swarm-id (-> workers first :swarm-id)
1441
+ stale-current (tasks/list-current)]
1442
+ (when (seq stale-current)
1443
+ (println (format "WARNING: %d task(s) already in current/ from a previous run. These may be stale claims."
1444
+ (count stale-current)))
1445
+ (doseq [t stale-current]
1446
+ (println (format " - %s: %s" (:id t) (:summary t))))
1447
+ (println " Run `oompa requeue` to move them back to pending/ if they are stale."))
996
1448
  (println (format "Launching %d workers..." (count workers)))
997
1449
 
998
1450
  ;; Register JVM shutdown hook so SIGTERM/SIGINT triggers graceful stop.
@@ -1014,27 +1466,38 @@
1014
1466
  (map-indexed
1015
1467
  (fn [idx worker]
1016
1468
  (let [worker (assoc worker :id (or (:id worker) (str "w" idx)))]
1017
- (future (run-worker! worker))))
1469
+ (future
1470
+ (try
1471
+ (run-worker! worker)
1472
+ (catch Exception e
1473
+ (println (format "[%s] FATAL: %s" (:id worker) (.getMessage e)))
1474
+ (.printStackTrace e)
1475
+ (throw e))))))
1018
1476
  workers))]
1019
1477
 
1020
1478
  (println "All workers launched. Waiting for completion...")
1021
- (let [results (mapv deref futures)]
1479
+ (let [results (mapv (fn [f]
1480
+ (try
1481
+ (deref f)
1482
+ (catch Exception e
1483
+ (println (format "Worker future failed: %s" (.getMessage e)))
1484
+ {:status :fatal-error :error (.getMessage e)})))
1485
+ futures)]
1022
1486
  ;; Clean exit — tell shutdown hook not to write stopped.json
1023
1487
  (reset! shutdown-requested? false)
1024
1488
  ;; Remove the hook so it doesn't accumulate across calls
1025
1489
  (try (.removeShutdownHook (Runtime/getRuntime) hook) (catch Exception _))
1026
1490
  (println "\nAll workers complete.")
1027
- (doseq [w results]
1028
- (println (format " [%s] %s - %d completed, %d merges, %d claims, %d rejections, %d errors, %d recycled, %d review rounds"
1029
- (:id w)
1030
- (name (:status w))
1031
- (:completed w)
1032
- (or (:merges w) 0)
1033
- (or (:claims w) 0)
1034
- (or (:rejections w) 0)
1035
- (or (:errors w) 0)
1036
- (or (:recycled w) 0)
1037
- (or (:review-rounds-total w) 0))))
1491
+ (let [timing-by-worker (aggregate-cycle-timings-by-worker swarm-id)
1492
+ rows (mapv (fn [result]
1493
+ (let [row-id (or (:id result) "")
1494
+ totals (get timing-by-worker row-id empty-cycle-total)]
1495
+ (worker-summary-row result totals)))
1496
+ results)]
1497
+ (println "\nWorker Summary")
1498
+ (print-table [:Worker :Runs :Cycles :Status :Merges :Claims :Rejects :Errors :Recycled
1499
+ :ReviewRounds :ImplMs :ReviewMs :FixMs :HarnessMs :TotalMs]
1500
+ rows))
1038
1501
 
1039
1502
  ;; Write stopped event — all state derivable from cycle logs
1040
1503
  (when swarm-id
@@ -1082,16 +1545,12 @@
1082
1545
  tagged-prompt (str "[oompa:" swarm-id* ":planner] " prompt-text)
1083
1546
  abs-root (.getAbsolutePath (io/file project-root))
1084
1547
 
1085
- cmd (harness/build-cmd harness
1086
- {:cwd abs-root :model model :prompt tagged-prompt})
1087
-
1088
1548
  _ (println (format "[planner] Running (%s:%s, max_pending: %d, current: %d)"
1089
1549
  (name harness) (or model "default") max-pending pending-before))
1090
1550
 
1091
1551
  result (try
1092
- (process/sh cmd {:dir abs-root
1093
- :in (harness/process-stdin harness tagged-prompt)
1094
- :out :string :err :string})
1552
+ (harness/run-command! harness
1553
+ {:cwd abs-root :model model :prompt tagged-prompt})
1095
1554
  (catch Exception e
1096
1555
  (println (format "[planner] Agent exception: %s" (.getMessage e)))
1097
1556
  {:exit -1 :out "" :err (.getMessage e)}))