@nbardy/oompa 0.7.2 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,6 +21,7 @@
21
21
  [babashka.process :as process]
22
22
  [clojure.java.io :as io]
23
23
  [clojure.set]
24
+ [clojure.pprint :refer [print-table]]
24
25
  [clojure.string :as str]))
25
26
 
26
27
  ;; =============================================================================
@@ -40,6 +41,13 @@
40
41
  (def ^:private shutdown-requested? (atom false))
41
42
 
42
43
  (declare task-root-for-cwd)
44
+ (declare verify-mergeable?)
45
+
46
+ (defn- log-ts
47
+ "Readable wall-clock timestamp for worker log lines."
48
+ []
49
+ (.format (java.time.format.DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss")
50
+ (java.time.LocalDateTime/now)))
43
51
 
44
52
  (defn- load-prompt
45
53
  "Load a prompt file. Tries path as-is first, then from package root."
@@ -47,6 +55,11 @@
47
55
  (or (agent/load-custom-prompt path)
48
56
  (agent/load-custom-prompt (str package-root "/" path))))
49
57
 
58
+ (defn- snippet
59
+ [s limit]
60
+ (let [s (or s "")]
61
+ (subs s 0 (min limit (count s)))))
62
+
50
63
  (defn- build-template-tokens
51
64
  "Build token map for prompt template {var} substitution.
52
65
  Merges core/build-context (rich YAML header, queue, hotspots, etc.)
@@ -90,6 +103,7 @@
90
103
  (str/replace "{TASKS_ROOT}" task-root))))
91
104
 
92
105
  (def ^:private default-max-working-resumes 5)
106
+ (def ^:private default-max-needs-followups 1)
93
107
  (def ^:private default-max-wait-for-tasks 600)
94
108
 
95
109
  (defn create-worker
@@ -100,10 +114,11 @@
100
114
  :review-prompts paths to reviewer prompt files (loaded and concatenated for review).
101
115
  :wait-between seconds to sleep between cycles (nil or 0 = no wait).
102
116
  :max-wait-for-tasks max seconds a non-planner waits for tasks before giving up (default 600).
103
- :max-working-resumes max consecutive working resumes before nudge+kill (default 5)."
117
+ :max-working-resumes max consecutive working resumes before nudge+kill (default 5).
118
+ :max-needs-followups max NEEDS_FOLLOWUP continuations in one cycle (default 1)."
104
119
  [{:keys [id swarm-id harness model runs max-cycles iterations prompts can-plan reasoning
105
120
  reviewers wait-between
106
- max-working-resumes max-wait-for-tasks]}]
121
+ max-working-resumes max-needs-followups max-wait-for-tasks]}]
107
122
  (let [cycle-cap (or max-cycles iterations runs 10)
108
123
  run-goal (or runs iterations 10)]
109
124
  {:id id
@@ -127,6 +142,7 @@
127
142
  default-max-wait-for-tasks))
128
143
  :reviewers reviewers
129
144
  :max-working-resumes (or max-working-resumes default-max-working-resumes)
145
+ :max-needs-followups (or max-needs-followups default-max-needs-followups)
130
146
  :completed 0
131
147
  :status :idle}))
132
148
 
@@ -145,7 +161,9 @@
145
161
  "1. If you have meaningful changes: commit them and signal COMPLETE_AND_READY_FOR_MERGE\n"
146
162
  "2. If scope is too large: create follow-up tasks in tasks/pending/ for remaining work,\n"
147
163
  " commit what you have (even partial notes/design docs), and signal COMPLETE_AND_READY_FOR_MERGE\n"
148
- "3. If you are stuck and cannot make progress: signal __DONE__\n\n"
164
+ "3. If you truly cannot produce a merge-ready artifact this turn, signal NEEDS_FOLLOWUP\n"
165
+ " and explain the remaining work. The framework will keep your claimed tasks and give you\n"
166
+ " one targeted follow-up prompt. This is not success.\n\n"
149
167
  "Do NOT continue working without producing a signal."))
150
168
 
151
169
  (defn- build-context
@@ -187,11 +205,63 @@
187
205
  "\n\n"
188
206
  (if (seq claimed-ids)
189
207
  "Work on your claimed tasks. Signal COMPLETE_AND_READY_FOR_MERGE when done."
190
- "No claims succeeded. CLAIM different tasks, or signal __DONE__ if no suitable work remains."))]
208
+ "No claims succeeded. CLAIM different tasks. If you cannot finish a mergeable artifact after trying hard, signal NEEDS_FOLLOWUP with a short explanation."))]
191
209
  {:claimed claimed-ids
192
210
  :failed failed-ids
193
211
  :resume-prompt prompt}))
194
212
 
213
+ (defn- active-claimed-task-ids
214
+ "Union of tasks claimed earlier in the cycle and tasks moved into current/
215
+ during the latest attempt."
216
+ [claimed-ids mv-claimed-tasks]
217
+ (-> (set claimed-ids)
218
+ (into mv-claimed-tasks)))
219
+
220
+ (defn- recycle-task-id-set!
221
+ "Recycle a set of claimed task IDs from current/ back to pending/.
222
+ Returns a vector of recycled IDs."
223
+ [worker-id task-ids]
224
+ (let [task-ids (set (remove nil? task-ids))
225
+ recycled (when (seq task-ids)
226
+ (tasks/recycle-tasks! task-ids))]
227
+ (when (seq recycled)
228
+ (println (format "[%s] Recycled %d claimed task(s): %s"
229
+ worker-id (count recycled) (str/join ", " recycled))))
230
+ (vec (or recycled []))))
231
+
232
+ (defn- recycle-active-claims!
233
+ "Recycle all claims active in the current cycle."
234
+ [worker-id claimed-ids mv-claimed-tasks]
235
+ (recycle-task-id-set! worker-id (active-claimed-task-ids claimed-ids mv-claimed-tasks)))
236
+
237
+ (defn- build-needs-followup-prompt
238
+ "Prompt injected after NEEDS_FOLLOWUP so the worker keeps ownership and
239
+ closes the loop in the same cycle."
240
+ [claimed-ids output]
241
+ (let [context (build-context)
242
+ explanation (some-> output
243
+ (str/replace #"(?is)^\s*NEEDS_FOLLOWUP\b[\s:.-]*" "")
244
+ str/trim)]
245
+ (str "## NEEDS_FOLLOWUP Follow-up\n\n"
246
+ (if (seq claimed-ids)
247
+ (str "You still own these claimed tasks: "
248
+ (str/join ", " (sort claimed-ids))
249
+ "\n\n")
250
+ "You do not currently own any claimed tasks.\n\n")
251
+ "Continue the SAME cycle and finish a merge-ready artifact.\n"
252
+ "Do not output NEEDS_FOLLOWUP again unless you are still blocked after this follow-up.\n"
253
+ "Prefer the smallest useful diff. If scope is too large, create concrete follow-up tasks in the pending queue and still ship the artifact you have.\n\n"
254
+ (when (seq explanation)
255
+ (str "Your previous explanation:\n"
256
+ explanation
257
+ "\n\n"))
258
+ "Task Status: " (:task_status context) "\n"
259
+ "Remaining Pending:\n"
260
+ (if (str/blank? (:pending_tasks context))
261
+ "(none)"
262
+ (:pending_tasks context))
263
+ "\n\nWhen ready, signal COMPLETE_AND_READY_FOR_MERGE.")))
264
+
195
265
  (defn- run-agent!
196
266
  "Run agent with prompt, return {:output :done? :merge? :claim-ids :exit :session-id}.
197
267
  When resume? is true, continues the existing session with a lighter prompt.
@@ -245,28 +315,31 @@
245
315
  tagged-prompt (str "[oompa:" swarm-id* ":" id "] " prompt)
246
316
  abs-worktree (.getAbsolutePath (io/file worktree-path))
247
317
 
248
- cmd (harness/build-cmd harness
249
- {:cwd abs-worktree :model model :reasoning reasoning
250
- :session-id session-id :resume? resume?
251
- :prompt tagged-prompt :format? true})
252
-
253
318
  result (try
254
- (process/sh cmd {:dir abs-worktree
255
- :in (harness/process-stdin harness tagged-prompt)
256
- :out :string :err :string})
319
+ (harness/run-command! harness
320
+ {:cwd abs-worktree :model model :reasoning reasoning
321
+ :session-id session-id :resume? resume?
322
+ :prompt tagged-prompt :format? true})
257
323
  (catch Exception e
258
324
  (println (format "[%s] Agent exception: %s" id (.getMessage e)))
259
325
  {:exit -1 :out "" :err (.getMessage e)}))
260
326
 
261
- {:keys [output session-id]}
262
- (harness/parse-output harness (:out result) session-id)]
327
+ {:keys [output session-id warning raw-snippet]}
328
+ (harness/parse-output harness (:out result) session-id)
329
+ stderr-snippet (let [stderr (some-> (:err result) str/trim)]
330
+ (when (seq stderr)
331
+ (subs stderr 0 (min 400 (count stderr)))))]
263
332
 
264
333
  {:output output
265
334
  :exit (:exit result)
266
335
  :done? (agent/done-signal? output)
267
336
  :merge? (agent/merge-signal? output)
337
+ :needs-followup? (agent/needs-followup-signal? output)
268
338
  :claim-ids (agent/parse-claim-signal output)
269
- :session-id session-id}))
339
+ :session-id session-id
340
+ :parse-warning warning
341
+ :raw-snippet raw-snippet
342
+ :stderr-snippet stderr-snippet}))
270
343
 
271
344
  (defn- run-reviewer!
272
345
  "Run reviewer on worktree changes.
@@ -274,7 +347,8 @@
274
347
  prev-feedback: vector of previous review outputs (for multi-round context).
275
348
  Returns {:verdict :approved|:needs-changes|:rejected, :comments [...], :output string}"
276
349
  [{:keys [id swarm-id reviewers]} worktree-path prev-feedback]
277
- (let [;; Get actual diff content (not just stat) — truncate to 8000 chars for prompt budget
350
+ (let [start-ms (System/currentTimeMillis)
351
+ ;; Get actual diff content (not just stat) — truncate to 8000 chars for prompt budget
278
352
  diff-result (process/sh ["git" "diff" "main"]
279
353
  {:dir worktree-path :out :string :err :string})
280
354
  diff-content (let [d (:out diff-result)]
@@ -320,14 +394,12 @@
320
394
  "After your verdict line, list every issue as a numbered item with "
321
395
  "the file path and what needs to change.\n")
322
396
  review-prompt (str "[oompa:" swarm-id* ":" id "] " review-body)
323
- cmd (harness/build-cmd harness {:cwd abs-wt :model model :prompt review-prompt})
324
397
  res (try
325
- (process/sh cmd {:dir abs-wt
326
- :in (harness/process-stdin harness review-prompt)
327
- :out :string :err :string})
398
+ (harness/run-command! harness {:cwd abs-wt :model model :prompt review-prompt})
328
399
  (catch Exception e
329
400
  {:exit -1 :out "" :err (.getMessage e)}))
330
- output (or (:out res) "")
401
+ parsed (harness/parse-output harness (:out res) nil)
402
+ output (or (:output parsed) "")
331
403
  has-verdict? (or (re-find #"VERDICT:\s*APPROVED" output)
332
404
  (re-find #"VERDICT:\s*NEEDS_CHANGES" output)
333
405
  (re-find #"VERDICT:\s*REJECTED" output)
@@ -348,7 +420,8 @@
348
420
  (re-find #"VERDICT:\s*NEEDS_CHANGES" output) :needs-changes
349
421
  (re-find #"VERDICT:\s*REJECTED" output) :needs-changes
350
422
  (re-find #"(?i)\bAPPROVED\b" output) :approved
351
- :else :needs-changes)]
423
+ :else :needs-changes)
424
+ duration-ms (- (System/currentTimeMillis) start-ms)]
352
425
 
353
426
  (println (format "[%s] Reviewer verdict: %s" id (name verdict)))
354
427
  (let [summary (subs output 0 (min 300 (count output)))]
@@ -358,14 +431,16 @@
358
431
  {:verdict verdict
359
432
  :comments (when (not= (:exit result) 0)
360
433
  [(:err result)])
361
- :output output}))
434
+ :output output
435
+ :duration-ms duration-ms}))
362
436
 
363
437
  (defn- run-fix!
364
438
  "Ask worker to fix issues based on reviewer feedback.
365
439
  all-feedback: vector of all reviewer outputs so far (accumulated across rounds).
366
440
  Returns {:output string, :exit int}"
367
441
  [{:keys [id swarm-id harness model]} worktree-path all-feedback]
368
- (let [swarm-id* (or swarm-id "unknown")
442
+ (let [start-ms (System/currentTimeMillis)
443
+ swarm-id* (or swarm-id "unknown")
369
444
  feedback-text (if (> (count all-feedback) 1)
370
445
  (str "The reviewer has given feedback across " (count all-feedback) " rounds.\n"
371
446
  "Fix ALL outstanding issues:\n\n"
@@ -381,18 +456,17 @@
381
456
 
382
457
  abs-wt (.getAbsolutePath (io/file worktree-path))
383
458
 
384
- cmd (harness/build-cmd harness
385
- {:cwd abs-wt :model model :prompt fix-prompt})
386
-
387
459
  result (try
388
- (process/sh cmd {:dir abs-wt
389
- :in (harness/process-stdin harness fix-prompt)
390
- :out :string :err :string})
460
+ (harness/run-command! harness
461
+ {:cwd abs-wt :model model :prompt fix-prompt})
391
462
  (catch Exception e
392
- {:exit -1 :out "" :err (.getMessage e)}))]
463
+ {:exit -1 :out "" :err (.getMessage e)}))
464
+ parsed (harness/parse-output harness (:out result) nil)
465
+ duration-ms (- (System/currentTimeMillis) start-ms)]
393
466
 
394
- {:output (:out result)
395
- :exit (:exit result)}))
467
+ {:output (:output parsed)
468
+ :exit (:exit result)
469
+ :duration-ms duration-ms}))
396
470
 
397
471
  (defn- collect-divergence-context
398
472
  "Collect context about how a worktree branch has diverged from main.
@@ -406,6 +480,65 @@
406
480
  :main-log (or main-log "(none)")
407
481
  :diff-stat (or diff-stat "(none)")}))
408
482
 
483
+ (defn- first-nonblank-line
484
+ "Return first non-blank line from text for compact logging."
485
+ [s]
486
+ (some->> (or s "")
487
+ str/split-lines
488
+ (remove str/blank?)
489
+ first))
490
+
491
+ (defn- classify-merge-failure
492
+ "Classify git merge/checkout failure text for better logs."
493
+ [failure-text]
494
+ (cond
495
+ (re-find #"untracked working tree files would be overwritten by merge" (or failure-text ""))
496
+ :untracked-overwrite
497
+ (re-find #"CONFLICT|Merge conflict" (or failure-text ""))
498
+ :conflict
499
+ (re-find #"Your local changes to the following files would be overwritten" (or failure-text ""))
500
+ :local-changes-overwrite
501
+ :else
502
+ :unknown))
503
+
504
+ (defn- run-resolver-agent!
505
+ "Run resolver agent with divergence + failure context.
506
+ Returns :resolved when branch verifies as mergeable, else :failed."
507
+ [worker wt-path worker-id reason-details]
508
+ (println (format "[%s] Branch diverged from main, launching resolver agent%s"
509
+ worker-id
510
+ (if (str/blank? reason-details)
511
+ ""
512
+ (str " (" reason-details ")"))))
513
+ (let [{:keys [branch-log main-log diff-stat]} (collect-divergence-context wt-path)
514
+ resolve-prompt (str "[oompa:" (or (:swarm-id worker) "unknown") ":" worker-id "] "
515
+ "Your branch cannot currently be merged safely into main.\n\n"
516
+ (when-not (str/blank? reason-details)
517
+ (str "Failure context from previous merge attempt:\n"
518
+ reason-details "\n\n"))
519
+ "Your branch's commits (not on main):\n" branch-log "\n\n"
520
+ "Commits on main since you branched:\n" main-log "\n\n"
521
+ "Divergence scope:\n" diff-stat "\n\n"
522
+ "Make this branch cleanly mergeable into main. "
523
+ "Preserve the intent of your branch's changes.\n"
524
+ "You have full git access — rebase, cherry-pick, resolve conflicts, "
525
+ "or clean up merge blockers.\n"
526
+ "When done, verify with: git diff main --stat")
527
+ abs-wt (.getAbsolutePath (io/file wt-path))
528
+ result (try
529
+ (harness/run-command! (:harness worker)
530
+ {:cwd abs-wt :model (:model worker) :prompt resolve-prompt})
531
+ (catch Exception e
532
+ {:exit -1 :out "" :err (.getMessage e)}))]
533
+ (if (zero? (:exit result))
534
+ (if (verify-mergeable? wt-path)
535
+ (do (println (format "[%s] Agent resolved divergence, branch is mergeable" worker-id))
536
+ :resolved)
537
+ (do (println (format "[%s] Agent ran but branch still can't merge cleanly" worker-id))
538
+ :failed))
539
+ (do (println (format "[%s] Resolver agent failed (exit %d)" worker-id (:exit result)))
540
+ :failed))))
541
+
409
542
  (defn- verify-mergeable?
410
543
  "Dry-run merge to verify a worktree branch merges cleanly into main.
411
544
  Does NOT leave merge state behind — always cleans up the dry-run.
@@ -437,36 +570,10 @@
437
570
  ;; Conflict — abort merge to restore clean worktree state, then
438
571
  ;; hand the problem to the agent with full divergence context.
439
572
  (let [_ (process/sh ["git" "merge" "--abort"] {:dir wt-path})
440
- _ (println (format "[%s] Branch diverged from main, launching resolver agent" worker-id))
441
- {:keys [branch-log main-log diff-stat]} (collect-divergence-context wt-path)
442
- resolve-prompt (str "[oompa:" (or (:swarm-id worker) "unknown") ":" worker-id "] "
443
- "Your branch has diverged from main and cannot merge cleanly.\n\n"
444
- "Your branch's commits (not on main):\n" branch-log "\n\n"
445
- "Commits on main since you branched:\n" main-log "\n\n"
446
- "Divergence scope:\n" diff-stat "\n\n"
447
- "Make this branch cleanly mergeable into main. "
448
- "Preserve the intent of your branch's changes.\n"
449
- "You have full git access — rebase, cherry-pick, resolve conflicts, "
450
- "whatever works.\n"
451
- "When done, verify with: git diff main --stat")
452
- abs-wt (.getAbsolutePath (io/file wt-path))
453
- cmd (harness/build-cmd (:harness worker)
454
- {:cwd abs-wt :model (:model worker) :prompt resolve-prompt})
455
- result (try
456
- (process/sh cmd {:dir abs-wt
457
- :in (harness/process-stdin (:harness worker) resolve-prompt)
458
- :out :string :err :string})
459
- (catch Exception e
460
- {:exit -1 :out "" :err (.getMessage e)}))]
461
- (if (zero? (:exit result))
462
- ;; Agent ran — verify the branch actually merges cleanly now
463
- (if (verify-mergeable? wt-path)
464
- (do (println (format "[%s] Agent resolved divergence, branch is mergeable" worker-id))
465
- :resolved)
466
- (do (println (format "[%s] Agent ran but branch still can't merge cleanly" worker-id))
467
- :failed))
468
- (do (println (format "[%s] Resolver agent failed (exit %d)" worker-id (:exit result)))
469
- :failed))))))
573
+ failure-snippet (first-nonblank-line (str (:out merge-result) "\n" (:err merge-result)))]
574
+ (run-resolver-agent! worker wt-path worker-id
575
+ (str "sync_worktree_to_main failed"
576
+ (when failure-snippet (str ": " failure-snippet))))))))
470
577
 
471
578
  (defn- worktree-has-changes?
472
579
  "Check if worktree has committed OR uncommitted changes vs main.
@@ -486,9 +593,11 @@
486
593
  (defn- create-iteration-worktree!
487
594
  "Create a fresh worktree for an iteration. Returns {:dir :branch :path}.
488
595
  Force-removes stale worktree+branch from previous failed runs first."
489
- [project-root worker-id iteration]
490
- (let [wt-dir (format ".w%s-i%d" worker-id iteration)
491
- wt-branch (format "oompa/%s-i%d" worker-id iteration)
596
+ [project-root swarm-id worker-id iteration]
597
+ (let [swarm-token (or swarm-id (subs (str (java.util.UUID/randomUUID)) 0 8))
598
+ work-id (format "s%s-%s-i%d" swarm-token worker-id iteration)
599
+ wt-dir (format ".w%s" work-id)
600
+ wt-branch (format "oompa/%s" work-id)
492
601
  wt-path (str project-root "/" wt-dir)]
493
602
  ;; Clean stale worktree/branch from previous failed runs
494
603
  (process/sh ["git" "worktree" "remove" wt-dir "--force"] {:dir project-root})
@@ -507,39 +616,201 @@
507
616
  (let [post-ids (tasks/current-task-ids)]
508
617
  (clojure.set/difference post-ids pre-current-ids)))
509
618
 
619
+ (defn- now-ms
620
+ []
621
+ (System/currentTimeMillis))
622
+
623
+ (defn- ms->seconds
624
+ [ms]
625
+ (/ ms 1000.0))
626
+
627
+ (defn- pct-of
628
+ [part total]
629
+ (if (pos? total)
630
+ (* 100.0 (/ part (double total)))
631
+ 0.0))
632
+
633
+ (defn- init-cycle-timing
634
+ []
635
+ {:implementation-rounds-ms []
636
+ :reviewer-response-ms []
637
+ :review-fixes-ms []
638
+ :optional-review-ms []
639
+ :llm-calls []})
640
+
641
+ (defn- add-llm-call
642
+ [timing section-name call-name duration-ms]
643
+ (let [timing (or timing (init-cycle-timing))
644
+ duration-ms (max 0 (long (or duration-ms 0)))]
645
+ (-> timing
646
+ (update section-name (fnil conj []) duration-ms)
647
+ (update :llm-calls conj {:name call-name
648
+ :section section-name
649
+ :duration-ms duration-ms}))))
650
+
651
+ (defn- cycle-llm-total-ms
652
+ [timing]
653
+ (let [sections [:implementation-rounds-ms :reviewer-response-ms :review-fixes-ms :optional-review-ms]]
654
+ (->> sections
655
+ (map #(reduce + 0 (or (get timing %) [])))
656
+ (reduce + 0))))
657
+
658
+ (defn- with-call-percent
659
+ [timing total-ms]
660
+ (update timing :llm-calls
661
+ (fn [calls]
662
+ (mapv (fn [{:keys [duration-ms] :as call}]
663
+ (assoc call :percent (pct-of duration-ms total-ms)))
664
+ calls))))
665
+
666
+ (defn- format-timing-segment
667
+ [label durations total-ms]
668
+ (let [durations (vec (or durations []))
669
+ items (if (seq durations)
670
+ (str/join ", "
671
+ (map #(format "%.2fs (%.1f%%)"
672
+ (ms->seconds %) (pct-of % total-ms))
673
+ durations))
674
+ "-")
675
+ section-ms (reduce + 0 durations)]
676
+ (format "%s=[%s] %.2fs (%.1f%%)"
677
+ label
678
+ items
679
+ (ms->seconds section-ms)
680
+ (pct-of section-ms total-ms))))
681
+
682
+ (defn- format-cycle-timing
683
+ [{:keys [implementation-rounds-ms reviewer-response-ms review-fixes-ms optional-review-ms]}
684
+ total-ms]
685
+ (let [llm-ms (cycle-llm-total-ms {:implementation-rounds-ms implementation-rounds-ms
686
+ :reviewer-response-ms reviewer-response-ms
687
+ :review-fixes-ms review-fixes-ms
688
+ :optional-review-ms optional-review-ms})
689
+ harness-ms (max 0 (- total-ms llm-ms))]
690
+ (str "timing: "
691
+ (format-timing-segment "Implementation" implementation-rounds-ms total-ms)
692
+ " | "
693
+ (format-timing-segment "Reviewer" reviewer-response-ms total-ms)
694
+ " | "
695
+ (format-timing-segment "Fixes" review-fixes-ms total-ms)
696
+ " | "
697
+ (format-timing-segment "OptionalReview" optional-review-ms total-ms)
698
+ " | LLM="
699
+ (format "%.2fs (%.1f%%)" (ms->seconds llm-ms) (pct-of llm-ms total-ms))
700
+ " | Harness="
701
+ (format "%.2fs (%.1f%%)" (ms->seconds harness-ms) (pct-of harness-ms total-ms))
702
+ " | Total="
703
+ (format "%.2fs" (ms->seconds total-ms)))))
704
+
705
+ (defn- safe-number
706
+ [v]
707
+ (if (number? v) (long v) 0))
708
+
709
+ (defn- safe-sum
710
+ [v]
711
+ (reduce + 0 (or v [])))
712
+
713
+ (defn- format-ms
714
+ [ms]
715
+ (format "%.2fs" (ms->seconds (safe-number ms))))
716
+
717
+ (defn- cycle-time-sum
718
+ [{:keys [implementation-rounds-ms reviewer-response-ms review-fixes-ms optional-review-ms] :as timing-ms}
719
+ duration-ms]
720
+ (let [impl (safe-sum implementation-rounds-ms)
721
+ review (safe-sum reviewer-response-ms)
722
+ fixes (safe-sum review-fixes-ms)
723
+ optional (safe-sum optional-review-ms)
724
+ total (safe-number duration-ms)
725
+ llm (+ impl review fixes optional)
726
+ harness (max 0 (- total llm))]
727
+ {:implementation-ms impl
728
+ :review-ms review
729
+ :fixes-ms fixes
730
+ :optional-review-ms optional
731
+ :llm-ms llm
732
+ :harness-ms harness
733
+ :total-ms total}))
734
+
735
+ (def ^:private empty-cycle-total
736
+ {:implementation-ms 0
737
+ :review-ms 0
738
+ :fixes-ms 0
739
+ :optional-review-ms 0
740
+ :llm-ms 0
741
+ :harness-ms 0
742
+ :total-ms 0})
743
+
744
+ (defn- aggregate-cycle-timings-by-worker
745
+ [swarm-id]
746
+ (reduce (fn [acc {:keys [worker-id timing-ms duration-ms]}]
747
+ (update acc worker-id
748
+ (fn [current]
749
+ (merge-with + (or current empty-cycle-total)
750
+ (cycle-time-sum timing-ms duration-ms)))))
751
+ {}
752
+ (or (when swarm-id (runs/list-cycles swarm-id)) [])))
753
+
754
+ (defn- worker-summary-row
755
+ [{:keys [id status completed cycles-completed merges claims rejections errors recycled review-rounds-total] :as _worker}
756
+ {:keys [implementation-ms review-ms fixes-ms harness-ms total-ms]}]
757
+ {:Worker id
758
+ :Runs (or completed cycles-completed 0)
759
+ :Cycles (or cycles-completed 0)
760
+ :Status (name status)
761
+ :Merges (or merges 0)
762
+ :Claims (or claims 0)
763
+ :Rejects (or rejections 0)
764
+ :Errors (or errors 0)
765
+ :Recycled (or recycled 0)
766
+ :ReviewRounds (or review-rounds-total 0)
767
+ :ImplMs (format-ms implementation-ms)
768
+ :ReviewMs (format-ms review-ms)
769
+ :FixMs (format-ms fixes-ms)
770
+ :HarnessMs (format-ms harness-ms)
771
+ :TotalMs (format-ms total-ms)})
772
+
510
773
  (defn- emit-cycle-log!
511
- "Write cycle event log. Called at every cycle exit point.
774
+ "Write cycle event log. Called at every cycle attempt exit point.
512
775
  session-id links to the Claude CLI conversation transcript on disk.
513
776
  No mutable summary state — all state is derived from immutable cycle logs."
514
- [swarm-id worker-id cycle run start-ms session-id
515
- {:keys [outcome claimed-task-ids recycled-tasks error-snippet review-rounds]}]
516
- (let [duration-ms (- (System/currentTimeMillis) start-ms)]
777
+ [swarm-id worker-id cycle attempt run start-ms session-id
778
+ {:keys [outcome claimed-task-ids recycled-tasks error-snippet review-rounds timing-ms
779
+ worktree-path signals]}]
780
+ (let [duration-ms (- (now-ms) start-ms)
781
+ timing-ms (or timing-ms (init-cycle-timing))
782
+ harness-ms (max 0 (- duration-ms (cycle-llm-total-ms timing-ms)))
783
+ timing-ms (with-call-percent (assoc timing-ms
784
+ :harness-ms harness-ms
785
+ :llm-calls (or (:llm-calls timing-ms) []))
786
+ duration-ms)]
517
787
  (runs/write-cycle-log!
518
788
  swarm-id worker-id cycle
519
- {:run run
520
- :outcome outcome
521
- :duration-ms duration-ms
522
- :claimed-task-ids (vec (or claimed-task-ids []))
523
- :recycled-tasks (or recycled-tasks [])
524
- :error-snippet error-snippet
525
- :review-rounds (or review-rounds 0)
526
- :session-id session-id})))
527
-
528
- (defn- recycle-orphaned-tasks!
529
- "Recycle tasks that a worker claimed but didn't complete.
530
- Compares current/ task IDs before and after the agent ran —
531
- new IDs that appeared are tasks this worker claimed. On failure
532
- or rejection, move them back to pending/ so other workers can
533
- pick them up. Returns count of recycled tasks."
534
- [worker-id pre-current-ids]
535
- (let [post-current-ids (tasks/current-task-ids)
536
- orphaned-ids (clojure.set/difference post-current-ids pre-current-ids)
537
- recycled (when (seq orphaned-ids)
538
- (tasks/recycle-tasks! orphaned-ids))]
539
- (when (seq recycled)
540
- (println (format "[%s] Recycled %d orphaned task(s): %s"
541
- worker-id (count recycled) (str/join ", " recycled))))
542
- (count (or recycled []))))
789
+ (cond-> {:run run
790
+ :attempt attempt
791
+ :outcome outcome
792
+ :duration-ms duration-ms
793
+ :claimed-task-ids (vec (or claimed-task-ids []))
794
+ :recycled-tasks (or recycled-tasks [])
795
+ :error-snippet error-snippet
796
+ :review-rounds (or review-rounds 0)
797
+ :session-id session-id
798
+ :timing-ms timing-ms}
799
+ worktree-path (assoc :worktree-path worktree-path)
800
+ (seq signals) (assoc :signals (vec signals))))
801
+ (let [terminal-outcomes #{:merged :merge-failed :rejected :sync-failed :no-changes
802
+ :executor-done :stuck :error :interrupted :needs-followup}]
803
+ (if (and outcome (contains? terminal-outcomes outcome))
804
+ (do
805
+ (println (format "[%s] %s" worker-id (format-cycle-timing timing-ms duration-ms)))
806
+ (when worktree-path
807
+ (println (format "[%s] worktree: %s" worker-id worktree-path)))
808
+ (when (seq signals)
809
+ (println (format "[%s] signals: %s" worker-id (str/join " → " signals)))))
810
+ (println (format "[%s] Cycle %d attempt %d continuing"
811
+ worker-id cycle attempt))))))
812
+
813
+
543
814
 
544
815
  (defn- cleanup-worktree!
545
816
  "Remove worktree and branch."
@@ -580,7 +851,8 @@
580
851
  (defn- merge-to-main!
581
852
  "Merge worktree changes to main branch. Serialized via merge-lock to prevent
582
853
  concurrent workers from corrupting the git index. On success, moves claimed
583
- tasks current→complete and annotates metadata. Returns true on success.
854
+ tasks current→complete and annotates metadata. Returns
855
+ {:ok? bool :reason keyword :message string}.
584
856
  claimed-task-ids: set of task IDs this worker claimed (framework owns completion)."
585
857
  [wt-path wt-id worker-id project-root review-rounds claimed-task-ids]
586
858
  (locking merge-lock
@@ -599,31 +871,66 @@
599
871
  (process/sh ["git" "merge" wt-id "--no-edit"]
600
872
  {:dir project-root :out :string :err :string}))
601
873
  success (and (zero? (:exit checkout-result))
602
- (zero? (:exit merge-result)))]
874
+ (zero? (:exit merge-result)))
875
+ failure-text (str/join "\n"
876
+ (remove str/blank?
877
+ [(:out checkout-result)
878
+ (:err checkout-result)
879
+ (when merge-result (:out merge-result))
880
+ (when merge-result (:err merge-result))]))
881
+ failure-reason (if (not (zero? (:exit checkout-result)))
882
+ :checkout-failed
883
+ (classify-merge-failure failure-text))]
603
884
  (if success
604
- (do
885
+ (let [completed (when (seq claimed-task-ids)
886
+ (tasks/complete-by-ids! claimed-task-ids))
887
+ completed-count (count (or completed []))]
605
888
  (println (format "[%s] Merge successful" worker-id))
606
889
  ;; Framework-owned completion: move claimed tasks current→complete
607
- (when (seq claimed-task-ids)
608
- (let [completed (tasks/complete-by-ids! claimed-task-ids)]
609
- (when (seq completed)
610
- (println (format "[%s] Completed %d task(s): %s"
611
- worker-id (count completed) (str/join ", " completed))))))
890
+ (when (seq completed)
891
+ (println (format "[%s] Completed %d task(s): %s"
892
+ worker-id completed-count (str/join ", " completed))))
612
893
  ;; Annotate completed tasks with metadata while still holding merge-lock
613
- (annotate-completed-tasks! project-root worker-id review-rounds))
894
+ (annotate-completed-tasks! project-root worker-id review-rounds)
895
+ {:ok? true
896
+ :reason :merged
897
+ :message "merge successful"
898
+ :completed-count completed-count})
614
899
  ;; FAILED: Clean up git state before releasing merge-lock.
615
900
  ;; Without this, a conflict leaves .git/MERGE_HEAD and poisons the
616
901
  ;; shared index — every subsequent worker fails on `git checkout main`.
617
902
  (do
618
- (println (format "[%s] MERGE FAILED: %s" worker-id
619
- (or (:err merge-result) (:err checkout-result))))
903
+ (println (format "[%s] MERGE FAILED (%s): %s"
904
+ worker-id
905
+ (name failure-reason)
906
+ (or (first-nonblank-line failure-text)
907
+ "no output")))
620
908
  (let [abort-result (process/sh ["git" "merge" "--abort"]
621
909
  {:dir project-root :out :string :err :string})]
622
910
  (when-not (zero? (:exit abort-result))
623
911
  ;; Abort failed (no merge in progress, or other issue) — hard reset.
624
912
  (process/sh ["git" "reset" "--hard" "HEAD"]
625
- {:dir project-root :out :string :err :string})))))
626
- success)))
913
+ {:dir project-root :out :string :err :string})))
914
+ {:ok? false
915
+ :reason failure-reason
916
+ :message (or (first-nonblank-line failure-text) "merge failed")})))))
917
+
918
+ (defn- recover-merge-failure!
919
+ "On merge-to-main failure, launch resolver agent and retry merge once.
920
+ Must run outside merge-lock to avoid blocking other workers."
921
+ [worker wt-path wt-id worker-id project-root review-rounds claimed-task-ids merge-result]
922
+ (let [reason (:reason merge-result)
923
+ msg (:message merge-result)
924
+ _ (println (format "[%s] Launching resolver after merge failure (%s): %s"
925
+ worker-id (name (or reason :unknown)) (or msg "merge failed")))
926
+ resolve-status (run-resolver-agent! worker wt-path worker-id
927
+ (str "merge_to_main failed (" (name (or reason :unknown)) ")"
928
+ (when msg (str ": " msg))))]
929
+ (if (= :failed resolve-status)
930
+ merge-result
931
+ (do
932
+ (println (format "[%s] Retrying merge after resolver" worker-id))
933
+ (merge-to-main! wt-path wt-id worker-id project-root review-rounds claimed-task-ids)))))
627
934
 
628
935
  (defn- task-only-diff?
629
936
  "Check if all changes in worktree are task files only (no code changes).
@@ -653,16 +960,21 @@
653
960
  and fixer has full context of all prior feedback.
654
961
  Writes review logs to runs/{swarm-id}/reviews/ for post-mortem analysis.
655
962
  Returns {:approved? bool, :attempts int}"
656
- [worker wt-path worker-id iteration]
963
+ [worker wt-path worker-id iteration & [cycle-timing]]
657
964
  (if (empty? (:reviewers worker))
658
965
  ;; No reviewer configured, auto-approve
659
- {:approved? true :attempts 0}
966
+ {:approved? true :attempts 0 :timing (or cycle-timing (init-cycle-timing))}
660
967
 
661
968
  ;; Run review loop with accumulated feedback
662
969
  (loop [attempt 1
663
- prev-feedback []]
970
+ prev-feedback []
971
+ timing (or cycle-timing (init-cycle-timing))]
664
972
  (println (format "[%s] Review attempt %d/%d" worker-id attempt max-review-retries))
665
- (let [{:keys [verdict output]} (run-reviewer! worker wt-path prev-feedback)
973
+ (let [{:keys [verdict output duration-ms]} (run-reviewer! worker wt-path prev-feedback)
974
+ timing (add-llm-call timing
975
+ :reviewer-response-ms
976
+ (str "review_" attempt)
977
+ (or duration-ms 0))
666
978
  diff-files (diff-file-names wt-path)]
667
979
 
668
980
  ;; Persist review log for this round
@@ -670,13 +982,14 @@
670
982
  (runs/write-review-log! (:swarm-id worker) worker-id iteration attempt
671
983
  {:verdict verdict
672
984
  :output output
985
+ :duration-ms (or duration-ms 0)
673
986
  :diff-files (or diff-files [])}))
674
987
 
675
988
  (case verdict
676
989
  :approved
677
990
  (do
678
991
  (println (format "[%s] Reviewer APPROVED (attempt %d)" worker-id attempt))
679
- {:approved? true :attempts attempt})
992
+ {:approved? true :attempts attempt :timing timing})
680
993
 
681
994
  ;; :needs-changes — always give the worker a chance to fix.
682
995
  ;; Hard rejection only happens when max review rounds are exhausted.
@@ -684,11 +997,15 @@
684
997
  (if (>= attempt max-review-retries)
685
998
  (do
686
999
  (println (format "[%s] Max review retries reached (%d rounds)" worker-id attempt))
687
- {:approved? false :attempts attempt})
1000
+ {:approved? false :attempts attempt :timing timing})
688
1001
  (do
689
1002
  (println (format "[%s] Reviewer requested changes, fixing..." worker-id))
690
- (run-fix! worker wt-path all-feedback)
691
- (recur (inc attempt) all-feedback)))))))))
1003
+ (let [{:keys [duration-ms]} (run-fix! worker wt-path all-feedback)
1004
+ timing (add-llm-call timing
1005
+ :review-fixes-ms
1006
+ (str "fix_" attempt)
1007
+ (or duration-ms 0))]
1008
+ (recur (inc attempt) all-feedback timing))))))))))
692
1009
 
693
1010
  ;; =============================================================================
694
1011
  ;; Worker Loop
@@ -716,11 +1033,13 @@
716
1033
  (pos? (tasks/pending-count)) true
717
1034
  (pos? (tasks/current-count)) true
718
1035
  (>= waited max-wait-seconds)
719
- (do (println (format "[%s] No tasks after %ds, giving up" worker-id waited))
1036
+ (do (println (format "[%s] [%s] No tasks after %ds, giving up"
1037
+ worker-id (log-ts) waited))
720
1038
  false)
721
1039
  :else
722
1040
  (do (when (zero? (mod waited 60))
723
- (println (format "[%s] Waiting for tasks... (%ds/%ds)" worker-id waited max-wait-seconds)))
1041
+ (println (format "[%s] [%s] Waiting for tasks... (%ds/%ds)"
1042
+ worker-id (log-ts) waited max-wait-seconds)))
724
1043
  (Thread/sleep (* wait-poll-interval 1000))
725
1044
  (recur (+ waited wait-poll-interval))))))
726
1045
 
@@ -740,7 +1059,8 @@
740
1059
  Cycle cap is controlled by :max-cycles (legacy key: :iterations)."
741
1060
  [worker]
742
1061
  (tasks/ensure-dirs!)
743
- (let [{:keys [id runs max-cycles iterations swarm-id wait-between max-wait-for-tasks]} worker
1062
+ (let [{:keys [id runs max-cycles iterations swarm-id wait-between
1063
+ max-wait-for-tasks max-needs-followups]} worker
744
1064
  cycle-cap (or max-cycles iterations 10)
745
1065
  run-goal (or runs iterations 10)
746
1066
  project-root (System/getProperty "user.dir")]
@@ -759,6 +1079,7 @@
759
1079
  (wait-for-tasks! id max-wait-for-tasks))
760
1080
 
761
1081
  (loop [cycle 1
1082
+ attempt 1
762
1083
  completed-runs 0
763
1084
  consec-errors 0
764
1085
  metrics {:merges 0 :rejections 0 :errors 0 :recycled 0 :review-rounds-total 0 :claims 0}
@@ -766,7 +1087,9 @@
766
1087
  wt-state nil
767
1088
  claimed-ids #{}
768
1089
  claim-resume-prompt nil
769
- working-resumes 0]
1090
+ working-resumes 0
1091
+ needs-followups 0
1092
+ signals []]
770
1093
  (let [finish (fn [status]
771
1094
  (assoc worker :completed completed-runs
772
1095
  :runs-completed completed-runs
@@ -783,6 +1106,8 @@
783
1106
  (> cycle cycle-cap)
784
1107
  (do
785
1108
  (when wt-state
1109
+ (when (seq claimed-ids)
1110
+ (recycle-task-id-set! id claimed-ids))
786
1111
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
787
1112
  (println (format "[%s] Completed %d/%d runs in %d cycles (%d merges, %d claims, %d rejections, %d errors, %d recycled)"
788
1113
  id completed-runs run-goal (dec cycle)
@@ -792,6 +1117,8 @@
792
1117
  (>= completed-runs run-goal)
793
1118
  (do
794
1119
  (when wt-state
1120
+ (when (seq claimed-ids)
1121
+ (recycle-task-id-set! id claimed-ids))
795
1122
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
796
1123
  (println (format "[%s] Reached run goal: %d/%d runs in %d cycles"
797
1124
  id completed-runs run-goal (dec cycle)))
@@ -806,8 +1133,9 @@
806
1133
  (when (seq recycled)
807
1134
  (println (format "[%s] Recycled %d claimed task(s) on shutdown" id (count recycled))))))
808
1135
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state)))
809
- (emit-cycle-log! swarm-id id cycle current-run (System/currentTimeMillis) session-id
810
- {:outcome :interrupted})
1136
+ (emit-cycle-log! swarm-id id cycle attempt current-run (now-ms) session-id
1137
+ {:timing-ms (init-cycle-timing)
1138
+ :outcome :interrupted})
811
1139
  (finish :interrupted))
812
1140
 
813
1141
  :else
@@ -820,8 +1148,8 @@
820
1148
  (println (format "[%s] Queue empty, waiting for tasks before cycle %d" id cycle))
821
1149
  (wait-for-tasks! id max-wait-for-tasks))
822
1150
 
823
- (let [wt-state (try
824
- (or wt-state (create-iteration-worktree! project-root id cycle))
1151
+ (let [wt-state (try
1152
+ (or wt-state (create-iteration-worktree! project-root swarm-id id cycle))
825
1153
  (catch Exception e
826
1154
  (println (format "[%s] Worktree creation failed: %s" id (.getMessage e)))
827
1155
  nil))]
@@ -832,160 +1160,268 @@
832
1160
  (do
833
1161
  (println (format "[%s] %d consecutive errors, stopping" id errors))
834
1162
  (finish :error))
835
- (do (backoff-sleep! id errors) (recur (inc cycle) completed-runs errors metrics nil nil #{} nil 0))))
1163
+ (do (backoff-sleep! id errors)
1164
+ (recur (inc cycle) 1 completed-runs errors metrics nil nil #{} nil 0 0 []))))
836
1165
 
837
1166
  (let [resume? (or (some? session-id) (some? claim-resume-prompt))
838
- cycle-start-ms (System/currentTimeMillis)
1167
+ cycle-start-ms (now-ms)
1168
+ cycle-timing (init-cycle-timing)
839
1169
  pre-current-ids (tasks/current-task-ids)
840
- _ (println (format "[%s] %s cycle %d/%d (run %d/%d)"
841
- id (if resume? "Resuming" "Starting") cycle cycle-cap current-run run-goal))
1170
+ _ (println (format "[%s] %s cycle %d/%d (run %d/%d, attempt %d)"
1171
+ id
1172
+ (if (= attempt 1) "Starting" "Resuming")
1173
+ cycle cycle-cap current-run run-goal attempt))
842
1174
  context (build-context)
843
- {:keys [output exit done? merge? claim-ids] :as agent-result}
1175
+ agent-start-ms (now-ms)
1176
+ {:keys [output exit done? merge? needs-followup? claim-ids parse-warning raw-snippet] :as agent-result}
844
1177
  (run-agent! worker (:path wt-state) context session-id resume?
845
1178
  :resume-prompt-override claim-resume-prompt)
1179
+ cycle-timing (add-llm-call cycle-timing
1180
+ :implementation-rounds-ms
1181
+ "implementation"
1182
+ (- (now-ms) agent-start-ms))
846
1183
  new-session-id (:session-id agent-result)
847
- mv-claimed-tasks (detect-claimed-tasks pre-current-ids)]
1184
+ stderr-snippet (:stderr-snippet agent-result)
1185
+ mv-claimed-tasks (detect-claimed-tasks pre-current-ids)
1186
+ active-claimed-ids (active-claimed-task-ids claimed-ids mv-claimed-tasks)
1187
+ wt-path (:path wt-state)
1188
+ ;; Classify the signal for this attempt
1189
+ signal-label (cond
1190
+ (not (zero? exit)) (str "error:exit-" exit)
1191
+ (and (seq claim-ids) (not merge?) (not done?))
1192
+ (str "claim:" (str/join "," claim-ids))
1193
+ merge? "merge"
1194
+ done? "done"
1195
+ needs-followup? "needs-followup"
1196
+ :else "working")
1197
+ signals (conj signals signal-label)
1198
+ emit! (fn [opts]
1199
+ (emit-cycle-log! swarm-id id cycle attempt current-run cycle-start-ms new-session-id
1200
+ (merge {:worktree-path wt-path :signals signals} opts)))]
848
1201
  (cond
849
1202
  (not (zero? exit))
850
1203
  (let [errors (inc consec-errors)
851
- recycled (recycle-orphaned-tasks! id pre-current-ids)
852
- metrics (-> metrics (update :errors inc) (update :recycled + recycled))
1204
+ recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1205
+ metrics (-> metrics (update :errors inc) (update :recycled + (count recycled)))
853
1206
  error-msg (subs (or output "") 0 (min 200 (count (or output ""))))]
854
1207
  (println (format "[%s] Agent error (exit %d): %s" id exit error-msg))
855
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
856
- {:outcome :error
857
- :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
858
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
1208
+ (when (seq stderr-snippet)
1209
+ (println (format "[%s] Agent stderr snippet: %s"
1210
+ id
1211
+ (snippet (str/replace stderr-snippet #"\s+" " ") 240))))
1212
+ (emit!
1213
+ {:timing-ms cycle-timing
1214
+ :outcome :error
1215
+ :claimed-task-ids (vec active-claimed-ids)
1216
+ :recycled-tasks (seq recycled)
859
1217
  :error-snippet error-msg})
860
1218
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
861
1219
  (if (>= errors max-consecutive-errors)
862
1220
  (do
863
1221
  (println (format "[%s] %d consecutive errors, stopping" id errors))
864
1222
  (finish :error))
865
- (do (backoff-sleep! id errors) (recur (inc cycle) (inc completed-runs) errors metrics nil nil #{} nil 0))))
1223
+ (do (backoff-sleep! id errors)
1224
+ (recur (inc cycle) 1 (inc completed-runs) errors metrics nil nil #{} nil 0 0 []))))
866
1225
 
867
1226
  (and (seq claim-ids) (not merge?) (not done?))
868
1227
  (let [_ (println (format "[%s] CLAIM signal: %s" id (str/join ", " claim-ids)))
869
1228
  {:keys [claimed resume-prompt]} (execute-claims! claim-ids)
870
- new-claimed-ids (into claimed-ids claimed)
1229
+ new-claimed-ids (into active-claimed-ids claimed)
871
1230
  metrics (update metrics :claims + (count claimed))]
872
1231
  (println (format "[%s] Claimed %d/%d tasks" id (count claimed) (count claim-ids)))
873
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
874
- {:outcome :claimed :claimed-task-ids (vec claimed)})
875
- (recur (inc cycle) completed-runs 0 metrics new-session-id wt-state
876
- new-claimed-ids resume-prompt 0))
1232
+ (emit!
1233
+ {:timing-ms cycle-timing
1234
+ :outcome :claimed :claimed-task-ids (vec claimed)})
1235
+ (recur cycle (inc attempt) completed-runs 0 metrics new-session-id wt-state
1236
+ new-claimed-ids resume-prompt 0 0 signals))
877
1237
 
878
1238
  merge?
879
1239
  (if (worktree-has-changes? (:path wt-state))
880
1240
  (if (task-only-diff? (:path wt-state))
881
- (do
1241
+ (let [all-claimed active-claimed-ids]
882
1242
  (println (format "[%s] Task-only diff, auto-merging" id))
883
- (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
884
- all-claimed (into claimed-ids mv-claimed-tasks)]
1243
+ (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)]
885
1244
  (if (= :failed sync-status)
886
- (do
1245
+ (let [recycled (recycle-task-id-set! id all-claimed)
1246
+ metrics (update metrics :recycled + (count recycled))]
887
1247
  (println (format "[%s] Sync to main failed, skipping merge" id))
888
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
889
- {:outcome :sync-failed :claimed-task-ids (vec all-claimed)})
1248
+ (emit!
1249
+ {:timing-ms cycle-timing
1250
+ :outcome :sync-failed
1251
+ :claimed-task-ids (vec all-claimed)
1252
+ :recycled-tasks (seq recycled)})
890
1253
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
891
- (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0))
892
- (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root 0 all-claimed)
893
- metrics (if merged? (update metrics :merges inc) metrics)]
1254
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 []))
1255
+ (let [merge-result (merge-to-main! (:path wt-state) (:branch wt-state) id project-root 0 all-claimed)
1256
+ merge-result (if (:ok? merge-result)
1257
+ merge-result
1258
+ (recover-merge-failure! worker (:path wt-state) (:branch wt-state)
1259
+ id project-root 0 all-claimed merge-result))
1260
+ merged? (:ok? merge-result)
1261
+ recycled (when-not merged?
1262
+ (recycle-task-id-set! id all-claimed))
1263
+ completed-count (or (:completed-count merge-result) 0)
1264
+ metrics (cond-> metrics
1265
+ (and merged? (pos? completed-count)) (update :merges inc)
1266
+ (seq recycled) (update :recycled + (count recycled)))]
894
1267
  (println (format "[%s] Cycle %d/%d complete" id cycle cycle-cap))
895
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
896
- {:outcome (if merged? :merged :merge-failed)
1268
+ (emit!
1269
+ {:timing-ms cycle-timing
1270
+ :outcome (if merged? :merged :merge-failed)
897
1271
  :claimed-task-ids (vec all-claimed)
1272
+ :recycled-tasks (seq recycled)
898
1273
  :review-rounds 0})
899
1274
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
900
- (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0)))))
901
- (let [{:keys [approved? attempts]} (review-loop! worker (:path wt-state) id cycle)
902
- metrics (-> metrics
903
- (update :review-rounds-total + (or attempts 0))
904
- (cond-> (not approved?) (update :rejections inc)))]
1275
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 [])))))
1276
+ (let [{:keys [approved? attempts timing]} (review-loop! worker (:path wt-state) id cycle cycle-timing)
1277
+ cycle-timing (or timing cycle-timing)
1278
+ metrics (-> metrics
1279
+ (update :review-rounds-total + (or attempts 0))
1280
+ (cond-> (not approved?) (update :rejections inc)))]
905
1281
  (if approved?
906
1282
  (let [sync-status (sync-worktree-to-main! worker (:path wt-state) id)
907
- all-claimed (into claimed-ids mv-claimed-tasks)]
1283
+ all-claimed active-claimed-ids]
908
1284
  (if (= :failed sync-status)
909
- (do
1285
+ (let [recycled (recycle-task-id-set! id all-claimed)
1286
+ metrics (update metrics :recycled + (count recycled))]
910
1287
  (println (format "[%s] Sync to main failed after approval, skipping merge" id))
911
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
912
- {:outcome :sync-failed
1288
+ (emit!
1289
+ {:timing-ms cycle-timing
1290
+ :outcome :sync-failed
913
1291
  :claimed-task-ids (vec all-claimed)
1292
+ :recycled-tasks (seq recycled)
914
1293
  :review-rounds (or attempts 0)})
915
1294
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
916
- (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0))
917
- (let [merged? (merge-to-main! (:path wt-state) (:branch wt-state) id project-root (or attempts 0) all-claimed)
918
- metrics (if merged? (update metrics :merges inc) metrics)]
1295
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 []))
1296
+ (let [merge-result (merge-to-main! (:path wt-state) (:branch wt-state) id project-root (or attempts 0) all-claimed)
1297
+ merge-result (if (:ok? merge-result)
1298
+ merge-result
1299
+ (recover-merge-failure! worker (:path wt-state) (:branch wt-state)
1300
+ id project-root (or attempts 0) all-claimed merge-result))
1301
+ merged? (:ok? merge-result)
1302
+ recycled (when-not merged?
1303
+ (recycle-task-id-set! id all-claimed))
1304
+ completed-count (or (:completed-count merge-result) 0)
1305
+ metrics (cond-> metrics
1306
+ (and merged? (pos? completed-count)) (update :merges inc)
1307
+ (seq recycled) (update :recycled + (count recycled)))]
919
1308
  (println (format "[%s] Cycle %d/%d complete" id cycle cycle-cap))
920
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
921
- {:outcome (if merged? :merged :merge-failed)
1309
+ (emit!
1310
+ {:timing-ms cycle-timing
1311
+ :outcome (if merged? :merged :merge-failed)
922
1312
  :claimed-task-ids (vec all-claimed)
1313
+ :recycled-tasks (seq recycled)
923
1314
  :review-rounds (or attempts 0)})
924
1315
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
925
- (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0))))
926
- (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
927
- metrics (update metrics :recycled + recycled)]
1316
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 []))))
1317
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1318
+ metrics (update metrics :recycled + (count recycled))]
928
1319
  (println (format "[%s] Cycle %d/%d rejected" id cycle cycle-cap))
929
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
930
- {:outcome :rejected
931
- :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
932
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))
1320
+ (emit!
1321
+ {:timing-ms cycle-timing
1322
+ :outcome :rejected
1323
+ :claimed-task-ids (vec active-claimed-ids)
1324
+ :recycled-tasks (seq recycled)
933
1325
  :review-rounds (or attempts 0)})
934
1326
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
935
- (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0)))))
936
- (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
937
- metrics (update metrics :recycled + recycled)]
1327
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 [])))))
1328
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1329
+ metrics (update metrics :recycled + (count recycled))]
938
1330
  (println (format "[%s] Merge signaled but no changes, skipping" id))
939
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
940
- {:outcome :no-changes
941
- :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
942
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
1331
+ (emit!
1332
+ {:timing-ms cycle-timing
1333
+ :outcome :no-changes
1334
+ :claimed-task-ids (vec active-claimed-ids)
1335
+ :recycled-tasks (seq recycled)})
943
1336
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
944
- (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0)))
1337
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 [])))
945
1338
 
946
1339
  done?
947
- (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
948
- metrics (update metrics :recycled + recycled)]
949
- (println (format "[%s] __DONE__ signal, resetting session (cycle %d/%d)" id cycle cycle-cap))
950
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
951
- {:outcome :executor-done
952
- :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
953
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
1340
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1341
+ metrics (-> metrics
1342
+ (update :recycled + (count recycled))
1343
+ (update :errors inc))]
1344
+ (println (format "[%s] Invalid __DONE__ signal from executor; stopping worker (cycle %d/%d)" id cycle cycle-cap))
1345
+ (emit!
1346
+ {:timing-ms cycle-timing
1347
+ :outcome :error
1348
+ :claimed-task-ids (vec active-claimed-ids)
1349
+ :recycled-tasks (seq recycled)
1350
+ :error-snippet "__DONE__ is not a valid executor signal; use CLAIM(...) or COMPLETE_AND_READY_FOR_MERGE"})
954
1351
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
955
- (recur (inc cycle) completed-runs 0 metrics nil nil #{} nil 0))
1352
+ (finish :error))
1353
+
1354
+ needs-followup?
1355
+ (let [summary (subs (or output "") 0 (min 240 (count (or output ""))))
1356
+ next-followups (inc needs-followups)]
1357
+ (emit!
1358
+ {:timing-ms cycle-timing
1359
+ :outcome :needs-followup
1360
+ :claimed-task-ids (vec active-claimed-ids)
1361
+ :error-snippet summary})
1362
+ (if (> next-followups max-needs-followups)
1363
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1364
+ metrics (-> metrics
1365
+ (update :recycled + (count recycled))
1366
+ (update :errors inc))]
1367
+ (println (format "[%s] NEEDS_FOLLOWUP exhausted (%d/%d); stopping worker" id next-followups max-needs-followups))
1368
+ (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
1369
+ (finish :error))
1370
+ (let [followup-prompt (build-needs-followup-prompt active-claimed-ids output)]
1371
+ (println (format "[%s] NEEDS_FOLLOWUP signal; continuing cycle with follow-up prompt (%d/%d)"
1372
+ id next-followups max-needs-followups))
1373
+ (recur cycle (inc attempt) completed-runs 0 metrics new-session-id wt-state
1374
+ active-claimed-ids followup-prompt 0 next-followups signals))))
956
1375
 
957
1376
  :else
958
1377
  (let [wr (inc working-resumes)
959
1378
  max-wr (:max-working-resumes worker)]
1379
+ (when parse-warning
1380
+ (if (str/includes? parse-warning "AUTH_REQUIRED:")
1381
+ (println (format "[%s] LOGIN ISSUE: %s"
1382
+ id
1383
+ (str/replace parse-warning #"^AUTH_REQUIRED:\s*" "")))
1384
+ (println (format "[%s] WARNING: %s" id parse-warning))))
1385
+ (when (and parse-warning (seq raw-snippet))
1386
+ (println (format "[%s] Raw output snippet: %s"
1387
+ id
1388
+ (snippet (str/replace raw-snippet #"\s+" " ") 240))))
1389
+ (when (seq stderr-snippet)
1390
+ (println (format "[%s] Agent stderr snippet: %s"
1391
+ id
1392
+ (snippet (str/replace stderr-snippet #"\s+" " ") 240))))
960
1393
  (cond
961
1394
  (> wr max-wr)
962
- (let [recycled (recycle-orphaned-tasks! id pre-current-ids)
963
- metrics (update metrics :recycled + recycled)]
1395
+ (let [recycled (recycle-active-claims! id claimed-ids mv-claimed-tasks)
1396
+ metrics (update metrics :recycled + (count recycled))]
964
1397
  (println (format "[%s] Stuck after %d working resumes + nudge, resetting session" id wr))
965
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
966
- {:outcome :stuck
967
- :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))
968
- :recycled-tasks (when (pos? recycled) (vec mv-claimed-tasks))})
1398
+ (emit!
1399
+ {:timing-ms cycle-timing
1400
+ :outcome :stuck
1401
+ :claimed-task-ids (vec active-claimed-ids)
1402
+ :recycled-tasks (seq recycled)})
969
1403
  (cleanup-worktree! project-root (:dir wt-state) (:branch wt-state))
970
- (recur (inc cycle) (inc completed-runs) 0 metrics nil nil #{} nil 0))
1404
+ (recur (inc cycle) 1 (inc completed-runs) 0 metrics nil nil #{} nil 0 0 []))
971
1405
 
972
1406
  (= wr max-wr)
973
1407
  (do
974
1408
  (println (format "[%s] Working... %d/%d resumes, nudging agent to wrap up" id wr max-wr))
975
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
976
- {:outcome :working
977
- :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
978
- (recur (inc cycle) completed-runs 0 metrics new-session-id wt-state
979
- claimed-ids nudge-prompt wr))
1409
+ (emit!
1410
+ {:timing-ms cycle-timing
1411
+ :outcome :working
1412
+ :claimed-task-ids (vec active-claimed-ids)})
1413
+ (recur cycle (inc attempt) completed-runs 0 metrics new-session-id wt-state
1414
+ active-claimed-ids nudge-prompt wr needs-followups signals))
980
1415
 
981
1416
  :else
982
1417
  (do
983
1418
  (println (format "[%s] Working... (will resume, %d/%d)" id wr max-wr))
984
- (emit-cycle-log! swarm-id id cycle current-run cycle-start-ms new-session-id
985
- {:outcome :working
986
- :claimed-task-ids (vec (into claimed-ids mv-claimed-tasks))})
987
- (recur (inc cycle) completed-runs 0 metrics new-session-id wt-state
988
- claimed-ids nil wr))))))))))))))
1419
+ (emit!
1420
+ {:timing-ms cycle-timing
1421
+ :outcome :working
1422
+ :claimed-task-ids (vec active-claimed-ids)})
1423
+ (recur cycle (inc attempt) completed-runs 0 metrics new-session-id wt-state
1424
+ active-claimed-ids nil wr needs-followups signals))))))))))))))
989
1425
 
990
1426
  ;; =============================================================================
991
1427
  ;; Multi-Worker Execution
@@ -1001,7 +1437,14 @@
1001
1437
  Returns seq of final worker states."
1002
1438
  [workers]
1003
1439
  (tasks/ensure-dirs!)
1004
- (let [swarm-id (-> workers first :swarm-id)]
1440
+ (let [swarm-id (-> workers first :swarm-id)
1441
+ stale-current (tasks/list-current)]
1442
+ (when (seq stale-current)
1443
+ (println (format "WARNING: %d task(s) already in current/ from a previous run. These may be stale claims."
1444
+ (count stale-current)))
1445
+ (doseq [t stale-current]
1446
+ (println (format " - %s: %s" (:id t) (:summary t))))
1447
+ (println " Run `oompa requeue` to move them back to pending/ if they are stale."))
1005
1448
  (println (format "Launching %d workers..." (count workers)))
1006
1449
 
1007
1450
  ;; Register JVM shutdown hook so SIGTERM/SIGINT triggers graceful stop.
@@ -1023,27 +1466,38 @@
1023
1466
  (map-indexed
1024
1467
  (fn [idx worker]
1025
1468
  (let [worker (assoc worker :id (or (:id worker) (str "w" idx)))]
1026
- (future (run-worker! worker))))
1469
+ (future
1470
+ (try
1471
+ (run-worker! worker)
1472
+ (catch Exception e
1473
+ (println (format "[%s] FATAL: %s" (:id worker) (.getMessage e)))
1474
+ (.printStackTrace e)
1475
+ (throw e))))))
1027
1476
  workers))]
1028
1477
 
1029
1478
  (println "All workers launched. Waiting for completion...")
1030
- (let [results (mapv deref futures)]
1479
+ (let [results (mapv (fn [f]
1480
+ (try
1481
+ (deref f)
1482
+ (catch Exception e
1483
+ (println (format "Worker future failed: %s" (.getMessage e)))
1484
+ {:status :fatal-error :error (.getMessage e)})))
1485
+ futures)]
1031
1486
  ;; Clean exit — tell shutdown hook not to write stopped.json
1032
1487
  (reset! shutdown-requested? false)
1033
1488
  ;; Remove the hook so it doesn't accumulate across calls
1034
1489
  (try (.removeShutdownHook (Runtime/getRuntime) hook) (catch Exception _))
1035
1490
  (println "\nAll workers complete.")
1036
- (doseq [w results]
1037
- (println (format " [%s] %s - %d completed, %d merges, %d claims, %d rejections, %d errors, %d recycled, %d review rounds"
1038
- (:id w)
1039
- (name (:status w))
1040
- (:completed w)
1041
- (or (:merges w) 0)
1042
- (or (:claims w) 0)
1043
- (or (:rejections w) 0)
1044
- (or (:errors w) 0)
1045
- (or (:recycled w) 0)
1046
- (or (:review-rounds-total w) 0))))
1491
+ (let [timing-by-worker (aggregate-cycle-timings-by-worker swarm-id)
1492
+ rows (mapv (fn [result]
1493
+ (let [row-id (or (:id result) "")
1494
+ totals (get timing-by-worker row-id empty-cycle-total)]
1495
+ (worker-summary-row result totals)))
1496
+ results)]
1497
+ (println "\nWorker Summary")
1498
+ (print-table [:Worker :Runs :Cycles :Status :Merges :Claims :Rejects :Errors :Recycled
1499
+ :ReviewRounds :ImplMs :ReviewMs :FixMs :HarnessMs :TotalMs]
1500
+ rows))
1047
1501
 
1048
1502
  ;; Write stopped event — all state derivable from cycle logs
1049
1503
  (when swarm-id
@@ -1091,16 +1545,12 @@
1091
1545
  tagged-prompt (str "[oompa:" swarm-id* ":planner] " prompt-text)
1092
1546
  abs-root (.getAbsolutePath (io/file project-root))
1093
1547
 
1094
- cmd (harness/build-cmd harness
1095
- {:cwd abs-root :model model :prompt tagged-prompt})
1096
-
1097
1548
  _ (println (format "[planner] Running (%s:%s, max_pending: %d, current: %d)"
1098
1549
  (name harness) (or model "default") max-pending pending-before))
1099
1550
 
1100
1551
  result (try
1101
- (process/sh cmd {:dir abs-root
1102
- :in (harness/process-stdin harness tagged-prompt)
1103
- :out :string :err :string})
1552
+ (harness/run-command! harness
1553
+ {:cwd abs-root :model model :prompt tagged-prompt})
1104
1554
  (catch Exception e
1105
1555
  (println (format "[planner] Agent exception: %s" (.getMessage e)))
1106
1556
  {:exit -1 :out "" :err (.getMessage e)}))