@yemi33/minions 0.1.2044 → 0.1.2045

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/engine/queries.js CHANGED
@@ -1772,8 +1772,17 @@ function _scheduleProjectGitStatusRefresh(localPath, key, configuredMainBranch)
1772
1772
  if (existing && existing.promise) return existing.promise;
1773
1773
  const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null };
1774
1774
  const prevValue = entry.value;
1775
+ // Capture probe-start time BEFORE running git, not after. Used as the
1776
+ // baseline for `_projectGitRefsAdvancedSince` on the next call. If we
1777
+ // captured probe-END time, a file written just before the probe started
1778
+ // could end up with `mtimeMs >= entry.ts` on a fast filesystem (NTFS
1779
+ // mtime granularity vs millisecond-precise Date.now()), busting the
1780
+ // cache spuriously on the very next read. Probe-START is the safer
1781
+ // anchor — any file with `mtimeMs > probeStartTs` legitimately changed
1782
+ // at-or-after the probe, so re-probing is correct.
1783
+ const probeStartTs = Date.now();
1775
1784
  entry.promise = _probeProjectGitStatus(localPath, configuredMainBranch).then(value => {
1776
- entry.ts = Date.now();
1785
+ entry.ts = probeStartTs;
1777
1786
  entry.value = value;
1778
1787
  entry.promise = null;
1779
1788
  if (_onProjectGitStatusChanged && !_projectGitStatusEqual(prevValue, value)) {
@@ -1824,6 +1833,30 @@ function _resolveGitDir(localPath) {
1824
1833
  return null;
1825
1834
  }
1826
1835
 
1836
+ // For a per-worktree gitdir, resolve the COMMON gitdir shared across all
1837
+ // linked worktrees of the same repo. Git writes per-worktree state
1838
+ // (HEAD, logs/HEAD, index) into the worktree-specific gitdir, but state
1839
+ // that's logically per-repo (objects, refs/remotes/, FETCH_HEAD) into
1840
+ // the common gitdir. The common location is recorded in
1841
+ // `<per-worktree-gitdir>/commondir`, a file containing either an
1842
+ // absolute path or a path relative to the per-worktree gitdir
1843
+ // (typically `../..` — resolves from `<main>/.git/worktrees/<name>` →
1844
+ // `<main>/.git`). For the main worktree there is no commondir file and
1845
+ // `gitDir` already IS the common gitdir, so this function returns
1846
+ // `gitDir` unchanged. Callers should track FETCH_HEAD and
1847
+ // refs/remotes/origin/<comparator> against the COMMON gitdir; logs/HEAD
1848
+ // must stay on the per-worktree gitdir so per-worktree HEAD moves still
1849
+ // register.
1850
+ function _resolveCommonGitDir(gitDir) {
1851
+ if (!gitDir) return null;
1852
+ const commondirPath = path.join(gitDir, 'commondir');
1853
+ let raw = '';
1854
+ try { raw = fs.readFileSync(commondirPath, { encoding: 'utf8', flag: 'r' }).trim(); }
1855
+ catch { return gitDir; }
1856
+ if (!raw) return gitDir;
1857
+ return path.isAbsolute(raw) ? raw : path.resolve(gitDir, raw);
1858
+ }
1859
+
1827
1860
  // Return true when any of the per-project git ref files (logs/HEAD,
1828
1861
  // FETCH_HEAD, refs/remotes/origin/<comparator>) have mtimeMs > cachedTs.
1829
1862
  // Lets `getProjectGitStatus` bypass its 15s TTL after `git pull`, `git
@@ -1836,13 +1869,17 @@ function _resolveGitDir(localPath) {
1836
1869
  function _projectGitRefsAdvancedSince(localPath, cachedTs, configuredMainBranch) {
1837
1870
  const gitDir = _resolveGitDir(localPath);
1838
1871
  if (!gitDir) return false;
1872
+ // logs/HEAD is per-worktree; FETCH_HEAD + refs/remotes/origin/* live in
1873
+ // the COMMON gitdir for linked worktrees. For the main worktree both
1874
+ // resolve to the same place, so this is a no-op there.
1875
+ const commonGitDir = _resolveCommonGitDir(gitDir);
1839
1876
  const candidates = [
1840
1877
  path.join(gitDir, 'logs', 'HEAD'),
1841
- path.join(gitDir, 'FETCH_HEAD'),
1878
+ path.join(commonGitDir, 'FETCH_HEAD'),
1842
1879
  ];
1843
1880
  const comparator = configuredMainBranch && String(configuredMainBranch).trim();
1844
1881
  if (comparator) {
1845
- candidates.push(path.join(gitDir, 'refs', 'remotes', 'origin', comparator));
1882
+ candidates.push(path.join(commonGitDir, 'refs', 'remotes', 'origin', comparator));
1846
1883
  }
1847
1884
  for (const file of candidates) {
1848
1885
  try {
@@ -2052,8 +2089,16 @@ function getStatusFastStateMtimePaths(config) {
2052
2089
  files.push(shared.projectPrPath(p));
2053
2090
  if (p && p.localPath) {
2054
2091
  const gitDir = _resolveGitDir(p.localPath) || path.join(p.localPath, '.git');
2092
+ // logs/HEAD is per-worktree (HEAD moves, commits, checkouts).
2093
+ // FETCH_HEAD lives in the COMMON gitdir — `git fetch` from a linked
2094
+ // worktree writes to `<main>/.git/FETCH_HEAD`, not to the
2095
+ // per-worktree subdir. Tracking only the per-worktree path here
2096
+ // would leave linked-worktree projects stuck after `git fetch`
2097
+ // (the file at `<main>/.git/worktrees/<name>/FETCH_HEAD` never
2098
+ // exists — verified empirically).
2099
+ const commonGitDir = _resolveCommonGitDir(gitDir);
2055
2100
  files.push(path.join(gitDir, 'logs', 'HEAD'));
2056
- files.push(path.join(gitDir, 'FETCH_HEAD'));
2101
+ files.push(path.join(commonGitDir, 'FETCH_HEAD'));
2057
2102
  }
2058
2103
  }
2059
2104
  return files;
@@ -2100,9 +2145,25 @@ function getStatusFastStateMtimePaths(config) {
2100
2145
  * Files intentionally NOT tracked here:
2101
2146
  * - version, autoMode, installId — change only on human/CLI edits, which
2102
2147
  * already pop the slow-state via reloadConfig + the 60 s TTL.
2103
- * - project git state — already invalidated via the
2104
- * `_setOnProjectGitStatusChanged` callback into `invalidateStatusCache`
2105
- * (W-mpgrk5cy fix); also tracked in fast-state via `.git/logs/HEAD`.
2148
+ *
2149
+ * Per-project `.git/logs/HEAD` + `.git/FETCH_HEAD` ARE tracked here in
2150
+ * addition to the fast-state tracker. The project payload (`projects:`
2151
+ * with ahead/behind counts) lives in the slow-state slice, and
2152
+ * `getProjectGitStatus` is called only from `_buildStatusSlowState`.
2153
+ * Without tracking the git refs here, a fresh `git pull` busts
2154
+ * fast-state but leaves slow-state cached for up to 60 s —
2155
+ * `getProjectGitStatus` is never called, so the background probe never
2156
+ * runs, so the `_setOnProjectGitStatusChanged` callback never fires,
2157
+ * so the cache stays stale. Tracking the refs in BOTH tiers means any
2158
+ * HEAD-moving / fetching git operation busts slow-state on the next
2159
+ * poll, the rebuild calls `getProjectGitStatus` which then schedules
2160
+ * the probe, and the probe's invalidation callback finishes the
2161
+ * round-trip within ~1 poll cycle. `logs/HEAD` is resolved via
2162
+ * `_resolveGitDir` (per-worktree); `FETCH_HEAD` is resolved via
2163
+ * `_resolveCommonGitDir` (shared across all linked worktrees of the
2164
+ * same repo — `git fetch` from a linked worktree writes to the main
2165
+ * gitdir, not the per-worktree subdir, so tracking the per-worktree
2166
+ * path would silently miss every fetch).
2106
2167
  *
2107
2168
  * NOTE: Detecting a change here busts the dashboard's slow-state cache, but
2108
2169
  * the inner per-source caches (`queries._skillsCache` 30 s, dashboard's
@@ -2170,6 +2231,22 @@ function getStatusSlowStateMtimePaths(config) {
2170
2231
  }
2171
2232
  }
2172
2233
 
2234
+ // Per-project git refs — see the "Per-project .git/logs/HEAD" note in
2235
+ // the header. Same pair the fast-state tracker watches; `logs/HEAD` is
2236
+ // per-worktree (resolved via `_resolveGitDir`) while `FETCH_HEAD` lives
2237
+ // in the COMMON gitdir (resolved via `_resolveCommonGitDir`) — git
2238
+ // fetches from a linked worktree write to `<main>/.git/FETCH_HEAD`,
2239
+ // not into the per-worktree subdir. For the main worktree both
2240
+ // resolvers return the same gitdir, so the behavior collapses to the
2241
+ // expected `<localPath>/.git/{logs/HEAD,FETCH_HEAD}` pair.
2242
+ for (const project of projects) {
2243
+ if (!project || !project.localPath) continue;
2244
+ const gitDir = _resolveGitDir(project.localPath) || path.join(project.localPath, '.git');
2245
+ const commonGitDir = _resolveCommonGitDir(gitDir);
2246
+ files.push(path.join(gitDir, 'logs', 'HEAD'));
2247
+ files.push(path.join(commonGitDir, 'FETCH_HEAD'));
2248
+ }
2249
+
2173
2250
  return files;
2174
2251
  }
2175
2252
 
package/engine/shared.js CHANGED
@@ -1779,6 +1779,7 @@ const ENGINE_DEFAULTS = {
1779
1779
  autoReReviewPrs: true, // auto-dispatch review agents after a PR fix is pushed
1780
1780
  autoFixReviewFeedback: true, // auto-dispatch fix agents for minions review changes-requested verdicts
1781
1781
  autoFixHumanComments: true, // auto-dispatch fix agents for actionable human PR comments
1782
+ autoConsolidateMemory: false, // opt-in: periodically spawn engine/kb-sweep-runner.js from the tick loop (4h cadence). Inbox→notes consolidation already runs every tick via consolidateInbox; this flag only controls the KB sweep.
1782
1783
  prNoOpFixPauseAttempts: 2, // pause one PR automation cause after repeated no-op fixes for unchanged evidence
1783
1784
  completionReportRetentionDays: 90, // retain completion report sidecars beyond capped dispatch history
1784
1785
  completionReportMaxFiles: 5000, // hard cap for completion report sidecars during cleanup
@@ -1787,6 +1788,7 @@ const ENGINE_DEFAULTS = {
1787
1788
  evalLoop: true, // enable review→fix loop after implementation completes
1788
1789
  evalMaxCost: null, // USD ceiling per work item across all eval iterations; null = no limit (gather baseline data first)
1789
1790
  maxRetries: 3, // max dispatch retries before marking work item as failed
1791
+ maxRetriesPerAgent: 2, // W-mpmwxn1j — per-agent retry cap. When the SAME agent fails the same WI this many times, the next retry MUST reassign to a different eligible agent (consults routing.md + agent availability). Falls back to the same agent only when no alternate is available. Counted separately from `maxRetries` (which caps total retries across all agents) and tracked on the WI as `_retriesByAgent: { agentId: count }`. Hard-pinned agents bypass reassignment (operator intent wins).
1790
1792
  maxPhantomRetries: 3, // max retries for "phantom completion" (runtime crashed before emitting type:"result"); tracked separately from _retryCount so phantom retries don't pollute the normal PR-attachment retry budget. See engine/lifecycle.markMissingPrAttachment + detectNonTerminalResultSummary.
1791
1793
  minRetryGapMs: 120000, // 2min — minimum gap between retry dispatches for the same work item; prevents tight retry loops when an idempotent agent (e.g. review bailing out on a duplicate) cannot produce the expected output (#1770)
1792
1794
  pipelineApiRetries: 2, // max attempts for pipeline API calls
@@ -4752,6 +4754,39 @@ function safeSlugComponent(text, maxLen = 80) {
4752
4754
  return `${base}-${hash}`.slice(0, maxLen);
4753
4755
  }
4754
4756
 
4757
+ // W-mpmwxn1j — Per-agent retry tracking. After the same agent has failed a WI
4758
+ // `ENGINE_DEFAULTS.maxRetriesPerAgent` times, the next dispatch must reassign
4759
+ // to a different eligible agent. The counter is stored on the WI itself
4760
+ // (`_retriesByAgent: { agentId: count }`) so engine restart preserves the
4761
+ // state. The counter is cleared on successful completion alongside
4762
+ // `_retryCount`. The caller passes a mutable WI object (inside a
4763
+ // `mutateWorkItems` / `mutateJsonFileLocked` callback) and the failed agent ID.
4764
+ // Anonymous failures (no resolvable agent) skip the bump — we'd corrupt the
4765
+ // shape with an `undefined` key. Returns the new per-agent count for logging.
4766
+ function bumpAgentRetryCount(wi, agentId) {
4767
+ if (!wi || !agentId) return 0;
4768
+ if (!wi._retriesByAgent || typeof wi._retriesByAgent !== 'object' || Array.isArray(wi._retriesByAgent)) {
4769
+ wi._retriesByAgent = {};
4770
+ }
4771
+ const next = (Number(wi._retriesByAgent[agentId]) || 0) + 1;
4772
+ wi._retriesByAgent[agentId] = next;
4773
+ return next;
4774
+ }
4775
+
4776
+ function getAgentRetryCount(wi, agentId) {
4777
+ if (!wi || !agentId) return 0;
4778
+ const map = wi._retriesByAgent;
4779
+ if (!map || typeof map !== 'object' || Array.isArray(map)) return 0;
4780
+ return Number(map[agentId]) || 0;
4781
+ }
4782
+
4783
+ function resolveMaxRetriesPerAgent(config) {
4784
+ const raw = config?.engine?.maxRetriesPerAgent;
4785
+ const val = Number(raw);
4786
+ if (Number.isFinite(val) && val > 0) return val;
4787
+ return ENGINE_DEFAULTS.maxRetriesPerAgent;
4788
+ }
4789
+
4755
4790
  const PR_AUTOMATION_CAUSE_LIMIT = 50;
4756
4791
 
4757
4792
  function getPrAutomationCauses(pr) {
@@ -4912,6 +4947,7 @@ module.exports = {
4912
4947
  tailTextBytes,
4913
4948
  appendTextTail,
4914
4949
  writeToInbox, parseNoteId,
4950
+ bumpAgentRetryCount, getAgentRetryCount, resolveMaxRetriesPerAgent,
4915
4951
  exec,
4916
4952
  execAsync,
4917
4953
  execSilent,
package/engine/timeout.js CHANGED
@@ -587,6 +587,10 @@ function checkTimeouts(config) {
587
587
  log('info', `Reconcile: work item ${item.id} agent died — auto-retry ${retries + 1}/${maxRetries}`);
588
588
  item.status = WI_STATUS.PENDING;
589
589
  item._retryCount = retries + 1;
590
+ // W-mpmwxn1j — bump per-agent retry count BEFORE deleting
591
+ // dispatched_to so the next dispatch can reassign away from the
592
+ // agent that died. Anonymous (no dispatched_to) failures don't bump.
593
+ if (item.dispatched_to) shared.bumpAgentRetryCount(item, item.dispatched_to);
590
594
  delete item.dispatched_at;
591
595
  delete item.dispatched_to;
592
596
  delete item._pendingReason;
package/engine.js CHANGED
@@ -630,6 +630,26 @@ async function syncReusedWorktree(rootDir, worktreePath, branchName, gitOpts = {
630
630
  }
631
631
 
632
632
  // Find an existing worktree already checked out on a given branch
633
+ // Parse the holder path out of a `git worktree add` "already used" error so
634
+ // callers can give the operator something actionable instead of just re-
635
+ // throwing the raw git message. The two surface forms emitted by current git
636
+ // versions:
637
+ //
638
+ // fatal: 'work/W-X' is already used by worktree at 'D:/squad'
639
+ // fatal: 'work/W-X' is already checked out at 'D:/squad'
640
+ //
641
+ // Returns null on unparseable input so callers fall through to the generic
642
+ // re-throw path safely. Path is returned exactly as git printed it (forward
643
+ // or back slashes), since callers compare it against rootDir which is
644
+ // shaped the same way by upstream.
645
+ function _parseAlreadyUsedHolderPath(errMsg) {
646
+ if (!errMsg) return null;
647
+ // Match the path inside single quotes after either "already used by worktree at"
648
+ // or "already checked out at". Non-greedy so we stop at the closing quote.
649
+ const m = String(errMsg).match(/already (?:used by worktree|checked out) at ['"]([^'"]+)['"]/);
650
+ return m ? m[1] : null;
651
+ }
652
+
633
653
  async function findExistingWorktree(repoDir, branchName) {
634
654
  try {
635
655
  const out = await shared.shellSafeGit(['worktree', 'list', '--porcelain'], { cwd: repoDir, timeout: 10000 });
@@ -658,6 +678,65 @@ function isWorktreeRetryableError(err) {
658
678
  || msg.includes('already exists');
659
679
  }
660
680
 
681
+ // Distinct from isWorktreeRetryableError above: that one covers worktree-add
682
+ // LOCAL contention (index lock, file busy, already exists). This one covers
683
+ // `git fetch` NETWORK-side transients where retrying once is cheap and often
684
+ // recovers — network blips, transient TLS errors, github.com 5xx during a
685
+ // fetch. We deliberately do NOT retry on auth failures, repo-not-found, or
686
+ // "couldn't find remote ref" — those won't change on a 1-2s retry, and
687
+ // re-running them just wastes the dispatch budget.
688
+ function _isTransientGitNetworkError(err) {
689
+ const msg = String(err?.message || '');
690
+ return msg.includes('ETIMEDOUT')
691
+ || msg.includes('ECONNRESET')
692
+ || msg.includes('ECONNREFUSED')
693
+ || msg.includes('EAI_AGAIN')
694
+ || msg.includes('Could not resolve host')
695
+ || msg.includes('Connection reset')
696
+ || msg.includes('Connection timed out')
697
+ || msg.includes('Operation timed out')
698
+ || msg.includes('timed out')
699
+ || msg.includes('502 Bad Gateway')
700
+ || msg.includes('503 Service Unavailable')
701
+ || msg.includes('504 Gateway Timeout')
702
+ || msg.includes('RPC failed')
703
+ || msg.includes('HTTP/2 stream');
704
+ }
705
+
706
+ // Wraps shellSafeGit(['fetch', ...]) with a single retry on transient
707
+ // network errors. Preserves the pre-fix convention that fetch failures
708
+ // are NON-fatal here: we log + swallow the terminal error so the caller
709
+ // falls back to the local ref. The retry just gives transient errors one
710
+ // shot to recover before that fallback kicks in — empirically eliminates
711
+ // most "couldn't find remote ref" cascades downstream caused by a 0-second
712
+ // network blip during the initial fetch.
713
+ //
714
+ // `args` is the array passed to git AFTER 'fetch' (e.g. ['origin', branch]).
715
+ // `label` is a short string used in the log line for triage.
716
+ async function _fetchWithTransientRetry(args, opts, label) {
717
+ const _attempt = async () => shared.shellSafeGit(['fetch', ...args], opts);
718
+ try {
719
+ await _attempt();
720
+ return true;
721
+ } catch (e) {
722
+ if (!_isTransientGitNetworkError(e)) {
723
+ log('warn', `git fetch ${label}: ${String(e.message || e).split('\n')[0]}`);
724
+ return false; // swallow non-transient — caller falls back to local ref
725
+ }
726
+ const firstMsg = String(e.message || e).split('\n')[0];
727
+ log('warn', `git fetch ${label}: transient (${firstMsg}) — retrying once after 1.5s`);
728
+ await new Promise(r => setTimeout(r, 1500));
729
+ try {
730
+ await _attempt();
731
+ log('info', `git fetch ${label}: succeeded on retry`);
732
+ return true;
733
+ } catch (e2) {
734
+ log('warn', `git fetch ${label}: retry also failed (${String(e2.message || e2).split('\n')[0]}) — falling back to local ref`);
735
+ return false;
736
+ }
737
+ }
738
+ }
739
+
661
740
  function removeStaleIndexLock(rootDir) {
662
741
  const lockFile = path.join(rootDir, '.git', 'index.lock');
663
742
  try {
@@ -1055,6 +1134,62 @@ async function spawnAgent(dispatchItem, config) {
1055
1134
  );
1056
1135
  cleanupTempAgent(agentId);
1057
1136
  };
1137
+ // Atomic read of the dispatch state to detect when another active
1138
+ // dispatch holds `branchName`. Returns the conflicting dispatch object
1139
+ // (or null when none). Uses mutateDispatch for the file-lock read even
1140
+ // though we don't mutate — the snapshot it gives back is consistent
1141
+ // with what a concurrent writer would see. Caller uses the return value
1142
+ // both as a boolean (truthy = conflict) and to extract the other
1143
+ // dispatch's id for the error message.
1144
+ const _isBranchActivelyUsedByOtherDispatch = (branch, currentId) => {
1145
+ if (!branch || !currentId) return null;
1146
+ const targetBranch = sanitizeBranch(branch);
1147
+ let conflict = null;
1148
+ mutateDispatch((dp) => {
1149
+ conflict = (dp.active || []).find(d => {
1150
+ const dBranch = d.meta?.branch ? sanitizeBranch(d.meta.branch) : '';
1151
+ return dBranch === targetBranch && d.id !== currentId;
1152
+ }) || null;
1153
+ return dp;
1154
+ });
1155
+ return conflict;
1156
+ };
1157
+ // Fail-fast handler for the "branch is checked out elsewhere AND prune
1158
+ // couldn't recover it" case. This is the bug class operators hit when the
1159
+ // project root itself (or a sibling worktree we can't see via
1160
+ // findExistingWorktree) is genuinely holding the branch — the existing
1161
+ // "throw eRemote" path retried every tick forever with no surface signal,
1162
+ // and findExistingWorktree deliberately filters out the project root so
1163
+ // the reuse path never fires for that case. Mark the dispatch non-
1164
+ // retryable so the storm stops, and put the holder path in the message
1165
+ // so the operator can see what to do (`git -C <holder> checkout master`
1166
+ // when the holder is the project root, or kill/finish the agent owning
1167
+ // the sibling worktree). Returns truthy when caller should `return null`.
1168
+ const _failBranchHeldByExternalWorktree = (branchName, holderPath, rawErr) => {
1169
+ const isProjectRoot = holderPath && rootDir
1170
+ && path.resolve(holderPath) === path.resolve(rootDir);
1171
+ const summary = isProjectRoot
1172
+ ? `Branch ${branchName} is held by the project root (${holderPath}). Spawning a worktree against it would create a nested checkout. Switch the root back to master to release the branch.`
1173
+ : `Branch ${branchName} is held by an external worktree at ${holderPath} that this engine can't reach (findExistingWorktree filters out paths at-or-inside the project root, and the holder isn't visible to our worktree list). Resolve the lock and retry.`;
1174
+ log('error', `spawnAgent: ${summary}`);
1175
+ _cleanupPromptFiles();
1176
+ completeDispatch(
1177
+ id,
1178
+ DISPATCH_RESULT.ERROR,
1179
+ summary.slice(0, 800),
1180
+ isProjectRoot
1181
+ ? 'Branch held by the project root checkout. Recovery: `git -C <projectRoot> checkout master` (or whichever branch your engine root should be on). Until the root releases the branch, every retry will fail identically.'
1182
+ : 'Branch held externally. Recovery: find the holding worktree (`git worktree list` from the project root), finish or remove it, then re-dispatch.',
1183
+ { failureClass: FAILURE_CLASS.WORKTREE_PREFLIGHT, agentRetryable: false },
1184
+ );
1185
+ cleanupTempAgent(agentId);
1186
+ // Preserve the raw git output in the dispatch log for debugging — kept
1187
+ // as a side log line rather than the summary so the summary stays
1188
+ // actionable.
1189
+ if (rawErr && rawErr.message) {
1190
+ log('debug', `spawnAgent: raw worktree-add failure for ${id}: ${String(rawErr.message).split('\n')[0]}`);
1191
+ }
1192
+ };
1058
1193
  _phaseT.afterPrompt = Date.now();
1059
1194
 
1060
1195
  if (branchName && READ_ONLY_ROOT_TASK_TYPES.has(type)) {
@@ -1173,6 +1308,31 @@ async function spawnAgent(dispatchItem, config) {
1173
1308
  }
1174
1309
  }
1175
1310
 
1311
+ // Pre-add concurrency guard: refuse to attempt `git worktree add` for a
1312
+ // branch that's already in flight under another dispatch. The
1313
+ // post-failure activelyUsed check on the non-shared path further down
1314
+ // still fires for the legitimate-reuse-of-existing-worktree case, but
1315
+ // the pre-add check closes the race window where two concurrent
1316
+ // dispatches both call `git worktree add` on the same branch before
1317
+ // either notices the conflict — and it now also covers the
1318
+ // shared-branch path, which previously had no activelyUsed guard at
1319
+ // all (a human-triggered review on a shared-branch plan item could
1320
+ // blindly reuse the in-flight implement's worktree).
1321
+ const _activelyUsedByOther = _isBranchActivelyUsedByOtherDispatch(branchName, id);
1322
+ if (_activelyUsedByOther) {
1323
+ const summary = `branch ${branchName} is actively used by dispatch ${_activelyUsedByOther.id} — refusing to spawn a concurrent worktree against the same branch`;
1324
+ log('warn', `spawnAgent: ${summary}`);
1325
+ _cleanupPromptFiles();
1326
+ completeDispatch(
1327
+ id,
1328
+ DISPATCH_RESULT.ERROR,
1329
+ summary,
1330
+ 'Another dispatch holds this branch right now. This is retryable — the engine will try again on the next tick after the other dispatch completes.',
1331
+ );
1332
+ cleanupTempAgent(agentId);
1333
+ return null;
1334
+ }
1335
+
1176
1336
  try {
1177
1337
  if (!fs.existsSync(worktreePath)) {
1178
1338
  const isSharedBranch = meta?.branchStrategy === 'shared-branch' || meta?.useExistingBranch;
@@ -1183,7 +1343,7 @@ async function spawnAgent(dispatchItem, config) {
1183
1343
 
1184
1344
  if (isSharedBranch) {
1185
1345
  log('info', `Creating worktree for shared branch: ${worktreePath} on ${branchName}`);
1186
- try { await shared.shellSafeGit(['fetch', 'origin', branchName], { ..._gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
1346
+ await _fetchWithTransientRetry(['origin', branchName], { ..._gitOpts, cwd: rootDir }, `origin/${branchName} (shared-branch pre-create)`);
1187
1347
  try {
1188
1348
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
1189
1349
  } catch (eShared) {
@@ -1205,7 +1365,16 @@ async function spawnAgent(dispatchItem, config) {
1205
1365
  if (pruned > 0) {
1206
1366
  log('info', `Pruned ${pruned} stale worktree entry(ies) for shared branch ${branchName}; retrying worktree add`);
1207
1367
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, 0);
1208
- } else { throw eShared; }
1368
+ } else {
1369
+ // Prune returned 0 — the holder is real, not stale metadata.
1370
+ // findExistingWorktree returned null because the holder is
1371
+ // at-or-inside the project root (the deliberate filter), or
1372
+ // because git's view drifted from ours. Surface non-retryably
1373
+ // with the holder path so the operator can act.
1374
+ const holder = _parseAlreadyUsedHolderPath(eShared.message);
1375
+ if (holder) { _failBranchHeldByExternalWorktree(branchName, holder, eShared); return null; }
1376
+ throw eShared;
1377
+ }
1209
1378
  }
1210
1379
  } else if (eShared.message?.includes('invalid reference') || eShared.message?.includes('not a valid ref')) {
1211
1380
  // Branch doesn't exist yet (first item in plan) — create it from main
@@ -1235,7 +1404,7 @@ async function spawnAgent(dispatchItem, config) {
1235
1404
  if (_branchOnRemote) {
1236
1405
  // Mirror shared-branch fetch+add (~line 1157-1159).
1237
1406
  log('info', `origin/${branchName} exists — checking out remote branch instead of -b from ${mainRef}`);
1238
- try { await shared.shellSafeGit(['fetch', 'origin', branchName], { ..._gitOpts, cwd: rootDir, timeout: 30000 }); } catch (e) { log('warn', 'git: ' + e.message); }
1407
+ await _fetchWithTransientRetry(['origin', branchName], { ..._gitOpts, cwd: rootDir, timeout: 30000 }, `origin/${branchName} (non-shared pre-create)`);
1239
1408
  try {
1240
1409
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
1241
1410
  } catch (eRemote) {
@@ -1267,7 +1436,14 @@ async function spawnAgent(dispatchItem, config) {
1267
1436
  if (pruned > 0) {
1268
1437
  log('info', `Pruned ${pruned} stale worktree entry(ies) for ${branchName}; retrying worktree add`);
1269
1438
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, 0);
1270
- } else { throw eRemote; }
1439
+ } else {
1440
+ // Same recovery shape as the shared-branch path above:
1441
+ // prune was a no-op, so the holder is a real worktree.
1442
+ // Mark non-retryable with the holder path in the message.
1443
+ const holder = _parseAlreadyUsedHolderPath(eRemote.message);
1444
+ if (holder) { _failBranchHeldByExternalWorktree(branchName, holder, eRemote); return null; }
1445
+ throw eRemote;
1446
+ }
1271
1447
  }
1272
1448
  } else { throw eRemote; }
1273
1449
  }
@@ -1285,12 +1461,12 @@ async function spawnAgent(dispatchItem, config) {
1285
1461
  // the dep-merge phase's own fetch + the on-failure
1286
1462
  // `git reset --hard origin/<mainRef>` recovery remain as safety nets.
1287
1463
  let _freshCreateBase = mainRef;
1288
- try {
1289
- await shared.shellSafeGit(['fetch', 'origin', mainRef], { ..._gitOpts, cwd: rootDir, timeout: 30000 });
1290
- _freshCreateBase = `origin/${mainRef}`;
1291
- } catch (mainFetchErr) {
1292
- log('warn', `Failed to fetch origin/${mainRef} before fresh-create worktree for ${branchName}: ${mainFetchErr.message} — falling back to local ${mainRef}`);
1293
- }
1464
+ const _baseFetchOk = await _fetchWithTransientRetry(
1465
+ ['origin', mainRef],
1466
+ { ..._gitOpts, cwd: rootDir, timeout: 30000 },
1467
+ `origin/${mainRef} (fresh-create base for ${branchName})`
1468
+ );
1469
+ if (_baseFetchOk) _freshCreateBase = `origin/${mainRef}`;
1294
1470
  try {
1295
1471
  await runWorktreeAdd(rootDir, worktreePath, ['-b', branchName, _freshCreateBase], _worktreeGitOpts, worktreeCreateRetries);
1296
1472
  } catch (e1) {
@@ -1311,7 +1487,7 @@ async function spawnAgent(dispatchItem, config) {
1311
1487
  }
1312
1488
  } else {
1313
1489
  // Branch already exists — try checkout without -b
1314
- try { await shared.shellSafeGit(['fetch', 'origin', branchName], { ..._gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
1490
+ await _fetchWithTransientRetry(['origin', branchName], { ..._gitOpts, cwd: rootDir }, `origin/${branchName} (fresh-create fallback)`);
1315
1491
  try {
1316
1492
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
1317
1493
  log('info', `Reusing existing branch: ${branchName}`);
@@ -5134,6 +5310,39 @@ function discoverFromWorkItems(config, project) {
5134
5310
  const hardPinnedAgent = routing.getHardPinnedAgent(item, config.agents || {});
5135
5311
  const hardPinRequested = !!hardPinnedAgent;
5136
5312
  let agentId = hardPinnedAgent || resolveAgent(workType, config, { agentHints });
5313
+ // W-mpmwxn1j — Per-agent retry threshold. When the same agent has failed
5314
+ // this WI maxRetriesPerAgent times, force-reassign to a different
5315
+ // eligible agent. Hard-pinned agents bypass reassignment (operator
5316
+ // intent wins). If no alternate is available we fall back to the same
5317
+ // agent + write a dated inbox note so the operator can intervene.
5318
+ if (agentId && !hardPinRequested) {
5319
+ const maxPerAgent = shared.resolveMaxRetriesPerAgent(config);
5320
+ const failsForAgent = shared.getAgentRetryCount(item, agentId);
5321
+ if (failsForAgent >= maxPerAgent) {
5322
+ const altAgent = resolveAgent(workType, config, { agentHints, excludeAgent: agentId });
5323
+ if (altAgent && altAgent !== agentId) {
5324
+ log('info', `Per-agent retry threshold reached: ${item.id} reassigning ${agentId} → ${altAgent} (${failsForAgent}/${maxPerAgent} failures by ${agentId})`);
5325
+ agentId = altAgent;
5326
+ } else {
5327
+ // No alternate available — log + inbox note (writeToInbox dedupes
5328
+ // per-day-per-slug, so re-runs in the same day stay quiet).
5329
+ log('warn', `Per-agent retry threshold reached for ${item.id} (${agentId}) but no alternate agent available for work type "${workType}" — falling back to same agent`);
5330
+ try {
5331
+ shared.writeToInbox('engine', `per-agent-retry-no-alternate-${item.id}`,
5332
+ `# Per-agent retry threshold — no alternate available\n\n` +
5333
+ `Work item: \`${item.id}\` — ${item.title || ''}\n\n` +
5334
+ `Agent **${agentId}** has failed this WI ${failsForAgent} times ` +
5335
+ `(threshold: ${maxPerAgent}). The engine attempted to reassign to a ` +
5336
+ `different eligible agent for work type **${workType}** but no alternate was found ` +
5337
+ `(routing.md preferred/fallback both excluded, no idle named agent, ` +
5338
+ `temp agents disabled or budget exhausted). Re-dispatching to **${agentId}** anyway ` +
5339
+ `to avoid deadlock.\n\n` +
5340
+ `Action: review routing.md, add another agent for this work type, or enable ` +
5341
+ `\`allowTempAgents\` so the engine has a fallback target.\n`);
5342
+ } catch (e) { log('warn', 'per-agent retry inbox write failed: ' + e.message); }
5343
+ }
5344
+ }
5345
+ }
5137
5346
  let reservedAgentId = agentId;
5138
5347
  const cfgAgents = config.agents || {};
5139
5348
  const budgetBlocked = Object.keys(cfgAgents).some(id => {
@@ -6339,6 +6548,26 @@ async function tickInner() {
6339
6548
  // 2. Consolidate inbox
6340
6549
  safe('consolidateInbox', () => consolidateInbox(config));
6341
6550
 
6551
+ // 2.1. Auto-consolidate memory — opt-in periodic KB sweep. Inbox→notes
6552
+ // already runs above every tick (threshold-gated); this phase only adds
6553
+ // the KB sweep that was previously dashboard-button-only. Gated by
6554
+ // engine.autoConsolidateMemory; 4h cadence enforced inside shouldAutoSweep().
6555
+ if (config.engine?.autoConsolidateMemory === true) {
6556
+ safe('autoSweepKb', () => {
6557
+ const { shouldAutoSweep, spawnSweepRunnerDetached } = require('./engine/kb-sweep');
6558
+ const decision = shouldAutoSweep();
6559
+ if (!decision.shouldSpawn) return;
6560
+ const result = spawnSweepRunnerDetached({
6561
+ log: (level, msg) => log(level === 'error' ? 'warn' : 'info', `auto-sweep: ${msg}`),
6562
+ });
6563
+ if (result.ok) {
6564
+ log('info', `auto-sweep: spawned KB sweep (reason=${decision.reason}, pid=${result.pid})`);
6565
+ } else {
6566
+ log('warn', `auto-sweep: spawn failed: ${result.error}`);
6567
+ }
6568
+ });
6569
+ }
6570
+
6342
6571
  // 2.5. Periodic cleanup + MCP sync (every 10 ticks = ~5 minutes)
6343
6572
  if (tickCount % 10 === 0) {
6344
6573
  try { await runCleanup(config); } catch (e) { log('warn', `runCleanup: ${e.message}`); }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.2044",
3
+ "version": "0.1.2045",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"