@yemi33/minions 0.1.2044 → 0.1.2046

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/engine.js CHANGED
@@ -630,6 +630,26 @@ async function syncReusedWorktree(rootDir, worktreePath, branchName, gitOpts = {
630
630
  }
631
631
 
632
632
  // Find an existing worktree already checked out on a given branch
633
+ // Parse the holder path out of a `git worktree add` "already used" error so
634
+ // callers can give the operator something actionable instead of just re-
635
+ // throwing the raw git message. The two surface forms emitted by current git
636
+ // versions:
637
+ //
638
+ // fatal: 'work/W-X' is already used by worktree at 'D:/squad'
639
+ // fatal: 'work/W-X' is already checked out at 'D:/squad'
640
+ //
641
+ // Returns null on unparseable input so callers fall through to the generic
642
+ // re-throw path safely. Path is returned exactly as git printed it (forward
643
+ // or back slashes), since callers compare it against rootDir which is
644
+ // shaped the same way by upstream.
645
+ function _parseAlreadyUsedHolderPath(errMsg) {
646
+ if (!errMsg) return null;
647
+ // Match the path inside single quotes after either "already used by worktree at"
648
+ // or "already checked out at". Non-greedy so we stop at the closing quote.
649
+ const m = String(errMsg).match(/already (?:used by worktree|checked out) at ['"]([^'"]+)['"]/);
650
+ return m ? m[1] : null;
651
+ }
652
+
633
653
  async function findExistingWorktree(repoDir, branchName) {
634
654
  try {
635
655
  const out = await shared.shellSafeGit(['worktree', 'list', '--porcelain'], { cwd: repoDir, timeout: 10000 });
@@ -658,6 +678,65 @@ function isWorktreeRetryableError(err) {
658
678
  || msg.includes('already exists');
659
679
  }
660
680
 
681
+ // Distinct from isWorktreeRetryableError above: that one covers worktree-add
682
+ // LOCAL contention (index lock, file busy, already exists). This one covers
683
+ // `git fetch` NETWORK-side transients where retrying once is cheap and often
684
+ // recovers — network blips, transient TLS errors, github.com 5xx during a
685
+ // fetch. We deliberately do NOT retry on auth failures, repo-not-found, or
686
+ // "couldn't find remote ref" — those won't change on a 1-2s retry, and
687
+ // re-running them just wastes the dispatch budget.
688
+ function _isTransientGitNetworkError(err) {
689
+ const msg = String(err?.message || '');
690
+ return msg.includes('ETIMEDOUT')
691
+ || msg.includes('ECONNRESET')
692
+ || msg.includes('ECONNREFUSED')
693
+ || msg.includes('EAI_AGAIN')
694
+ || msg.includes('Could not resolve host')
695
+ || msg.includes('Connection reset')
696
+ || msg.includes('Connection timed out')
697
+ || msg.includes('Operation timed out')
698
+ || msg.includes('timed out')
699
+ || msg.includes('502 Bad Gateway')
700
+ || msg.includes('503 Service Unavailable')
701
+ || msg.includes('504 Gateway Timeout')
702
+ || msg.includes('RPC failed')
703
+ || msg.includes('HTTP/2 stream');
704
+ }
705
+
706
+ // Wraps shellSafeGit(['fetch', ...]) with a single retry on transient
707
+ // network errors. Preserves the pre-fix convention that fetch failures
708
+ // are NON-fatal here: we log + swallow the terminal error so the caller
709
+ // falls back to the local ref. The retry just gives transient errors one
710
+ // shot to recover before that fallback kicks in — empirically eliminates
711
+ // most "couldn't find remote ref" cascades downstream caused by a 0-second
712
+ // network blip during the initial fetch.
713
+ //
714
+ // `args` is the array passed to git AFTER 'fetch' (e.g. ['origin', branch]).
715
+ // `label` is a short string used in the log line for triage.
716
+ async function _fetchWithTransientRetry(args, opts, label) {
717
+ const _attempt = async () => shared.shellSafeGit(['fetch', ...args], opts);
718
+ try {
719
+ await _attempt();
720
+ return true;
721
+ } catch (e) {
722
+ if (!_isTransientGitNetworkError(e)) {
723
+ log('warn', `git fetch ${label}: ${String(e.message || e).split('\n')[0]}`);
724
+ return false; // swallow non-transient — caller falls back to local ref
725
+ }
726
+ const firstMsg = String(e.message || e).split('\n')[0];
727
+ log('warn', `git fetch ${label}: transient (${firstMsg}) — retrying once after 1.5s`);
728
+ await new Promise(r => setTimeout(r, 1500));
729
+ try {
730
+ await _attempt();
731
+ log('info', `git fetch ${label}: succeeded on retry`);
732
+ return true;
733
+ } catch (e2) {
734
+ log('warn', `git fetch ${label}: retry also failed (${String(e2.message || e2).split('\n')[0]}) — falling back to local ref`);
735
+ return false;
736
+ }
737
+ }
738
+ }
739
+
661
740
  function removeStaleIndexLock(rootDir) {
662
741
  const lockFile = path.join(rootDir, '.git', 'index.lock');
663
742
  try {
@@ -1055,6 +1134,62 @@ async function spawnAgent(dispatchItem, config) {
1055
1134
  );
1056
1135
  cleanupTempAgent(agentId);
1057
1136
  };
1137
+ // Atomic read of the dispatch state to detect when another active
1138
+ // dispatch holds `branchName`. Returns the conflicting dispatch object
1139
+ // (or null when none). Uses mutateDispatch for the file-lock read even
1140
+ // though we don't mutate — the snapshot it gives back is consistent
1141
+ // with what a concurrent writer would see. Caller uses the return value
1142
+ // both as a boolean (truthy = conflict) and to extract the other
1143
+ // dispatch's id for the error message.
1144
+ const _isBranchActivelyUsedByOtherDispatch = (branch, currentId) => {
1145
+ if (!branch || !currentId) return null;
1146
+ const targetBranch = sanitizeBranch(branch);
1147
+ let conflict = null;
1148
+ mutateDispatch((dp) => {
1149
+ conflict = (dp.active || []).find(d => {
1150
+ const dBranch = d.meta?.branch ? sanitizeBranch(d.meta.branch) : '';
1151
+ return dBranch === targetBranch && d.id !== currentId;
1152
+ }) || null;
1153
+ return dp;
1154
+ });
1155
+ return conflict;
1156
+ };
1157
+ // Fail-fast handler for the "branch is checked out elsewhere AND prune
1158
+ // couldn't recover it" case. This is the bug class operators hit when the
1159
+ // project root itself (or a sibling worktree we can't see via
1160
+ // findExistingWorktree) is genuinely holding the branch — the existing
1161
+ // "throw eRemote" path retried every tick forever with no surface signal,
1162
+ // and findExistingWorktree deliberately filters out the project root so
1163
+ // the reuse path never fires for that case. Mark the dispatch non-
1164
+ // retryable so the storm stops, and put the holder path in the message
1165
+ // so the operator can see what to do (`git -C <holder> checkout master`
1166
+ // when the holder is the project root, or kill/finish the agent owning
1167
+ // the sibling worktree). Returns truthy when caller should `return null`.
1168
+ const _failBranchHeldByExternalWorktree = (branchName, holderPath, rawErr) => {
1169
+ const isProjectRoot = holderPath && rootDir
1170
+ && path.resolve(holderPath) === path.resolve(rootDir);
1171
+ const summary = isProjectRoot
1172
+ ? `Branch ${branchName} is held by the project root (${holderPath}). Spawning a worktree against it would create a nested checkout. Switch the root back to master to release the branch.`
1173
+ : `Branch ${branchName} is held by an external worktree at ${holderPath} that this engine can't reach (findExistingWorktree filters out paths at-or-inside the project root, and the holder isn't visible to our worktree list). Resolve the lock and retry.`;
1174
+ log('error', `spawnAgent: ${summary}`);
1175
+ _cleanupPromptFiles();
1176
+ completeDispatch(
1177
+ id,
1178
+ DISPATCH_RESULT.ERROR,
1179
+ summary.slice(0, 800),
1180
+ isProjectRoot
1181
+ ? 'Branch held by the project root checkout. Recovery: `git -C <projectRoot> checkout master` (or whichever branch your engine root should be on). Until the root releases the branch, every retry will fail identically.'
1182
+ : 'Branch held externally. Recovery: find the holding worktree (`git worktree list` from the project root), finish or remove it, then re-dispatch.',
1183
+ { failureClass: FAILURE_CLASS.WORKTREE_PREFLIGHT, agentRetryable: false },
1184
+ );
1185
+ cleanupTempAgent(agentId);
1186
+ // Preserve the raw git output in the dispatch log for debugging — kept
1187
+ // as a side log line rather than the summary so the summary stays
1188
+ // actionable.
1189
+ if (rawErr && rawErr.message) {
1190
+ log('debug', `spawnAgent: raw worktree-add failure for ${id}: ${String(rawErr.message).split('\n')[0]}`);
1191
+ }
1192
+ };
1058
1193
  _phaseT.afterPrompt = Date.now();
1059
1194
 
1060
1195
  if (branchName && READ_ONLY_ROOT_TASK_TYPES.has(type)) {
@@ -1173,6 +1308,31 @@ async function spawnAgent(dispatchItem, config) {
1173
1308
  }
1174
1309
  }
1175
1310
 
1311
+ // Pre-add concurrency guard: refuse to attempt `git worktree add` for a
1312
+ // branch that's already in flight under another dispatch. The
1313
+ // post-failure activelyUsed check on the non-shared path further down
1314
+ // still fires for the legitimate-reuse-of-existing-worktree case, but
1315
+ // the pre-add check closes the race window where two concurrent
1316
+ // dispatches both call `git worktree add` on the same branch before
1317
+ // either notices the conflict — and it now also covers the
1318
+ // shared-branch path, which previously had no activelyUsed guard at
1319
+ // all (a human-triggered review on a shared-branch plan item could
1320
+ // blindly reuse the in-flight implement's worktree).
1321
+ const _activelyUsedByOther = _isBranchActivelyUsedByOtherDispatch(branchName, id);
1322
+ if (_activelyUsedByOther) {
1323
+ const summary = `branch ${branchName} is actively used by dispatch ${_activelyUsedByOther.id} — refusing to spawn a concurrent worktree against the same branch`;
1324
+ log('warn', `spawnAgent: ${summary}`);
1325
+ _cleanupPromptFiles();
1326
+ completeDispatch(
1327
+ id,
1328
+ DISPATCH_RESULT.ERROR,
1329
+ summary,
1330
+ 'Another dispatch holds this branch right now. This is retryable — the engine will try again on the next tick after the other dispatch completes.',
1331
+ );
1332
+ cleanupTempAgent(agentId);
1333
+ return null;
1334
+ }
1335
+
1176
1336
  try {
1177
1337
  if (!fs.existsSync(worktreePath)) {
1178
1338
  const isSharedBranch = meta?.branchStrategy === 'shared-branch' || meta?.useExistingBranch;
@@ -1183,7 +1343,7 @@ async function spawnAgent(dispatchItem, config) {
1183
1343
 
1184
1344
  if (isSharedBranch) {
1185
1345
  log('info', `Creating worktree for shared branch: ${worktreePath} on ${branchName}`);
1186
- try { await shared.shellSafeGit(['fetch', 'origin', branchName], { ..._gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
1346
+ await _fetchWithTransientRetry(['origin', branchName], { ..._gitOpts, cwd: rootDir }, `origin/${branchName} (shared-branch pre-create)`);
1187
1347
  try {
1188
1348
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
1189
1349
  } catch (eShared) {
@@ -1205,7 +1365,16 @@ async function spawnAgent(dispatchItem, config) {
1205
1365
  if (pruned > 0) {
1206
1366
  log('info', `Pruned ${pruned} stale worktree entry(ies) for shared branch ${branchName}; retrying worktree add`);
1207
1367
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, 0);
1208
- } else { throw eShared; }
1368
+ } else {
1369
+ // Prune returned 0 — the holder is real, not stale metadata.
1370
+ // findExistingWorktree returned null because the holder is
1371
+ // at-or-inside the project root (the deliberate filter), or
1372
+ // because git's view drifted from ours. Surface non-retryably
1373
+ // with the holder path so the operator can act.
1374
+ const holder = _parseAlreadyUsedHolderPath(eShared.message);
1375
+ if (holder) { _failBranchHeldByExternalWorktree(branchName, holder, eShared); return null; }
1376
+ throw eShared;
1377
+ }
1209
1378
  }
1210
1379
  } else if (eShared.message?.includes('invalid reference') || eShared.message?.includes('not a valid ref')) {
1211
1380
  // Branch doesn't exist yet (first item in plan) — create it from main
@@ -1235,7 +1404,7 @@ async function spawnAgent(dispatchItem, config) {
1235
1404
  if (_branchOnRemote) {
1236
1405
  // Mirror shared-branch fetch+add (~line 1157-1159).
1237
1406
  log('info', `origin/${branchName} exists — checking out remote branch instead of -b from ${mainRef}`);
1238
- try { await shared.shellSafeGit(['fetch', 'origin', branchName], { ..._gitOpts, cwd: rootDir, timeout: 30000 }); } catch (e) { log('warn', 'git: ' + e.message); }
1407
+ await _fetchWithTransientRetry(['origin', branchName], { ..._gitOpts, cwd: rootDir, timeout: 30000 }, `origin/${branchName} (non-shared pre-create)`);
1239
1408
  try {
1240
1409
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
1241
1410
  } catch (eRemote) {
@@ -1267,7 +1436,14 @@ async function spawnAgent(dispatchItem, config) {
1267
1436
  if (pruned > 0) {
1268
1437
  log('info', `Pruned ${pruned} stale worktree entry(ies) for ${branchName}; retrying worktree add`);
1269
1438
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, 0);
1270
- } else { throw eRemote; }
1439
+ } else {
1440
+ // Same recovery shape as the shared-branch path above:
1441
+ // prune was a no-op, so the holder is a real worktree.
1442
+ // Mark non-retryable with the holder path in the message.
1443
+ const holder = _parseAlreadyUsedHolderPath(eRemote.message);
1444
+ if (holder) { _failBranchHeldByExternalWorktree(branchName, holder, eRemote); return null; }
1445
+ throw eRemote;
1446
+ }
1271
1447
  }
1272
1448
  } else { throw eRemote; }
1273
1449
  }
@@ -1285,12 +1461,12 @@ async function spawnAgent(dispatchItem, config) {
1285
1461
  // the dep-merge phase's own fetch + the on-failure
1286
1462
  // `git reset --hard origin/<mainRef>` recovery remain as safety nets.
1287
1463
  let _freshCreateBase = mainRef;
1288
- try {
1289
- await shared.shellSafeGit(['fetch', 'origin', mainRef], { ..._gitOpts, cwd: rootDir, timeout: 30000 });
1290
- _freshCreateBase = `origin/${mainRef}`;
1291
- } catch (mainFetchErr) {
1292
- log('warn', `Failed to fetch origin/${mainRef} before fresh-create worktree for ${branchName}: ${mainFetchErr.message} — falling back to local ${mainRef}`);
1293
- }
1464
+ const _baseFetchOk = await _fetchWithTransientRetry(
1465
+ ['origin', mainRef],
1466
+ { ..._gitOpts, cwd: rootDir, timeout: 30000 },
1467
+ `origin/${mainRef} (fresh-create base for ${branchName})`
1468
+ );
1469
+ if (_baseFetchOk) _freshCreateBase = `origin/${mainRef}`;
1294
1470
  try {
1295
1471
  await runWorktreeAdd(rootDir, worktreePath, ['-b', branchName, _freshCreateBase], _worktreeGitOpts, worktreeCreateRetries);
1296
1472
  } catch (e1) {
@@ -1311,7 +1487,7 @@ async function spawnAgent(dispatchItem, config) {
1311
1487
  }
1312
1488
  } else {
1313
1489
  // Branch already exists — try checkout without -b
1314
- try { await shared.shellSafeGit(['fetch', 'origin', branchName], { ..._gitOpts, cwd: rootDir }); } catch (e) { log('warn', 'git: ' + e.message); }
1490
+ await _fetchWithTransientRetry(['origin', branchName], { ..._gitOpts, cwd: rootDir }, `origin/${branchName} (fresh-create fallback)`);
1315
1491
  try {
1316
1492
  await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
1317
1493
  log('info', `Reusing existing branch: ${branchName}`);
@@ -5134,6 +5310,39 @@ function discoverFromWorkItems(config, project) {
5134
5310
  const hardPinnedAgent = routing.getHardPinnedAgent(item, config.agents || {});
5135
5311
  const hardPinRequested = !!hardPinnedAgent;
5136
5312
  let agentId = hardPinnedAgent || resolveAgent(workType, config, { agentHints });
5313
+ // W-mpmwxn1j — Per-agent retry threshold. When the same agent has failed
5314
+ // this WI maxRetriesPerAgent times, force-reassign to a different
5315
+ // eligible agent. Hard-pinned agents bypass reassignment (operator
5316
+ // intent wins). If no alternate is available we fall back to the same
5317
+ // agent + write a dated inbox note so the operator can intervene.
5318
+ if (agentId && !hardPinRequested) {
5319
+ const maxPerAgent = shared.resolveMaxRetriesPerAgent(config);
5320
+ const failsForAgent = shared.getAgentRetryCount(item, agentId);
5321
+ if (failsForAgent >= maxPerAgent) {
5322
+ const altAgent = resolveAgent(workType, config, { agentHints, excludeAgent: agentId });
5323
+ if (altAgent && altAgent !== agentId) {
5324
+ log('info', `Per-agent retry threshold reached: ${item.id} reassigning ${agentId} → ${altAgent} (${failsForAgent}/${maxPerAgent} failures by ${agentId})`);
5325
+ agentId = altAgent;
5326
+ } else {
5327
+ // No alternate available — log + inbox note (writeToInbox dedupes
5328
+ // per-day-per-slug, so re-runs in the same day stay quiet).
5329
+ log('warn', `Per-agent retry threshold reached for ${item.id} (${agentId}) but no alternate agent available for work type "${workType}" — falling back to same agent`);
5330
+ try {
5331
+ shared.writeToInbox('engine', `per-agent-retry-no-alternate-${item.id}`,
5332
+ `# Per-agent retry threshold — no alternate available\n\n` +
5333
+ `Work item: \`${item.id}\` — ${item.title || ''}\n\n` +
5334
+ `Agent **${agentId}** has failed this WI ${failsForAgent} times ` +
5335
+ `(threshold: ${maxPerAgent}). The engine attempted to reassign to a ` +
5336
+ `different eligible agent for work type **${workType}** but no alternate was found ` +
5337
+ `(routing.md preferred/fallback both excluded, no idle named agent, ` +
5338
+ `temp agents disabled or budget exhausted). Re-dispatching to **${agentId}** anyway ` +
5339
+ `to avoid deadlock.\n\n` +
5340
+ `Action: review routing.md, add another agent for this work type, or enable ` +
5341
+ `\`allowTempAgents\` so the engine has a fallback target.\n`);
5342
+ } catch (e) { log('warn', 'per-agent retry inbox write failed: ' + e.message); }
5343
+ }
5344
+ }
5345
+ }
5137
5346
  let reservedAgentId = agentId;
5138
5347
  const cfgAgents = config.agents || {};
5139
5348
  const budgetBlocked = Object.keys(cfgAgents).some(id => {
@@ -6339,6 +6548,26 @@ async function tickInner() {
6339
6548
  // 2. Consolidate inbox
6340
6549
  safe('consolidateInbox', () => consolidateInbox(config));
6341
6550
 
6551
+ // 2.1. Auto-consolidate memory — opt-in periodic KB sweep. Inbox→notes
6552
+ // already runs above every tick (threshold-gated); this phase only adds
6553
+ // the KB sweep that was previously dashboard-button-only. Gated by
6554
+ // engine.autoConsolidateMemory; 4h cadence enforced inside shouldAutoSweep().
6555
+ if (config.engine?.autoConsolidateMemory === true) {
6556
+ safe('autoSweepKb', () => {
6557
+ const { shouldAutoSweep, spawnSweepRunnerDetached } = require('./engine/kb-sweep');
6558
+ const decision = shouldAutoSweep();
6559
+ if (!decision.shouldSpawn) return;
6560
+ const result = spawnSweepRunnerDetached({
6561
+ log: (level, msg) => log(level === 'error' ? 'warn' : 'info', `auto-sweep: ${msg}`),
6562
+ });
6563
+ if (result.ok) {
6564
+ log('info', `auto-sweep: spawned KB sweep (reason=${decision.reason}, pid=${result.pid})`);
6565
+ } else {
6566
+ log('warn', `auto-sweep: spawn failed: ${result.error}`);
6567
+ }
6568
+ });
6569
+ }
6570
+
6342
6571
  // 2.5. Periodic cleanup + MCP sync (every 10 ticks = ~5 minutes)
6343
6572
  if (tickCount % 10 === 0) {
6344
6573
  try { await runCleanup(config); } catch (e) { log('warn', `runCleanup: ${e.message}`); }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yemi33/minions",
3
- "version": "0.1.2044",
3
+ "version": "0.1.2046",
4
4
  "description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
5
5
  "bin": {
6
6
  "minions": "bin/minions.js"