@yemi33/minions 0.1.2044 → 0.1.2045
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/js/command-center.js +64 -7
- package/dashboard/js/refresh.js +143 -2
- package/dashboard/js/render-prs.js +43 -9
- package/dashboard/js/settings.js +4 -0
- package/dashboard/styles.css +21 -0
- package/dashboard.js +21 -79
- package/docs/auto-discovery.md +3 -1
- package/docs/qa-runbook-lifecycle.md +71 -0
- package/docs/qa-runbooks.md +6 -5
- package/docs/runtime-adapters.md +1 -1
- package/docs/security.md +2 -1
- package/docs/watches.md +19 -19
- package/engine/cleanup.js +84 -2
- package/engine/dispatch.js +6 -0
- package/engine/kb-sweep.js +127 -0
- package/engine/lifecycle.js +18 -0
- package/engine/queries.js +84 -7
- package/engine/shared.js +36 -0
- package/engine/timeout.js +4 -0
- package/engine.js +240 -11
- package/package.json +1 -1
package/engine/queries.js
CHANGED
|
@@ -1772,8 +1772,17 @@ function _scheduleProjectGitStatusRefresh(localPath, key, configuredMainBranch)
|
|
|
1772
1772
|
if (existing && existing.promise) return existing.promise;
|
|
1773
1773
|
const entry = existing || { ts: 0, value: PROJECT_GIT_STATUS_PENDING, promise: null };
|
|
1774
1774
|
const prevValue = entry.value;
|
|
1775
|
+
// Capture probe-start time BEFORE running git, not after. Used as the
|
|
1776
|
+
// baseline for `_projectGitRefsAdvancedSince` on the next call. If we
|
|
1777
|
+
// captured probe-END time, a file written just before the probe started
|
|
1778
|
+
// could end up with `mtimeMs >= entry.ts` on a fast filesystem (NTFS
|
|
1779
|
+
// mtime granularity vs millisecond-precise Date.now()), busting the
|
|
1780
|
+
// cache spuriously on the very next read. Probe-START is the safer
|
|
1781
|
+
// anchor — any file with `mtimeMs > probeStartTs` legitimately changed
|
|
1782
|
+
// at-or-after the probe, so re-probing is correct.
|
|
1783
|
+
const probeStartTs = Date.now();
|
|
1775
1784
|
entry.promise = _probeProjectGitStatus(localPath, configuredMainBranch).then(value => {
|
|
1776
|
-
entry.ts =
|
|
1785
|
+
entry.ts = probeStartTs;
|
|
1777
1786
|
entry.value = value;
|
|
1778
1787
|
entry.promise = null;
|
|
1779
1788
|
if (_onProjectGitStatusChanged && !_projectGitStatusEqual(prevValue, value)) {
|
|
@@ -1824,6 +1833,30 @@ function _resolveGitDir(localPath) {
|
|
|
1824
1833
|
return null;
|
|
1825
1834
|
}
|
|
1826
1835
|
|
|
1836
|
+
// For a per-worktree gitdir, resolve the COMMON gitdir shared across all
|
|
1837
|
+
// linked worktrees of the same repo. Git writes per-worktree state
|
|
1838
|
+
// (HEAD, logs/HEAD, index) into the worktree-specific gitdir, but state
|
|
1839
|
+
// that's logically per-repo (objects, refs/remotes/, FETCH_HEAD) into
|
|
1840
|
+
// the common gitdir. The common location is recorded in
|
|
1841
|
+
// `<per-worktree-gitdir>/commondir`, a file containing either an
|
|
1842
|
+
// absolute path or a path relative to the per-worktree gitdir
|
|
1843
|
+
// (typically `../..` — resolves from `<main>/.git/worktrees/<name>` →
|
|
1844
|
+
// `<main>/.git`). For the main worktree there is no commondir file and
|
|
1845
|
+
// `gitDir` already IS the common gitdir, so this function returns
|
|
1846
|
+
// `gitDir` unchanged. Callers should track FETCH_HEAD and
|
|
1847
|
+
// refs/remotes/origin/<comparator> against the COMMON gitdir; logs/HEAD
|
|
1848
|
+
// must stay on the per-worktree gitdir so per-worktree HEAD moves still
|
|
1849
|
+
// register.
|
|
1850
|
+
function _resolveCommonGitDir(gitDir) {
|
|
1851
|
+
if (!gitDir) return null;
|
|
1852
|
+
const commondirPath = path.join(gitDir, 'commondir');
|
|
1853
|
+
let raw = '';
|
|
1854
|
+
try { raw = fs.readFileSync(commondirPath, { encoding: 'utf8', flag: 'r' }).trim(); }
|
|
1855
|
+
catch { return gitDir; }
|
|
1856
|
+
if (!raw) return gitDir;
|
|
1857
|
+
return path.isAbsolute(raw) ? raw : path.resolve(gitDir, raw);
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1827
1860
|
// Return true when any of the per-project git ref files (logs/HEAD,
|
|
1828
1861
|
// FETCH_HEAD, refs/remotes/origin/<comparator>) have mtimeMs > cachedTs.
|
|
1829
1862
|
// Lets `getProjectGitStatus` bypass its 15s TTL after `git pull`, `git
|
|
@@ -1836,13 +1869,17 @@ function _resolveGitDir(localPath) {
|
|
|
1836
1869
|
function _projectGitRefsAdvancedSince(localPath, cachedTs, configuredMainBranch) {
|
|
1837
1870
|
const gitDir = _resolveGitDir(localPath);
|
|
1838
1871
|
if (!gitDir) return false;
|
|
1872
|
+
// logs/HEAD is per-worktree; FETCH_HEAD + refs/remotes/origin/* live in
|
|
1873
|
+
// the COMMON gitdir for linked worktrees. For the main worktree both
|
|
1874
|
+
// resolve to the same place, so this is a no-op there.
|
|
1875
|
+
const commonGitDir = _resolveCommonGitDir(gitDir);
|
|
1839
1876
|
const candidates = [
|
|
1840
1877
|
path.join(gitDir, 'logs', 'HEAD'),
|
|
1841
|
-
path.join(
|
|
1878
|
+
path.join(commonGitDir, 'FETCH_HEAD'),
|
|
1842
1879
|
];
|
|
1843
1880
|
const comparator = configuredMainBranch && String(configuredMainBranch).trim();
|
|
1844
1881
|
if (comparator) {
|
|
1845
|
-
candidates.push(path.join(
|
|
1882
|
+
candidates.push(path.join(commonGitDir, 'refs', 'remotes', 'origin', comparator));
|
|
1846
1883
|
}
|
|
1847
1884
|
for (const file of candidates) {
|
|
1848
1885
|
try {
|
|
@@ -2052,8 +2089,16 @@ function getStatusFastStateMtimePaths(config) {
|
|
|
2052
2089
|
files.push(shared.projectPrPath(p));
|
|
2053
2090
|
if (p && p.localPath) {
|
|
2054
2091
|
const gitDir = _resolveGitDir(p.localPath) || path.join(p.localPath, '.git');
|
|
2092
|
+
// logs/HEAD is per-worktree (HEAD moves, commits, checkouts).
|
|
2093
|
+
// FETCH_HEAD lives in the COMMON gitdir — `git fetch` from a linked
|
|
2094
|
+
// worktree writes to `<main>/.git/FETCH_HEAD`, not to the
|
|
2095
|
+
// per-worktree subdir. Tracking only the per-worktree path here
|
|
2096
|
+
// would leave linked-worktree projects stuck after `git fetch`
|
|
2097
|
+
// (the file at `<main>/.git/worktrees/<name>/FETCH_HEAD` never
|
|
2098
|
+
// exists — verified empirically).
|
|
2099
|
+
const commonGitDir = _resolveCommonGitDir(gitDir);
|
|
2055
2100
|
files.push(path.join(gitDir, 'logs', 'HEAD'));
|
|
2056
|
-
files.push(path.join(
|
|
2101
|
+
files.push(path.join(commonGitDir, 'FETCH_HEAD'));
|
|
2057
2102
|
}
|
|
2058
2103
|
}
|
|
2059
2104
|
return files;
|
|
@@ -2100,9 +2145,25 @@ function getStatusFastStateMtimePaths(config) {
|
|
|
2100
2145
|
* Files intentionally NOT tracked here:
|
|
2101
2146
|
* - version, autoMode, installId — change only on human/CLI edits, which
|
|
2102
2147
|
* already pop the slow-state via reloadConfig + the 60 s TTL.
|
|
2103
|
-
*
|
|
2104
|
-
*
|
|
2105
|
-
*
|
|
2148
|
+
*
|
|
2149
|
+
* Per-project `.git/logs/HEAD` + `.git/FETCH_HEAD` ARE tracked here in
|
|
2150
|
+
* addition to the fast-state tracker. The project payload (`projects:`
|
|
2151
|
+
* with ahead/behind counts) lives in the slow-state slice, and
|
|
2152
|
+
* `getProjectGitStatus` is called only from `_buildStatusSlowState`.
|
|
2153
|
+
* Without tracking the git refs here, a fresh `git pull` busts
|
|
2154
|
+
* fast-state but leaves slow-state cached for up to 60 s —
|
|
2155
|
+
* `getProjectGitStatus` is never called, so the background probe never
|
|
2156
|
+
* runs, so the `_setOnProjectGitStatusChanged` callback never fires,
|
|
2157
|
+
* so the cache stays stale. Tracking the refs in BOTH tiers means any
|
|
2158
|
+
* HEAD-moving / fetching git operation busts slow-state on the next
|
|
2159
|
+
* poll, the rebuild calls `getProjectGitStatus` which then schedules
|
|
2160
|
+
* the probe, and the probe's invalidation callback finishes the
|
|
2161
|
+
* round-trip within ~1 poll cycle. `logs/HEAD` is resolved via
|
|
2162
|
+
* `_resolveGitDir` (per-worktree); `FETCH_HEAD` is resolved via
|
|
2163
|
+
* `_resolveCommonGitDir` (shared across all linked worktrees of the
|
|
2164
|
+
* same repo — `git fetch` from a linked worktree writes to the main
|
|
2165
|
+
* gitdir, not the per-worktree subdir, so tracking the per-worktree
|
|
2166
|
+
* path would silently miss every fetch).
|
|
2106
2167
|
*
|
|
2107
2168
|
* NOTE: Detecting a change here busts the dashboard's slow-state cache, but
|
|
2108
2169
|
* the inner per-source caches (`queries._skillsCache` 30 s, dashboard's
|
|
@@ -2170,6 +2231,22 @@ function getStatusSlowStateMtimePaths(config) {
|
|
|
2170
2231
|
}
|
|
2171
2232
|
}
|
|
2172
2233
|
|
|
2234
|
+
// Per-project git refs — see the "Per-project .git/logs/HEAD" note in
|
|
2235
|
+
// the header. Same pair the fast-state tracker watches; `logs/HEAD` is
|
|
2236
|
+
// per-worktree (resolved via `_resolveGitDir`) while `FETCH_HEAD` lives
|
|
2237
|
+
// in the COMMON gitdir (resolved via `_resolveCommonGitDir`) — git
|
|
2238
|
+
// fetches from a linked worktree write to `<main>/.git/FETCH_HEAD`,
|
|
2239
|
+
// not into the per-worktree subdir. For the main worktree both
|
|
2240
|
+
// resolvers return the same gitdir, so the behavior collapses to the
|
|
2241
|
+
// expected `<localPath>/.git/{logs/HEAD,FETCH_HEAD}` pair.
|
|
2242
|
+
for (const project of projects) {
|
|
2243
|
+
if (!project || !project.localPath) continue;
|
|
2244
|
+
const gitDir = _resolveGitDir(project.localPath) || path.join(project.localPath, '.git');
|
|
2245
|
+
const commonGitDir = _resolveCommonGitDir(gitDir);
|
|
2246
|
+
files.push(path.join(gitDir, 'logs', 'HEAD'));
|
|
2247
|
+
files.push(path.join(commonGitDir, 'FETCH_HEAD'));
|
|
2248
|
+
}
|
|
2249
|
+
|
|
2173
2250
|
return files;
|
|
2174
2251
|
}
|
|
2175
2252
|
|
package/engine/shared.js
CHANGED
|
@@ -1779,6 +1779,7 @@ const ENGINE_DEFAULTS = {
|
|
|
1779
1779
|
autoReReviewPrs: true, // auto-dispatch review agents after a PR fix is pushed
|
|
1780
1780
|
autoFixReviewFeedback: true, // auto-dispatch fix agents for minions review changes-requested verdicts
|
|
1781
1781
|
autoFixHumanComments: true, // auto-dispatch fix agents for actionable human PR comments
|
|
1782
|
+
autoConsolidateMemory: false, // opt-in: periodically spawn engine/kb-sweep-runner.js from the tick loop (4h cadence). Inbox→notes consolidation already runs every tick via consolidateInbox; this flag only controls the KB sweep.
|
|
1782
1783
|
prNoOpFixPauseAttempts: 2, // pause one PR automation cause after repeated no-op fixes for unchanged evidence
|
|
1783
1784
|
completionReportRetentionDays: 90, // retain completion report sidecars beyond capped dispatch history
|
|
1784
1785
|
completionReportMaxFiles: 5000, // hard cap for completion report sidecars during cleanup
|
|
@@ -1787,6 +1788,7 @@ const ENGINE_DEFAULTS = {
|
|
|
1787
1788
|
evalLoop: true, // enable review→fix loop after implementation completes
|
|
1788
1789
|
evalMaxCost: null, // USD ceiling per work item across all eval iterations; null = no limit (gather baseline data first)
|
|
1789
1790
|
maxRetries: 3, // max dispatch retries before marking work item as failed
|
|
1791
|
+
maxRetriesPerAgent: 2, // W-mpmwxn1j — per-agent retry cap. When the SAME agent fails the same WI this many times, the next retry MUST reassign to a different eligible agent (consults routing.md + agent availability). Falls back to the same agent only when no alternate is available. Counted separately from `maxRetries` (which caps total retries across all agents) and tracked on the WI as `_retriesByAgent: { agentId: count }`. Hard-pinned agents bypass reassignment (operator intent wins).
|
|
1790
1792
|
maxPhantomRetries: 3, // max retries for "phantom completion" (runtime crashed before emitting type:"result"); tracked separately from _retryCount so phantom retries don't pollute the normal PR-attachment retry budget. See engine/lifecycle.markMissingPrAttachment + detectNonTerminalResultSummary.
|
|
1791
1793
|
minRetryGapMs: 120000, // 2min — minimum gap between retry dispatches for the same work item; prevents tight retry loops when an idempotent agent (e.g. review bailing out on a duplicate) cannot produce the expected output (#1770)
|
|
1792
1794
|
pipelineApiRetries: 2, // max attempts for pipeline API calls
|
|
@@ -4752,6 +4754,39 @@ function safeSlugComponent(text, maxLen = 80) {
|
|
|
4752
4754
|
return `${base}-${hash}`.slice(0, maxLen);
|
|
4753
4755
|
}
|
|
4754
4756
|
|
|
4757
|
+
// W-mpmwxn1j — Per-agent retry tracking. After the same agent has failed a WI
|
|
4758
|
+
// `ENGINE_DEFAULTS.maxRetriesPerAgent` times, the next dispatch must reassign
|
|
4759
|
+
// to a different eligible agent. The counter is stored on the WI itself
|
|
4760
|
+
// (`_retriesByAgent: { agentId: count }`) so engine restart preserves the
|
|
4761
|
+
// state. The counter is cleared on successful completion alongside
|
|
4762
|
+
// `_retryCount`. The caller passes a mutable WI object (inside a
|
|
4763
|
+
// `mutateWorkItems` / `mutateJsonFileLocked` callback) and the failed agent ID.
|
|
4764
|
+
// Anonymous failures (no resolvable agent) skip the bump — we'd corrupt the
|
|
4765
|
+
// shape with an `undefined` key. Returns the new per-agent count for logging.
|
|
4766
|
+
function bumpAgentRetryCount(wi, agentId) {
|
|
4767
|
+
if (!wi || !agentId) return 0;
|
|
4768
|
+
if (!wi._retriesByAgent || typeof wi._retriesByAgent !== 'object' || Array.isArray(wi._retriesByAgent)) {
|
|
4769
|
+
wi._retriesByAgent = {};
|
|
4770
|
+
}
|
|
4771
|
+
const next = (Number(wi._retriesByAgent[agentId]) || 0) + 1;
|
|
4772
|
+
wi._retriesByAgent[agentId] = next;
|
|
4773
|
+
return next;
|
|
4774
|
+
}
|
|
4775
|
+
|
|
4776
|
+
function getAgentRetryCount(wi, agentId) {
|
|
4777
|
+
if (!wi || !agentId) return 0;
|
|
4778
|
+
const map = wi._retriesByAgent;
|
|
4779
|
+
if (!map || typeof map !== 'object' || Array.isArray(map)) return 0;
|
|
4780
|
+
return Number(map[agentId]) || 0;
|
|
4781
|
+
}
|
|
4782
|
+
|
|
4783
|
+
function resolveMaxRetriesPerAgent(config) {
|
|
4784
|
+
const raw = config?.engine?.maxRetriesPerAgent;
|
|
4785
|
+
const val = Number(raw);
|
|
4786
|
+
if (Number.isFinite(val) && val > 0) return val;
|
|
4787
|
+
return ENGINE_DEFAULTS.maxRetriesPerAgent;
|
|
4788
|
+
}
|
|
4789
|
+
|
|
4755
4790
|
const PR_AUTOMATION_CAUSE_LIMIT = 50;
|
|
4756
4791
|
|
|
4757
4792
|
function getPrAutomationCauses(pr) {
|
|
@@ -4912,6 +4947,7 @@ module.exports = {
|
|
|
4912
4947
|
tailTextBytes,
|
|
4913
4948
|
appendTextTail,
|
|
4914
4949
|
writeToInbox, parseNoteId,
|
|
4950
|
+
bumpAgentRetryCount, getAgentRetryCount, resolveMaxRetriesPerAgent,
|
|
4915
4951
|
exec,
|
|
4916
4952
|
execAsync,
|
|
4917
4953
|
execSilent,
|
package/engine/timeout.js
CHANGED
|
@@ -587,6 +587,10 @@ function checkTimeouts(config) {
|
|
|
587
587
|
log('info', `Reconcile: work item ${item.id} agent died — auto-retry ${retries + 1}/${maxRetries}`);
|
|
588
588
|
item.status = WI_STATUS.PENDING;
|
|
589
589
|
item._retryCount = retries + 1;
|
|
590
|
+
// W-mpmwxn1j — bump per-agent retry count BEFORE deleting
|
|
591
|
+
// dispatched_to so the next dispatch can reassign away from the
|
|
592
|
+
// agent that died. Anonymous (no dispatched_to) failures don't bump.
|
|
593
|
+
if (item.dispatched_to) shared.bumpAgentRetryCount(item, item.dispatched_to);
|
|
590
594
|
delete item.dispatched_at;
|
|
591
595
|
delete item.dispatched_to;
|
|
592
596
|
delete item._pendingReason;
|
package/engine.js
CHANGED
|
@@ -630,6 +630,26 @@ async function syncReusedWorktree(rootDir, worktreePath, branchName, gitOpts = {
|
|
|
630
630
|
}
|
|
631
631
|
|
|
632
632
|
// Find an existing worktree already checked out on a given branch
|
|
633
|
+
// Parse the holder path out of a `git worktree add` "already used" error so
|
|
634
|
+
// callers can give the operator something actionable instead of just re-
|
|
635
|
+
// throwing the raw git message. The two surface forms emitted by current git
|
|
636
|
+
// versions:
|
|
637
|
+
//
|
|
638
|
+
// fatal: 'work/W-X' is already used by worktree at 'D:/squad'
|
|
639
|
+
// fatal: 'work/W-X' is already checked out at 'D:/squad'
|
|
640
|
+
//
|
|
641
|
+
// Returns null on unparseable input so callers fall through to the generic
|
|
642
|
+
// re-throw path safely. Path is returned exactly as git printed it (forward
|
|
643
|
+
// or back slashes), since callers compare it against rootDir which is
|
|
644
|
+
// shaped the same way by upstream.
|
|
645
|
+
function _parseAlreadyUsedHolderPath(errMsg) {
|
|
646
|
+
if (!errMsg) return null;
|
|
647
|
+
// Match the path inside single quotes after either "already used by worktree at"
|
|
648
|
+
// or "already checked out at". Non-greedy so we stop at the closing quote.
|
|
649
|
+
const m = String(errMsg).match(/already (?:used by worktree|checked out) at ['"]([^'"]+)['"]/);
|
|
650
|
+
return m ? m[1] : null;
|
|
651
|
+
}
|
|
652
|
+
|
|
633
653
|
async function findExistingWorktree(repoDir, branchName) {
|
|
634
654
|
try {
|
|
635
655
|
const out = await shared.shellSafeGit(['worktree', 'list', '--porcelain'], { cwd: repoDir, timeout: 10000 });
|
|
@@ -658,6 +678,65 @@ function isWorktreeRetryableError(err) {
|
|
|
658
678
|
|| msg.includes('already exists');
|
|
659
679
|
}
|
|
660
680
|
|
|
681
|
+
// Distinct from isWorktreeRetryableError above: that one covers worktree-add
|
|
682
|
+
// LOCAL contention (index lock, file busy, already exists). This one covers
|
|
683
|
+
// `git fetch` NETWORK-side transients where retrying once is cheap and often
|
|
684
|
+
// recovers — network blips, transient TLS errors, github.com 5xx during a
|
|
685
|
+
// fetch. We deliberately do NOT retry on auth failures, repo-not-found, or
|
|
686
|
+
// "couldn't find remote ref" — those won't change on a 1-2s retry, and
|
|
687
|
+
// re-running them just wastes the dispatch budget.
|
|
688
|
+
function _isTransientGitNetworkError(err) {
|
|
689
|
+
const msg = String(err?.message || '');
|
|
690
|
+
return msg.includes('ETIMEDOUT')
|
|
691
|
+
|| msg.includes('ECONNRESET')
|
|
692
|
+
|| msg.includes('ECONNREFUSED')
|
|
693
|
+
|| msg.includes('EAI_AGAIN')
|
|
694
|
+
|| msg.includes('Could not resolve host')
|
|
695
|
+
|| msg.includes('Connection reset')
|
|
696
|
+
|| msg.includes('Connection timed out')
|
|
697
|
+
|| msg.includes('Operation timed out')
|
|
698
|
+
|| msg.includes('timed out')
|
|
699
|
+
|| msg.includes('502 Bad Gateway')
|
|
700
|
+
|| msg.includes('503 Service Unavailable')
|
|
701
|
+
|| msg.includes('504 Gateway Timeout')
|
|
702
|
+
|| msg.includes('RPC failed')
|
|
703
|
+
|| msg.includes('HTTP/2 stream');
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
// Wraps shellSafeGit(['fetch', ...]) with a single retry on transient
|
|
707
|
+
// network errors. Preserves the pre-fix convention that fetch failures
|
|
708
|
+
// are NON-fatal here: we log + swallow the terminal error so the caller
|
|
709
|
+
// falls back to the local ref. The retry just gives transient errors one
|
|
710
|
+
// shot to recover before that fallback kicks in — empirically eliminates
|
|
711
|
+
// most "couldn't find remote ref" cascades downstream caused by a 0-second
|
|
712
|
+
// network blip during the initial fetch.
|
|
713
|
+
//
|
|
714
|
+
// `args` is the array passed to git AFTER 'fetch' (e.g. ['origin', branch]).
|
|
715
|
+
// `label` is a short string used in the log line for triage.
|
|
716
|
+
async function _fetchWithTransientRetry(args, opts, label) {
|
|
717
|
+
const _attempt = async () => shared.shellSafeGit(['fetch', ...args], opts);
|
|
718
|
+
try {
|
|
719
|
+
await _attempt();
|
|
720
|
+
return true;
|
|
721
|
+
} catch (e) {
|
|
722
|
+
if (!_isTransientGitNetworkError(e)) {
|
|
723
|
+
log('warn', `git fetch ${label}: ${String(e.message || e).split('\n')[0]}`);
|
|
724
|
+
return false; // swallow non-transient — caller falls back to local ref
|
|
725
|
+
}
|
|
726
|
+
const firstMsg = String(e.message || e).split('\n')[0];
|
|
727
|
+
log('warn', `git fetch ${label}: transient (${firstMsg}) — retrying once after 1.5s`);
|
|
728
|
+
await new Promise(r => setTimeout(r, 1500));
|
|
729
|
+
try {
|
|
730
|
+
await _attempt();
|
|
731
|
+
log('info', `git fetch ${label}: succeeded on retry`);
|
|
732
|
+
return true;
|
|
733
|
+
} catch (e2) {
|
|
734
|
+
log('warn', `git fetch ${label}: retry also failed (${String(e2.message || e2).split('\n')[0]}) — falling back to local ref`);
|
|
735
|
+
return false;
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
}
|
|
739
|
+
|
|
661
740
|
function removeStaleIndexLock(rootDir) {
|
|
662
741
|
const lockFile = path.join(rootDir, '.git', 'index.lock');
|
|
663
742
|
try {
|
|
@@ -1055,6 +1134,62 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1055
1134
|
);
|
|
1056
1135
|
cleanupTempAgent(agentId);
|
|
1057
1136
|
};
|
|
1137
|
+
// Atomic read of the dispatch state to detect when another active
|
|
1138
|
+
// dispatch holds `branchName`. Returns the conflicting dispatch object
|
|
1139
|
+
// (or null when none). Uses mutateDispatch for the file-lock read even
|
|
1140
|
+
// though we don't mutate — the snapshot it gives back is consistent
|
|
1141
|
+
// with what a concurrent writer would see. Caller uses the return value
|
|
1142
|
+
// both as a boolean (truthy = conflict) and to extract the other
|
|
1143
|
+
// dispatch's id for the error message.
|
|
1144
|
+
const _isBranchActivelyUsedByOtherDispatch = (branch, currentId) => {
|
|
1145
|
+
if (!branch || !currentId) return null;
|
|
1146
|
+
const targetBranch = sanitizeBranch(branch);
|
|
1147
|
+
let conflict = null;
|
|
1148
|
+
mutateDispatch((dp) => {
|
|
1149
|
+
conflict = (dp.active || []).find(d => {
|
|
1150
|
+
const dBranch = d.meta?.branch ? sanitizeBranch(d.meta.branch) : '';
|
|
1151
|
+
return dBranch === targetBranch && d.id !== currentId;
|
|
1152
|
+
}) || null;
|
|
1153
|
+
return dp;
|
|
1154
|
+
});
|
|
1155
|
+
return conflict;
|
|
1156
|
+
};
|
|
1157
|
+
// Fail-fast handler for the "branch is checked out elsewhere AND prune
|
|
1158
|
+
// couldn't recover it" case. This is the bug class operators hit when the
|
|
1159
|
+
// project root itself (or a sibling worktree we can't see via
|
|
1160
|
+
// findExistingWorktree) is genuinely holding the branch — the existing
|
|
1161
|
+
// "throw eRemote" path retried every tick forever with no surface signal,
|
|
1162
|
+
// and findExistingWorktree deliberately filters out the project root so
|
|
1163
|
+
// the reuse path never fires for that case. Mark the dispatch non-
|
|
1164
|
+
// retryable so the storm stops, and put the holder path in the message
|
|
1165
|
+
// so the operator can see what to do (`git -C <holder> checkout master`
|
|
1166
|
+
// when the holder is the project root, or kill/finish the agent owning
|
|
1167
|
+
// the sibling worktree). Returns truthy when caller should `return null`.
|
|
1168
|
+
const _failBranchHeldByExternalWorktree = (branchName, holderPath, rawErr) => {
|
|
1169
|
+
const isProjectRoot = holderPath && rootDir
|
|
1170
|
+
&& path.resolve(holderPath) === path.resolve(rootDir);
|
|
1171
|
+
const summary = isProjectRoot
|
|
1172
|
+
? `Branch ${branchName} is held by the project root (${holderPath}). Spawning a worktree against it would create a nested checkout. Switch the root back to master to release the branch.`
|
|
1173
|
+
: `Branch ${branchName} is held by an external worktree at ${holderPath} that this engine can't reach (findExistingWorktree filters out paths at-or-inside the project root, and the holder isn't visible to our worktree list). Resolve the lock and retry.`;
|
|
1174
|
+
log('error', `spawnAgent: ${summary}`);
|
|
1175
|
+
_cleanupPromptFiles();
|
|
1176
|
+
completeDispatch(
|
|
1177
|
+
id,
|
|
1178
|
+
DISPATCH_RESULT.ERROR,
|
|
1179
|
+
summary.slice(0, 800),
|
|
1180
|
+
isProjectRoot
|
|
1181
|
+
? 'Branch held by the project root checkout. Recovery: `git -C <projectRoot> checkout master` (or whichever branch your engine root should be on). Until the root releases the branch, every retry will fail identically.'
|
|
1182
|
+
: 'Branch held externally. Recovery: find the holding worktree (`git worktree list` from the project root), finish or remove it, then re-dispatch.',
|
|
1183
|
+
{ failureClass: FAILURE_CLASS.WORKTREE_PREFLIGHT, agentRetryable: false },
|
|
1184
|
+
);
|
|
1185
|
+
cleanupTempAgent(agentId);
|
|
1186
|
+
// Preserve the raw git output in the dispatch log for debugging — kept
|
|
1187
|
+
// as a side log line rather than the summary so the summary stays
|
|
1188
|
+
// actionable.
|
|
1189
|
+
if (rawErr && rawErr.message) {
|
|
1190
|
+
log('debug', `spawnAgent: raw worktree-add failure for ${id}: ${String(rawErr.message).split('\n')[0]}`);
|
|
1191
|
+
}
|
|
1192
|
+
};
|
|
1058
1193
|
_phaseT.afterPrompt = Date.now();
|
|
1059
1194
|
|
|
1060
1195
|
if (branchName && READ_ONLY_ROOT_TASK_TYPES.has(type)) {
|
|
@@ -1173,6 +1308,31 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1173
1308
|
}
|
|
1174
1309
|
}
|
|
1175
1310
|
|
|
1311
|
+
// Pre-add concurrency guard: refuse to attempt `git worktree add` for a
|
|
1312
|
+
// branch that's already in flight under another dispatch. The
|
|
1313
|
+
// post-failure activelyUsed check on the non-shared path further down
|
|
1314
|
+
// still fires for the legitimate-reuse-of-existing-worktree case, but
|
|
1315
|
+
// the pre-add check closes the race window where two concurrent
|
|
1316
|
+
// dispatches both call `git worktree add` on the same branch before
|
|
1317
|
+
// either notices the conflict — and it now also covers the
|
|
1318
|
+
// shared-branch path, which previously had no activelyUsed guard at
|
|
1319
|
+
// all (a human-triggered review on a shared-branch plan item could
|
|
1320
|
+
// blindly reuse the in-flight implement's worktree).
|
|
1321
|
+
const _activelyUsedByOther = _isBranchActivelyUsedByOtherDispatch(branchName, id);
|
|
1322
|
+
if (_activelyUsedByOther) {
|
|
1323
|
+
const summary = `branch ${branchName} is actively used by dispatch ${_activelyUsedByOther.id} — refusing to spawn a concurrent worktree against the same branch`;
|
|
1324
|
+
log('warn', `spawnAgent: ${summary}`);
|
|
1325
|
+
_cleanupPromptFiles();
|
|
1326
|
+
completeDispatch(
|
|
1327
|
+
id,
|
|
1328
|
+
DISPATCH_RESULT.ERROR,
|
|
1329
|
+
summary,
|
|
1330
|
+
'Another dispatch holds this branch right now. This is retryable — the engine will try again on the next tick after the other dispatch completes.',
|
|
1331
|
+
);
|
|
1332
|
+
cleanupTempAgent(agentId);
|
|
1333
|
+
return null;
|
|
1334
|
+
}
|
|
1335
|
+
|
|
1176
1336
|
try {
|
|
1177
1337
|
if (!fs.existsSync(worktreePath)) {
|
|
1178
1338
|
const isSharedBranch = meta?.branchStrategy === 'shared-branch' || meta?.useExistingBranch;
|
|
@@ -1183,7 +1343,7 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1183
1343
|
|
|
1184
1344
|
if (isSharedBranch) {
|
|
1185
1345
|
log('info', `Creating worktree for shared branch: ${worktreePath} on ${branchName}`);
|
|
1186
|
-
|
|
1346
|
+
await _fetchWithTransientRetry(['origin', branchName], { ..._gitOpts, cwd: rootDir }, `origin/${branchName} (shared-branch pre-create)`);
|
|
1187
1347
|
try {
|
|
1188
1348
|
await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
|
|
1189
1349
|
} catch (eShared) {
|
|
@@ -1205,7 +1365,16 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1205
1365
|
if (pruned > 0) {
|
|
1206
1366
|
log('info', `Pruned ${pruned} stale worktree entry(ies) for shared branch ${branchName}; retrying worktree add`);
|
|
1207
1367
|
await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, 0);
|
|
1208
|
-
} else {
|
|
1368
|
+
} else {
|
|
1369
|
+
// Prune returned 0 — the holder is real, not stale metadata.
|
|
1370
|
+
// findExistingWorktree returned null because the holder is
|
|
1371
|
+
// at-or-inside the project root (the deliberate filter), or
|
|
1372
|
+
// because git's view drifted from ours. Surface non-retryably
|
|
1373
|
+
// with the holder path so the operator can act.
|
|
1374
|
+
const holder = _parseAlreadyUsedHolderPath(eShared.message);
|
|
1375
|
+
if (holder) { _failBranchHeldByExternalWorktree(branchName, holder, eShared); return null; }
|
|
1376
|
+
throw eShared;
|
|
1377
|
+
}
|
|
1209
1378
|
}
|
|
1210
1379
|
} else if (eShared.message?.includes('invalid reference') || eShared.message?.includes('not a valid ref')) {
|
|
1211
1380
|
// Branch doesn't exist yet (first item in plan) — create it from main
|
|
@@ -1235,7 +1404,7 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1235
1404
|
if (_branchOnRemote) {
|
|
1236
1405
|
// Mirror shared-branch fetch+add (~line 1157-1159).
|
|
1237
1406
|
log('info', `origin/${branchName} exists — checking out remote branch instead of -b from ${mainRef}`);
|
|
1238
|
-
|
|
1407
|
+
await _fetchWithTransientRetry(['origin', branchName], { ..._gitOpts, cwd: rootDir, timeout: 30000 }, `origin/${branchName} (non-shared pre-create)`);
|
|
1239
1408
|
try {
|
|
1240
1409
|
await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
|
|
1241
1410
|
} catch (eRemote) {
|
|
@@ -1267,7 +1436,14 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1267
1436
|
if (pruned > 0) {
|
|
1268
1437
|
log('info', `Pruned ${pruned} stale worktree entry(ies) for ${branchName}; retrying worktree add`);
|
|
1269
1438
|
await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, 0);
|
|
1270
|
-
} else {
|
|
1439
|
+
} else {
|
|
1440
|
+
// Same recovery shape as the shared-branch path above:
|
|
1441
|
+
// prune was a no-op, so the holder is a real worktree.
|
|
1442
|
+
// Mark non-retryable with the holder path in the message.
|
|
1443
|
+
const holder = _parseAlreadyUsedHolderPath(eRemote.message);
|
|
1444
|
+
if (holder) { _failBranchHeldByExternalWorktree(branchName, holder, eRemote); return null; }
|
|
1445
|
+
throw eRemote;
|
|
1446
|
+
}
|
|
1271
1447
|
}
|
|
1272
1448
|
} else { throw eRemote; }
|
|
1273
1449
|
}
|
|
@@ -1285,12 +1461,12 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1285
1461
|
// the dep-merge phase's own fetch + the on-failure
|
|
1286
1462
|
// `git reset --hard origin/<mainRef>` recovery remain as safety nets.
|
|
1287
1463
|
let _freshCreateBase = mainRef;
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
}
|
|
1464
|
+
const _baseFetchOk = await _fetchWithTransientRetry(
|
|
1465
|
+
['origin', mainRef],
|
|
1466
|
+
{ ..._gitOpts, cwd: rootDir, timeout: 30000 },
|
|
1467
|
+
`origin/${mainRef} (fresh-create base for ${branchName})`
|
|
1468
|
+
);
|
|
1469
|
+
if (_baseFetchOk) _freshCreateBase = `origin/${mainRef}`;
|
|
1294
1470
|
try {
|
|
1295
1471
|
await runWorktreeAdd(rootDir, worktreePath, ['-b', branchName, _freshCreateBase], _worktreeGitOpts, worktreeCreateRetries);
|
|
1296
1472
|
} catch (e1) {
|
|
@@ -1311,7 +1487,7 @@ async function spawnAgent(dispatchItem, config) {
|
|
|
1311
1487
|
}
|
|
1312
1488
|
} else {
|
|
1313
1489
|
// Branch already exists — try checkout without -b
|
|
1314
|
-
|
|
1490
|
+
await _fetchWithTransientRetry(['origin', branchName], { ..._gitOpts, cwd: rootDir }, `origin/${branchName} (fresh-create fallback)`);
|
|
1315
1491
|
try {
|
|
1316
1492
|
await runWorktreeAdd(rootDir, worktreePath, [branchName], _worktreeGitOpts, worktreeCreateRetries);
|
|
1317
1493
|
log('info', `Reusing existing branch: ${branchName}`);
|
|
@@ -5134,6 +5310,39 @@ function discoverFromWorkItems(config, project) {
|
|
|
5134
5310
|
const hardPinnedAgent = routing.getHardPinnedAgent(item, config.agents || {});
|
|
5135
5311
|
const hardPinRequested = !!hardPinnedAgent;
|
|
5136
5312
|
let agentId = hardPinnedAgent || resolveAgent(workType, config, { agentHints });
|
|
5313
|
+
// W-mpmwxn1j — Per-agent retry threshold. When the same agent has failed
|
|
5314
|
+
// this WI maxRetriesPerAgent times, force-reassign to a different
|
|
5315
|
+
// eligible agent. Hard-pinned agents bypass reassignment (operator
|
|
5316
|
+
// intent wins). If no alternate is available we fall back to the same
|
|
5317
|
+
// agent + write a dated inbox note so the operator can intervene.
|
|
5318
|
+
if (agentId && !hardPinRequested) {
|
|
5319
|
+
const maxPerAgent = shared.resolveMaxRetriesPerAgent(config);
|
|
5320
|
+
const failsForAgent = shared.getAgentRetryCount(item, agentId);
|
|
5321
|
+
if (failsForAgent >= maxPerAgent) {
|
|
5322
|
+
const altAgent = resolveAgent(workType, config, { agentHints, excludeAgent: agentId });
|
|
5323
|
+
if (altAgent && altAgent !== agentId) {
|
|
5324
|
+
log('info', `Per-agent retry threshold reached: ${item.id} reassigning ${agentId} → ${altAgent} (${failsForAgent}/${maxPerAgent} failures by ${agentId})`);
|
|
5325
|
+
agentId = altAgent;
|
|
5326
|
+
} else {
|
|
5327
|
+
// No alternate available — log + inbox note (writeToInbox dedupes
|
|
5328
|
+
// per-day-per-slug, so re-runs in the same day stay quiet).
|
|
5329
|
+
log('warn', `Per-agent retry threshold reached for ${item.id} (${agentId}) but no alternate agent available for work type "${workType}" — falling back to same agent`);
|
|
5330
|
+
try {
|
|
5331
|
+
shared.writeToInbox('engine', `per-agent-retry-no-alternate-${item.id}`,
|
|
5332
|
+
`# Per-agent retry threshold — no alternate available\n\n` +
|
|
5333
|
+
`Work item: \`${item.id}\` — ${item.title || ''}\n\n` +
|
|
5334
|
+
`Agent **${agentId}** has failed this WI ${failsForAgent} times ` +
|
|
5335
|
+
`(threshold: ${maxPerAgent}). The engine attempted to reassign to a ` +
|
|
5336
|
+
`different eligible agent for work type **${workType}** but no alternate was found ` +
|
|
5337
|
+
`(routing.md preferred/fallback both excluded, no idle named agent, ` +
|
|
5338
|
+
`temp agents disabled or budget exhausted). Re-dispatching to **${agentId}** anyway ` +
|
|
5339
|
+
`to avoid deadlock.\n\n` +
|
|
5340
|
+
`Action: review routing.md, add another agent for this work type, or enable ` +
|
|
5341
|
+
`\`allowTempAgents\` so the engine has a fallback target.\n`);
|
|
5342
|
+
} catch (e) { log('warn', 'per-agent retry inbox write failed: ' + e.message); }
|
|
5343
|
+
}
|
|
5344
|
+
}
|
|
5345
|
+
}
|
|
5137
5346
|
let reservedAgentId = agentId;
|
|
5138
5347
|
const cfgAgents = config.agents || {};
|
|
5139
5348
|
const budgetBlocked = Object.keys(cfgAgents).some(id => {
|
|
@@ -6339,6 +6548,26 @@ async function tickInner() {
|
|
|
6339
6548
|
// 2. Consolidate inbox
|
|
6340
6549
|
safe('consolidateInbox', () => consolidateInbox(config));
|
|
6341
6550
|
|
|
6551
|
+
// 2.1. Auto-consolidate memory — opt-in periodic KB sweep. Inbox→notes
|
|
6552
|
+
// already runs above every tick (threshold-gated); this phase only adds
|
|
6553
|
+
// the KB sweep that was previously dashboard-button-only. Gated by
|
|
6554
|
+
// engine.autoConsolidateMemory; 4h cadence enforced inside shouldAutoSweep().
|
|
6555
|
+
if (config.engine?.autoConsolidateMemory === true) {
|
|
6556
|
+
safe('autoSweepKb', () => {
|
|
6557
|
+
const { shouldAutoSweep, spawnSweepRunnerDetached } = require('./engine/kb-sweep');
|
|
6558
|
+
const decision = shouldAutoSweep();
|
|
6559
|
+
if (!decision.shouldSpawn) return;
|
|
6560
|
+
const result = spawnSweepRunnerDetached({
|
|
6561
|
+
log: (level, msg) => log(level === 'error' ? 'warn' : 'info', `auto-sweep: ${msg}`),
|
|
6562
|
+
});
|
|
6563
|
+
if (result.ok) {
|
|
6564
|
+
log('info', `auto-sweep: spawned KB sweep (reason=${decision.reason}, pid=${result.pid})`);
|
|
6565
|
+
} else {
|
|
6566
|
+
log('warn', `auto-sweep: spawn failed: ${result.error}`);
|
|
6567
|
+
}
|
|
6568
|
+
});
|
|
6569
|
+
}
|
|
6570
|
+
|
|
6342
6571
|
// 2.5. Periodic cleanup + MCP sync (every 10 ticks = ~5 minutes)
|
|
6343
6572
|
if (tickCount % 10 === 0) {
|
|
6344
6573
|
try { await runCleanup(config); } catch (e) { log('warn', `runCleanup: ${e.message}`); }
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@yemi33/minions",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2045",
|
|
4
4
|
"description": "Multi-agent AI dev team that runs from ~/.minions/ — five autonomous agents share a single engine, dashboard, and knowledge base",
|
|
5
5
|
"bin": {
|
|
6
6
|
"minions": "bin/minions.js"
|