@ikunin/sprintpilot 2.0.7 → 2.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -64,8 +64,25 @@ function parseLayer(raw) {
64
64
  }
65
65
 
66
66
  function planLayer({ keys, maxParallel, projectRoot, branchPrefix, baseBranch }) {
67
- const effectiveParallel = Math.max(1, Math.min(maxParallel | 0, keys.length));
68
- const worktrees = keys.map((key) => ({
67
+ // Dedupe story keys a duplicated key in --layer would otherwise
68
+ // produce two entries pointing at the same worktree path and same
69
+ // branch name, racing on `git worktree add`.
70
+ const seen = new Set();
71
+ const dedupedKeys = [];
72
+ for (const k of keys) {
73
+ if (seen.has(k)) continue;
74
+ seen.add(k);
75
+ dedupedKeys.push(k);
76
+ }
77
+ const effectiveParallel = Math.max(1, Math.min(maxParallel | 0, dedupedKeys.length));
78
+ // CAP: only dispatch the first `effectiveParallel` stories. The
79
+ // remaining keys are deferred — the autopilot loop will pick them up
80
+ // in the next iteration after this batch completes. Pre-2.0.8 the
81
+ // script created worktrees for ALL keys regardless of the cap, then
82
+ // the workflow spawned N agents anyway, fully ignoring --max-parallel.
83
+ const dispatchedKeys = dedupedKeys.slice(0, effectiveParallel);
84
+ const deferredKeys = dedupedKeys.slice(effectiveParallel);
85
+ const worktrees = dispatchedKeys.map((key) => ({
69
86
  story: key,
70
87
  worktree: path.join(projectRoot, '.worktrees', key),
71
88
  branch: `${branchPrefix}${key}`,
@@ -77,6 +94,7 @@ function planLayer({ keys, maxParallel, projectRoot, branchPrefix, baseBranch })
77
94
  effective_parallel: effectiveParallel,
78
95
  max_parallel: maxParallel,
79
96
  stories: worktrees,
97
+ deferred: deferredKeys,
80
98
  };
81
99
  }
82
100
 
@@ -90,8 +108,14 @@ function writePlan(projectRoot, plan) {
90
108
  return file;
91
109
  }
92
110
 
111
+ // Match git's "branch already exists" diagnostic. We retry without -b
112
+ // only when the FIRST attempt failed for this specific reason —
113
+ // pre-2.0.8 the bare retry fired on ANY first-attempt failure and
114
+ // silently checked out whatever stale branch happened to exist at the
115
+ // requested name (e.g. last week's commits from an abandoned story).
116
+ const BRANCH_EXISTS_RE = /a branch named .* already exists/i;
117
+
93
118
  function createWorktree({ projectRoot, worktree, branch, baseBranch }) {
94
- // Try -b first, fall back to checkout-existing-branch if already present.
95
119
  const args = ['worktree', 'add', worktree, '-b', branch];
96
120
  if (baseBranch) args.push(baseBranch);
97
121
  const first = spawnSync('git', ['-C', projectRoot, ...args], {
@@ -99,7 +123,16 @@ function createWorktree({ projectRoot, worktree, branch, baseBranch }) {
99
123
  stdio: ['ignore', 'pipe', 'pipe'],
100
124
  });
101
125
  if (first.status === 0) return { created: true, retried: false, stderr: first.stderr || '' };
102
- // Retry without -b (branch exists).
126
+ // Only retry without -b if git specifically reported the branch
127
+ // already exists. Any other error (path collision, missing base
128
+ // branch, dirty index, etc.) is propagated rather than masked.
129
+ if (!BRANCH_EXISTS_RE.test(first.stderr || '')) {
130
+ return {
131
+ created: false,
132
+ retried: false,
133
+ stderr: first.stderr || '',
134
+ };
135
+ }
103
136
  const second = spawnSync(
104
137
  'git',
105
138
  ['-C', projectRoot, 'worktree', 'add', worktree, branch],
@@ -112,26 +145,45 @@ function createWorktree({ projectRoot, worktree, branch, baseBranch }) {
112
145
  };
113
146
  }
114
147
 
148
+ // After a worktree is created, disable gc.auto on it. The sequential
149
+ // path in workflow.md does this at line 738; pre-2.0.8 the parallel
150
+ // path skipped it, so concurrent sub-agents in heavy repos could
151
+ // trigger gc on each worktree mid-dispatch. Best-effort — never block
152
+ // dispatch on a config write.
153
+ function disableGcAutoOnWorktree(worktree) {
154
+ spawnSync('git', ['-C', worktree, 'config', '--local', 'gc.auto', '0'], {
155
+ encoding: 'utf8',
156
+ stdio: ['ignore', 'pipe', 'pipe'],
157
+ });
158
+ }
159
+
160
+ // Roll back successful worktrees when a later create fails — leaves
161
+ // no orphaned worktrees on disk, no `.layer-plan.json` describing
162
+ // state that doesn't exist. Best-effort; rollback failures are logged
163
+ // but don't change the overall non-zero exit.
164
+ function rollbackWorktrees(projectRoot, created) {
165
+ for (const entry of created) {
166
+ const r = spawnSync(
167
+ 'git',
168
+ ['-C', projectRoot, 'worktree', 'remove', '--force', entry.worktree],
169
+ { encoding: 'utf8', stdio: ['ignore', 'pipe', 'pipe'] },
170
+ );
171
+ if (r.status !== 0) {
172
+ log.warn(`failed to roll back worktree ${entry.worktree}: ${r.stderr || 'unknown'}`);
173
+ }
174
+ }
175
+ }
176
+
115
177
  function dispatch({ keys, maxParallel, projectRoot, branchPrefix, baseBranch, dryRun }) {
116
178
  const plan = planLayer({ keys, maxParallel, projectRoot, branchPrefix, baseBranch });
117
179
  const results = {
118
180
  plan_file: null,
119
181
  effective_parallel: plan.effective_parallel,
120
182
  stories: [],
183
+ deferred: plan.deferred,
121
184
  dry_run: !!dryRun,
122
185
  };
123
- if (!dryRun) {
124
- for (const entry of plan.stories) {
125
- const out = createWorktree({
126
- projectRoot,
127
- worktree: entry.worktree,
128
- branch: entry.branch,
129
- baseBranch: entry.base_branch,
130
- });
131
- results.stories.push({ story: entry.story, worktree: entry.worktree, branch: entry.branch, ...out });
132
- }
133
- results.plan_file = writePlan(projectRoot, plan);
134
- } else {
186
+ if (dryRun) {
135
187
  results.stories = plan.stories.map((e) => ({
136
188
  story: e.story,
137
189
  worktree: e.worktree,
@@ -140,7 +192,60 @@ function dispatch({ keys, maxParallel, projectRoot, branchPrefix, baseBranch, dr
140
192
  retried: false,
141
193
  stderr: '(dry-run)',
142
194
  }));
195
+ return results;
196
+ }
197
+ // Real dispatch. Track successful creates so we can roll them back if
198
+ // a later create fails — leaving an orphan worktree + a plan file
199
+ // claiming it succeeded was the v2.0.7 partial-failure bug.
200
+ const succeeded = [];
201
+ let failureIndex = -1;
202
+ for (let i = 0; i < plan.stories.length; i++) {
203
+ const entry = plan.stories[i];
204
+ const out = createWorktree({
205
+ projectRoot,
206
+ worktree: entry.worktree,
207
+ branch: entry.branch,
208
+ baseBranch: entry.base_branch,
209
+ });
210
+ results.stories.push({
211
+ story: entry.story,
212
+ worktree: entry.worktree,
213
+ branch: entry.branch,
214
+ ...out,
215
+ });
216
+ if (out.created) {
217
+ disableGcAutoOnWorktree(entry.worktree);
218
+ succeeded.push(entry);
219
+ } else {
220
+ failureIndex = i;
221
+ break; // stop creating; remaining keys are not attempted
222
+ }
223
+ }
224
+ if (failureIndex !== -1) {
225
+ rollbackWorktrees(projectRoot, succeeded);
226
+ // Mark the previously-succeeded entries as rolled back so the
227
+ // workflow doesn't think their worktrees still exist on disk.
228
+ for (let i = 0; i < failureIndex; i++) {
229
+ results.stories[i].rolled_back = true;
230
+ results.stories[i].created = false;
231
+ }
232
+ // Mark untried-after-failure stories (the keys past failureIndex
233
+ // that we never attempted) so the workflow can see what's missing.
234
+ for (let i = failureIndex + 1; i < plan.stories.length; i++) {
235
+ results.stories.push({
236
+ story: plan.stories[i].story,
237
+ worktree: plan.stories[i].worktree,
238
+ branch: plan.stories[i].branch,
239
+ created: false,
240
+ retried: false,
241
+ stderr: '(skipped — earlier dispatch failed)',
242
+ });
243
+ }
244
+ // Do NOT write the plan file on partial failure — workflow.md
245
+ // should never read a plan describing worktrees that don't exist.
246
+ return results;
143
247
  }
248
+ results.plan_file = writePlan(projectRoot, plan);
144
249
  return results;
145
250
  }
146
251
 
@@ -53,11 +53,15 @@ const VALID_ACTIONS = ['start', 'end', 'once', 'mark'];
53
53
  // paths.
54
54
  //
55
55
  // Sanity ceiling for a single duration record. Phase durations longer
56
- // than this are treated as overflow (likely a forgotten _end across
57
- // sessions or a long-paused autopilot run) and clamped to 0 with
58
- // `over_threshold: true` stamped. 7 days chosen so legitimate
59
- // weekend-spanning sprint-level phases (sprint, dispatch.layer-X) are
60
- // preserved; only genuinely stale markers get clamped.
56
+ // than this are treated as overflow (likely a stale marker from an
57
+ // abandoned session) and clamped to 0 with `over_threshold: true`
58
+ // stamped. 7 days chosen so legitimate weekend-spanning sprint-level
59
+ // phases (sprint, dispatch.layer-X) are preserved; only genuinely
60
+ // stale markers get clamped.
61
+ //
62
+ // Negative deltas (wall-clock backsteps) are an orthogonal anomaly,
63
+ // flagged with `clock_skew: true` instead. The two flags are mutually
64
+ // exclusive — see the JSDoc on `markPhase`.
61
65
  const MAX_PLAUSIBLE_DURATION_MS = 7 * 24 * 60 * 60 * 1000;
62
66
 
63
67
  function help() {
@@ -315,10 +319,16 @@ function clearMarker(projectRoot, story) {
315
319
  * duration record but the next mark will read the new marker (not the
316
320
  * stale prev) and won't double-count.
317
321
  *
318
- * Wall-clock skew: durations are clamped to [0, MAX_PLAUSIBLE_DURATION_MS]
319
- * with a `clock_skew: true` flag in the entry so aggregators don't get
320
- * poisoned by NTP backsteps, DST transitions, or container clock skips
321
- * forward of unrealistic magnitudes (e.g. "this skill ran for 7 hours").
322
+ * Two anomaly classes are flagged separately so consumers can treat
323
+ * them differently. Both clamp `duration_ms` to 0:
324
+ * - `clock_skew: true` — wall-clock went backwards (NTP backstep,
325
+ * DST transition, manual clock change). The flag is reliable as a
326
+ * "the clock did something weird" signal.
327
+ * - `over_threshold: true` — elapsed time exceeded
328
+ * `MAX_PLAUSIBLE_DURATION_MS` (7 days). Almost always a stale
329
+ * marker from an abandoned session, not a real measurement.
330
+ * The flags are mutually exclusive (a single rawDelta can be either
331
+ * negative OR exceed the ceiling, never both).
322
332
  *
323
333
  * Returns { duration_ms, prev_phase } so callers can log/inspect.
324
334
  */
@@ -51,8 +51,100 @@ function help() {
51
51
  );
52
52
  }
53
53
 
54
+ // Read BMad's `output_folder` from _bmad/bmm/config.yaml if present, so
55
+ // projects that have configured a non-default output dir don't desync
56
+ // from sibling scripts (mark-done-stories-tasks.js etc.).
57
+ function readOutputFolder(projectRoot) {
58
+ const cfg = path.join(projectRoot, '_bmad', 'bmm', 'config.yaml');
59
+ if (!fs.existsSync(cfg)) return null;
60
+ try {
61
+ const body = fs.readFileSync(cfg, 'utf8');
62
+ const m = body.match(/^output_folder\s*:\s*(\S+)/m);
63
+ if (!m) return null;
64
+ return m[1].replace(/^["']|["']$/g, '').trim();
65
+ } catch {
66
+ return null;
67
+ }
68
+ }
69
+
54
70
  function implArtifactsDir(projectRoot) {
55
- return path.join(projectRoot, '_bmad-output', 'implementation-artifacts');
71
+ const folder = readOutputFolder(projectRoot) || '_bmad-output';
72
+ return path.join(projectRoot, folder, 'implementation-artifacts');
73
+ }
74
+
75
+ // ──────────────────────────────────────────────────────────────────
76
+ // Cross-process merge lock
77
+ // ──────────────────────────────────────────────────────────────────
78
+ //
79
+ // Pre-2.0.8 two concurrent merge invocations would each compute the
80
+ // merge in-memory then both rename their tmp file over autopilot-state
81
+ // .yaml. Tmp filenames are unique, so the renames never collided on
82
+ // the source — but the LAST rename wins on the destination, and the
83
+ // earlier merge (potentially with newer shard data) was clobbered.
84
+ // Combined with the archive race below, the loser's archived shards
85
+ // were also gone — silent state rewind.
86
+ //
87
+ // The fix: a sibling lock file. If another invocation holds the lock,
88
+ // either wait briefly + retry, or fail with a clear message naming
89
+ // the holder's pid and start time so the operator can diagnose.
90
+
91
+ const MERGE_LOCK_FILE = '.merge-shards.lock';
92
+ const STALE_LOCK_AGE_MS = 5 * 60 * 1000; // 5 minutes — merges are fast
93
+
94
+ function lockPath(projectRoot) {
95
+ return path.join(implArtifactsDir(projectRoot), MERGE_LOCK_FILE);
96
+ }
97
+
98
+ function acquireMergeLock(projectRoot) {
99
+ const file = lockPath(projectRoot);
100
+ fs.mkdirSync(path.dirname(file), { recursive: true });
101
+ for (let attempt = 0; attempt < 2; attempt++) {
102
+ try {
103
+ const fd = fs.openSync(file, 'wx');
104
+ const payload = JSON.stringify({
105
+ pid: process.pid,
106
+ ts: new Date().toISOString(),
107
+ });
108
+ fs.writeSync(fd, payload);
109
+ fs.closeSync(fd);
110
+ return file;
111
+ } catch (e) {
112
+ if (e.code !== 'EEXIST') throw e;
113
+ // Try stale-recovery once.
114
+ if (attempt === 0) {
115
+ try {
116
+ const st = fs.statSync(file);
117
+ if (Date.now() - st.mtimeMs > STALE_LOCK_AGE_MS) {
118
+ log.warn(`merge-shards: removing stale lock ${file} (older than ${STALE_LOCK_AGE_MS}ms)`);
119
+ fs.unlinkSync(file);
120
+ continue;
121
+ }
122
+ } catch {
123
+ /* lock vanished between EEXIST and stat — retry */
124
+ continue;
125
+ }
126
+ }
127
+ let holder = '';
128
+ try {
129
+ holder = fs.readFileSync(file, 'utf8');
130
+ } catch {
131
+ /* ignore */
132
+ }
133
+ throw new Error(
134
+ `merge-shards: another invocation holds ${file} (${holder}); ` +
135
+ 'wait for it to finish or remove the lock manually if known stale',
136
+ );
137
+ }
138
+ }
139
+ throw new Error(`merge-shards: failed to acquire lock at ${file}`);
140
+ }
141
+
142
+ function releaseMergeLock(file) {
143
+ try {
144
+ fs.unlinkSync(file);
145
+ } catch {
146
+ /* idempotent */
147
+ }
56
148
  }
57
149
 
58
150
  function readShardFile(file) {
@@ -91,16 +183,49 @@ function compareStamps(a, b) {
91
183
  return 0;
92
184
  }
93
185
 
186
+ // Snapshot file stat at read time so we can verify it's unchanged
187
+ // before archiving. Pre-2.0.8: a worker writing a fresh shard between
188
+ // merge-read and archive-rename would have its shard moved into
189
+ // .archive/ without ever being folded into the merged YAML — silent
190
+ // data loss under parallel dispatch.
191
+ function snapshotShard(file) {
192
+ try {
193
+ const st = fs.statSync(file);
194
+ return { mtime: st.mtimeMs, size: st.size, ino: st.ino };
195
+ } catch {
196
+ return null;
197
+ }
198
+ }
199
+
200
+ function shardUnchanged(file, snapshot) {
201
+ if (!snapshot) return false;
202
+ try {
203
+ const st = fs.statSync(file);
204
+ return (
205
+ st.mtimeMs === snapshot.mtime &&
206
+ st.size === snapshot.size &&
207
+ st.ino === snapshot.ino
208
+ );
209
+ } catch {
210
+ return false;
211
+ }
212
+ }
213
+
94
214
  function mergeStateShards(projectRoot) {
95
- // Returns { byStory: { [storyKey]: shard }, corrupt: [...], invalid: [...] }
215
+ // Returns { byStory: { [storyKey]: shard }, snapshots: { [storyKey]: stat },
216
+ // corrupt: [...], invalid: [...] }
96
217
  const dir = path.join(implArtifactsDir(projectRoot), KIND_DIR.state);
97
- if (!fs.existsSync(dir)) return { byStory: {}, corrupt: [], invalid: [] };
218
+ if (!fs.existsSync(dir)) return { byStory: {}, snapshots: {}, corrupt: [], invalid: [] };
98
219
  const stories = listShardStories(projectRoot, 'state');
99
220
  const byStory = {};
221
+ const snapshots = {};
100
222
  const corrupt = [];
101
223
  const invalid = [];
102
224
  for (const story of stories) {
103
225
  const file = path.join(dir, `${story}.yaml`);
226
+ // Snapshot BEFORE reading so a writer that touches the file during
227
+ // read still produces a stat mismatch later.
228
+ const snap = snapshotShard(file);
104
229
  let shard;
105
230
  try {
106
231
  shard = readShardFile(file);
@@ -113,19 +238,33 @@ function mergeStateShards(projectRoot) {
113
238
  continue;
114
239
  }
115
240
  byStory[story] = shard;
241
+ snapshots[story] = snap;
116
242
  }
117
- return { byStory, corrupt, invalid };
243
+ return { byStory, snapshots, corrupt, invalid };
244
+ }
245
+
246
+ // Parse a timestamp string defensively: malformed `ts` returns 0
247
+ // rather than NaN (which Array.sort treats unpredictably). Pre-2.0.8
248
+ // `Date.parse('not-a-date')` returned NaN, NaN comparisons returned
249
+ // 0, and entries clustered in undefined order — the documented
250
+ // "sort by ts ascending" claim was silently violated.
251
+ function tsToMs(ts) {
252
+ if (!ts) return 0;
253
+ const v = Date.parse(ts);
254
+ return Number.isFinite(v) ? v : 0;
118
255
  }
119
256
 
120
257
  function mergeDecisionShards(projectRoot) {
121
258
  const dir = path.join(implArtifactsDir(projectRoot), KIND_DIR['decision-log']);
122
- if (!fs.existsSync(dir)) return { entries: [], corrupt: [], invalid: [] };
123
- const stories = listShardStories(projectRoot, 'decision-log');
259
+ if (!fs.existsSync(dir)) return { entries: [], snapshots: {}, corrupt: [], invalid: [] };
260
+ const stories = listShardStories(projectRoot, 'decision-log').sort();
124
261
  const entries = [];
262
+ const snapshots = {};
125
263
  const corrupt = [];
126
264
  const invalid = [];
127
265
  for (const story of stories) {
128
266
  const file = path.join(dir, `${story}.yaml`);
267
+ const snap = snapshotShard(file);
129
268
  let shard;
130
269
  try {
131
270
  shard = readShardFile(file);
@@ -142,29 +281,50 @@ function mergeDecisionShards(projectRoot) {
142
281
  if (!item || typeof item !== 'object') continue;
143
282
  entries.push({ ...item, _story: story });
144
283
  }
284
+ snapshots[story] = snap;
145
285
  }
146
- // Dedupe by id (if present), otherwise keep all. Sort by ts ascending.
286
+ // Deterministic dedup: sort by (id asc, ts DESC) first, then keep the
287
+ // first entry for each id — that's the latest-by-ts. Pre-2.0.8 the
288
+ // dedup was iteration-order-dependent (filesystem readdir order is
289
+ // unspecified), so identical inputs produced different outputs across
290
+ // OSes. Idempotency claim was filesystem-dependent.
291
+ entries.sort((a, b) => {
292
+ const ai = a.id !== undefined && a.id !== null ? String(a.id) : '';
293
+ const bi = b.id !== undefined && b.id !== null ? String(b.id) : '';
294
+ if (ai !== bi) return ai < bi ? -1 : 1;
295
+ // Within same id: latest ts wins (desc).
296
+ const aw = tsToMs(a.ts);
297
+ const bw = tsToMs(b.ts);
298
+ return bw - aw;
299
+ });
147
300
  const seen = new Set();
148
301
  const deduped = [];
149
302
  for (const e of entries) {
150
- if (e.id !== undefined && e.id !== null && seen.has(String(e.id))) continue;
151
- if (e.id !== undefined && e.id !== null) seen.add(String(e.id));
303
+ if (e.id !== undefined && e.id !== null) {
304
+ if (seen.has(String(e.id))) continue;
305
+ seen.add(String(e.id));
306
+ }
152
307
  deduped.push(e);
153
308
  }
309
+ // Final sort for output: ts ascending, with deterministic tiebreaks.
154
310
  deduped.sort((a, b) => {
155
- const aw = a.ts ? Date.parse(a.ts) : 0;
156
- const bw = b.ts ? Date.parse(b.ts) : 0;
311
+ const aw = tsToMs(a.ts);
312
+ const bw = tsToMs(b.ts);
157
313
  if (aw !== bw) return aw - bw;
158
- // Tiebreak alphabetically by id then story for determinism.
159
314
  const ai = a.id !== undefined ? String(a.id) : '';
160
315
  const bi = b.id !== undefined ? String(b.id) : '';
161
316
  if (ai !== bi) return ai < bi ? -1 : 1;
162
317
  return (a._story || '').localeCompare(b._story || '');
163
318
  });
164
- return { entries: deduped.map((e) => {
165
- const { _story, ...rest } = e;
166
- return rest;
167
- }), corrupt, invalid };
319
+ return {
320
+ entries: deduped.map((e) => {
321
+ const { _story, ...rest } = e;
322
+ return rest;
323
+ }),
324
+ snapshots,
325
+ corrupt,
326
+ invalid,
327
+ };
168
328
  }
169
329
 
170
330
  function archiveCorrupt(projectRoot, kind, story, file, reason) {
@@ -186,18 +346,38 @@ function archiveCorrupt(projectRoot, kind, story, file, reason) {
186
346
  return { archived: dest, reason };
187
347
  }
188
348
 
189
- function archiveShardsToLayer(projectRoot, layerId, storyKeys) {
190
- const ts = layerId || new Date().toISOString().replace(/[:.]/g, '-');
349
+ function archiveShardsToLayer(projectRoot, layerId, snapshotsByKind) {
350
+ // Default layerId includes pid + hrtime to avoid collision when two
351
+ // archive operations land in the same millisecond on fast CI. Pre-
352
+ // 2.0.8 the bare ISO timestamp could collide and the second archive
353
+ // would race-clobber the first.
354
+ const ts =
355
+ layerId ||
356
+ `${new Date().toISOString().replace(/[:.]/g, '-')}-${process.pid}-${process.hrtime.bigint().toString(36)}`;
191
357
  const base = path.join(implArtifactsDir(projectRoot), '.archive', `layer-${ts}`);
192
358
  fs.mkdirSync(base, { recursive: true });
359
+ // Snapshot-verify each shard before moving — if a writer touched the
360
+ // file after merge-read but before archive, the stat won't match and
361
+ // we must NOT move it (otherwise the fresh shard's contents are lost
362
+ // without ever being folded into the merged YAML). Skip + log so the
363
+ // shard stays on disk for the next merge to pick up.
364
+ const skipped = [];
193
365
  for (const kind of ['state', 'decision-log']) {
194
366
  const src = path.join(implArtifactsDir(projectRoot), KIND_DIR[kind]);
195
367
  if (!fs.existsSync(src)) continue;
196
368
  const destDir = path.join(base, KIND_DIR[kind]);
197
369
  fs.mkdirSync(destDir, { recursive: true });
198
- for (const story of storyKeys) {
370
+ const snapshots = (snapshotsByKind && snapshotsByKind[kind]) || {};
371
+ for (const story of Object.keys(snapshots)) {
199
372
  const file = path.join(src, `${story}.yaml`);
200
373
  if (!fs.existsSync(file)) continue;
374
+ if (!shardUnchanged(file, snapshots[story])) {
375
+ log.warn(
376
+ `merge-shards: shard ${file} changed during merge; not archiving (will be folded into next merge)`,
377
+ );
378
+ skipped.push({ kind, story, file, reason: 'changed during merge' });
379
+ continue;
380
+ }
201
381
  const dest = path.join(destDir, `${story}.yaml`);
202
382
  try {
203
383
  fs.renameSync(file, dest);
@@ -207,7 +387,7 @@ function archiveShardsToLayer(projectRoot, layerId, storyKeys) {
207
387
  }
208
388
  }
209
389
  }
210
- return base;
390
+ return { dir: base, skipped };
211
391
  }
212
392
 
213
393
  function writeAuthoritative(projectRoot, filename, body, { dryRun } = {}) {
@@ -263,43 +443,63 @@ function composeDecisionYaml(decisionMerge) {
263
443
  }
264
444
 
265
445
  function merge(projectRoot, { layerId, archive, dryRun } = {}) {
266
- const state = mergeStateShards(projectRoot);
267
- const decisions = mergeDecisionShards(projectRoot);
268
-
269
- // Archive corrupt shards before writing merged files so subsequent
270
- // merges don't re-surface the same errors.
271
- const archivedCorrupt = [];
272
- if (!dryRun) {
273
- for (const c of state.corrupt.concat(state.invalid)) {
274
- const arch = archiveCorrupt(projectRoot, 'state', c.story, c.file, c.error || c.reason);
275
- archivedCorrupt.push({ kind: 'state', story: c.story, ...arch });
276
- }
277
- for (const c of decisions.corrupt.concat(decisions.invalid)) {
278
- const arch = archiveCorrupt(projectRoot, 'decision-log', c.story, c.file, c.error || c.reason);
279
- archivedCorrupt.push({ kind: 'decision-log', story: c.story, ...arch });
446
+ // Acquire cross-process lock. Even dry-run takes the lock so a real
447
+ // merge in progress doesn't have its shard reads disturbed by a
448
+ // concurrent dry-run that might (e.g.) tail the same files.
449
+ const lockFile = acquireMergeLock(projectRoot);
450
+ try {
451
+ const state = mergeStateShards(projectRoot);
452
+ const decisions = mergeDecisionShards(projectRoot);
453
+
454
+ // Archive corrupt shards before writing merged files so subsequent
455
+ // merges don't re-surface the same errors.
456
+ const archivedCorrupt = [];
457
+ if (!dryRun) {
458
+ for (const c of state.corrupt.concat(state.invalid)) {
459
+ const arch = archiveCorrupt(projectRoot, 'state', c.story, c.file, c.error || c.reason);
460
+ archivedCorrupt.push({ kind: 'state', story: c.story, ...arch });
461
+ }
462
+ for (const c of decisions.corrupt.concat(decisions.invalid)) {
463
+ const arch = archiveCorrupt(projectRoot, 'decision-log', c.story, c.file, c.error || c.reason);
464
+ archivedCorrupt.push({ kind: 'decision-log', story: c.story, ...arch });
465
+ }
280
466
  }
281
- }
282
467
 
283
- const stateBody = composeStateYaml(state);
284
- const decisionBody = composeDecisionYaml(decisions);
468
+ const stateBody = composeStateYaml(state);
469
+ const decisionBody = composeDecisionYaml(decisions);
285
470
 
286
- const stateWrite = writeAuthoritative(projectRoot, 'autopilot-state.yaml', stateBody, { dryRun });
287
- const decisionWrite = writeAuthoritative(projectRoot, 'decision-log.yaml', decisionBody, { dryRun });
471
+ const stateWrite = writeAuthoritative(projectRoot, 'autopilot-state.yaml', stateBody, { dryRun });
472
+ const decisionWrite = writeAuthoritative(projectRoot, 'decision-log.yaml', decisionBody, { dryRun });
288
473
 
289
- let archiveDir = null;
290
- if (archive && !dryRun) {
291
- const storyKeys = Object.keys(state.byStory);
292
- archiveDir = archiveShardsToLayer(projectRoot, layerId, storyKeys);
293
- }
474
+ let archiveDir = null;
475
+ let archiveSkipped = [];
476
+ if (archive && !dryRun) {
477
+ const archResult = archiveShardsToLayer(projectRoot, layerId, {
478
+ state: state.snapshots,
479
+ 'decision-log': decisions.snapshots,
480
+ });
481
+ archiveDir = archResult.dir;
482
+ archiveSkipped = archResult.skipped;
483
+ }
294
484
 
295
- return {
296
- state: { stories: Object.keys(state.byStory).length, problems: state.corrupt.length + state.invalid.length },
297
- decisions: { entries: decisions.entries.length, problems: decisions.corrupt.length + decisions.invalid.length },
298
- files: { state: stateWrite.file, decisions: decisionWrite.file },
299
- archived_corrupt: archivedCorrupt,
300
- archive_dir: archiveDir,
301
- dry_run: !!dryRun,
302
- };
485
+ return {
486
+ state: {
487
+ stories: Object.keys(state.byStory).length,
488
+ problems: state.corrupt.length + state.invalid.length,
489
+ },
490
+ decisions: {
491
+ entries: decisions.entries.length,
492
+ problems: decisions.corrupt.length + decisions.invalid.length,
493
+ },
494
+ files: { state: stateWrite.file, decisions: decisionWrite.file },
495
+ archived_corrupt: archivedCorrupt,
496
+ archive_dir: archiveDir,
497
+ archive_skipped: archiveSkipped,
498
+ dry_run: !!dryRun,
499
+ };
500
+ } finally {
501
+ releaseMergeLock(lockFile);
502
+ }
303
503
  }
304
504
 
305
505
  function main() {