akm-cli 0.9.0-beta.5 → 0.9.0-beta.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,8 +19,9 @@ import { info, warn } from "../../core/warn.js";
19
19
  import { closeDatabase, getAllEntries, getEntryCount, getRetrievalCounts, getUtilityScoresByIds, getZeroResultSearches, openDatabase, openExistingDatabase, } from "../../indexer/db/db.js";
20
20
  import { ensureIndex } from "../../indexer/ensure-index.js";
21
21
  import { runGraphExtractionPass } from "../../indexer/graph/graph-extraction.js";
22
+ import { withIndexWriterLease } from "../../indexer/index-writer-lock.js";
22
23
  import { akmIndex } from "../../indexer/indexer.js";
23
- import { runMemoryInferencePass } from "../../indexer/passes/memory-inference.js";
24
+ import { collectPendingMemories, runMemoryInferencePass, } from "../../indexer/passes/memory-inference.js";
24
25
  import { runStalenessDetectionPass } from "../../indexer/passes/staleness-detect.js";
25
26
  import { getWritableStashDirs, resolveSourceEntries } from "../../indexer/search/search-source.js";
26
27
  import { countUsageEventsByType } from "../../indexer/usage/usage-events.js";
@@ -46,7 +47,105 @@ import { makeGateConfig, resolveExtractConfidence, runAutoAcceptGate } from "./i
46
47
  import { isProfileFilteredForAllPasses, resolveImproveProfile, resolveProcessEnabled, shouldSkipRef, } from "./improve-profiles.js";
47
48
  import { detectAndWriteContradictions } from "./memory/memory-contradiction-detect.js";
48
49
  import { analyzeMemoryCleanup, applyMemoryCleanup } from "./memory/memory-improve.js";
50
+ import { DEFAULT_DUE_DAYS, DEFAULT_MAX_PER_RUN, selectProactiveMaintenanceRefs } from "./proactive-maintenance.js";
49
51
  import { akmReflect } from "./reflect.js";
52
+ // #607 Lock Decomposition: fine-grained per-process locks replace the single
53
+ // `improve.lock`. Three independent locks allow concurrent improve runs when
54
+ // they touch different subsystems (e.g. quick-shredder consolidate can run
55
+ // alongside daily reflect+distill).
56
+ //
57
+ // consolidate.lock — protects consolidate + memoryInference (both write index.db)
58
+ // reflect-distill.lock — protects reflect + distill (both write state.db proposals)
59
+ // triage.lock — protects triage (writes proposal promotions)
60
+ //
61
+ // Stale timeouts are per-lock, tuned to the expected runtime of the protected
62
+ // processes: consolidate is disk-bound (1h), reflect+distill is GPU-bound (2h),
63
+ // triage is fast (30min).
64
+ const PROCESS_LOCK_DEFS = {
65
+ consolidate: { fileName: "consolidate.lock", staleAfterMs: 60 * 60 * 1000 },
66
+ reflectDistill: { fileName: "reflect-distill.lock", staleAfterMs: 2 * 60 * 60 * 1000 },
67
+ triage: { fileName: "triage.lock", staleAfterMs: 30 * 60 * 1000 },
68
+ };
69
+ const heldProcessLocks = new Set();
70
+ export function resetHeldProcessLocks() {
71
+ heldProcessLocks.clear();
72
+ }
73
+ function processLockPath(lockBaseDir, lockName) {
74
+ return path.join(lockBaseDir, PROCESS_LOCK_DEFS[lockName].fileName);
75
+ }
76
+ function tryAcquireProcessLock(lockPath, staleAfterMs, skipIfLocked, lockLabel) {
77
+ fs.mkdirSync(path.dirname(lockPath), { recursive: true });
78
+ const lockPayload = () => JSON.stringify({ pid: process.pid, startedAt: new Date().toISOString() });
79
+ if (tryAcquireLockSync(lockPath, lockPayload())) {
80
+ heldProcessLocks.add(lockPath);
81
+ return "acquired";
82
+ }
83
+ const probe = probeLock(lockPath, { staleAfterMs });
84
+ const rawContent = probe.state === "absent" ? undefined : probe.rawContent;
85
+ const lock = rawContent
86
+ ? (() => {
87
+ try {
88
+ return JSON.parse(rawContent);
89
+ }
90
+ catch {
91
+ return null;
92
+ }
93
+ })()
94
+ : null;
95
+ if (probe.state === "stale") {
96
+ try {
97
+ appendEvent({
98
+ eventType: "improve_lock_recovered",
99
+ metadata: {
100
+ lockName: lockLabel,
101
+ stalePid: lock?.pid ?? null,
102
+ lockedAt: lock?.startedAt ?? null,
103
+ recoveredAt: new Date().toISOString(),
104
+ lockAgeMs: probe.ageMs ?? null,
105
+ reason: probe.reason === "pid_dead" ? "pid_not_alive" : probe.reason,
106
+ },
107
+ });
108
+ }
109
+ catch {
110
+ /* event emission is best-effort; never block lock recovery */
111
+ }
112
+ releaseLock(lockPath);
113
+ if (tryAcquireLockSync(lockPath, lockPayload())) {
114
+ heldProcessLocks.add(lockPath);
115
+ return "acquired";
116
+ }
117
+ if (skipIfLocked) {
118
+ warn(`[improve] ${lockLabel} lock acquired by another run during stale recovery; skipping (--skip-if-locked)`);
119
+ return "skipped";
120
+ }
121
+ throw new ConfigError(`akm improve ${lockLabel} is already running. Delete ${lockPath} to force.`, "INVALID_CONFIG_FILE");
122
+ }
123
+ if (skipIfLocked) {
124
+ warn(`[improve] ${lockLabel} lock held by another run (PID ${lock?.pid}, started ${lock?.startedAt}); skipping (--skip-if-locked)`);
125
+ return "skipped";
126
+ }
127
+ throw new ConfigError(`akm improve ${lockLabel} is already running (PID ${lock?.pid}, started ${lock?.startedAt}). Delete ${lockPath} to force.`, "INVALID_CONFIG_FILE");
128
+ }
129
+ function releaseProcessLock(lockPath) {
130
+ try {
131
+ fs.unlinkSync(lockPath);
132
+ }
133
+ catch {
134
+ // ignore
135
+ }
136
+ heldProcessLocks.delete(lockPath);
137
+ }
138
+ function releaseAllProcessLocks() {
139
+ for (const p of heldProcessLocks) {
140
+ try {
141
+ fs.unlinkSync(p);
142
+ }
143
+ catch {
144
+ // ignore
145
+ }
146
+ }
147
+ heldProcessLocks.clear();
148
+ }
50
149
  function resolveImproveScope(scope) {
51
150
  const trimmed = scope?.trim();
52
151
  if (!trimmed)
@@ -102,6 +201,22 @@ export function renderSyncCommitMessage(template, result, nowMs) {
102
201
  };
103
202
  return template.replace(/\{(\w+)\}/g, (match, key) => (Object.hasOwn(tokens, key) ? tokens[key] : match));
104
203
  }
204
+ /**
205
+ * Dedupe a list of eligible refs by `ref`, preserving first-seen order. Used to
206
+ * merge the three eligibility sources (feedback-signal, P0-A high-retrieval,
207
+ * Layer-2 proactive-maintenance) without admitting a ref into the loop twice.
208
+ */
209
+ function dedupeRefs(refs) {
210
+ const seen = new Set();
211
+ const out = [];
212
+ for (const r of refs) {
213
+ if (seen.has(r.ref))
214
+ continue;
215
+ seen.add(r.ref);
216
+ out.push(r);
217
+ }
218
+ return out;
219
+ }
105
220
  async function collectEligibleRefs(scope, stashDir, improveProfile) {
106
221
  if (scope.mode === "ref" && scope.value) {
107
222
  const parsed = parseAssetRef(scope.value);
@@ -471,7 +586,9 @@ export async function akmImprove(options = {}) {
471
586
  options = {
472
587
  ...options,
473
588
  autoAccept: options.autoAccept ?? improveProfile.autoAccept,
474
- limit: options.limit ?? improveProfile.limit,
589
+ // Profile-level limit, then process-level reflect.limit as fallback.
590
+ // CLI --limit takes precedence over both.
591
+ limit: options.limit ?? improveProfile?.processes?.reflect?.limit ?? improveProfile.limit,
475
592
  };
476
593
  let primaryStashDir;
477
594
  try {
@@ -489,103 +606,16 @@ export async function akmImprove(options = {}) {
489
606
  // timeout root cause). Because beforeEach runs synchronously, env is still the
490
607
  // calling test's own at this point; we capture it before yielding the loop.
491
608
  const resolvedStateDbPath = getStateDbPathInDataDir();
492
- // Phase 4 lock hoist (§7): the `improve.lock` setup is hoisted ABOVE
493
- // ensureIndex/collectEligibleRefs so the triage pre-pass (and improve's own
494
- // queue writes) run fully serialized under the lock. The dry-run early-return
495
- // below still skips the lock and triage (the lock+triage block is gated on
496
- // `!options.dryRun`); contradiction-detection and memory-cleanup analysis,
497
- // which previously ran before the lock, now sit after it for free.
498
- const resolvedLockPath = primaryStashDir
499
- ? path.join(primaryStashDir, ".akm", "improve.lock")
500
- : path.join(options.stashDir ?? ".", ".akm", "improve.lock");
501
- const MAX_LOCK_AGE_MS = 4 * 60 * 60 * 1000; // 4 hours
502
- const acquireLock = () => {
503
- fs.mkdirSync(path.dirname(resolvedLockPath), { recursive: true });
504
- const lockPayload = () => JSON.stringify({ pid: process.pid, startedAt: new Date().toISOString() });
505
- if (tryAcquireLockSync(resolvedLockPath, lockPayload()))
506
- return "acquired";
507
- // Lock file already exists — probe to determine whether it's still held
508
- // or whether the prior run died without cleaning up.
509
- const probe = probeLock(resolvedLockPath, { staleAfterMs: MAX_LOCK_AGE_MS });
510
- const rawContent = probe.state === "absent" ? undefined : probe.rawContent;
511
- const lock = rawContent
512
- ? (() => {
513
- try {
514
- return JSON.parse(rawContent);
515
- }
516
- catch {
517
- return null;
518
- }
519
- })()
520
- : null;
521
- if (probe.state === "stale") {
522
- // O-7 / #394: Emit improve_lock_recovered event before recovery so the
523
- // audit trail records the abnormal prior-run exit (Temporal/Airflow pattern).
524
- try {
525
- appendEvent({
526
- eventType: "improve_lock_recovered",
527
- metadata: {
528
- stalePid: lock?.pid ?? null,
529
- lockedAt: lock?.startedAt ?? null,
530
- recoveredAt: new Date().toISOString(),
531
- lockAgeMs: probe.ageMs ?? null,
532
- reason: probe.reason === "pid_dead" ? "pid_not_alive" : probe.reason,
533
- },
534
- });
535
- }
536
- catch {
537
- /* event emission is best-effort; never block lock recovery */
538
- }
539
- releaseLock(resolvedLockPath);
540
- if (tryAcquireLockSync(resolvedLockPath, lockPayload()))
541
- return "acquired";
542
- // Lost the race to another run that grabbed the freed stale lock.
543
- if (options.skipIfLocked) {
544
- warn("[improve] another run acquired the lock during stale recovery; skipping (--skip-if-locked)");
545
- return "skipped";
546
- }
547
- throw new ConfigError(`akm improve is already running. Delete ${resolvedLockPath} to force.`, "INVALID_CONFIG_FILE");
548
- }
549
- // Lock is held by a live run within the staleness window.
550
- if (options.skipIfLocked) {
551
- warn(`[improve] another improve run holds the lock (PID ${lock?.pid}, started ${lock?.startedAt}); skipping (--skip-if-locked)`);
552
- return "skipped";
553
- }
554
- throw new ConfigError(`akm improve is already running (PID ${lock?.pid}, started ${lock?.startedAt}). Delete ${resolvedLockPath} to force.`, "INVALID_CONFIG_FILE");
555
- };
556
- // Phase 4 lock-leak guard (§7 ordering hazard): hoisting `improve.lock` above
557
- // the pre-index region (so the triage pre-pass runs under it) means the lock is
558
- // held while ensureIndex / collectEligibleRefs / contradiction-detection /
559
- // memory-cleanup analysis run — but the main protecting `try { … } finally {
560
- // unlinkSync(resolvedLockPath) }` does not begin until after them. A throw in
561
- // any of those steps would leak the lock. We close that window by wrapping the
562
- // whole region in a try whose catch releases the lock (when held) and
563
- // re-throws. The values this region computes are declared in the outer scope so
564
- // they remain visible to the main run below. The dry-run path never sets
565
- // `lockAcquired`, so its early return releases nothing.
566
- let lockAcquired = false;
567
- const releaseLockOnError = () => {
568
- if (!lockAcquired)
569
- return;
570
- try {
571
- fs.unlinkSync(resolvedLockPath);
572
- }
573
- catch {
574
- // best-effort release on the error path
575
- }
576
- lockAcquired = false;
577
- };
578
- // Signal-safe lock release. The SIGTERM/SIGINT/SIGHUP handler in improve-cli.ts
579
- // calls `process.exit()`, which does NOT run the `finally` below that owns lock
580
- // release — so a cron-timeout SIGTERM leaked `improve.lock` every run.
581
- // `process.exit()` DOES fire `'exit'` listeners, so we release the lock from
582
- // one. `releaseLockIfOwned` only unlinks a lock still owned by this PID, so it
583
- // is safe even if a later run re-acquired it. The listener is removed in the
584
- // `finally` so the normal path stays single-release and repeated in-process
585
- // `akmImprove` calls (tests) do not accumulate listeners.
586
- const releaseLockOnExit = () => {
587
- releaseLockIfOwned(resolvedLockPath, process.pid);
588
- };
609
+ // #607 Lock decomposition: three per-process locks replace the single
610
+ // `improve.lock`. Each process acquires only the lock(s) it needs, so
611
+ // quick-shredder consolidate can run alongside daily reflect+distill.
612
+ //
613
+ // consolidate.lock — protects consolidate + memoryInference + graphExtraction (index.db writers)
614
+ // reflect-distill.lock protects reflect + distill (state.db proposal writers)
615
+ // triage.lock — protects triage pre-pass (state.db proposal promotions)
616
+ //
617
+ // Lock base directory — same `.akm/` under the primary stash dir.
618
+ const lockBaseDir = primaryStashDir ? path.join(primaryStashDir, ".akm") : path.join(options.stashDir ?? ".", ".akm");
589
619
  const preEnsureCleanupWarnings = [];
590
620
  let plannedRefs;
591
621
  let memorySummary;
@@ -594,65 +624,59 @@ export async function akmImprove(options = {}) {
594
624
  let guidance;
595
625
  let triageDrain;
596
626
  try {
597
- // Acquire the lock and run the triage pre-pass for non-dry-run executions.
598
- // The dry-run branch below produces plannedRefs/memorySummary WITHOUT the lock
599
- // or triage (decision: dry-run never mutates the queue).
627
+ // #607: Per-process lock acquisition. Each process acquires only the lock(s)
628
+ // it needs. The dry-run branch produces plannedRefs/memorySummary WITHOUT any
629
+ // locks (decision: dry-run never mutates the queue).
600
630
  if (!options.dryRun) {
601
- if (acquireLock() === "skipped") {
602
- // Another improve holds the lock and the caller asked to skip rather
603
- // than fail. Return a clean no-op result (exit 0) before any index/DB
604
- // work — never registered the exit listener, never set lockAcquired,
605
- // so we release nothing belonging to the run that owns the lock.
606
- return {
607
- schemaVersion: 1,
608
- ok: true,
609
- scope,
610
- dryRun: false,
611
- skipped: { reason: "lock-held" },
612
- memorySummary: { eligible: 0, derived: 0 },
613
- plannedRefs: [],
614
- };
615
- }
616
- lockAcquired = true;
617
631
  // Backstop release on process.exit() (signal handler / budget watchdog),
618
632
  // which skips the finally below. Removed in that finally on the normal path.
619
- process.on("exit", releaseLockOnExit);
620
- // Phase 4 triage pre-pass (§7, §13): drain the standing pending backlog
621
- // BEFORE ensureIndex so improve generates fresh proposals against a cleared
622
- // queue (no `duplicate_pending` collisions) and ensureIndex absorbs triage's
623
- // promotions for free. Gated on the triage process being enabled (opt-in,
624
- // defaults off) and on a whole-stash / type-scoped run — a single-ref
625
- // `akm improve skill:x` must never drain the whole queue. Best-effort: a
626
- // triage failure is a non-fatal warning, never an abort (mirrors the
627
- // contradiction-detection pass below).
633
+ const releaseAllOnExit = () => {
634
+ for (const p of heldProcessLocks) {
635
+ releaseLockIfOwned(p, process.pid);
636
+ }
637
+ };
638
+ process.on("exit", releaseAllOnExit);
639
+ // #607 triage pre-pass: acquire triage.lock, drain the standing pending
640
+ // backlog BEFORE ensureIndex so improve generates fresh proposals against
641
+ // a cleared queue (no `duplicate_pending` collisions) and ensureIndex
642
+ // absorbs triage's promotions for free. Release immediately after —
643
+ // triage.lock is not needed again until the next improve run.
628
644
  if (primaryStashDir && resolveProcessEnabled("triage", improveProfile)) {
629
645
  if (scope.mode === "ref") {
630
646
  warn("[improve] triage pre-pass skipped (single-ref scope never drains the whole queue)");
631
647
  }
632
648
  else {
633
- try {
634
- const triageConfig = improveProfile.processes?.triage;
635
- const policy = resolveDrainPolicy(triageConfig?.policy);
636
- const applyMode = triageConfig?.applyMode ?? "queue";
637
- const maxAccepts = triageConfig?.maxAcceptsPerRun ?? 25;
638
- const judgment = triageConfig?.judgment
639
- ? resolveTriageJudgmentRunner(triageConfig.judgment, _earlyConfig)
640
- : null;
641
- triageDrain = await drainProposalsFn({
642
- stashDir: primaryStashDir,
643
- policy,
644
- applyMode,
645
- maxAccepts,
646
- dryRun: false,
647
- // No fresh ids exist yet — triage runs before improve generates any.
648
- excludeIds: new Set(),
649
- ...(triageConfig?.maxDiffLines !== undefined ? { maxDiffLines: triageConfig.maxDiffLines } : {}),
650
- judgment,
651
- });
649
+ const triageLPath = processLockPath(lockBaseDir, "triage");
650
+ const triageResult = tryAcquireProcessLock(triageLPath, PROCESS_LOCK_DEFS.triage.staleAfterMs, options.skipIfLocked, "triage");
651
+ if (triageResult === "skipped") {
652
+ triageDrain = undefined;
652
653
  }
653
- catch (err) {
654
- // Non-fatal: triage is a best-effort pre-pass and must never abort improve.
655
- warn(`[improve] triage pre-pass failed (non-fatal): ${err instanceof Error ? err.message : String(err)}`);
654
+ else {
655
+ try {
656
+ const triageConfig = improveProfile.processes?.triage;
657
+ const policy = resolveDrainPolicy(triageConfig?.policy);
658
+ const applyMode = triageConfig?.applyMode ?? "queue";
659
+ const maxAccepts = triageConfig?.maxAcceptsPerRun ?? 25;
660
+ const judgment = triageConfig?.judgment
661
+ ? resolveTriageJudgmentRunner(triageConfig.judgment, _earlyConfig)
662
+ : null;
663
+ triageDrain = await drainProposalsFn({
664
+ stashDir: primaryStashDir,
665
+ policy,
666
+ applyMode,
667
+ maxAccepts,
668
+ dryRun: false,
669
+ excludeIds: new Set(),
670
+ ...(triageConfig?.maxDiffLines !== undefined ? { maxDiffLines: triageConfig.maxDiffLines } : {}),
671
+ judgment,
672
+ });
673
+ }
674
+ catch (err) {
675
+ warn(`[improve] triage pre-pass failed (non-fatal): ${err instanceof Error ? err.message : String(err)}`);
676
+ }
677
+ finally {
678
+ releaseProcessLock(triageLPath);
679
+ }
656
680
  }
657
681
  }
658
682
  }
@@ -684,7 +708,7 @@ export async function akmImprove(options = {}) {
684
708
  // best-effort; leave preEnsureEntryCount undefined
685
709
  }
686
710
  try {
687
- await ensureIndexFn(primaryStashDir);
711
+ await ensureIndexFn(primaryStashDir, { mode: "blocking" });
688
712
  }
689
713
  catch (err) {
690
714
  preEnsureCleanupWarnings.push(`ensureIndex failed: ${err instanceof Error ? err.message : String(err)}`);
@@ -752,17 +776,14 @@ export async function akmImprove(options = {}) {
752
776
  }
753
777
  }
754
778
  catch (err) {
755
- releaseLockOnError();
779
+ releaseAllProcessLocks();
756
780
  throw err;
757
781
  }
758
- // FIX 2 (lock-leak window): everything from here on runs UNDER the lock that
759
- // `acquireLock()` just took. The single `try { } finally { unlinkSync(lock) }`
760
- // below now spans the budget-timer setup, `openStateDatabase()`, and the
761
- // `profileFilteredRefs` audit-event loop too regions that previously sat in
762
- // the gap between the lock-acquire catch (above) and the main try. A throw in
763
- // any of them used to leak the lock (blocking the next improve up to 4h);
764
- // now the finally releases it exactly once. The dry-run path already returned
765
- // above without acquiring the lock, so it never reaches this finally; the
782
+ // #607: per-process locks are acquired/released around each stage below.
783
+ // The triage pre-pass already ran under triage.lock (released). The
784
+ // preparation stage runs under consolidate.lock, the loop stage under
785
+ // reflect-distill.lock, and the post-loop stage under consolidate.lock again.
786
+ // Each stage acquires its lock just before starting and releases in finally.
766
787
  // best-effort `unlinkSync` is a no-op when no lock file exists.
767
788
  const startMs = Date.now();
768
789
  const budgetMs = options.timeoutMs ?? 2 * 60 * 60 * 1000; // default 2 hours
@@ -826,6 +847,10 @@ export async function akmImprove(options = {}) {
826
847
  },
827
848
  }, eventsCtx);
828
849
  }
850
+ // #607: acquire consolidate.lock for the preparation stage (consolidate,
851
+ // ensureIndex, extract all write index.db). Released immediately after.
852
+ const consolidateLPath = processLockPath(lockBaseDir, "consolidate");
853
+ const consolidatePrepAcquired = tryAcquireProcessLock(consolidateLPath, PROCESS_LOCK_DEFS.consolidate.staleAfterMs, options.skipIfLocked, "consolidate") === "acquired";
829
854
  const preparation = await runImprovePreparationStage({
830
855
  scope,
831
856
  options,
@@ -840,6 +865,8 @@ export async function akmImprove(options = {}) {
840
865
  initialCleanupWarnings: preEnsureCleanupWarnings,
841
866
  improveProfile,
842
867
  });
868
+ if (consolidatePrepAcquired)
869
+ releaseProcessLock(consolidateLPath);
843
870
  // D6: pre-load all proposal_rejected events from the last 30 days once,
844
871
  // so the per-asset loop can use a Map lookup instead of N DB round trips.
845
872
  const REJECTED_PROPOSAL_WINDOW_MS = daysToMs(30);
@@ -851,6 +878,10 @@ export async function akmImprove(options = {}) {
851
878
  rejectedProposalsByRef.set(e.ref, e);
852
879
  }
853
880
  }
881
+ // #607: acquire reflect-distill.lock for the loop stage (reflect + distill
882
+ // both write proposals to state.db). Released immediately after.
883
+ const reflectDistillLPath = processLockPath(lockBaseDir, "reflectDistill");
884
+ const reflectDistillAcquired = tryAcquireProcessLock(reflectDistillLPath, PROCESS_LOCK_DEFS.reflectDistill.staleAfterMs, options.skipIfLocked, "reflect-distill") === "acquired";
854
885
  const { reflectsWithErrorContext, memoryRefsForInference, gateAutoAcceptedCount: loopGateCount, gateAutoAcceptFailedCount: loopGateFailedCount, } = await runImproveLoopStage({
855
886
  scope,
856
887
  options,
@@ -870,9 +901,15 @@ export async function akmImprove(options = {}) {
870
901
  eventsCtx,
871
902
  improveProfile,
872
903
  });
904
+ if (reflectDistillAcquired)
905
+ releaseProcessLock(reflectDistillLPath);
873
906
  // #551: consolidation now runs in the preparation stage (before extract);
874
907
  // its result and run-flag are read from `preparation`, not the post-loop.
875
908
  const consolidation = preparation.consolidation;
909
+ // #607: acquire consolidate.lock for the post-loop stage (memoryInference +
910
+ // graphExtraction both write index.db). Released immediately after.
911
+ const consolidatePostLPath = processLockPath(lockBaseDir, "consolidate");
912
+ const consolidatePostAcquired = tryAcquireProcessLock(consolidatePostLPath, PROCESS_LOCK_DEFS.consolidate.staleAfterMs, options.skipIfLocked, "consolidate") === "acquired";
876
913
  const { allWarnings, deadUrls, memoryInference, graphExtraction, stalenessDetection, maintenanceActions, memoryInferenceDurationMs, graphExtractionDurationMs, orphansPurged, proposalsExpired, gateAutoAcceptedCount: postLoopGateCount, gateAutoAcceptFailedCount: postLoopGateFailedCount, } = await runImprovePostLoopStage({
877
914
  scope,
878
915
  options,
@@ -883,11 +920,12 @@ export async function akmImprove(options = {}) {
883
920
  memoryRefsForInference,
884
921
  reindexFn,
885
922
  eventsCtx,
886
- // O-1 (#364): propagate wall-clock budget signal to post-loop maintenance.
887
923
  budgetSignal: budgetAbortController.signal,
888
924
  improveProfile,
889
925
  consolidationRan: preparation.consolidationRan,
890
926
  });
927
+ if (consolidatePostAcquired)
928
+ releaseProcessLock(consolidatePostLPath);
891
929
  const finalActions = maintenanceActions && maintenanceActions.length > 0
892
930
  ? [...preparation.actions, ...maintenanceActions]
893
931
  : preparation.actions;
@@ -972,6 +1010,7 @@ export async function akmImprove(options = {}) {
972
1010
  },
973
1011
  }
974
1012
  : {}),
1013
+ ...(preparation.proactiveMaintenance ? { proactiveMaintenance: preparation.proactiveMaintenance } : {}),
975
1014
  ...(options.runId !== undefined ? { runId: options.runId } : {}),
976
1015
  };
977
1016
  if (!result.dryRun)
@@ -1054,15 +1093,12 @@ export async function akmImprove(options = {}) {
1054
1093
  // O-1 (#364): Clear the budget abort timer so it does not keep the event
1055
1094
  // loop alive after the run completes.
1056
1095
  clearBudgetTimer();
1057
- try {
1058
- fs.unlinkSync(resolvedLockPath);
1059
- }
1060
- catch {
1061
- // ignore
1062
- }
1063
- // The normal path released the lock above; drop the process.exit backstop so
1064
- // it does not fire later (or accumulate across repeated in-process calls).
1065
- process.removeListener("exit", releaseLockOnExit);
1096
+ // #607: release any per-process locks still held (backstop for error paths;
1097
+ // the normal path already released each lock after its stage completed).
1098
+ releaseAllProcessLocks();
1099
+ // Drop the process.exit backstop so it does not fire later (or accumulate
1100
+ // across repeated in-process calls).
1101
+ process.removeAllListeners("exit");
1066
1102
  // I1: close the long-lived state.db connection opened at the top of the run.
1067
1103
  try {
1068
1104
  eventsDb?.close();
@@ -1175,6 +1211,11 @@ function emitImproveCompletedEvent(result, durations, eventsCtx) {
1175
1211
  memoryInferenceDurationMs: durations.memoryInferenceDurationMs,
1176
1212
  graphExtractionExtractedFiles: result.graphExtraction?.quality.extractedFiles ?? 0,
1177
1213
  graphExtractionDurationMs: durations.graphExtractionDurationMs,
1214
+ // Layer-2 proactive-maintenance coverage (0 when the process is disabled
1215
+ // or the run was ref-scoped) so a scheduled sweep's reach is trackable.
1216
+ proactiveSelected: result.proactiveMaintenance?.selected ?? 0,
1217
+ proactiveDueTotal: result.proactiveMaintenance?.dueTotal ?? 0,
1218
+ proactiveNeverReflected: result.proactiveMaintenance?.neverReflected ?? 0,
1178
1219
  // New metrics for tuning the improve loop.
1179
1220
  ...(durations.totalDurationMs !== undefined ? { durationMs: durations.totalDurationMs } : {}),
1180
1221
  ...(durations.warningCount !== undefined ? { warningCount: durations.warningCount } : {}),
@@ -1385,13 +1426,13 @@ async function runConsolidationPass(args) {
1385
1426
  // Tie consolidate proposals back to this improve invocation so
1386
1427
  // accept-rate-per-run aggregation works. Mirrors reflect/propose/extract.
1387
1428
  sourceRun: `consolidate-${Date.now()}`,
1388
- // Full-pool sweep: consolidation only runs on the nightly default-profile
1389
- // pass (quick/frequent disable it), so a complete re-cluster is correct and
1390
- // affordable here. Do NOT pass incrementalSince the time-window narrowing
1391
- // it triggers permanently excludes stale-but-unmerged duplicate clusters,
1392
- // starving merge recall and letting the pool grow unbounded. (The narrowing
1393
- // was a band-aid for an every-30-min consolidation cadence that the profile
1394
- // split has since eliminated.) lastConsolidateTs still gates whether we run.
1429
+ // Pass profile-configured options. incrementalSince narrows the pool to
1430
+ // recently-changed memories + graph neighbours use this for frequent
1431
+ // passes (quick-shredder). Leave absent in the nightly default profile for
1432
+ // a full-pool sweep that catches stale-but-unmerged duplicates.
1433
+ incrementalSince: improveProfile?.processes?.consolidate?.incrementalSince,
1434
+ limit: improveProfile?.processes?.consolidate?.limit,
1435
+ neighborsPerChanged: improveProfile?.processes?.consolidate?.neighborsPerChanged,
1395
1436
  maxChunkSize: improveProfile?.processes?.consolidate?.maxChunkSize,
1396
1437
  // Honor profile.autoAccept (already merged into options.autoAccept at the
1397
1438
  // top of akmImprove). The CLI parser always supplies 90 when --auto-accept
@@ -1420,7 +1461,14 @@ async function runConsolidationPass(args) {
1420
1461
  appendEvent({
1421
1462
  eventType: "consolidate_completed",
1422
1463
  ref: "memory:_consolidation",
1423
- metadata: { processed: consolidation.processed, merged: consolidation.merged },
1464
+ metadata: {
1465
+ processed: consolidation.processed,
1466
+ merged: consolidation.merged,
1467
+ deleted: consolidation.deleted,
1468
+ contradicted: consolidation.contradicted,
1469
+ failedChunks: consolidation.failedChunks ?? 0,
1470
+ durationMs: consolidation.durationMs,
1471
+ },
1424
1472
  }, eventsCtx);
1425
1473
  }
1426
1474
  }
@@ -1791,10 +1839,19 @@ async function runImprovePreparationStage(args) {
1791
1839
  // refs that fail the distill signal-delta gate).
1792
1840
  // distillOnlyRefs — reflect blocked but distill signal-delta passes
1793
1841
  // AND ref is a distill candidate.
1794
- // fullySkippedCount — neither gate passes synthetic skip action
1795
- // + improve_skipped event, excluded from sort.
1842
+ // noFeedbackPool — neither signal-delta gate passes *and* the ref has
1843
+ // no recent feedback signal at all. These are NOT
1844
+ // skipped here: they are handed to the high-retrieval
1845
+ // fallback (P0-A) below so frequently-retrieved but
1846
+ // never-rated assets can still be improved. Only refs
1847
+ // that P0-A declines are ultimately fully skipped.
1848
+ // fullySkippedCount — has stale feedback but no signal delta → genuine
1849
+ // skip (counted, aggregated event emitted post-loop),
1850
+ // excluded from sort.
1796
1851
  const eligibleRefs = [];
1797
1852
  const distillOnlyRefs = [];
1853
+ // Zero-(recent-)feedback refs deferred to the P0-A high-retrieval fallback.
1854
+ const noFeedbackPool = [];
1798
1855
  let fullySkippedCount = 0;
1799
1856
  // O-2 (#365): explicit --scope <ref> bypasses every gate (user intent wins).
1800
1857
  const scopeRefBypass = scope.mode === "ref";
@@ -1832,22 +1889,59 @@ async function runImprovePreparationStage(args) {
1832
1889
  // Reflect blocked but distill passes → distill-only bucket.
1833
1890
  distillOnlyRefs.push(r);
1834
1891
  }
1892
+ else if (!latestFeedbackTs.has(r.ref)) {
1893
+ // Neither signal-delta gate passes AND there is no recent feedback signal
1894
+ // at all. Rather than skip outright, defer to the high-retrieval fallback
1895
+ // (P0-A) below: a never-rated-but-frequently-retrieved asset is exactly
1896
+ // what that path is meant to rescue. Refs P0-A declines are skipped there.
1897
+ noFeedbackPool.push(r);
1898
+ }
1835
1899
  else {
1836
- // Neither gate passes fully skipped.
1900
+ // Has feedback on record but no signal delta since the last proposal —
1901
+ // genuinely fully skipped. Counted here; a single aggregated
1902
+ // improve_skipped event is emitted after the loop (mirrors
1903
+ // profile_filtered_all_passes) instead of one event per ref.
1837
1904
  fullySkippedCount++;
1838
1905
  actions.push({
1839
1906
  ref: r.ref,
1840
1907
  mode: "distill-skipped",
1841
1908
  result: { ok: true, reason: "no new signal since last proposal" },
1842
1909
  });
1843
- appendEvent({ eventType: "improve_skipped", ref: r.ref, metadata: { reason: "no_new_signal" } }, eventsCtx);
1844
1910
  }
1845
1911
  }
1912
+ // Emit ONE aggregated skip event for the fully-skipped bucket rather than one
1913
+ // improve_skipped event per ref (#592 pattern, mirrors
1914
+ // profile_filtered_all_passes above). The per-ref loop previously produced
1915
+ // ~11K state.db writes per run on a large stash, the dominant contributor to
1916
+ // 900 s timeouts. The in-memory `actions` log keeps the per-ref detail for the
1917
+ // run summary; no downstream consumer needs a per-ref DB audit trail (health's
1918
+ // skip histogram reads the `no_new_signal` counter from the count field).
1919
+ if (fullySkippedCount > 0) {
1920
+ appendEvent({
1921
+ eventType: "improve_skipped",
1922
+ ref: undefined,
1923
+ metadata: {
1924
+ reason: "no_new_signal",
1925
+ count: fullySkippedCount,
1926
+ },
1927
+ }, eventsCtx);
1928
+ }
1846
1929
  // ── Phase 4: signal/feedback/utility/sort on the reduced set ──────────────
1847
- // Everything from here works only on (eligibleRefs ∪ distillOnlyRefs). The
1848
- // fully-skipped bucket has already been routed and emitted; we deliberately
1849
- // avoid spending DB/CPU on refs that cannot enter the loop.
1930
+ // Everything from here works on (eligibleRefs ∪ distillOnlyRefs) plus the
1931
+ // deferred noFeedbackPool that may be rescued by the high-retrieval fallback
1932
+ // (P0-A). The fully-skipped bucket has already been routed and its aggregated
1933
+ // event emitted; we deliberately avoid spending DB/CPU on refs that the
1934
+ // signal-delta gate rejected with feedback already on record.
1850
1935
  const processableRefs = [...eligibleRefs, ...distillOnlyRefs];
1936
+ // Refs eligible for the high-retrieval fallback (P0-A): the signal-delta
1937
+ // partition above could not place these in a reflect/distill bucket, but they
1938
+ // may still qualify if they have been retrieved often enough. Two disjoint
1939
+ // sources feed this set:
1940
+ // 1. noFeedbackPool — refs with no recent feedback that the partition loop
1941
+ // deliberately deferred here (otherwise they would never reach P0-A).
1942
+ // 2. processableRefs entries that turn out to carry no recent feedback
1943
+ // *signal* once feedbackSummary is computed below.
1944
+ // (1) is added here; (2) is folded in after feedbackSummary is built.
1851
1945
  // Gap 6: only surface feedback signals from the last 30 days so that
1852
1946
  // ancient one-off feedback events don't permanently lock an asset into
1853
1947
  // every improve run. Assets with only stale signals fall through to the
@@ -1857,8 +1951,12 @@ async function runImprovePreparationStage(args) {
1857
1951
  // Pre-compute feedback summary per ref in a single pass so we don't issue
1858
1952
  // two readEvents({type:"feedback", ref}) per asset (one for signal filtering,
1859
1953
  // one for ratio computation).
1954
+ // Cover processableRefs *and* the deferred noFeedbackPool so utility/feedback
1955
+ // ratios are available for any noFeedbackPool ref that P0-A rescues below.
1860
1956
  const feedbackSummary = new Map();
1861
- for (const candidate of processableRefs) {
1957
+ for (const candidate of [...processableRefs, ...noFeedbackPool]) {
1958
+ if (feedbackSummary.has(candidate.ref))
1959
+ continue;
1862
1960
  const { events } = readEvents({ type: "feedback", ref: candidate.ref });
1863
1961
  let hasSignal = false;
1864
1962
  let positive = 0;
@@ -1881,8 +1979,21 @@ async function runImprovePreparationStage(args) {
1881
1979
  // P0-A: also surface zero-feedback assets that have been retrieved many times.
1882
1980
  const RETRIEVAL_COUNT_THRESHOLD = options.minRetrievalCount ?? 5;
1883
1981
  const signalBearingSet = new Set(signalFiltered.map((r) => r.ref));
1884
- const noFeedbackCandidates = processableRefs.filter((r) => !signalBearingSet.has(r.ref));
1982
+ // Zero-feedback candidates for P0-A: processableRefs without a recent signal,
1983
+ // plus the deferred noFeedbackPool. Dedupe by ref (the two sources are
1984
+ // disjoint by construction, but guard against overlap defensively).
1985
+ const noFeedbackSeen = new Set();
1986
+ const noFeedbackCandidates = [];
1987
+ for (const r of [...processableRefs.filter((r) => !signalBearingSet.has(r.ref)), ...noFeedbackPool]) {
1988
+ if (noFeedbackSeen.has(r.ref))
1989
+ continue;
1990
+ noFeedbackSeen.add(r.ref);
1991
+ noFeedbackCandidates.push(r);
1992
+ }
1885
1993
  let highRetrievalRefs = [];
1994
+ // Retrieval counts for the zero-feedback pool, hoisted so the Layer-2
1995
+ // proactive-maintenance selector below can reuse them without a second DB pass.
1996
+ let retrievalCounts = new Map();
1886
1997
  let dbForRetrieval;
1887
1998
  try {
1888
1999
  dbForRetrieval = openExistingDatabase();
@@ -1890,15 +2001,21 @@ async function runImprovePreparationStage(args) {
1890
2001
  if (showEventCount === 0) {
1891
2002
  warn("Warning: show events not yet in usage_events — zero-feedback fallback will match only search-retrieved assets.");
1892
2003
  }
1893
- const retrievalCounts = getRetrievalCounts(dbForRetrieval, noFeedbackCandidates.map((r) => r.ref));
2004
+ retrievalCounts = getRetrievalCounts(dbForRetrieval, noFeedbackCandidates.map((r) => r.ref));
1894
2005
  // High-retrieval signal-delta (simplified rule, 0.8.0): a no-feedback
1895
- // ref qualifies exactly once — when retrievalCount threshold AND no
1896
- // prior reflect proposal exists for it. Once a reflect proposal is on
1897
- // record, subsequent re-eligibility requires explicit feedback (which
1898
- // flows through the normal signal-delta gate above). Tracking growth in
1899
- // retrieval count would require persisting the count in proposal
1900
- // metadata; deferred to a follow-up.
1901
- highRetrievalRefs = noFeedbackCandidates.filter((r) => (retrievalCounts.get(r.ref) ?? 0) >= RETRIEVAL_COUNT_THRESHOLD && !lastReflectProposalTs.has(r.ref));
2006
+ // ref qualifies exactly once — when it has actually been retrieved
2007
+ // (retrievalCount 1) AND retrievalCount threshold AND no prior reflect
2008
+ // proposal exists for it. Once a reflect proposal is on record, subsequent
2009
+ // re-eligibility requires explicit feedback (which flows through the normal
2010
+ // signal-delta gate above). The explicit `> 0` guard keeps a threshold of 0
2011
+ // from rescuing genuinely never-retrieved assets — the fallback is for
2012
+ // *retrieved* assets, not silent ones. Tracking growth in retrieval count
2013
+ // would require persisting the count in proposal metadata; deferred to a
2014
+ // follow-up.
2015
+ highRetrievalRefs = noFeedbackCandidates.filter((r) => {
2016
+ const count = retrievalCounts.get(r.ref) ?? 0;
2017
+ return count > 0 && count >= RETRIEVAL_COUNT_THRESHOLD && !lastReflectProposalTs.has(r.ref);
2018
+ });
1902
2019
  }
1903
2020
  catch (err) {
1904
2021
  rethrowIfTestIsolationError(err);
@@ -1908,6 +2025,91 @@ async function runImprovePreparationStage(args) {
1908
2025
  if (dbForRetrieval)
1909
2026
  closeDatabase(dbForRetrieval);
1910
2027
  }
2028
+ // ── Layer 2: PROACTIVE MAINTENANCE SELECTOR (third eligibility source) ─────
2029
+ // The signal-delta gate and P0-A only surface assets with fresh feedback or a
2030
+ // raw-retrieval spike. Neither revisits a stable, high-value asset on a
2031
+ // schedule, so on a quiet stash useful assets drift stale and are never
2032
+ // refreshed. When the `proactiveMaintenance` process is enabled (DEFAULT OFF)
2033
+ // and the run is whole-stash / type scope, this selector ranks the eligible
2034
+ // population by a composite maintenance priority, gates on staleness ("due"),
2035
+ // bounds to top-N, and folds the winners into the SAME candidate set the other
2036
+ // two sources feed — so they flow through the existing #580 empty-diff /
2037
+ // cosmetic suppression and additive-distill gates. It adds no new mutation
2038
+ // logic of its own. The due gate doubles as the rotation cooldown: a freshly
2039
+ // reflected asset is excluded until it ages back past `dueDays`, so successive
2040
+ // runs rotate through the due pool rather than re-selecting the same heads.
2041
+ let proactiveRefs = [];
2042
+ let proactiveMaintenanceSummary;
2043
+ const proactiveEnabled = scope.mode !== "ref" && resolveProcessEnabled("proactiveMaintenance", improveProfile);
2044
+ if (proactiveEnabled) {
2045
+ const pmCfg = improveProfile.processes?.proactiveMaintenance;
2046
+ const dueDays = pmCfg?.dueDays ?? DEFAULT_DUE_DAYS;
2047
+ const maxPerRun = pmCfg?.maxPerRun ?? pmCfg?.limit ?? DEFAULT_MAX_PER_RUN;
2048
+ const importanceWeights = pmCfg?.importanceWeights;
2049
+ // Candidate population: the zero-feedback / non-signal pool — exactly the
2050
+ // assets the other two sources would NOT pick this run. Exclude any P0-A
2051
+ // rescued this run so we never double-select the same ref.
2052
+ const alreadySelected = new Set(highRetrievalRefs.map((r) => r.ref));
2053
+ const pmCandidates = noFeedbackCandidates.filter((r) => !alreadySelected.has(r.ref));
2054
+ const selection = selectProactiveMaintenanceRefs({
2055
+ candidates: pmCandidates,
2056
+ lastReflectTs: lastReflectProposalTs,
2057
+ lastDistillTs: lastDistillProposalTs,
2058
+ retrievalCounts,
2059
+ sizeBytesOf: (r) => {
2060
+ const fp = r.filePath;
2061
+ if (!fp)
2062
+ return undefined;
2063
+ try {
2064
+ return fs.statSync(fp).size;
2065
+ }
2066
+ catch {
2067
+ return undefined;
2068
+ }
2069
+ },
2070
+ dueDays,
2071
+ maxPerRun,
2072
+ importanceWeights,
2073
+ });
2074
+ proactiveRefs = selection.selected;
2075
+ proactiveMaintenanceSummary = {
2076
+ selected: selection.selected.length,
2077
+ dueTotal: selection.dueTotal,
2078
+ neverReflected: selection.neverReflected,
2079
+ };
2080
+ // Aggregated observability event (never per-ref — avoids the event flood the
2081
+ // Layer-1 work eliminated). Mirrors the `no_new_signal` aggregation pattern.
2082
+ appendEvent({
2083
+ eventType: "proactive_selected",
2084
+ ref: undefined,
2085
+ metadata: {
2086
+ count: selection.selected.length,
2087
+ dueTotal: selection.dueTotal,
2088
+ neverReflected: selection.neverReflected,
2089
+ },
2090
+ }, eventsCtx);
2091
+ if (selection.selected.length > 0) {
2092
+ info(`[improve] proactive maintenance selected ${selection.selected.length}/${selection.dueTotal} due refs ` +
2093
+ `(${selection.neverReflected} never reflected, dueDays=${dueDays}, maxPerRun=${maxPerRun})`);
2094
+ }
2095
+ }
2096
+ // Record an in-memory skip action for every zero-feedback ref that the
2097
+ // partition loop deferred to P0-A but P0-A then declined (retrievalCount below
2098
+ // threshold, or a prior reflect proposal already on record). These never make
2099
+ // it into mergedRefs, so without this they would silently vanish from the run
2100
+ // summary. No DB event is written here — these refs carry no signal at all, so
2101
+ // there is nothing for the skip histogram to aggregate; the action log alone
2102
+ // preserves the per-ref audit trail (mirrors the fully-skipped action above).
2103
+ const rescuedSet = new Set([...highRetrievalRefs, ...proactiveRefs].map((r) => r.ref));
2104
+ for (const r of noFeedbackPool) {
2105
+ if (rescuedSet.has(r.ref))
2106
+ continue;
2107
+ actions.push({
2108
+ ref: r.ref,
2109
+ mode: "distill-skipped",
2110
+ result: { ok: true, reason: "no new signal since last proposal" },
2111
+ });
2112
+ }
1911
2113
  // If the user explicitly scoped to a single ref, always act on it —
1912
2114
  // skip the signal/retrieval filter entirely. The filter exists to avoid
1913
2115
  // noisy "improve everything" runs; it should not gate an intentional
@@ -1917,8 +2119,48 @@ async function runImprovePreparationStage(args) {
1917
2119
  // or sufficient retrievals). A stash with no signals has 0 eligible refs —
1918
2120
  // usage is the gate. Run `akm feedback <ref> --positive` or retrieve assets
1919
2121
  // to bring them into the eligible pool.
1920
- const signalAndRetrievalRefs = [...signalFiltered, ...highRetrievalRefs];
2122
+ // Layer-2 proactive refs join the eligible set alongside feedback-signal and
2123
+ // high-retrieval (P0-A) refs. The three sources are disjoint by construction
2124
+ // (proactive draws from noFeedbackCandidates with the P0-A picks removed), but
2125
+ // dedupe defensively so a ref can never enter the loop twice. `requireFeedbackSignal`
2126
+ // still suppresses both fallback sources for callers that want feedback-only runs.
2127
+ const signalAndRetrievalRefs = dedupeRefs([...signalFiltered, ...highRetrievalRefs, ...proactiveRefs]);
1921
2128
  const mergedRefs = scope.mode === "ref" ? processableRefs : options.requireFeedbackSignal ? signalFiltered : signalAndRetrievalRefs;
2129
+ // ── Attribution tagging: stamp each ref with the eligibility lane that
2130
+ // selected it ──────────────────────────────────────────────────────────────
2131
+ // Every reflect/distill proposal must record WHICH lane chose its source asset
2132
+ // so downstream accept/reject/revert/retrieval outcomes can be sliced by lane
2133
+ // (does the PROACTIVE lane produce value vs the reactive lanes?). We build the
2134
+ // lane map here — the one place all four lanes are known — and stamp it onto
2135
+ // each ImproveEligibleRef object. Because the ref objects are shared by
2136
+ // reference across buckets, the stamp travels with the ref through the sort,
2137
+ // disk-check, and loop stages down to the reflect/distill event emit sites and
2138
+ // createProposal calls. See EligibilitySource for the lane vocabulary.
2139
+ //
2140
+ // Precedence (prefer the most specific reactive signal):
2141
+ // scope > signal-delta > high-retrieval > proactive
2142
+ // A ref with real feedback is attributed to feedback even if it was also due
2143
+ // for proactive maintenance. We apply lanes weakest-first so the strongest
2144
+ // overwrites; the explicit --scope <ref> bypass wins outright (user intent).
2145
+ const eligibilitySourceByRef = new Map();
2146
+ for (const r of proactiveRefs)
2147
+ eligibilitySourceByRef.set(r.ref, "proactive");
2148
+ for (const r of highRetrievalRefs)
2149
+ eligibilitySourceByRef.set(r.ref, "high-retrieval");
2150
+ for (const r of signalFiltered)
2151
+ eligibilitySourceByRef.set(r.ref, "signal-delta");
2152
+ if (scope.mode === "ref") {
2153
+ // O-2 (#365): explicit --scope <ref> bypass — every ref in processableRefs
2154
+ // arrived via the scopeRefBypass branch, so attribute the whole set to scope.
2155
+ for (const r of processableRefs)
2156
+ eligibilitySourceByRef.set(r.ref, "scope");
2157
+ }
2158
+ for (const r of mergedRefs) {
2159
+ // "unknown" is a genuine fallback, never a silent alias for signal-delta:
2160
+ // only refs we truly cannot attribute land here (none in practice, since
2161
+ // mergedRefs is always a subset of the four lanes above).
2162
+ r.eligibilitySource = eligibilitySourceByRef.get(r.ref) ?? "unknown";
2163
+ }
1922
2164
  const utilityMap = buildUtilityMap(mergedRefs);
1923
2165
  // Load feedback ratio per ref from the pre-computed summary (no extra DB pass).
1924
2166
  const feedbackRatios = new Map();
@@ -2059,6 +2301,7 @@ async function runImprovePreparationStage(args) {
2059
2301
  gateAutoAcceptFailedCount,
2060
2302
  consolidation: consolidationPass.consolidation,
2061
2303
  consolidationRan: consolidationPass.consolidationRan,
2304
+ ...(proactiveMaintenanceSummary ? { proactiveMaintenance: proactiveMaintenanceSummary } : {}),
2062
2305
  };
2063
2306
  }
2064
2307
  async function runImproveLoopStage(args) {
@@ -2067,6 +2310,14 @@ async function runImproveLoopStage(args) {
2067
2310
  // receives only its fair share of the wall-clock budget.
2068
2311
  const remainingBudgetMs = () => Math.max(0, budgetMs - (Date.now() - startMs));
2069
2312
  const RECENT_ERRORS_CAP = 3;
2313
+ // requirePlannedRefs guard: when the distill profile sets this flag, skip
2314
+ // distill for distill-only refs if the reflect phase produced no planned refs.
2315
+ // Prevents the distill loop from generating hundreds of distill-skipped events
2316
+ // on quiet passes (all refs on reflect cooldown, no new signal to distill).
2317
+ const requirePlannedRefs = improveProfile?.processes?.distill?.requirePlannedRefs === true;
2318
+ const _distillOnlyRefNames = new Set(distillOnlyRefs.map((r) => r.ref));
2319
+ const hasReflectEligibleRefs = loopRefs.some((r) => !_distillOnlyRefNames.has(r.ref));
2320
+ const skipDistillDueToRequirePlannedRefs = requirePlannedRefs && !hasReflectEligibleRefs;
2070
2321
  // R-2 / #389: Self-Consistency multi-sample voting helpers.
2071
2322
  // Wang et al. arXiv:2203.11171 — N=3 samples beat single-shot on reasoning tasks.
2072
2323
  const SC_THRESHOLD = options.selfConsistencyThreshold ?? 0.7;
@@ -2227,6 +2478,9 @@ async function runImproveLoopStage(args) {
2227
2478
  eventSource: "improve",
2228
2479
  ...(reflectBudgetMs > 0 ? { timeoutMs: reflectBudgetMs } : {}),
2229
2480
  ...(reflectProfileRunner ? { runner: reflectProfileRunner } : {}),
2481
+ // Attribution: carry the eligibility lane so reflect stamps it on
2482
+ // the reflect_invoked event and the persisted proposal.
2483
+ ...(planned.eligibilitySource ? { eligibilitySource: planned.eligibilitySource } : {}),
2230
2484
  };
2231
2485
  // R-2 / #389: Self-consistency multi-sample voting for high-utility refs.
2232
2486
  // Self-Consistency arXiv:2203.11171 — N=3 samples beat single-shot quality.
@@ -2251,6 +2505,9 @@ async function runImproveLoopStage(args) {
2251
2505
  source: "reflect",
2252
2506
  sourceRun: `reflect-sc-${Date.now()}`,
2253
2507
  payload: winner.proposal.payload,
2508
+ // Attribution: the self-consistency path persists the winner here
2509
+ // (draftMode skips reflect's own createProposal), so stamp the lane.
2510
+ ...(planned.eligibilitySource ? { eligibilitySource: planned.eligibilitySource } : {}),
2254
2511
  });
2255
2512
  reflectResult = isProposalSkipped(persistResult)
2256
2513
  ? {
@@ -2364,6 +2621,18 @@ async function runImproveLoopStage(args) {
2364
2621
  info(`[improve] ${completedCount}/${loopRefs.length} ${planned.ref}`);
2365
2622
  continue;
2366
2623
  }
2624
+ // requirePlannedRefs guard: skip distill for distill-only refs when no
2625
+ // reflect-eligible refs were planned this run, preventing mass skip events.
2626
+ if (skipDistillDueToRequirePlannedRefs && isDistillOnly) {
2627
+ actions.push({
2628
+ ref: planned.ref,
2629
+ mode: "distill-skipped",
2630
+ result: { ok: true, reason: "require_planned_refs" },
2631
+ });
2632
+ completedCount++;
2633
+ info(`[improve] ${completedCount}/${loopRefs.length} ${planned.ref}`);
2634
+ continue;
2635
+ }
2367
2636
  // See `isDistillCandidateRef` — excludes `lesson:*` (and anything else in
2368
2637
  // DISTILL_REFUSED_INPUT_TYPES) so distill never gets queued for an input
2369
2638
  // it will refuse.
@@ -2437,6 +2706,9 @@ async function runImproveLoopStage(args) {
2437
2706
  ref: planned.ref,
2438
2707
  ...(parsedPlannedRef.type === "memory" ? { proposalKind: "auto" } : {}),
2439
2708
  ...(options.stashDir ? { stashDir: options.stashDir } : {}),
2709
+ // Attribution: carry the eligibility lane so distill stamps it on the
2710
+ // distill_invoked event and the persisted proposal.
2711
+ ...(planned.eligibilitySource ? { eligibilitySource: planned.eligibilitySource } : {}),
2440
2712
  }));
2441
2713
  actions.push({ ref: planned.ref, mode: "distill", result: distillResult });
2442
2714
  if (distillResult.outcome === "queued" && distillResult.proposal) {
@@ -2618,309 +2890,325 @@ export async function runImproveMaintenancePasses(args) {
2618
2890
  db = openIndexDb();
2619
2891
  }
2620
2892
  };
2621
- try {
2622
- db = openIndexDb();
2623
- // Memory inference candidate-discovery (post-Item 9 fix from
2624
- // memory:akm-improve-critical-review-2026-05-20). Previously this pass
2625
- // was gated on memoryRefsForInference.size > 0 AND passed those refs as a
2626
- // candidateRefs filter. But memoryRefsForInference is populated from refs
2627
- // distilled THIS RUN by the time that happens, those parents are
2628
- // already split (`inferenceProcessed: true`) and `isPendingMemory` excludes
2629
- // them. The genuinely-pending parents in the stash never entered the
2630
- // filter. Result: 0/0/0 for 25 consecutive runs.
2631
- //
2632
- // Fix: always run the pass when the feature is enabled; let the pass's
2633
- // own `collectPendingMemories` + `isPendingMemory` predicate find
2634
- // candidates from the filesystem-of-truth. The this-run set is still
2635
- // logged as a hint but no longer used as a filter.
2636
- const memoryInferenceDisabledByProfile = improveProfile?.processes?.memoryInference?.enabled === false;
2637
- if (memoryInferenceDisabledByProfile) {
2638
- info("[improve] memory inference skipped (disabled by improve profile)");
2639
- }
2640
- else {
2641
- const hintRefs = memoryRefsForInference.size;
2642
- info(hintRefs > 0
2643
- ? `[improve] memory inference starting (${hintRefs} hint refs touched this run; pass discovers all pending)`
2644
- : "[improve] memory inference starting (discovering pending parents)");
2645
- const inferenceStart = Date.now();
2646
- try {
2647
- // O-1 (#364): pass budget signal so a hung inference call is cancelled.
2648
- memoryInference = await withLlmStage("memory-inference", () => memoryInferenceFn({
2649
- config,
2650
- sources,
2651
- signal: budgetSignal,
2652
- db,
2653
- reEnrich: false,
2654
- onProgress: (event) => {
2655
- const current = event.currentRef ? ` ${event.currentRef}` : "";
2656
- info(`[improve] memory inference ${event.processed}/${event.total}${current} (written ${event.writtenFacts}, skipped ${event.skippedNoFacts})`);
2657
- },
2658
- }));
2659
- memoryInferenceDurationMs = Date.now() - inferenceStart;
2660
- actions.push({ ref: "memory:_inference", mode: "memory-inference", result: memoryInference });
2661
- info(`[improve] memory inference complete (${memoryInference.writtenFacts} facts written from ${memoryInference.splitParents} parents)`);
2662
- }
2663
- catch (err) {
2664
- memoryInferenceDurationMs = Date.now() - inferenceStart;
2665
- allWarnings.push(`memory inference failed: ${err instanceof Error ? err.message : String(err)}`);
2666
- }
2667
- }
2668
- if (memoryInference && (memoryInference.splitParents > 0 || memoryInference.writtenFacts > 0)) {
2669
- info("[improve] reindexing after memory inference writes");
2670
- try {
2671
- await reindexWithIndexDbReleased(primaryStashDir);
2672
- reindexedAfterInference = true;
2673
- info("[improve] reindex after memory inference complete");
2893
+ await withIndexWriterLease({ purpose: "improve-maintenance", signal: budgetSignal }, async () => {
2894
+ try {
2895
+ db = openIndexDb();
2896
+ // Memory inference candidate-discovery (post-Item 9 fix from
2897
+ // memory:akm-improve-critical-review-2026-05-20). Previously this pass
2898
+ // was gated on memoryRefsForInference.size > 0 AND passed those refs as a
2899
+ // candidateRefs filter. But memoryRefsForInference is populated from refs
2900
+ // distilled THIS RUN by the time that happens, those parents are
2901
+ // already split (`inferenceProcessed: true`) and `isPendingMemory` excludes
2902
+ // them. The genuinely-pending parents in the stash never entered the
2903
+ // filter. Result: 0/0/0 for 25 consecutive runs.
2904
+ //
2905
+ // Fix: always run the pass when the feature is enabled; let the pass's
2906
+ // own `collectPendingMemories` + `isPendingMemory` predicate find
2907
+ // candidates from the filesystem-of-truth. The this-run set is still
2908
+ // logged as a hint but no longer used as a filter.
2909
+ const memoryInferenceDisabledByProfile = improveProfile?.processes?.memoryInference?.enabled === false;
2910
+ const minPendingCount = improveProfile?.processes?.memoryInference?.minPendingCount;
2911
+ const pendingBelowMinCount = (() => {
2912
+ if (!primaryStashDir || minPendingCount === undefined || minPendingCount <= 0)
2913
+ return false;
2914
+ const pending = collectPendingMemories(primaryStashDir).length;
2915
+ if (pending < minPendingCount) {
2916
+ info(`[improve] memory inference skipped (${pending} pending < minPendingCount ${minPendingCount})`);
2917
+ return true;
2918
+ }
2919
+ return false;
2920
+ })();
2921
+ if (memoryInferenceDisabledByProfile) {
2922
+ info("[improve] memory inference skipped (disabled by improve profile)");
2674
2923
  }
2675
- catch (err) {
2676
- allWarnings.push(`reindex after memory inference failed: ${err instanceof Error ? err.message : String(err)}`);
2924
+ else if (pendingBelowMinCount) {
2925
+ // skipped message already emitted above
2677
2926
  }
2678
- }
2679
- const graphEnabled = isProcessEnabled("index", "graph_extraction", config);
2680
- const graphExtractionDisabledByProfile = improveProfile?.processes?.graphExtraction?.enabled === false;
2681
- const graphExtractionFullScan = improveProfile?.processes?.graphExtraction?.fullScan === true;
2682
- // Build the set of refs actually touched this run.
2683
- const touchedRefs = new Set();
2684
- for (const r of args.actionableRefs)
2685
- touchedRefs.add(r.ref);
2686
- for (const r of memoryRefsForInference)
2687
- touchedRefs.add(r);
2688
- // INVARIANT: graph extraction normally runs only on files touched by
2689
- // actionable refs (candidatePaths). Full-corpus scans are opt-in via
2690
- // profile.processes.graphExtraction.fullScan = true (used by the
2691
- // `graph-refresh` built-in profile and its weekly scheduled task).
2692
- // The empty-Set fallback is intentional when no refs were touched —
2693
- // the extractor's filter rejects every file and returns empty, keeping
2694
- // the pass invoked so the action is recorded and tests stay exercised.
2695
- if (graphExtractionDisabledByProfile) {
2696
- info("[improve] graph extraction skipped (disabled by improve profile)");
2697
- }
2698
- else if (sources.length > 0 && graphEnabled) {
2699
- info(`[improve] graph extraction starting${graphExtractionFullScan ? " (full-corpus scan)" : ""}`);
2700
- const extractionStart = Date.now();
2701
- try {
2702
- // D9: if consolidation ran but memory inference did not reindex, force a reindex
2703
- // so graph extraction sees current DB state after consolidation writes.
2704
- if (consolidationRan && !reindexedAfterInference) {
2705
- info("[improve] reindexing after consolidation (graph extraction needs current state)");
2706
- try {
2707
- await reindexWithIndexDbReleased(primaryStashDir);
2708
- reindexedAfterInference = true;
2709
- info("[improve] reindex after consolidation complete");
2710
- }
2711
- catch (err) {
2712
- allWarnings.push(`reindex after consolidation failed: ${err instanceof Error ? err.message : String(err)}`);
2713
- }
2927
+ else {
2928
+ const hintRefs = memoryRefsForInference.size;
2929
+ info(hintRefs > 0
2930
+ ? `[improve] memory inference starting (${hintRefs} hint refs touched this run; pass discovers all pending)`
2931
+ : "[improve] memory inference starting (discovering pending parents)");
2932
+ const inferenceStart = Date.now();
2933
+ try {
2934
+ // O-1 (#364): pass budget signal so a hung inference call is cancelled.
2935
+ memoryInference = await withLlmStage("memory-inference", () => memoryInferenceFn({
2936
+ config,
2937
+ sources,
2938
+ signal: budgetSignal,
2939
+ db,
2940
+ reEnrich: false,
2941
+ onProgress: (event) => {
2942
+ const current = event.currentRef ? ` ${event.currentRef}` : "";
2943
+ info(`[improve] memory inference ${event.processed}/${event.total}${current} (written ${event.writtenFacts}, skipped ${event.skippedNoFacts})`);
2944
+ },
2945
+ }));
2946
+ memoryInferenceDurationMs = Date.now() - inferenceStart;
2947
+ actions.push({ ref: "memory:_inference", mode: "memory-inference", result: memoryInference });
2948
+ info(`[improve] memory inference complete (${memoryInference.writtenFacts} facts written from ${memoryInference.splitParents} parents)`);
2714
2949
  }
2715
- // #584: no close/reopen needed here — reindexWithIndexDbReleased
2716
- // already swapped in a fresh post-reindex handle.
2717
- // Resolve touched refs to absolute file paths. Skipped for fullScan
2718
- // (candidatePaths stays undefined → extractor processes all files).
2719
- let candidatePaths;
2720
- if (!graphExtractionFullScan) {
2721
- candidatePaths = new Set();
2722
- if (primaryStashDir && touchedRefs.size > 0) {
2723
- const writableDirSet = new Set(getWritableStashDirs(primaryStashDir).map((d) => path.resolve(d)));
2724
- const resolved = await Promise.all([...touchedRefs].map((ref) => findAssetFilePath(ref, primaryStashDir, writableDirSet).catch(() => null)));
2725
- for (const p of resolved) {
2726
- if (typeof p === "string" && p.length > 0)
2727
- candidatePaths.add(p);
2728
- }
2729
- }
2950
+ catch (err) {
2951
+ memoryInferenceDurationMs = Date.now() - inferenceStart;
2952
+ allWarnings.push(`memory inference failed: ${err instanceof Error ? err.message : String(err)}`);
2730
2953
  }
2731
- const progressHandler = (event) => {
2732
- const current = event.currentPath ? ` ${path.basename(event.currentPath)}` : "";
2733
- info(`[improve] graph extraction ${event.processed}/${event.total}${current} (extracted ${event.extracted}, entities ${event.totalEntities}, relations ${event.totalRelations})`);
2734
- };
2735
- // O-1 (#364): pass budget signal so a hung graph extraction call is cancelled.
2736
- graphExtraction = await withLlmStage("graph-extraction", () => graphExtractionFn({
2737
- config,
2738
- sources,
2739
- signal: budgetSignal,
2740
- db,
2741
- reEnrich: false,
2742
- onProgress: progressHandler,
2743
- options: { candidatePaths },
2744
- }));
2745
- graphExtractionDurationMs = Date.now() - extractionStart;
2746
- actions.push({ ref: "graph:_artifact", mode: "graph-extraction", result: graphExtraction });
2747
- info(`[improve] graph extraction complete (${graphExtraction.quality.extractedFiles} files, ${graphExtraction.quality.entityCount} entities, ${graphExtraction.quality.relationCount} relations)`);
2748
- }
2749
- catch (err) {
2750
- graphExtractionDurationMs = Date.now() - extractionStart;
2751
- allWarnings.push(`graph extraction failed: ${err instanceof Error ? err.message : String(err)}`);
2752
2954
  }
2753
- }
2754
- else if (sources.length > 0 && !graphEnabled) {
2755
- info("[improve] graph extraction skipped (features.index.graph_extraction is disabled)");
2756
- }
2757
- // Orphan proposal purge — reject pending reflect proposals whose target
2758
- // asset no longer exists on disk. Runs after graph extraction so newly
2759
- // promoted assets from accept flows during this run are already present.
2760
- if (primaryStashDir) {
2761
- try {
2762
- const purgeResult = purgeOrphanProposals(primaryStashDir, sources.map((s) => s.path));
2763
- orphansPurged = purgeResult.rejected;
2764
- if (purgeResult.rejected > 0) {
2765
- info(`[improve] orphan purge: ${purgeResult.rejected}/${purgeResult.checked} orphaned proposals rejected (${purgeResult.durationMs}ms)`);
2955
+ if (memoryInference && (memoryInference.splitParents > 0 || memoryInference.writtenFacts > 0)) {
2956
+ info("[improve] reindexing after memory inference writes");
2957
+ try {
2958
+ await reindexWithIndexDbReleased(primaryStashDir);
2959
+ reindexedAfterInference = true;
2960
+ info("[improve] reindex after memory inference complete");
2961
+ }
2962
+ catch (err) {
2963
+ allWarnings.push(`reindex after memory inference failed: ${err instanceof Error ? err.message : String(err)}`);
2766
2964
  }
2767
- appendEvent({
2768
- eventType: "proposal_orphan_purge",
2769
- ref: "proposals:_orphan-purge",
2770
- metadata: {
2771
- checked: purgeResult.checked,
2772
- rejected: purgeResult.rejected,
2773
- durationMs: purgeResult.durationMs,
2774
- byType: purgeResult.byType,
2775
- orphans: purgeResult.orphans.map((o) => o.ref),
2776
- },
2777
- }, eventsCtx);
2778
2965
  }
2779
- catch (err) {
2780
- allWarnings.push(`orphan purge failed: ${err instanceof Error ? err.message : String(err)}`);
2966
+ const graphEnabled = isProcessEnabled("index", "graph_extraction", config);
2967
+ const graphExtractionDisabledByProfile = improveProfile?.processes?.graphExtraction?.enabled === false;
2968
+ const graphExtractionFullScan = improveProfile?.processes?.graphExtraction?.fullScan === true;
2969
+ // Build the set of refs actually touched this run.
2970
+ const touchedRefs = new Set();
2971
+ for (const r of args.actionableRefs)
2972
+ touchedRefs.add(r.ref);
2973
+ for (const r of memoryRefsForInference)
2974
+ touchedRefs.add(r);
2975
+ // INVARIANT: graph extraction normally runs only on files touched by
2976
+ // actionable refs (candidatePaths). Full-corpus scans are opt-in via
2977
+ // profile.processes.graphExtraction.fullScan = true (used by the
2978
+ // `graph-refresh` built-in profile and its weekly scheduled task).
2979
+ // The empty-Set fallback is intentional when no refs were touched —
2980
+ // the extractor's filter rejects every file and returns empty, keeping
2981
+ // the pass invoked so the action is recorded and tests stay exercised.
2982
+ if (graphExtractionDisabledByProfile) {
2983
+ info("[improve] graph extraction skipped (disabled by improve profile)");
2781
2984
  }
2782
- // Phase 6B (Advantage D6b): expire pending proposals that have aged past
2783
- // the retention window. Runs AFTER orphan purge so we never double-archive
2784
- // a proposal that orphan-purge already moved. `expireStaleProposals` emits
2785
- // its own per-proposal `proposal_expired` events; we additionally emit a
2786
- // single roll-up event here for parity with the orphan-purge surface.
2787
- try {
2788
- const expireResult = expireStaleProposals(primaryStashDir, config);
2789
- proposalsExpired = expireResult.expired;
2790
- if (expireResult.expired > 0) {
2791
- info(`[improve] expiration: ${expireResult.expired}/${expireResult.checked} pending proposals expired ` +
2792
- `(retention=${expireResult.retentionDays}d, ${expireResult.durationMs}ms)`);
2985
+ else if (sources.length > 0 && graphEnabled) {
2986
+ info(`[improve] graph extraction starting${graphExtractionFullScan ? " (full-corpus scan)" : ""}`);
2987
+ const extractionStart = Date.now();
2988
+ try {
2989
+ // D9: if consolidation ran but memory inference did not reindex, force a reindex
2990
+ // so graph extraction sees current DB state after consolidation writes.
2991
+ if (consolidationRan && !reindexedAfterInference) {
2992
+ info("[improve] reindexing after consolidation (graph extraction needs current state)");
2993
+ try {
2994
+ await reindexWithIndexDbReleased(primaryStashDir);
2995
+ reindexedAfterInference = true;
2996
+ info("[improve] reindex after consolidation complete");
2997
+ }
2998
+ catch (err) {
2999
+ allWarnings.push(`reindex after consolidation failed: ${err instanceof Error ? err.message : String(err)}`);
3000
+ }
3001
+ }
3002
+ // #584: no close/reopen needed here — reindexWithIndexDbReleased
3003
+ // already swapped in a fresh post-reindex handle.
3004
+ // Resolve touched refs to absolute file paths. Skipped for fullScan
3005
+ // (candidatePaths stays undefined → extractor processes all files).
3006
+ let candidatePaths;
3007
+ if (!graphExtractionFullScan) {
3008
+ candidatePaths = new Set();
3009
+ if (primaryStashDir && touchedRefs.size > 0) {
3010
+ const writableDirSet = new Set(getWritableStashDirs(primaryStashDir).map((d) => path.resolve(d)));
3011
+ const resolved = await Promise.all([...touchedRefs].map((ref) => findAssetFilePath(ref, primaryStashDir, writableDirSet).catch(() => null)));
3012
+ for (const p of resolved) {
3013
+ if (typeof p === "string" && p.length > 0)
3014
+ candidatePaths.add(p);
3015
+ }
3016
+ }
3017
+ }
3018
+ const progressHandler = (event) => {
3019
+ const current = event.currentPath ? ` ${path.basename(event.currentPath)}` : "";
3020
+ info(`[improve] graph extraction ${event.processed}/${event.total}${current} (extracted ${event.extracted}, entities ${event.totalEntities}, relations ${event.totalRelations})`);
3021
+ };
3022
+ // O-1 (#364): pass budget signal so a hung graph extraction call is cancelled.
3023
+ graphExtraction = await withLlmStage("graph-extraction", () => graphExtractionFn({
3024
+ config,
3025
+ sources,
3026
+ signal: budgetSignal,
3027
+ db,
3028
+ reEnrich: false,
3029
+ onProgress: progressHandler,
3030
+ options: { candidatePaths },
3031
+ }));
3032
+ graphExtractionDurationMs = Date.now() - extractionStart;
3033
+ actions.push({ ref: "graph:_artifact", mode: "graph-extraction", result: graphExtraction });
3034
+ info(`[improve] graph extraction complete (${graphExtraction.quality.extractedFiles} files, ${graphExtraction.quality.entityCount} entities, ${graphExtraction.quality.relationCount} relations)`);
3035
+ }
3036
+ catch (err) {
3037
+ graphExtractionDurationMs = Date.now() - extractionStart;
3038
+ allWarnings.push(`graph extraction failed: ${err instanceof Error ? err.message : String(err)}`);
2793
3039
  }
2794
- appendEvent({
2795
- eventType: "proposal_expiration_pass",
2796
- ref: "proposals:_expiration",
2797
- metadata: {
2798
- checked: expireResult.checked,
2799
- expired: expireResult.expired,
2800
- durationMs: expireResult.durationMs,
2801
- retentionDays: expireResult.retentionDays,
2802
- expiredProposals: expireResult.expiredProposals,
2803
- },
2804
- }, eventsCtx);
2805
3040
  }
2806
- catch (err) {
2807
- allWarnings.push(`proposal expiration failed: ${err instanceof Error ? err.message : String(err)}`);
3041
+ else if (sources.length > 0 && !graphEnabled) {
3042
+ info("[improve] graph extraction skipped (features.index.graph_extraction is disabled)");
2808
3043
  }
2809
- }
2810
- // Fix #2 (observability 0.8.0): trim the events table in state.db so it
2811
- // doesn't grow unbounded. `akm health` writes a `health_probe` row on every
2812
- // invocation, and every command surface emits at least one event besides —
2813
- // without this trim, state.db is a permanent append-only log. Config key
2814
- // `improve.eventRetentionDays` (default 90, set 0 to disable) controls the
2815
- // window. The purge runs against state.db (a different SQLite file from
2816
- // the index `db` above).
2817
- {
2818
- const retentionDays = typeof config.improve?.eventRetentionDays === "number" ? config.improve.eventRetentionDays : 90;
2819
- if (retentionDays > 0) {
2820
- // #585: reuse the long-lived eventsCtx.db connection when akmImprove
2821
- // opened one — opening a second state.db write connection while
2822
- // eventsDb is still live made two simultaneous writers contend on the
2823
- // same WAL file ("database is locked"). Only the eventsCtx.dbPath
2824
- // fallback path (state.db failed to open up-front) opens — and then
2825
- // owns and closes — its own handle. C2 still holds: the fallback uses
2826
- // the boundary-pinned path, never a live `process.env` re-read.
2827
- const ownsStateDb = !eventsCtx?.db;
2828
- let stateDb;
3044
+ // Orphan proposal purge — reject pending reflect proposals whose target
3045
+ // asset no longer exists on disk. Runs after graph extraction so newly
3046
+ // promoted assets from accept flows during this run are already present.
3047
+ if (primaryStashDir) {
2829
3048
  try {
2830
- stateDb = eventsCtx?.db ?? openStateDatabase(eventsCtx?.dbPath);
2831
- const purgedCount = purgeOldEvents(stateDb, retentionDays);
2832
- if (purgedCount > 0) {
2833
- info(`[improve] events purge: ${purgedCount} event(s) older than ${retentionDays}d removed from state.db`);
2834
- }
2835
- appendEvent({
2836
- eventType: "events_purged",
2837
- ref: "events:_purge",
2838
- metadata: { purgedCount, retentionDays },
2839
- }, eventsCtx);
2840
- // improve_runs uses the same retention window as events — both are
2841
- // observability/audit data, both grow append-only, both have a
2842
- // dedicated purge helper. Mirroring the events purge here means a
2843
- // single retention knob (improve.eventRetentionDays) governs both.
2844
- const improveRunsPurged = purgeOldImproveRuns(stateDb, retentionDays);
2845
- if (improveRunsPurged > 0) {
2846
- info(`[improve] improve_runs purge: ${improveRunsPurged} run(s) older than ${retentionDays}d removed from state.db`);
3049
+ const purgeResult = purgeOrphanProposals(primaryStashDir, sources.map((s) => s.path));
3050
+ orphansPurged = purgeResult.rejected;
3051
+ if (purgeResult.rejected > 0) {
3052
+ info(`[improve] orphan purge: ${purgeResult.rejected}/${purgeResult.checked} orphaned proposals rejected (${purgeResult.durationMs}ms)`);
2847
3053
  }
2848
3054
  appendEvent({
2849
- eventType: "improve_runs_purged",
2850
- ref: "improve_runs:_purge",
2851
- metadata: { purgedCount: improveRunsPurged, retentionDays },
3055
+ eventType: "proposal_orphan_purge",
3056
+ ref: "proposals:_orphan-purge",
3057
+ metadata: {
3058
+ checked: purgeResult.checked,
3059
+ rejected: purgeResult.rejected,
3060
+ durationMs: purgeResult.durationMs,
3061
+ byType: purgeResult.byType,
3062
+ orphans: purgeResult.orphans.map((o) => o.ref),
3063
+ },
2852
3064
  }, eventsCtx);
2853
3065
  }
2854
3066
  catch (err) {
2855
- allWarnings.push(`events purge failed: ${err instanceof Error ? err.message : String(err)}`);
3067
+ allWarnings.push(`orphan purge failed: ${err instanceof Error ? err.message : String(err)}`);
2856
3068
  }
2857
- finally {
2858
- if (ownsStateDb && stateDb) {
2859
- try {
2860
- stateDb.close();
2861
- }
2862
- catch {
2863
- // best-effort
2864
- }
2865
- }
2866
- }
2867
- // task_logs in logs.db (#579) shares the same retention window as
2868
- // events/improve_runs — all three are observability data governed by
2869
- // the single improve.eventRetentionDays knob. Separate try/finally
2870
- // because logs.db is a different file: a locked/missing logs.db must
2871
- // not block the state.db purges above.
2872
- let logsDb;
3069
+ // Phase 6B (Advantage D6b): expire pending proposals that have aged past
3070
+ // the retention window. Runs AFTER orphan purge so we never double-archive
3071
+ // a proposal that orphan-purge already moved. `expireStaleProposals` emits
3072
+ // its own per-proposal `proposal_expired` events; we additionally emit a
3073
+ // single roll-up event here for parity with the orphan-purge surface.
2873
3074
  try {
2874
- logsDb = openLogsDatabase();
2875
- const taskLogsPurged = purgeOldTaskLogs(logsDb, retentionDays);
2876
- if (taskLogsPurged > 0) {
2877
- info(`[improve] task_logs purge: ${taskLogsPurged} log line(s) older than ${retentionDays}d removed from logs.db`);
3075
+ const expireResult = expireStaleProposals(primaryStashDir, config);
3076
+ proposalsExpired = expireResult.expired;
3077
+ if (expireResult.expired > 0) {
3078
+ info(`[improve] expiration: ${expireResult.expired}/${expireResult.checked} pending proposals expired ` +
3079
+ `(retention=${expireResult.retentionDays}d, ${expireResult.durationMs}ms)`);
2878
3080
  }
2879
3081
  appendEvent({
2880
- eventType: "task_logs_purged",
2881
- ref: "task_logs:_purge",
2882
- metadata: { purgedCount: taskLogsPurged, retentionDays },
3082
+ eventType: "proposal_expiration_pass",
3083
+ ref: "proposals:_expiration",
3084
+ metadata: {
3085
+ checked: expireResult.checked,
3086
+ expired: expireResult.expired,
3087
+ durationMs: expireResult.durationMs,
3088
+ retentionDays: expireResult.retentionDays,
3089
+ expiredProposals: expireResult.expiredProposals,
3090
+ },
2883
3091
  }, eventsCtx);
2884
3092
  }
2885
3093
  catch (err) {
2886
- allWarnings.push(`task_logs purge failed: ${err instanceof Error ? err.message : String(err)}`);
3094
+ allWarnings.push(`proposal expiration failed: ${err instanceof Error ? err.message : String(err)}`);
2887
3095
  }
2888
- finally {
2889
- if (logsDb) {
2890
- try {
2891
- logsDb.close();
3096
+ }
3097
+ // Fix #2 (observability 0.8.0): trim the events table in state.db so it
3098
+ // doesn't grow unbounded. `akm health` writes a `health_probe` row on every
3099
+ // invocation, and every command surface emits at least one event besides —
3100
+ // without this trim, state.db is a permanent append-only log. Config key
3101
+ // `improve.eventRetentionDays` (default 90, set 0 to disable) controls the
3102
+ // window. The purge runs against state.db (a different SQLite file from
3103
+ // the index `db` above).
3104
+ {
3105
+ const retentionDays = typeof config.improve?.eventRetentionDays === "number" ? config.improve.eventRetentionDays : 90;
3106
+ if (retentionDays > 0) {
3107
+ // #585: reuse the long-lived eventsCtx.db connection when akmImprove
3108
+ // opened one — opening a second state.db write connection while
3109
+ // eventsDb is still live made two simultaneous writers contend on the
3110
+ // same WAL file ("database is locked"). Only the eventsCtx.dbPath
3111
+ // fallback path (state.db failed to open up-front) opens — and then
3112
+ // owns and closes — its own handle. C2 still holds: the fallback uses
3113
+ // the boundary-pinned path, never a live `process.env` re-read.
3114
+ const ownsStateDb = !eventsCtx?.db;
3115
+ let stateDb;
3116
+ try {
3117
+ stateDb = eventsCtx?.db ?? openStateDatabase(eventsCtx?.dbPath);
3118
+ const purgedCount = purgeOldEvents(stateDb, retentionDays);
3119
+ if (purgedCount > 0) {
3120
+ info(`[improve] events purge: ${purgedCount} event(s) older than ${retentionDays}d removed from state.db`);
2892
3121
  }
2893
- catch {
2894
- // best-effort
3122
+ appendEvent({
3123
+ eventType: "events_purged",
3124
+ ref: "events:_purge",
3125
+ metadata: { purgedCount, retentionDays },
3126
+ }, eventsCtx);
3127
+ // improve_runs uses the same retention window as events — both are
3128
+ // observability/audit data, both grow append-only, both have a
3129
+ // dedicated purge helper. Mirroring the events purge here means a
3130
+ // single retention knob (improve.eventRetentionDays) governs both.
3131
+ const improveRunsPurged = purgeOldImproveRuns(stateDb, retentionDays);
3132
+ if (improveRunsPurged > 0) {
3133
+ info(`[improve] improve_runs purge: ${improveRunsPurged} run(s) older than ${retentionDays}d removed from state.db`);
3134
+ }
3135
+ appendEvent({
3136
+ eventType: "improve_runs_purged",
3137
+ ref: "improve_runs:_purge",
3138
+ metadata: { purgedCount: improveRunsPurged, retentionDays },
3139
+ }, eventsCtx);
3140
+ }
3141
+ catch (err) {
3142
+ allWarnings.push(`events purge failed: ${err instanceof Error ? err.message : String(err)}`);
3143
+ }
3144
+ finally {
3145
+ if (ownsStateDb && stateDb) {
3146
+ try {
3147
+ stateDb.close();
3148
+ }
3149
+ catch {
3150
+ // best-effort
3151
+ }
3152
+ }
3153
+ }
3154
+ // task_logs in logs.db (#579) shares the same retention window as
3155
+ // events/improve_runs — all three are observability data governed by
3156
+ // the single improve.eventRetentionDays knob. Separate try/finally
3157
+ // because logs.db is a different file: a locked/missing logs.db must
3158
+ // not block the state.db purges above.
3159
+ let logsDb;
3160
+ try {
3161
+ logsDb = openLogsDatabase();
3162
+ const taskLogsPurged = purgeOldTaskLogs(logsDb, retentionDays);
3163
+ if (taskLogsPurged > 0) {
3164
+ info(`[improve] task_logs purge: ${taskLogsPurged} log line(s) older than ${retentionDays}d removed from logs.db`);
3165
+ }
3166
+ appendEvent({
3167
+ eventType: "task_logs_purged",
3168
+ ref: "task_logs:_purge",
3169
+ metadata: { purgedCount: taskLogsPurged, retentionDays },
3170
+ }, eventsCtx);
3171
+ }
3172
+ catch (err) {
3173
+ allWarnings.push(`task_logs purge failed: ${err instanceof Error ? err.message : String(err)}`);
3174
+ }
3175
+ finally {
3176
+ if (logsDb) {
3177
+ try {
3178
+ logsDb.close();
3179
+ }
3180
+ catch {
3181
+ // best-effort
3182
+ }
2895
3183
  }
2896
3184
  }
2897
3185
  }
2898
3186
  }
2899
- }
2900
- // Phase 4A (staleness detection). Activates the `deprecated` belief-state
2901
- // machinery shipped in Phase 1A. Default OFF gated by
2902
- // `features.index.staleness_detection.enabled`. Runs after orphan purge
2903
- // and before the URL check (which lives in the outer caller).
2904
- if (sources.length > 0) {
2905
- try {
2906
- stalenessDetection = await withLlmStage("staleness-detection", () => stalenessDetectionFn({ config, sources, signal: budgetSignal, db }));
2907
- if (stalenessDetection.considered > 0) {
2908
- info(`[improve] staleness detection complete (considered ${stalenessDetection.considered}, ` +
2909
- `deprecated ${stalenessDetection.deprecated}, confirmed ${stalenessDetection.confirmed}, ` +
2910
- `skipped ${stalenessDetection.skipped}, ${stalenessDetection.durationMs}ms)`);
3187
+ // Phase 4A (staleness detection). Activates the `deprecated` belief-state
3188
+ // machinery shipped in Phase 1A. Default OFF gated by
3189
+ // `features.index.staleness_detection.enabled`. Runs after orphan purge
3190
+ // and before the URL check (which lives in the outer caller).
3191
+ if (sources.length > 0) {
3192
+ try {
3193
+ stalenessDetection = await withLlmStage("staleness-detection", () => stalenessDetectionFn({ config, sources, signal: budgetSignal, db }));
3194
+ if (stalenessDetection.considered > 0) {
3195
+ info(`[improve] staleness detection complete (considered ${stalenessDetection.considered}, ` +
3196
+ `deprecated ${stalenessDetection.deprecated}, confirmed ${stalenessDetection.confirmed}, ` +
3197
+ `skipped ${stalenessDetection.skipped}, ${stalenessDetection.durationMs}ms)`);
3198
+ }
3199
+ for (const w of stalenessDetection.warnings)
3200
+ allWarnings.push(`[improve] staleness detection: ${w}`);
3201
+ }
3202
+ catch (err) {
3203
+ allWarnings.push(`staleness detection failed: ${err instanceof Error ? err.message : String(err)}`);
2911
3204
  }
2912
- for (const w of stalenessDetection.warnings)
2913
- allWarnings.push(`[improve] staleness detection: ${w}`);
2914
- }
2915
- catch (err) {
2916
- allWarnings.push(`staleness detection failed: ${err instanceof Error ? err.message : String(err)}`);
2917
3205
  }
2918
3206
  }
2919
- }
2920
- finally {
2921
- if (db)
2922
- closeDatabase(db);
2923
- }
3207
+ finally {
3208
+ if (db)
3209
+ closeDatabase(db);
3210
+ }
3211
+ });
2924
3212
  return {
2925
3213
  ...(memoryInference ? { memoryInference } : {}),
2926
3214
  ...(graphExtraction ? { graphExtraction } : {}),