akm-cli 0.9.0-beta.6 → 0.9.0-beta.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@ import { info, warn } from "../../core/warn.js";
19
19
  import { closeDatabase, getAllEntries, getEntryCount, getRetrievalCounts, getUtilityScoresByIds, getZeroResultSearches, openDatabase, openExistingDatabase, } from "../../indexer/db/db.js";
20
20
  import { ensureIndex } from "../../indexer/ensure-index.js";
21
21
  import { runGraphExtractionPass } from "../../indexer/graph/graph-extraction.js";
22
+ import { withIndexWriterLease } from "../../indexer/index-writer-lock.js";
22
23
  import { akmIndex } from "../../indexer/indexer.js";
23
24
  import { collectPendingMemories, runMemoryInferencePass, } from "../../indexer/passes/memory-inference.js";
24
25
  import { runStalenessDetectionPass } from "../../indexer/passes/staleness-detect.js";
@@ -46,7 +47,105 @@ import { makeGateConfig, resolveExtractConfidence, runAutoAcceptGate } from "./i
46
47
  import { isProfileFilteredForAllPasses, resolveImproveProfile, resolveProcessEnabled, shouldSkipRef, } from "./improve-profiles.js";
47
48
  import { detectAndWriteContradictions } from "./memory/memory-contradiction-detect.js";
48
49
  import { analyzeMemoryCleanup, applyMemoryCleanup } from "./memory/memory-improve.js";
50
+ import { DEFAULT_DUE_DAYS, DEFAULT_MAX_PER_RUN, selectProactiveMaintenanceRefs } from "./proactive-maintenance.js";
49
51
  import { akmReflect } from "./reflect.js";
52
+ // #607 Lock Decomposition: fine-grained per-process locks replace the single
53
+ // `improve.lock`. Three independent locks allow concurrent improve runs when
54
+ // they touch different subsystems (e.g. quick-shredder consolidate can run
55
+ // alongside daily reflect+distill).
56
+ //
57
+ // consolidate.lock — protects consolidate + memoryInference (both write index.db)
58
+ // reflect-distill.lock — protects reflect + distill (both write state.db proposals)
59
+ // triage.lock — protects triage (writes proposal promotions)
60
+ //
61
+ // Stale timeouts are per-lock, tuned to the expected runtime of the protected
62
+ // processes: consolidate is disk-bound (1h), reflect+distill is GPU-bound (2h),
63
+ // triage is fast (30min).
64
+ const PROCESS_LOCK_DEFS = {
65
+ consolidate: { fileName: "consolidate.lock", staleAfterMs: 60 * 60 * 1000 },
66
+ reflectDistill: { fileName: "reflect-distill.lock", staleAfterMs: 2 * 60 * 60 * 1000 },
67
+ triage: { fileName: "triage.lock", staleAfterMs: 30 * 60 * 1000 },
68
+ };
69
+ const heldProcessLocks = new Set();
70
+ export function resetHeldProcessLocks() {
71
+ heldProcessLocks.clear();
72
+ }
73
+ function processLockPath(lockBaseDir, lockName) {
74
+ return path.join(lockBaseDir, PROCESS_LOCK_DEFS[lockName].fileName);
75
+ }
76
+ function tryAcquireProcessLock(lockPath, staleAfterMs, skipIfLocked, lockLabel) {
77
+ fs.mkdirSync(path.dirname(lockPath), { recursive: true });
78
+ const lockPayload = () => JSON.stringify({ pid: process.pid, startedAt: new Date().toISOString() });
79
+ if (tryAcquireLockSync(lockPath, lockPayload())) {
80
+ heldProcessLocks.add(lockPath);
81
+ return "acquired";
82
+ }
83
+ const probe = probeLock(lockPath, { staleAfterMs });
84
+ const rawContent = probe.state === "absent" ? undefined : probe.rawContent;
85
+ const lock = rawContent
86
+ ? (() => {
87
+ try {
88
+ return JSON.parse(rawContent);
89
+ }
90
+ catch {
91
+ return null;
92
+ }
93
+ })()
94
+ : null;
95
+ if (probe.state === "stale") {
96
+ try {
97
+ appendEvent({
98
+ eventType: "improve_lock_recovered",
99
+ metadata: {
100
+ lockName: lockLabel,
101
+ stalePid: lock?.pid ?? null,
102
+ lockedAt: lock?.startedAt ?? null,
103
+ recoveredAt: new Date().toISOString(),
104
+ lockAgeMs: probe.ageMs ?? null,
105
+ reason: probe.reason === "pid_dead" ? "pid_not_alive" : probe.reason,
106
+ },
107
+ });
108
+ }
109
+ catch {
110
+ /* event emission is best-effort; never block lock recovery */
111
+ }
112
+ releaseLock(lockPath);
113
+ if (tryAcquireLockSync(lockPath, lockPayload())) {
114
+ heldProcessLocks.add(lockPath);
115
+ return "acquired";
116
+ }
117
+ if (skipIfLocked) {
118
+ warn(`[improve] ${lockLabel} lock acquired by another run during stale recovery; skipping (--skip-if-locked)`);
119
+ return "skipped";
120
+ }
121
+ throw new ConfigError(`akm improve ${lockLabel} is already running. Delete ${lockPath} to force.`, "INVALID_CONFIG_FILE");
122
+ }
123
+ if (skipIfLocked) {
124
+ warn(`[improve] ${lockLabel} lock held by another run (PID ${lock?.pid}, started ${lock?.startedAt}); skipping (--skip-if-locked)`);
125
+ return "skipped";
126
+ }
127
+ throw new ConfigError(`akm improve ${lockLabel} is already running (PID ${lock?.pid}, started ${lock?.startedAt}). Delete ${lockPath} to force.`, "INVALID_CONFIG_FILE");
128
+ }
129
+ function releaseProcessLock(lockPath) {
130
+ try {
131
+ fs.unlinkSync(lockPath);
132
+ }
133
+ catch {
134
+ // ignore
135
+ }
136
+ heldProcessLocks.delete(lockPath);
137
+ }
138
+ function releaseAllProcessLocks() {
139
+ for (const p of heldProcessLocks) {
140
+ try {
141
+ fs.unlinkSync(p);
142
+ }
143
+ catch {
144
+ // ignore
145
+ }
146
+ }
147
+ heldProcessLocks.clear();
148
+ }
50
149
  function resolveImproveScope(scope) {
51
150
  const trimmed = scope?.trim();
52
151
  if (!trimmed)
@@ -102,6 +201,22 @@ export function renderSyncCommitMessage(template, result, nowMs) {
102
201
  };
103
202
  return template.replace(/\{(\w+)\}/g, (match, key) => (Object.hasOwn(tokens, key) ? tokens[key] : match));
104
203
  }
204
+ /**
205
+ * Dedupe a list of eligible refs by `ref`, preserving first-seen order. Used to
206
+ * merge the three eligibility sources (feedback-signal, P0-A high-retrieval,
207
+ * Layer-2 proactive-maintenance) without admitting a ref into the loop twice.
208
+ */
209
+ function dedupeRefs(refs) {
210
+ const seen = new Set();
211
+ const out = [];
212
+ for (const r of refs) {
213
+ if (seen.has(r.ref))
214
+ continue;
215
+ seen.add(r.ref);
216
+ out.push(r);
217
+ }
218
+ return out;
219
+ }
105
220
  async function collectEligibleRefs(scope, stashDir, improveProfile) {
106
221
  if (scope.mode === "ref" && scope.value) {
107
222
  const parsed = parseAssetRef(scope.value);
@@ -491,103 +606,16 @@ export async function akmImprove(options = {}) {
491
606
  // timeout root cause). Because beforeEach runs synchronously, env is still the
492
607
  // calling test's own at this point; we capture it before yielding the loop.
493
608
  const resolvedStateDbPath = getStateDbPathInDataDir();
494
- // Phase 4 lock hoist (§7): the `improve.lock` setup is hoisted ABOVE
495
- // ensureIndex/collectEligibleRefs so the triage pre-pass (and improve's own
496
- // queue writes) run fully serialized under the lock. The dry-run early-return
497
- // below still skips the lock and triage (the lock+triage block is gated on
498
- // `!options.dryRun`); contradiction-detection and memory-cleanup analysis,
499
- // which previously ran before the lock, now sit after it for free.
500
- const resolvedLockPath = primaryStashDir
501
- ? path.join(primaryStashDir, ".akm", "improve.lock")
502
- : path.join(options.stashDir ?? ".", ".akm", "improve.lock");
503
- const MAX_LOCK_AGE_MS = 4 * 60 * 60 * 1000; // 4 hours
504
- const acquireLock = () => {
505
- fs.mkdirSync(path.dirname(resolvedLockPath), { recursive: true });
506
- const lockPayload = () => JSON.stringify({ pid: process.pid, startedAt: new Date().toISOString() });
507
- if (tryAcquireLockSync(resolvedLockPath, lockPayload()))
508
- return "acquired";
509
- // Lock file already exists — probe to determine whether it's still held
510
- // or whether the prior run died without cleaning up.
511
- const probe = probeLock(resolvedLockPath, { staleAfterMs: MAX_LOCK_AGE_MS });
512
- const rawContent = probe.state === "absent" ? undefined : probe.rawContent;
513
- const lock = rawContent
514
- ? (() => {
515
- try {
516
- return JSON.parse(rawContent);
517
- }
518
- catch {
519
- return null;
520
- }
521
- })()
522
- : null;
523
- if (probe.state === "stale") {
524
- // O-7 / #394: Emit improve_lock_recovered event before recovery so the
525
- // audit trail records the abnormal prior-run exit (Temporal/Airflow pattern).
526
- try {
527
- appendEvent({
528
- eventType: "improve_lock_recovered",
529
- metadata: {
530
- stalePid: lock?.pid ?? null,
531
- lockedAt: lock?.startedAt ?? null,
532
- recoveredAt: new Date().toISOString(),
533
- lockAgeMs: probe.ageMs ?? null,
534
- reason: probe.reason === "pid_dead" ? "pid_not_alive" : probe.reason,
535
- },
536
- });
537
- }
538
- catch {
539
- /* event emission is best-effort; never block lock recovery */
540
- }
541
- releaseLock(resolvedLockPath);
542
- if (tryAcquireLockSync(resolvedLockPath, lockPayload()))
543
- return "acquired";
544
- // Lost the race to another run that grabbed the freed stale lock.
545
- if (options.skipIfLocked) {
546
- warn("[improve] another run acquired the lock during stale recovery; skipping (--skip-if-locked)");
547
- return "skipped";
548
- }
549
- throw new ConfigError(`akm improve is already running. Delete ${resolvedLockPath} to force.`, "INVALID_CONFIG_FILE");
550
- }
551
- // Lock is held by a live run within the staleness window.
552
- if (options.skipIfLocked) {
553
- warn(`[improve] another improve run holds the lock (PID ${lock?.pid}, started ${lock?.startedAt}); skipping (--skip-if-locked)`);
554
- return "skipped";
555
- }
556
- throw new ConfigError(`akm improve is already running (PID ${lock?.pid}, started ${lock?.startedAt}). Delete ${resolvedLockPath} to force.`, "INVALID_CONFIG_FILE");
557
- };
558
- // Phase 4 lock-leak guard (§7 ordering hazard): hoisting `improve.lock` above
559
- // the pre-index region (so the triage pre-pass runs under it) means the lock is
560
- // held while ensureIndex / collectEligibleRefs / contradiction-detection /
561
- // memory-cleanup analysis run — but the main protecting `try { … } finally {
562
- // unlinkSync(resolvedLockPath) }` does not begin until after them. A throw in
563
- // any of those steps would leak the lock. We close that window by wrapping the
564
- // whole region in a try whose catch releases the lock (when held) and
565
- // re-throws. The values this region computes are declared in the outer scope so
566
- // they remain visible to the main run below. The dry-run path never sets
567
- // `lockAcquired`, so its early return releases nothing.
568
- let lockAcquired = false;
569
- const releaseLockOnError = () => {
570
- if (!lockAcquired)
571
- return;
572
- try {
573
- fs.unlinkSync(resolvedLockPath);
574
- }
575
- catch {
576
- // best-effort release on the error path
577
- }
578
- lockAcquired = false;
579
- };
580
- // Signal-safe lock release. The SIGTERM/SIGINT/SIGHUP handler in improve-cli.ts
581
- // calls `process.exit()`, which does NOT run the `finally` below that owns lock
582
- // release — so a cron-timeout SIGTERM leaked `improve.lock` every run.
583
- // `process.exit()` DOES fire `'exit'` listeners, so we release the lock from
584
- // one. `releaseLockIfOwned` only unlinks a lock still owned by this PID, so it
585
- // is safe even if a later run re-acquired it. The listener is removed in the
586
- // `finally` so the normal path stays single-release and repeated in-process
587
- // `akmImprove` calls (tests) do not accumulate listeners.
588
- const releaseLockOnExit = () => {
589
- releaseLockIfOwned(resolvedLockPath, process.pid);
590
- };
609
+ // #607 Lock decomposition: three per-process locks replace the single
610
+ // `improve.lock`. Each process acquires only the lock(s) it needs, so
611
+ // quick-shredder consolidate can run alongside daily reflect+distill.
612
+ //
613
+ // consolidate.lock — protects consolidate + memoryInference + graphExtraction (index.db writers)
614
+ // reflect-distill.lock protects reflect + distill (state.db proposal writers)
615
+ // triage.lock — protects triage pre-pass (state.db proposal promotions)
616
+ //
617
+ // Lock base directory — same `.akm/` under the primary stash dir.
618
+ const lockBaseDir = primaryStashDir ? path.join(primaryStashDir, ".akm") : path.join(options.stashDir ?? ".", ".akm");
591
619
  const preEnsureCleanupWarnings = [];
592
620
  let plannedRefs;
593
621
  let memorySummary;
@@ -596,65 +624,59 @@ export async function akmImprove(options = {}) {
596
624
  let guidance;
597
625
  let triageDrain;
598
626
  try {
599
- // Acquire the lock and run the triage pre-pass for non-dry-run executions.
600
- // The dry-run branch below produces plannedRefs/memorySummary WITHOUT the lock
601
- // or triage (decision: dry-run never mutates the queue).
627
+ // #607: Per-process lock acquisition. Each process acquires only the lock(s)
628
+ // it needs. The dry-run branch produces plannedRefs/memorySummary WITHOUT any
629
+ // locks (decision: dry-run never mutates the queue).
602
630
  if (!options.dryRun) {
603
- if (acquireLock() === "skipped") {
604
- // Another improve holds the lock and the caller asked to skip rather
605
- // than fail. Return a clean no-op result (exit 0) before any index/DB
606
- // work — never registered the exit listener, never set lockAcquired,
607
- // so we release nothing belonging to the run that owns the lock.
608
- return {
609
- schemaVersion: 1,
610
- ok: true,
611
- scope,
612
- dryRun: false,
613
- skipped: { reason: "lock-held" },
614
- memorySummary: { eligible: 0, derived: 0 },
615
- plannedRefs: [],
616
- };
617
- }
618
- lockAcquired = true;
619
631
  // Backstop release on process.exit() (signal handler / budget watchdog),
620
632
  // which skips the finally below. Removed in that finally on the normal path.
621
- process.on("exit", releaseLockOnExit);
622
- // Phase 4 triage pre-pass (§7, §13): drain the standing pending backlog
623
- // BEFORE ensureIndex so improve generates fresh proposals against a cleared
624
- // queue (no `duplicate_pending` collisions) and ensureIndex absorbs triage's
625
- // promotions for free. Gated on the triage process being enabled (opt-in,
626
- // defaults off) and on a whole-stash / type-scoped run — a single-ref
627
- // `akm improve skill:x` must never drain the whole queue. Best-effort: a
628
- // triage failure is a non-fatal warning, never an abort (mirrors the
629
- // contradiction-detection pass below).
633
+ const releaseAllOnExit = () => {
634
+ for (const p of heldProcessLocks) {
635
+ releaseLockIfOwned(p, process.pid);
636
+ }
637
+ };
638
+ process.on("exit", releaseAllOnExit);
639
+ // #607 triage pre-pass: acquire triage.lock, drain the standing pending
640
+ // backlog BEFORE ensureIndex so improve generates fresh proposals against
641
+ // a cleared queue (no `duplicate_pending` collisions) and ensureIndex
642
+ // absorbs triage's promotions for free. Release immediately after —
643
+ // triage.lock is not needed again until the next improve run.
630
644
  if (primaryStashDir && resolveProcessEnabled("triage", improveProfile)) {
631
645
  if (scope.mode === "ref") {
632
646
  warn("[improve] triage pre-pass skipped (single-ref scope never drains the whole queue)");
633
647
  }
634
648
  else {
635
- try {
636
- const triageConfig = improveProfile.processes?.triage;
637
- const policy = resolveDrainPolicy(triageConfig?.policy);
638
- const applyMode = triageConfig?.applyMode ?? "queue";
639
- const maxAccepts = triageConfig?.maxAcceptsPerRun ?? 25;
640
- const judgment = triageConfig?.judgment
641
- ? resolveTriageJudgmentRunner(triageConfig.judgment, _earlyConfig)
642
- : null;
643
- triageDrain = await drainProposalsFn({
644
- stashDir: primaryStashDir,
645
- policy,
646
- applyMode,
647
- maxAccepts,
648
- dryRun: false,
649
- // No fresh ids exist yet — triage runs before improve generates any.
650
- excludeIds: new Set(),
651
- ...(triageConfig?.maxDiffLines !== undefined ? { maxDiffLines: triageConfig.maxDiffLines } : {}),
652
- judgment,
653
- });
649
+ const triageLPath = processLockPath(lockBaseDir, "triage");
650
+ const triageResult = tryAcquireProcessLock(triageLPath, PROCESS_LOCK_DEFS.triage.staleAfterMs, options.skipIfLocked, "triage");
651
+ if (triageResult === "skipped") {
652
+ triageDrain = undefined;
654
653
  }
655
- catch (err) {
656
- // Non-fatal: triage is a best-effort pre-pass and must never abort improve.
657
- warn(`[improve] triage pre-pass failed (non-fatal): ${err instanceof Error ? err.message : String(err)}`);
654
+ else {
655
+ try {
656
+ const triageConfig = improveProfile.processes?.triage;
657
+ const policy = resolveDrainPolicy(triageConfig?.policy);
658
+ const applyMode = triageConfig?.applyMode ?? "queue";
659
+ const maxAccepts = triageConfig?.maxAcceptsPerRun ?? 25;
660
+ const judgment = triageConfig?.judgment
661
+ ? resolveTriageJudgmentRunner(triageConfig.judgment, _earlyConfig)
662
+ : null;
663
+ triageDrain = await drainProposalsFn({
664
+ stashDir: primaryStashDir,
665
+ policy,
666
+ applyMode,
667
+ maxAccepts,
668
+ dryRun: false,
669
+ excludeIds: new Set(),
670
+ ...(triageConfig?.maxDiffLines !== undefined ? { maxDiffLines: triageConfig.maxDiffLines } : {}),
671
+ judgment,
672
+ });
673
+ }
674
+ catch (err) {
675
+ warn(`[improve] triage pre-pass failed (non-fatal): ${err instanceof Error ? err.message : String(err)}`);
676
+ }
677
+ finally {
678
+ releaseProcessLock(triageLPath);
679
+ }
658
680
  }
659
681
  }
660
682
  }
@@ -686,7 +708,7 @@ export async function akmImprove(options = {}) {
686
708
  // best-effort; leave preEnsureEntryCount undefined
687
709
  }
688
710
  try {
689
- await ensureIndexFn(primaryStashDir);
711
+ await ensureIndexFn(primaryStashDir, { mode: "blocking" });
690
712
  }
691
713
  catch (err) {
692
714
  preEnsureCleanupWarnings.push(`ensureIndex failed: ${err instanceof Error ? err.message : String(err)}`);
@@ -754,17 +776,14 @@ export async function akmImprove(options = {}) {
754
776
  }
755
777
  }
756
778
  catch (err) {
757
- releaseLockOnError();
779
+ releaseAllProcessLocks();
758
780
  throw err;
759
781
  }
760
- // FIX 2 (lock-leak window): everything from here on runs UNDER the lock that
761
- // `acquireLock()` just took. The single `try { } finally { unlinkSync(lock) }`
762
- // below now spans the budget-timer setup, `openStateDatabase()`, and the
763
- // `profileFilteredRefs` audit-event loop too regions that previously sat in
764
- // the gap between the lock-acquire catch (above) and the main try. A throw in
765
- // any of them used to leak the lock (blocking the next improve up to 4h);
766
- // now the finally releases it exactly once. The dry-run path already returned
767
- // above without acquiring the lock, so it never reaches this finally; the
782
+ // #607: per-process locks are acquired/released around each stage below.
783
+ // The triage pre-pass already ran under triage.lock (released). The
784
+ // preparation stage runs under consolidate.lock, the loop stage under
785
+ // reflect-distill.lock, and the post-loop stage under consolidate.lock again.
786
+ // Each stage acquires its lock just before starting and releases in finally.
768
787
  // best-effort `unlinkSync` is a no-op when no lock file exists.
769
788
  const startMs = Date.now();
770
789
  const budgetMs = options.timeoutMs ?? 2 * 60 * 60 * 1000; // default 2 hours
@@ -828,6 +847,10 @@ export async function akmImprove(options = {}) {
828
847
  },
829
848
  }, eventsCtx);
830
849
  }
850
+ // #607: acquire consolidate.lock for the preparation stage (consolidate,
851
+ // ensureIndex, extract all write index.db). Released immediately after.
852
+ const consolidateLPath = processLockPath(lockBaseDir, "consolidate");
853
+ const consolidatePrepAcquired = tryAcquireProcessLock(consolidateLPath, PROCESS_LOCK_DEFS.consolidate.staleAfterMs, options.skipIfLocked, "consolidate") === "acquired";
831
854
  const preparation = await runImprovePreparationStage({
832
855
  scope,
833
856
  options,
@@ -842,6 +865,8 @@ export async function akmImprove(options = {}) {
842
865
  initialCleanupWarnings: preEnsureCleanupWarnings,
843
866
  improveProfile,
844
867
  });
868
+ if (consolidatePrepAcquired)
869
+ releaseProcessLock(consolidateLPath);
845
870
  // D6: pre-load all proposal_rejected events from the last 30 days once,
846
871
  // so the per-asset loop can use a Map lookup instead of N DB round trips.
847
872
  const REJECTED_PROPOSAL_WINDOW_MS = daysToMs(30);
@@ -853,6 +878,10 @@ export async function akmImprove(options = {}) {
853
878
  rejectedProposalsByRef.set(e.ref, e);
854
879
  }
855
880
  }
881
+ // #607: acquire reflect-distill.lock for the loop stage (reflect + distill
882
+ // both write proposals to state.db). Released immediately after.
883
+ const reflectDistillLPath = processLockPath(lockBaseDir, "reflectDistill");
884
+ const reflectDistillAcquired = tryAcquireProcessLock(reflectDistillLPath, PROCESS_LOCK_DEFS.reflectDistill.staleAfterMs, options.skipIfLocked, "reflect-distill") === "acquired";
856
885
  const { reflectsWithErrorContext, memoryRefsForInference, gateAutoAcceptedCount: loopGateCount, gateAutoAcceptFailedCount: loopGateFailedCount, } = await runImproveLoopStage({
857
886
  scope,
858
887
  options,
@@ -872,9 +901,15 @@ export async function akmImprove(options = {}) {
872
901
  eventsCtx,
873
902
  improveProfile,
874
903
  });
904
+ if (reflectDistillAcquired)
905
+ releaseProcessLock(reflectDistillLPath);
875
906
  // #551: consolidation now runs in the preparation stage (before extract);
876
907
  // its result and run-flag are read from `preparation`, not the post-loop.
877
908
  const consolidation = preparation.consolidation;
909
+ // #607: acquire consolidate.lock for the post-loop stage (memoryInference +
910
+ // graphExtraction both write index.db). Released immediately after.
911
+ const consolidatePostLPath = processLockPath(lockBaseDir, "consolidate");
912
+ const consolidatePostAcquired = tryAcquireProcessLock(consolidatePostLPath, PROCESS_LOCK_DEFS.consolidate.staleAfterMs, options.skipIfLocked, "consolidate") === "acquired";
878
913
  const { allWarnings, deadUrls, memoryInference, graphExtraction, stalenessDetection, maintenanceActions, memoryInferenceDurationMs, graphExtractionDurationMs, orphansPurged, proposalsExpired, gateAutoAcceptedCount: postLoopGateCount, gateAutoAcceptFailedCount: postLoopGateFailedCount, } = await runImprovePostLoopStage({
879
914
  scope,
880
915
  options,
@@ -885,11 +920,12 @@ export async function akmImprove(options = {}) {
885
920
  memoryRefsForInference,
886
921
  reindexFn,
887
922
  eventsCtx,
888
- // O-1 (#364): propagate wall-clock budget signal to post-loop maintenance.
889
923
  budgetSignal: budgetAbortController.signal,
890
924
  improveProfile,
891
925
  consolidationRan: preparation.consolidationRan,
892
926
  });
927
+ if (consolidatePostAcquired)
928
+ releaseProcessLock(consolidatePostLPath);
893
929
  const finalActions = maintenanceActions && maintenanceActions.length > 0
894
930
  ? [...preparation.actions, ...maintenanceActions]
895
931
  : preparation.actions;
@@ -974,6 +1010,7 @@ export async function akmImprove(options = {}) {
974
1010
  },
975
1011
  }
976
1012
  : {}),
1013
+ ...(preparation.proactiveMaintenance ? { proactiveMaintenance: preparation.proactiveMaintenance } : {}),
977
1014
  ...(options.runId !== undefined ? { runId: options.runId } : {}),
978
1015
  };
979
1016
  if (!result.dryRun)
@@ -1056,15 +1093,12 @@ export async function akmImprove(options = {}) {
1056
1093
  // O-1 (#364): Clear the budget abort timer so it does not keep the event
1057
1094
  // loop alive after the run completes.
1058
1095
  clearBudgetTimer();
1059
- try {
1060
- fs.unlinkSync(resolvedLockPath);
1061
- }
1062
- catch {
1063
- // ignore
1064
- }
1065
- // The normal path released the lock above; drop the process.exit backstop so
1066
- // it does not fire later (or accumulate across repeated in-process calls).
1067
- process.removeListener("exit", releaseLockOnExit);
1096
+ // #607: release any per-process locks still held (backstop for error paths;
1097
+ // the normal path already released each lock after its stage completed).
1098
+ releaseAllProcessLocks();
1099
+ // Drop the process.exit backstop so it does not fire later (or accumulate
1100
+ // across repeated in-process calls).
1101
+ process.removeAllListeners("exit");
1068
1102
  // I1: close the long-lived state.db connection opened at the top of the run.
1069
1103
  try {
1070
1104
  eventsDb?.close();
@@ -1177,6 +1211,11 @@ function emitImproveCompletedEvent(result, durations, eventsCtx) {
1177
1211
  memoryInferenceDurationMs: durations.memoryInferenceDurationMs,
1178
1212
  graphExtractionExtractedFiles: result.graphExtraction?.quality.extractedFiles ?? 0,
1179
1213
  graphExtractionDurationMs: durations.graphExtractionDurationMs,
1214
+ // Layer-2 proactive-maintenance coverage (0 when the process is disabled
1215
+ // or the run was ref-scoped) so a scheduled sweep's reach is trackable.
1216
+ proactiveSelected: result.proactiveMaintenance?.selected ?? 0,
1217
+ proactiveDueTotal: result.proactiveMaintenance?.dueTotal ?? 0,
1218
+ proactiveNeverReflected: result.proactiveMaintenance?.neverReflected ?? 0,
1180
1219
  // New metrics for tuning the improve loop.
1181
1220
  ...(durations.totalDurationMs !== undefined ? { durationMs: durations.totalDurationMs } : {}),
1182
1221
  ...(durations.warningCount !== undefined ? { warningCount: durations.warningCount } : {}),
@@ -1422,7 +1461,14 @@ async function runConsolidationPass(args) {
1422
1461
  appendEvent({
1423
1462
  eventType: "consolidate_completed",
1424
1463
  ref: "memory:_consolidation",
1425
- metadata: { processed: consolidation.processed, merged: consolidation.merged },
1464
+ metadata: {
1465
+ processed: consolidation.processed,
1466
+ merged: consolidation.merged,
1467
+ deleted: consolidation.deleted,
1468
+ contradicted: consolidation.contradicted,
1469
+ failedChunks: consolidation.failedChunks ?? 0,
1470
+ durationMs: consolidation.durationMs,
1471
+ },
1426
1472
  }, eventsCtx);
1427
1473
  }
1428
1474
  }
@@ -1793,10 +1839,19 @@ async function runImprovePreparationStage(args) {
1793
1839
  // refs that fail the distill signal-delta gate).
1794
1840
  // distillOnlyRefs — reflect blocked but distill signal-delta passes
1795
1841
  // AND ref is a distill candidate.
1796
- // fullySkippedCount — neither gate passes synthetic skip action
1797
- // + improve_skipped event, excluded from sort.
1842
+ // noFeedbackPool — neither signal-delta gate passes *and* the ref has
1843
+ // no recent feedback signal at all. These are NOT
1844
+ // skipped here: they are handed to the high-retrieval
1845
+ // fallback (P0-A) below so frequently-retrieved but
1846
+ // never-rated assets can still be improved. Only refs
1847
+ // that P0-A declines are ultimately fully skipped.
1848
+ // fullySkippedCount — has stale feedback but no signal delta → genuine
1849
+ // skip (counted, aggregated event emitted post-loop),
1850
+ // excluded from sort.
1798
1851
  const eligibleRefs = [];
1799
1852
  const distillOnlyRefs = [];
1853
+ // Zero-(recent-)feedback refs deferred to the P0-A high-retrieval fallback.
1854
+ const noFeedbackPool = [];
1800
1855
  let fullySkippedCount = 0;
1801
1856
  // O-2 (#365): explicit --scope <ref> bypasses every gate (user intent wins).
1802
1857
  const scopeRefBypass = scope.mode === "ref";
@@ -1834,22 +1889,59 @@ async function runImprovePreparationStage(args) {
1834
1889
  // Reflect blocked but distill passes → distill-only bucket.
1835
1890
  distillOnlyRefs.push(r);
1836
1891
  }
1892
+ else if (!latestFeedbackTs.has(r.ref)) {
1893
+ // Neither signal-delta gate passes AND there is no recent feedback signal
1894
+ // at all. Rather than skip outright, defer to the high-retrieval fallback
1895
+ // (P0-A) below: a never-rated-but-frequently-retrieved asset is exactly
1896
+ // what that path is meant to rescue. Refs P0-A declines are skipped there.
1897
+ noFeedbackPool.push(r);
1898
+ }
1837
1899
  else {
1838
- // Neither gate passes fully skipped.
1900
+ // Has feedback on record but no signal delta since the last proposal —
1901
+ // genuinely fully skipped. Counted here; a single aggregated
1902
+ // improve_skipped event is emitted after the loop (mirrors
1903
+ // profile_filtered_all_passes) instead of one event per ref.
1839
1904
  fullySkippedCount++;
1840
1905
  actions.push({
1841
1906
  ref: r.ref,
1842
1907
  mode: "distill-skipped",
1843
1908
  result: { ok: true, reason: "no new signal since last proposal" },
1844
1909
  });
1845
- appendEvent({ eventType: "improve_skipped", ref: r.ref, metadata: { reason: "no_new_signal" } }, eventsCtx);
1846
1910
  }
1847
1911
  }
1912
+ // Emit ONE aggregated skip event for the fully-skipped bucket rather than one
1913
+ // improve_skipped event per ref (#592 pattern, mirrors
1914
+ // profile_filtered_all_passes above). The per-ref loop previously produced
1915
+ // ~11K state.db writes per run on a large stash, the dominant contributor to
1916
+ // 900 s timeouts. The in-memory `actions` log keeps the per-ref detail for the
1917
+ // run summary; no downstream consumer needs a per-ref DB audit trail (health's
1918
+ // skip histogram reads the `no_new_signal` counter from the count field).
1919
+ if (fullySkippedCount > 0) {
1920
+ appendEvent({
1921
+ eventType: "improve_skipped",
1922
+ ref: undefined,
1923
+ metadata: {
1924
+ reason: "no_new_signal",
1925
+ count: fullySkippedCount,
1926
+ },
1927
+ }, eventsCtx);
1928
+ }
1848
1929
  // ── Phase 4: signal/feedback/utility/sort on the reduced set ──────────────
1849
- // Everything from here works only on (eligibleRefs ∪ distillOnlyRefs). The
1850
- // fully-skipped bucket has already been routed and emitted; we deliberately
1851
- // avoid spending DB/CPU on refs that cannot enter the loop.
1930
+ // Everything from here works on (eligibleRefs ∪ distillOnlyRefs) plus the
1931
+ // deferred noFeedbackPool that may be rescued by the high-retrieval fallback
1932
+ // (P0-A). The fully-skipped bucket has already been routed and its aggregated
1933
+ // event emitted; we deliberately avoid spending DB/CPU on refs that the
1934
+ // signal-delta gate rejected with feedback already on record.
1852
1935
  const processableRefs = [...eligibleRefs, ...distillOnlyRefs];
1936
+ // Refs eligible for the high-retrieval fallback (P0-A): the signal-delta
1937
+ // partition above could not place these in a reflect/distill bucket, but they
1938
+ // may still qualify if they have been retrieved often enough. Two disjoint
1939
+ // sources feed this set:
1940
+ // 1. noFeedbackPool — refs with no recent feedback that the partition loop
1941
+ // deliberately deferred here (otherwise they would never reach P0-A).
1942
+ // 2. processableRefs entries that turn out to carry no recent feedback
1943
+ // *signal* once feedbackSummary is computed below.
1944
+ // (1) is added here; (2) is folded in after feedbackSummary is built.
1853
1945
  // Gap 6: only surface feedback signals from the last 30 days so that
1854
1946
  // ancient one-off feedback events don't permanently lock an asset into
1855
1947
  // every improve run. Assets with only stale signals fall through to the
@@ -1859,8 +1951,12 @@ async function runImprovePreparationStage(args) {
1859
1951
  // Pre-compute feedback summary per ref in a single pass so we don't issue
1860
1952
  // two readEvents({type:"feedback", ref}) per asset (one for signal filtering,
1861
1953
  // one for ratio computation).
1954
+ // Cover processableRefs *and* the deferred noFeedbackPool so utility/feedback
1955
+ // ratios are available for any noFeedbackPool ref that P0-A rescues below.
1862
1956
  const feedbackSummary = new Map();
1863
- for (const candidate of processableRefs) {
1957
+ for (const candidate of [...processableRefs, ...noFeedbackPool]) {
1958
+ if (feedbackSummary.has(candidate.ref))
1959
+ continue;
1864
1960
  const { events } = readEvents({ type: "feedback", ref: candidate.ref });
1865
1961
  let hasSignal = false;
1866
1962
  let positive = 0;
@@ -1883,8 +1979,21 @@ async function runImprovePreparationStage(args) {
1883
1979
  // P0-A: also surface zero-feedback assets that have been retrieved many times.
1884
1980
  const RETRIEVAL_COUNT_THRESHOLD = options.minRetrievalCount ?? 5;
1885
1981
  const signalBearingSet = new Set(signalFiltered.map((r) => r.ref));
1886
- const noFeedbackCandidates = processableRefs.filter((r) => !signalBearingSet.has(r.ref));
1982
+ // Zero-feedback candidates for P0-A: processableRefs without a recent signal,
1983
+ // plus the deferred noFeedbackPool. Dedupe by ref (the two sources are
1984
+ // disjoint by construction, but guard against overlap defensively).
1985
+ const noFeedbackSeen = new Set();
1986
+ const noFeedbackCandidates = [];
1987
+ for (const r of [...processableRefs.filter((r) => !signalBearingSet.has(r.ref)), ...noFeedbackPool]) {
1988
+ if (noFeedbackSeen.has(r.ref))
1989
+ continue;
1990
+ noFeedbackSeen.add(r.ref);
1991
+ noFeedbackCandidates.push(r);
1992
+ }
1887
1993
  let highRetrievalRefs = [];
1994
+ // Retrieval counts for the zero-feedback pool, hoisted so the Layer-2
1995
+ // proactive-maintenance selector below can reuse them without a second DB pass.
1996
+ let retrievalCounts = new Map();
1888
1997
  let dbForRetrieval;
1889
1998
  try {
1890
1999
  dbForRetrieval = openExistingDatabase();
@@ -1892,15 +2001,21 @@ async function runImprovePreparationStage(args) {
1892
2001
  if (showEventCount === 0) {
1893
2002
  warn("Warning: show events not yet in usage_events — zero-feedback fallback will match only search-retrieved assets.");
1894
2003
  }
1895
- const retrievalCounts = getRetrievalCounts(dbForRetrieval, noFeedbackCandidates.map((r) => r.ref));
2004
+ retrievalCounts = getRetrievalCounts(dbForRetrieval, noFeedbackCandidates.map((r) => r.ref));
1896
2005
  // High-retrieval signal-delta (simplified rule, 0.8.0): a no-feedback
1897
- // ref qualifies exactly once — when retrievalCount threshold AND no
1898
- // prior reflect proposal exists for it. Once a reflect proposal is on
1899
- // record, subsequent re-eligibility requires explicit feedback (which
1900
- // flows through the normal signal-delta gate above). Tracking growth in
1901
- // retrieval count would require persisting the count in proposal
1902
- // metadata; deferred to a follow-up.
1903
- highRetrievalRefs = noFeedbackCandidates.filter((r) => (retrievalCounts.get(r.ref) ?? 0) >= RETRIEVAL_COUNT_THRESHOLD && !lastReflectProposalTs.has(r.ref));
2006
+ // ref qualifies exactly once — when it has actually been retrieved
2007
+ // (retrievalCount 1) AND retrievalCount threshold AND no prior reflect
2008
+ // proposal exists for it. Once a reflect proposal is on record, subsequent
2009
+ // re-eligibility requires explicit feedback (which flows through the normal
2010
+ // signal-delta gate above). The explicit `> 0` guard keeps a threshold of 0
2011
+ // from rescuing genuinely never-retrieved assets — the fallback is for
2012
+ // *retrieved* assets, not silent ones. Tracking growth in retrieval count
2013
+ // would require persisting the count in proposal metadata; deferred to a
2014
+ // follow-up.
2015
+ highRetrievalRefs = noFeedbackCandidates.filter((r) => {
2016
+ const count = retrievalCounts.get(r.ref) ?? 0;
2017
+ return count > 0 && count >= RETRIEVAL_COUNT_THRESHOLD && !lastReflectProposalTs.has(r.ref);
2018
+ });
1904
2019
  }
1905
2020
  catch (err) {
1906
2021
  rethrowIfTestIsolationError(err);
@@ -1910,6 +2025,91 @@ async function runImprovePreparationStage(args) {
1910
2025
  if (dbForRetrieval)
1911
2026
  closeDatabase(dbForRetrieval);
1912
2027
  }
2028
+ // ── Layer 2: PROACTIVE MAINTENANCE SELECTOR (third eligibility source) ─────
2029
+ // The signal-delta gate and P0-A only surface assets with fresh feedback or a
2030
+ // raw-retrieval spike. Neither revisits a stable, high-value asset on a
2031
+ // schedule, so on a quiet stash useful assets drift stale and are never
2032
+ // refreshed. When the `proactiveMaintenance` process is enabled (DEFAULT OFF)
2033
+ // and the run is whole-stash / type scope, this selector ranks the eligible
2034
+ // population by a composite maintenance priority, gates on staleness ("due"),
2035
+ // bounds to top-N, and folds the winners into the SAME candidate set the other
2036
+ // two sources feed — so they flow through the existing #580 empty-diff /
2037
+ // cosmetic suppression and additive-distill gates. It adds no new mutation
2038
+ // logic of its own. The due gate doubles as the rotation cooldown: a freshly
2039
+ // reflected asset is excluded until it ages back past `dueDays`, so successive
2040
+ // runs rotate through the due pool rather than re-selecting the same heads.
2041
+ let proactiveRefs = [];
2042
+ let proactiveMaintenanceSummary;
2043
+ const proactiveEnabled = scope.mode !== "ref" && resolveProcessEnabled("proactiveMaintenance", improveProfile);
2044
+ if (proactiveEnabled) {
2045
+ const pmCfg = improveProfile.processes?.proactiveMaintenance;
2046
+ const dueDays = pmCfg?.dueDays ?? DEFAULT_DUE_DAYS;
2047
+ const maxPerRun = pmCfg?.maxPerRun ?? pmCfg?.limit ?? DEFAULT_MAX_PER_RUN;
2048
+ const importanceWeights = pmCfg?.importanceWeights;
2049
+ // Candidate population: the zero-feedback / non-signal pool — exactly the
2050
+ // assets the other two sources would NOT pick this run. Exclude any P0-A
2051
+ // rescued this run so we never double-select the same ref.
2052
+ const alreadySelected = new Set(highRetrievalRefs.map((r) => r.ref));
2053
+ const pmCandidates = noFeedbackCandidates.filter((r) => !alreadySelected.has(r.ref));
2054
+ const selection = selectProactiveMaintenanceRefs({
2055
+ candidates: pmCandidates,
2056
+ lastReflectTs: lastReflectProposalTs,
2057
+ lastDistillTs: lastDistillProposalTs,
2058
+ retrievalCounts,
2059
+ sizeBytesOf: (r) => {
2060
+ const fp = r.filePath;
2061
+ if (!fp)
2062
+ return undefined;
2063
+ try {
2064
+ return fs.statSync(fp).size;
2065
+ }
2066
+ catch {
2067
+ return undefined;
2068
+ }
2069
+ },
2070
+ dueDays,
2071
+ maxPerRun,
2072
+ importanceWeights,
2073
+ });
2074
+ proactiveRefs = selection.selected;
2075
+ proactiveMaintenanceSummary = {
2076
+ selected: selection.selected.length,
2077
+ dueTotal: selection.dueTotal,
2078
+ neverReflected: selection.neverReflected,
2079
+ };
2080
+ // Aggregated observability event (never per-ref — avoids the event flood the
2081
+ // Layer-1 work eliminated). Mirrors the `no_new_signal` aggregation pattern.
2082
+ appendEvent({
2083
+ eventType: "proactive_selected",
2084
+ ref: undefined,
2085
+ metadata: {
2086
+ count: selection.selected.length,
2087
+ dueTotal: selection.dueTotal,
2088
+ neverReflected: selection.neverReflected,
2089
+ },
2090
+ }, eventsCtx);
2091
+ if (selection.selected.length > 0) {
2092
+ info(`[improve] proactive maintenance selected ${selection.selected.length}/${selection.dueTotal} due refs ` +
2093
+ `(${selection.neverReflected} never reflected, dueDays=${dueDays}, maxPerRun=${maxPerRun})`);
2094
+ }
2095
+ }
2096
+ // Record an in-memory skip action for every zero-feedback ref that the
2097
+ // partition loop deferred to P0-A but P0-A then declined (retrievalCount below
2098
+ // threshold, or a prior reflect proposal already on record). These never make
2099
+ // it into mergedRefs, so without this they would silently vanish from the run
2100
+ // summary. No DB event is written here — these refs carry no signal at all, so
2101
+ // there is nothing for the skip histogram to aggregate; the action log alone
2102
+ // preserves the per-ref audit trail (mirrors the fully-skipped action above).
2103
+ const rescuedSet = new Set([...highRetrievalRefs, ...proactiveRefs].map((r) => r.ref));
2104
+ for (const r of noFeedbackPool) {
2105
+ if (rescuedSet.has(r.ref))
2106
+ continue;
2107
+ actions.push({
2108
+ ref: r.ref,
2109
+ mode: "distill-skipped",
2110
+ result: { ok: true, reason: "no new signal since last proposal" },
2111
+ });
2112
+ }
1913
2113
  // If the user explicitly scoped to a single ref, always act on it —
1914
2114
  // skip the signal/retrieval filter entirely. The filter exists to avoid
1915
2115
  // noisy "improve everything" runs; it should not gate an intentional
@@ -1919,8 +2119,48 @@ async function runImprovePreparationStage(args) {
1919
2119
  // or sufficient retrievals). A stash with no signals has 0 eligible refs —
1920
2120
  // usage is the gate. Run `akm feedback <ref> --positive` or retrieve assets
1921
2121
  // to bring them into the eligible pool.
1922
- const signalAndRetrievalRefs = [...signalFiltered, ...highRetrievalRefs];
2122
+ // Layer-2 proactive refs join the eligible set alongside feedback-signal and
2123
+ // high-retrieval (P0-A) refs. The three sources are disjoint by construction
2124
+ // (proactive draws from noFeedbackCandidates with the P0-A picks removed), but
2125
+ // dedupe defensively so a ref can never enter the loop twice. `requireFeedbackSignal`
2126
+ // still suppresses both fallback sources for callers that want feedback-only runs.
2127
+ const signalAndRetrievalRefs = dedupeRefs([...signalFiltered, ...highRetrievalRefs, ...proactiveRefs]);
1923
2128
  const mergedRefs = scope.mode === "ref" ? processableRefs : options.requireFeedbackSignal ? signalFiltered : signalAndRetrievalRefs;
2129
+ // ── Attribution tagging: stamp each ref with the eligibility lane that
2130
+ // selected it ──────────────────────────────────────────────────────────────
2131
+ // Every reflect/distill proposal must record WHICH lane chose its source asset
2132
+ // so downstream accept/reject/revert/retrieval outcomes can be sliced by lane
2133
+ // (does the PROACTIVE lane produce value vs the reactive lanes?). We build the
2134
+ // lane map here — the one place all four lanes are known — and stamp it onto
2135
+ // each ImproveEligibleRef object. Because the ref objects are shared by
2136
+ // reference across buckets, the stamp travels with the ref through the sort,
2137
+ // disk-check, and loop stages down to the reflect/distill event emit sites and
2138
+ // createProposal calls. See EligibilitySource for the lane vocabulary.
2139
+ //
2140
+ // Precedence (prefer the most specific reactive signal):
2141
+ // scope > signal-delta > high-retrieval > proactive
2142
+ // A ref with real feedback is attributed to feedback even if it was also due
2143
+ // for proactive maintenance. We apply lanes weakest-first so the strongest
2144
+ // overwrites; the explicit --scope <ref> bypass wins outright (user intent).
2145
+ const eligibilitySourceByRef = new Map();
2146
+ for (const r of proactiveRefs)
2147
+ eligibilitySourceByRef.set(r.ref, "proactive");
2148
+ for (const r of highRetrievalRefs)
2149
+ eligibilitySourceByRef.set(r.ref, "high-retrieval");
2150
+ for (const r of signalFiltered)
2151
+ eligibilitySourceByRef.set(r.ref, "signal-delta");
2152
+ if (scope.mode === "ref") {
2153
+ // O-2 (#365): explicit --scope <ref> bypass — every ref in processableRefs
2154
+ // arrived via the scopeRefBypass branch, so attribute the whole set to scope.
2155
+ for (const r of processableRefs)
2156
+ eligibilitySourceByRef.set(r.ref, "scope");
2157
+ }
2158
+ for (const r of mergedRefs) {
2159
+ // "unknown" is a genuine fallback, never a silent alias for signal-delta:
2160
+ // only refs we truly cannot attribute land here (none in practice, since
2161
+ // mergedRefs is always a subset of the four lanes above).
2162
+ r.eligibilitySource = eligibilitySourceByRef.get(r.ref) ?? "unknown";
2163
+ }
1924
2164
  const utilityMap = buildUtilityMap(mergedRefs);
1925
2165
  // Load feedback ratio per ref from the pre-computed summary (no extra DB pass).
1926
2166
  const feedbackRatios = new Map();
@@ -2061,6 +2301,7 @@ async function runImprovePreparationStage(args) {
2061
2301
  gateAutoAcceptFailedCount,
2062
2302
  consolidation: consolidationPass.consolidation,
2063
2303
  consolidationRan: consolidationPass.consolidationRan,
2304
+ ...(proactiveMaintenanceSummary ? { proactiveMaintenance: proactiveMaintenanceSummary } : {}),
2064
2305
  };
2065
2306
  }
2066
2307
  async function runImproveLoopStage(args) {
@@ -2237,6 +2478,9 @@ async function runImproveLoopStage(args) {
2237
2478
  eventSource: "improve",
2238
2479
  ...(reflectBudgetMs > 0 ? { timeoutMs: reflectBudgetMs } : {}),
2239
2480
  ...(reflectProfileRunner ? { runner: reflectProfileRunner } : {}),
2481
+ // Attribution: carry the eligibility lane so reflect stamps it on
2482
+ // the reflect_invoked event and the persisted proposal.
2483
+ ...(planned.eligibilitySource ? { eligibilitySource: planned.eligibilitySource } : {}),
2240
2484
  };
2241
2485
  // R-2 / #389: Self-consistency multi-sample voting for high-utility refs.
2242
2486
  // Self-Consistency arXiv:2203.11171 — N=3 samples beat single-shot quality.
@@ -2261,6 +2505,9 @@ async function runImproveLoopStage(args) {
2261
2505
  source: "reflect",
2262
2506
  sourceRun: `reflect-sc-${Date.now()}`,
2263
2507
  payload: winner.proposal.payload,
2508
+ // Attribution: the self-consistency path persists the winner here
2509
+ // (draftMode skips reflect's own createProposal), so stamp the lane.
2510
+ ...(planned.eligibilitySource ? { eligibilitySource: planned.eligibilitySource } : {}),
2264
2511
  });
2265
2512
  reflectResult = isProposalSkipped(persistResult)
2266
2513
  ? {
@@ -2459,6 +2706,9 @@ async function runImproveLoopStage(args) {
2459
2706
  ref: planned.ref,
2460
2707
  ...(parsedPlannedRef.type === "memory" ? { proposalKind: "auto" } : {}),
2461
2708
  ...(options.stashDir ? { stashDir: options.stashDir } : {}),
2709
+ // Attribution: carry the eligibility lane so distill stamps it on the
2710
+ // distill_invoked event and the persisted proposal.
2711
+ ...(planned.eligibilitySource ? { eligibilitySource: planned.eligibilitySource } : {}),
2462
2712
  }));
2463
2713
  actions.push({ ref: planned.ref, mode: "distill", result: distillResult });
2464
2714
  if (distillResult.outcome === "queued" && distillResult.proposal) {
@@ -2640,323 +2890,325 @@ export async function runImproveMaintenancePasses(args) {
2640
2890
  db = openIndexDb();
2641
2891
  }
2642
2892
  };
2643
- try {
2644
- db = openIndexDb();
2645
- // Memory inference candidate-discovery (post-Item 9 fix from
2646
- // memory:akm-improve-critical-review-2026-05-20). Previously this pass
2647
- // was gated on memoryRefsForInference.size > 0 AND passed those refs as a
2648
- // candidateRefs filter. But memoryRefsForInference is populated from refs
2649
- // distilled THIS RUN by the time that happens, those parents are
2650
- // already split (`inferenceProcessed: true`) and `isPendingMemory` excludes
2651
- // them. The genuinely-pending parents in the stash never entered the
2652
- // filter. Result: 0/0/0 for 25 consecutive runs.
2653
- //
2654
- // Fix: always run the pass when the feature is enabled; let the pass's
2655
- // own `collectPendingMemories` + `isPendingMemory` predicate find
2656
- // candidates from the filesystem-of-truth. The this-run set is still
2657
- // logged as a hint but no longer used as a filter.
2658
- const memoryInferenceDisabledByProfile = improveProfile?.processes?.memoryInference?.enabled === false;
2659
- const minPendingCount = improveProfile?.processes?.memoryInference?.minPendingCount;
2660
- const pendingBelowMinCount = (() => {
2661
- if (!primaryStashDir || minPendingCount === undefined || minPendingCount <= 0)
2893
+ await withIndexWriterLease({ purpose: "improve-maintenance", signal: budgetSignal }, async () => {
2894
+ try {
2895
+ db = openIndexDb();
2896
+ // Memory inference candidate-discovery (post-Item 9 fix from
2897
+ // memory:akm-improve-critical-review-2026-05-20). Previously this pass
2898
+ // was gated on memoryRefsForInference.size > 0 AND passed those refs as a
2899
+ // candidateRefs filter. But memoryRefsForInference is populated from refs
2900
+ // distilled THIS RUN by the time that happens, those parents are
2901
+ // already split (`inferenceProcessed: true`) and `isPendingMemory` excludes
2902
+ // them. The genuinely-pending parents in the stash never entered the
2903
+ // filter. Result: 0/0/0 for 25 consecutive runs.
2904
+ //
2905
+ // Fix: always run the pass when the feature is enabled; let the pass's
2906
+ // own `collectPendingMemories` + `isPendingMemory` predicate find
2907
+ // candidates from the filesystem-of-truth. The this-run set is still
2908
+ // logged as a hint but no longer used as a filter.
2909
+ const memoryInferenceDisabledByProfile = improveProfile?.processes?.memoryInference?.enabled === false;
2910
+ const minPendingCount = improveProfile?.processes?.memoryInference?.minPendingCount;
2911
+ const pendingBelowMinCount = (() => {
2912
+ if (!primaryStashDir || minPendingCount === undefined || minPendingCount <= 0)
2913
+ return false;
2914
+ const pending = collectPendingMemories(primaryStashDir).length;
2915
+ if (pending < minPendingCount) {
2916
+ info(`[improve] memory inference skipped (${pending} pending < minPendingCount ${minPendingCount})`);
2917
+ return true;
2918
+ }
2662
2919
  return false;
2663
- const pending = collectPendingMemories(primaryStashDir).length;
2664
- if (pending < minPendingCount) {
2665
- info(`[improve] memory inference skipped (${pending} pending < minPendingCount ${minPendingCount})`);
2666
- return true;
2920
+ })();
2921
+ if (memoryInferenceDisabledByProfile) {
2922
+ info("[improve] memory inference skipped (disabled by improve profile)");
2667
2923
  }
2668
- return false;
2669
- })();
2670
- if (memoryInferenceDisabledByProfile) {
2671
- info("[improve] memory inference skipped (disabled by improve profile)");
2672
- }
2673
- else if (pendingBelowMinCount) {
2674
- // skipped — message already emitted above
2675
- }
2676
- else {
2677
- const hintRefs = memoryRefsForInference.size;
2678
- info(hintRefs > 0
2679
- ? `[improve] memory inference starting (${hintRefs} hint refs touched this run; pass discovers all pending)`
2680
- : "[improve] memory inference starting (discovering pending parents)");
2681
- const inferenceStart = Date.now();
2682
- try {
2683
- // O-1 (#364): pass budget signal so a hung inference call is cancelled.
2684
- memoryInference = await withLlmStage("memory-inference", () => memoryInferenceFn({
2685
- config,
2686
- sources,
2687
- signal: budgetSignal,
2688
- db,
2689
- reEnrich: false,
2690
- onProgress: (event) => {
2691
- const current = event.currentRef ? ` ${event.currentRef}` : "";
2692
- info(`[improve] memory inference ${event.processed}/${event.total}${current} (written ${event.writtenFacts}, skipped ${event.skippedNoFacts})`);
2693
- },
2694
- }));
2695
- memoryInferenceDurationMs = Date.now() - inferenceStart;
2696
- actions.push({ ref: "memory:_inference", mode: "memory-inference", result: memoryInference });
2697
- info(`[improve] memory inference complete (${memoryInference.writtenFacts} facts written from ${memoryInference.splitParents} parents)`);
2924
+ else if (pendingBelowMinCount) {
2925
+ // skipped — message already emitted above
2698
2926
  }
2699
- catch (err) {
2700
- memoryInferenceDurationMs = Date.now() - inferenceStart;
2701
- allWarnings.push(`memory inference failed: ${err instanceof Error ? err.message : String(err)}`);
2927
+ else {
2928
+ const hintRefs = memoryRefsForInference.size;
2929
+ info(hintRefs > 0
2930
+ ? `[improve] memory inference starting (${hintRefs} hint refs touched this run; pass discovers all pending)`
2931
+ : "[improve] memory inference starting (discovering pending parents)");
2932
+ const inferenceStart = Date.now();
2933
+ try {
2934
+ // O-1 (#364): pass budget signal so a hung inference call is cancelled.
2935
+ memoryInference = await withLlmStage("memory-inference", () => memoryInferenceFn({
2936
+ config,
2937
+ sources,
2938
+ signal: budgetSignal,
2939
+ db,
2940
+ reEnrich: false,
2941
+ onProgress: (event) => {
2942
+ const current = event.currentRef ? ` ${event.currentRef}` : "";
2943
+ info(`[improve] memory inference ${event.processed}/${event.total}${current} (written ${event.writtenFacts}, skipped ${event.skippedNoFacts})`);
2944
+ },
2945
+ }));
2946
+ memoryInferenceDurationMs = Date.now() - inferenceStart;
2947
+ actions.push({ ref: "memory:_inference", mode: "memory-inference", result: memoryInference });
2948
+ info(`[improve] memory inference complete (${memoryInference.writtenFacts} facts written from ${memoryInference.splitParents} parents)`);
2949
+ }
2950
+ catch (err) {
2951
+ memoryInferenceDurationMs = Date.now() - inferenceStart;
2952
+ allWarnings.push(`memory inference failed: ${err instanceof Error ? err.message : String(err)}`);
2953
+ }
2702
2954
  }
2703
- }
2704
- if (memoryInference && (memoryInference.splitParents > 0 || memoryInference.writtenFacts > 0)) {
2705
- info("[improve] reindexing after memory inference writes");
2706
- try {
2707
- await reindexWithIndexDbReleased(primaryStashDir);
2708
- reindexedAfterInference = true;
2709
- info("[improve] reindex after memory inference complete");
2955
+ if (memoryInference && (memoryInference.splitParents > 0 || memoryInference.writtenFacts > 0)) {
2956
+ info("[improve] reindexing after memory inference writes");
2957
+ try {
2958
+ await reindexWithIndexDbReleased(primaryStashDir);
2959
+ reindexedAfterInference = true;
2960
+ info("[improve] reindex after memory inference complete");
2961
+ }
2962
+ catch (err) {
2963
+ allWarnings.push(`reindex after memory inference failed: ${err instanceof Error ? err.message : String(err)}`);
2964
+ }
2710
2965
  }
2711
- catch (err) {
2712
- allWarnings.push(`reindex after memory inference failed: ${err instanceof Error ? err.message : String(err)}`);
2966
+ const graphEnabled = isProcessEnabled("index", "graph_extraction", config);
2967
+ const graphExtractionDisabledByProfile = improveProfile?.processes?.graphExtraction?.enabled === false;
2968
+ const graphExtractionFullScan = improveProfile?.processes?.graphExtraction?.fullScan === true;
2969
+ // Build the set of refs actually touched this run.
2970
+ const touchedRefs = new Set();
2971
+ for (const r of args.actionableRefs)
2972
+ touchedRefs.add(r.ref);
2973
+ for (const r of memoryRefsForInference)
2974
+ touchedRefs.add(r);
2975
+ // INVARIANT: graph extraction normally runs only on files touched by
2976
+ // actionable refs (candidatePaths). Full-corpus scans are opt-in via
2977
+ // profile.processes.graphExtraction.fullScan = true (used by the
2978
+ // `graph-refresh` built-in profile and its weekly scheduled task).
2979
+ // The empty-Set fallback is intentional when no refs were touched —
2980
+ // the extractor's filter rejects every file and returns empty, keeping
2981
+ // the pass invoked so the action is recorded and tests stay exercised.
2982
+ if (graphExtractionDisabledByProfile) {
2983
+ info("[improve] graph extraction skipped (disabled by improve profile)");
2713
2984
  }
2714
- }
2715
- const graphEnabled = isProcessEnabled("index", "graph_extraction", config);
2716
- const graphExtractionDisabledByProfile = improveProfile?.processes?.graphExtraction?.enabled === false;
2717
- const graphExtractionFullScan = improveProfile?.processes?.graphExtraction?.fullScan === true;
2718
- // Build the set of refs actually touched this run.
2719
- const touchedRefs = new Set();
2720
- for (const r of args.actionableRefs)
2721
- touchedRefs.add(r.ref);
2722
- for (const r of memoryRefsForInference)
2723
- touchedRefs.add(r);
2724
- // INVARIANT: graph extraction normally runs only on files touched by
2725
- // actionable refs (candidatePaths). Full-corpus scans are opt-in via
2726
- // profile.processes.graphExtraction.fullScan = true (used by the
2727
- // `graph-refresh` built-in profile and its weekly scheduled task).
2728
- // The empty-Set fallback is intentional when no refs were touched —
2729
- // the extractor's filter rejects every file and returns empty, keeping
2730
- // the pass invoked so the action is recorded and tests stay exercised.
2731
- if (graphExtractionDisabledByProfile) {
2732
- info("[improve] graph extraction skipped (disabled by improve profile)");
2733
- }
2734
- else if (sources.length > 0 && graphEnabled) {
2735
- info(`[improve] graph extraction starting${graphExtractionFullScan ? " (full-corpus scan)" : ""}`);
2736
- const extractionStart = Date.now();
2737
- try {
2738
- // D9: if consolidation ran but memory inference did not reindex, force a reindex
2739
- // so graph extraction sees current DB state after consolidation writes.
2740
- if (consolidationRan && !reindexedAfterInference) {
2741
- info("[improve] reindexing after consolidation (graph extraction needs current state)");
2742
- try {
2743
- await reindexWithIndexDbReleased(primaryStashDir);
2744
- reindexedAfterInference = true;
2745
- info("[improve] reindex after consolidation complete");
2746
- }
2747
- catch (err) {
2748
- allWarnings.push(`reindex after consolidation failed: ${err instanceof Error ? err.message : String(err)}`);
2985
+ else if (sources.length > 0 && graphEnabled) {
2986
+ info(`[improve] graph extraction starting${graphExtractionFullScan ? " (full-corpus scan)" : ""}`);
2987
+ const extractionStart = Date.now();
2988
+ try {
2989
+ // D9: if consolidation ran but memory inference did not reindex, force a reindex
2990
+ // so graph extraction sees current DB state after consolidation writes.
2991
+ if (consolidationRan && !reindexedAfterInference) {
2992
+ info("[improve] reindexing after consolidation (graph extraction needs current state)");
2993
+ try {
2994
+ await reindexWithIndexDbReleased(primaryStashDir);
2995
+ reindexedAfterInference = true;
2996
+ info("[improve] reindex after consolidation complete");
2997
+ }
2998
+ catch (err) {
2999
+ allWarnings.push(`reindex after consolidation failed: ${err instanceof Error ? err.message : String(err)}`);
3000
+ }
2749
3001
  }
2750
- }
2751
- // #584: no close/reopen needed here reindexWithIndexDbReleased
2752
- // already swapped in a fresh post-reindex handle.
2753
- // Resolve touched refs to absolute file paths. Skipped for fullScan
2754
- // (candidatePaths stays undefined → extractor processes all files).
2755
- let candidatePaths;
2756
- if (!graphExtractionFullScan) {
2757
- candidatePaths = new Set();
2758
- if (primaryStashDir && touchedRefs.size > 0) {
2759
- const writableDirSet = new Set(getWritableStashDirs(primaryStashDir).map((d) => path.resolve(d)));
2760
- const resolved = await Promise.all([...touchedRefs].map((ref) => findAssetFilePath(ref, primaryStashDir, writableDirSet).catch(() => null)));
2761
- for (const p of resolved) {
2762
- if (typeof p === "string" && p.length > 0)
2763
- candidatePaths.add(p);
3002
+ // #584: no close/reopen needed here — reindexWithIndexDbReleased
3003
+ // already swapped in a fresh post-reindex handle.
3004
+ // Resolve touched refs to absolute file paths. Skipped for fullScan
3005
+ // (candidatePaths stays undefined extractor processes all files).
3006
+ let candidatePaths;
3007
+ if (!graphExtractionFullScan) {
3008
+ candidatePaths = new Set();
3009
+ if (primaryStashDir && touchedRefs.size > 0) {
3010
+ const writableDirSet = new Set(getWritableStashDirs(primaryStashDir).map((d) => path.resolve(d)));
3011
+ const resolved = await Promise.all([...touchedRefs].map((ref) => findAssetFilePath(ref, primaryStashDir, writableDirSet).catch(() => null)));
3012
+ for (const p of resolved) {
3013
+ if (typeof p === "string" && p.length > 0)
3014
+ candidatePaths.add(p);
3015
+ }
2764
3016
  }
2765
3017
  }
3018
+ const progressHandler = (event) => {
3019
+ const current = event.currentPath ? ` ${path.basename(event.currentPath)}` : "";
3020
+ info(`[improve] graph extraction ${event.processed}/${event.total}${current} (extracted ${event.extracted}, entities ${event.totalEntities}, relations ${event.totalRelations})`);
3021
+ };
3022
+ // O-1 (#364): pass budget signal so a hung graph extraction call is cancelled.
3023
+ graphExtraction = await withLlmStage("graph-extraction", () => graphExtractionFn({
3024
+ config,
3025
+ sources,
3026
+ signal: budgetSignal,
3027
+ db,
3028
+ reEnrich: false,
3029
+ onProgress: progressHandler,
3030
+ options: { candidatePaths },
3031
+ }));
3032
+ graphExtractionDurationMs = Date.now() - extractionStart;
3033
+ actions.push({ ref: "graph:_artifact", mode: "graph-extraction", result: graphExtraction });
3034
+ info(`[improve] graph extraction complete (${graphExtraction.quality.extractedFiles} files, ${graphExtraction.quality.entityCount} entities, ${graphExtraction.quality.relationCount} relations)`);
2766
3035
  }
2767
- const progressHandler = (event) => {
2768
- const current = event.currentPath ? ` ${path.basename(event.currentPath)}` : "";
2769
- info(`[improve] graph extraction ${event.processed}/${event.total}${current} (extracted ${event.extracted}, entities ${event.totalEntities}, relations ${event.totalRelations})`);
2770
- };
2771
- // O-1 (#364): pass budget signal so a hung graph extraction call is cancelled.
2772
- graphExtraction = await withLlmStage("graph-extraction", () => graphExtractionFn({
2773
- config,
2774
- sources,
2775
- signal: budgetSignal,
2776
- db,
2777
- reEnrich: false,
2778
- onProgress: progressHandler,
2779
- options: { candidatePaths },
2780
- }));
2781
- graphExtractionDurationMs = Date.now() - extractionStart;
2782
- actions.push({ ref: "graph:_artifact", mode: "graph-extraction", result: graphExtraction });
2783
- info(`[improve] graph extraction complete (${graphExtraction.quality.extractedFiles} files, ${graphExtraction.quality.entityCount} entities, ${graphExtraction.quality.relationCount} relations)`);
2784
- }
2785
- catch (err) {
2786
- graphExtractionDurationMs = Date.now() - extractionStart;
2787
- allWarnings.push(`graph extraction failed: ${err instanceof Error ? err.message : String(err)}`);
2788
- }
2789
- }
2790
- else if (sources.length > 0 && !graphEnabled) {
2791
- info("[improve] graph extraction skipped (features.index.graph_extraction is disabled)");
2792
- }
2793
- // Orphan proposal purge — reject pending reflect proposals whose target
2794
- // asset no longer exists on disk. Runs after graph extraction so newly
2795
- // promoted assets from accept flows during this run are already present.
2796
- if (primaryStashDir) {
2797
- try {
2798
- const purgeResult = purgeOrphanProposals(primaryStashDir, sources.map((s) => s.path));
2799
- orphansPurged = purgeResult.rejected;
2800
- if (purgeResult.rejected > 0) {
2801
- info(`[improve] orphan purge: ${purgeResult.rejected}/${purgeResult.checked} orphaned proposals rejected (${purgeResult.durationMs}ms)`);
2802
- }
2803
- appendEvent({
2804
- eventType: "proposal_orphan_purge",
2805
- ref: "proposals:_orphan-purge",
2806
- metadata: {
2807
- checked: purgeResult.checked,
2808
- rejected: purgeResult.rejected,
2809
- durationMs: purgeResult.durationMs,
2810
- byType: purgeResult.byType,
2811
- orphans: purgeResult.orphans.map((o) => o.ref),
2812
- },
2813
- }, eventsCtx);
2814
- }
2815
- catch (err) {
2816
- allWarnings.push(`orphan purge failed: ${err instanceof Error ? err.message : String(err)}`);
2817
- }
2818
- // Phase 6B (Advantage D6b): expire pending proposals that have aged past
2819
- // the retention window. Runs AFTER orphan purge so we never double-archive
2820
- // a proposal that orphan-purge already moved. `expireStaleProposals` emits
2821
- // its own per-proposal `proposal_expired` events; we additionally emit a
2822
- // single roll-up event here for parity with the orphan-purge surface.
2823
- try {
2824
- const expireResult = expireStaleProposals(primaryStashDir, config);
2825
- proposalsExpired = expireResult.expired;
2826
- if (expireResult.expired > 0) {
2827
- info(`[improve] expiration: ${expireResult.expired}/${expireResult.checked} pending proposals expired ` +
2828
- `(retention=${expireResult.retentionDays}d, ${expireResult.durationMs}ms)`);
3036
+ catch (err) {
3037
+ graphExtractionDurationMs = Date.now() - extractionStart;
3038
+ allWarnings.push(`graph extraction failed: ${err instanceof Error ? err.message : String(err)}`);
2829
3039
  }
2830
- appendEvent({
2831
- eventType: "proposal_expiration_pass",
2832
- ref: "proposals:_expiration",
2833
- metadata: {
2834
- checked: expireResult.checked,
2835
- expired: expireResult.expired,
2836
- durationMs: expireResult.durationMs,
2837
- retentionDays: expireResult.retentionDays,
2838
- expiredProposals: expireResult.expiredProposals,
2839
- },
2840
- }, eventsCtx);
2841
3040
  }
2842
- catch (err) {
2843
- allWarnings.push(`proposal expiration failed: ${err instanceof Error ? err.message : String(err)}`);
3041
+ else if (sources.length > 0 && !graphEnabled) {
3042
+ info("[improve] graph extraction skipped (features.index.graph_extraction is disabled)");
2844
3043
  }
2845
- }
2846
- // Fix #2 (observability 0.8.0): trim the events table in state.db so it
2847
- // doesn't grow unbounded. `akm health` writes a `health_probe` row on every
2848
- // invocation, and every command surface emits at least one event besides —
2849
- // without this trim, state.db is a permanent append-only log. Config key
2850
- // `improve.eventRetentionDays` (default 90, set 0 to disable) controls the
2851
- // window. The purge runs against state.db (a different SQLite file from
2852
- // the index `db` above).
2853
- {
2854
- const retentionDays = typeof config.improve?.eventRetentionDays === "number" ? config.improve.eventRetentionDays : 90;
2855
- if (retentionDays > 0) {
2856
- // #585: reuse the long-lived eventsCtx.db connection when akmImprove
2857
- // opened one — opening a second state.db write connection while
2858
- // eventsDb is still live made two simultaneous writers contend on the
2859
- // same WAL file ("database is locked"). Only the eventsCtx.dbPath
2860
- // fallback path (state.db failed to open up-front) opens — and then
2861
- // owns and closes — its own handle. C2 still holds: the fallback uses
2862
- // the boundary-pinned path, never a live `process.env` re-read.
2863
- const ownsStateDb = !eventsCtx?.db;
2864
- let stateDb;
3044
+ // Orphan proposal purge — reject pending reflect proposals whose target
3045
+ // asset no longer exists on disk. Runs after graph extraction so newly
3046
+ // promoted assets from accept flows during this run are already present.
3047
+ if (primaryStashDir) {
2865
3048
  try {
2866
- stateDb = eventsCtx?.db ?? openStateDatabase(eventsCtx?.dbPath);
2867
- const purgedCount = purgeOldEvents(stateDb, retentionDays);
2868
- if (purgedCount > 0) {
2869
- info(`[improve] events purge: ${purgedCount} event(s) older than ${retentionDays}d removed from state.db`);
2870
- }
2871
- appendEvent({
2872
- eventType: "events_purged",
2873
- ref: "events:_purge",
2874
- metadata: { purgedCount, retentionDays },
2875
- }, eventsCtx);
2876
- // improve_runs uses the same retention window as events — both are
2877
- // observability/audit data, both grow append-only, both have a
2878
- // dedicated purge helper. Mirroring the events purge here means a
2879
- // single retention knob (improve.eventRetentionDays) governs both.
2880
- const improveRunsPurged = purgeOldImproveRuns(stateDb, retentionDays);
2881
- if (improveRunsPurged > 0) {
2882
- info(`[improve] improve_runs purge: ${improveRunsPurged} run(s) older than ${retentionDays}d removed from state.db`);
3049
+ const purgeResult = purgeOrphanProposals(primaryStashDir, sources.map((s) => s.path));
3050
+ orphansPurged = purgeResult.rejected;
3051
+ if (purgeResult.rejected > 0) {
3052
+ info(`[improve] orphan purge: ${purgeResult.rejected}/${purgeResult.checked} orphaned proposals rejected (${purgeResult.durationMs}ms)`);
2883
3053
  }
2884
3054
  appendEvent({
2885
- eventType: "improve_runs_purged",
2886
- ref: "improve_runs:_purge",
2887
- metadata: { purgedCount: improveRunsPurged, retentionDays },
3055
+ eventType: "proposal_orphan_purge",
3056
+ ref: "proposals:_orphan-purge",
3057
+ metadata: {
3058
+ checked: purgeResult.checked,
3059
+ rejected: purgeResult.rejected,
3060
+ durationMs: purgeResult.durationMs,
3061
+ byType: purgeResult.byType,
3062
+ orphans: purgeResult.orphans.map((o) => o.ref),
3063
+ },
2888
3064
  }, eventsCtx);
2889
3065
  }
2890
3066
  catch (err) {
2891
- allWarnings.push(`events purge failed: ${err instanceof Error ? err.message : String(err)}`);
2892
- }
2893
- finally {
2894
- if (ownsStateDb && stateDb) {
2895
- try {
2896
- stateDb.close();
2897
- }
2898
- catch {
2899
- // best-effort
2900
- }
2901
- }
3067
+ allWarnings.push(`orphan purge failed: ${err instanceof Error ? err.message : String(err)}`);
2902
3068
  }
2903
- // task_logs in logs.db (#579) shares the same retention window as
2904
- // events/improve_runs all three are observability data governed by
2905
- // the single improve.eventRetentionDays knob. Separate try/finally
2906
- // because logs.db is a different file: a locked/missing logs.db must
2907
- // not block the state.db purges above.
2908
- let logsDb;
3069
+ // Phase 6B (Advantage D6b): expire pending proposals that have aged past
3070
+ // the retention window. Runs AFTER orphan purge so we never double-archive
3071
+ // a proposal that orphan-purge already moved. `expireStaleProposals` emits
3072
+ // its own per-proposal `proposal_expired` events; we additionally emit a
3073
+ // single roll-up event here for parity with the orphan-purge surface.
2909
3074
  try {
2910
- logsDb = openLogsDatabase();
2911
- const taskLogsPurged = purgeOldTaskLogs(logsDb, retentionDays);
2912
- if (taskLogsPurged > 0) {
2913
- info(`[improve] task_logs purge: ${taskLogsPurged} log line(s) older than ${retentionDays}d removed from logs.db`);
3075
+ const expireResult = expireStaleProposals(primaryStashDir, config);
3076
+ proposalsExpired = expireResult.expired;
3077
+ if (expireResult.expired > 0) {
3078
+ info(`[improve] expiration: ${expireResult.expired}/${expireResult.checked} pending proposals expired ` +
3079
+ `(retention=${expireResult.retentionDays}d, ${expireResult.durationMs}ms)`);
2914
3080
  }
2915
3081
  appendEvent({
2916
- eventType: "task_logs_purged",
2917
- ref: "task_logs:_purge",
2918
- metadata: { purgedCount: taskLogsPurged, retentionDays },
3082
+ eventType: "proposal_expiration_pass",
3083
+ ref: "proposals:_expiration",
3084
+ metadata: {
3085
+ checked: expireResult.checked,
3086
+ expired: expireResult.expired,
3087
+ durationMs: expireResult.durationMs,
3088
+ retentionDays: expireResult.retentionDays,
3089
+ expiredProposals: expireResult.expiredProposals,
3090
+ },
2919
3091
  }, eventsCtx);
2920
3092
  }
2921
3093
  catch (err) {
2922
- allWarnings.push(`task_logs purge failed: ${err instanceof Error ? err.message : String(err)}`);
3094
+ allWarnings.push(`proposal expiration failed: ${err instanceof Error ? err.message : String(err)}`);
2923
3095
  }
2924
- finally {
2925
- if (logsDb) {
2926
- try {
2927
- logsDb.close();
3096
+ }
3097
+ // Fix #2 (observability 0.8.0): trim the events table in state.db so it
3098
+ // doesn't grow unbounded. `akm health` writes a `health_probe` row on every
3099
+ // invocation, and every command surface emits at least one event besides —
3100
+ // without this trim, state.db is a permanent append-only log. Config key
3101
+ // `improve.eventRetentionDays` (default 90, set 0 to disable) controls the
3102
+ // window. The purge runs against state.db (a different SQLite file from
3103
+ // the index `db` above).
3104
+ {
3105
+ const retentionDays = typeof config.improve?.eventRetentionDays === "number" ? config.improve.eventRetentionDays : 90;
3106
+ if (retentionDays > 0) {
3107
+ // #585: reuse the long-lived eventsCtx.db connection when akmImprove
3108
+ // opened one — opening a second state.db write connection while
3109
+ // eventsDb is still live made two simultaneous writers contend on the
3110
+ // same WAL file ("database is locked"). Only the eventsCtx.dbPath
3111
+ // fallback path (state.db failed to open up-front) opens — and then
3112
+ // owns and closes — its own handle. C2 still holds: the fallback uses
3113
+ // the boundary-pinned path, never a live `process.env` re-read.
3114
+ const ownsStateDb = !eventsCtx?.db;
3115
+ let stateDb;
3116
+ try {
3117
+ stateDb = eventsCtx?.db ?? openStateDatabase(eventsCtx?.dbPath);
3118
+ const purgedCount = purgeOldEvents(stateDb, retentionDays);
3119
+ if (purgedCount > 0) {
3120
+ info(`[improve] events purge: ${purgedCount} event(s) older than ${retentionDays}d removed from state.db`);
2928
3121
  }
2929
- catch {
2930
- // best-effort
3122
+ appendEvent({
3123
+ eventType: "events_purged",
3124
+ ref: "events:_purge",
3125
+ metadata: { purgedCount, retentionDays },
3126
+ }, eventsCtx);
3127
+ // improve_runs uses the same retention window as events — both are
3128
+ // observability/audit data, both grow append-only, both have a
3129
+ // dedicated purge helper. Mirroring the events purge here means a
3130
+ // single retention knob (improve.eventRetentionDays) governs both.
3131
+ const improveRunsPurged = purgeOldImproveRuns(stateDb, retentionDays);
3132
+ if (improveRunsPurged > 0) {
3133
+ info(`[improve] improve_runs purge: ${improveRunsPurged} run(s) older than ${retentionDays}d removed from state.db`);
3134
+ }
3135
+ appendEvent({
3136
+ eventType: "improve_runs_purged",
3137
+ ref: "improve_runs:_purge",
3138
+ metadata: { purgedCount: improveRunsPurged, retentionDays },
3139
+ }, eventsCtx);
3140
+ }
3141
+ catch (err) {
3142
+ allWarnings.push(`events purge failed: ${err instanceof Error ? err.message : String(err)}`);
3143
+ }
3144
+ finally {
3145
+ if (ownsStateDb && stateDb) {
3146
+ try {
3147
+ stateDb.close();
3148
+ }
3149
+ catch {
3150
+ // best-effort
3151
+ }
3152
+ }
3153
+ }
3154
+ // task_logs in logs.db (#579) shares the same retention window as
3155
+ // events/improve_runs — all three are observability data governed by
3156
+ // the single improve.eventRetentionDays knob. Separate try/finally
3157
+ // because logs.db is a different file: a locked/missing logs.db must
3158
+ // not block the state.db purges above.
3159
+ let logsDb;
3160
+ try {
3161
+ logsDb = openLogsDatabase();
3162
+ const taskLogsPurged = purgeOldTaskLogs(logsDb, retentionDays);
3163
+ if (taskLogsPurged > 0) {
3164
+ info(`[improve] task_logs purge: ${taskLogsPurged} log line(s) older than ${retentionDays}d removed from logs.db`);
3165
+ }
3166
+ appendEvent({
3167
+ eventType: "task_logs_purged",
3168
+ ref: "task_logs:_purge",
3169
+ metadata: { purgedCount: taskLogsPurged, retentionDays },
3170
+ }, eventsCtx);
3171
+ }
3172
+ catch (err) {
3173
+ allWarnings.push(`task_logs purge failed: ${err instanceof Error ? err.message : String(err)}`);
3174
+ }
3175
+ finally {
3176
+ if (logsDb) {
3177
+ try {
3178
+ logsDb.close();
3179
+ }
3180
+ catch {
3181
+ // best-effort
3182
+ }
2931
3183
  }
2932
3184
  }
2933
3185
  }
2934
3186
  }
2935
- }
2936
- // Phase 4A (staleness detection). Activates the `deprecated` belief-state
2937
- // machinery shipped in Phase 1A. Default OFF gated by
2938
- // `features.index.staleness_detection.enabled`. Runs after orphan purge
2939
- // and before the URL check (which lives in the outer caller).
2940
- if (sources.length > 0) {
2941
- try {
2942
- stalenessDetection = await withLlmStage("staleness-detection", () => stalenessDetectionFn({ config, sources, signal: budgetSignal, db }));
2943
- if (stalenessDetection.considered > 0) {
2944
- info(`[improve] staleness detection complete (considered ${stalenessDetection.considered}, ` +
2945
- `deprecated ${stalenessDetection.deprecated}, confirmed ${stalenessDetection.confirmed}, ` +
2946
- `skipped ${stalenessDetection.skipped}, ${stalenessDetection.durationMs}ms)`);
3187
+ // Phase 4A (staleness detection). Activates the `deprecated` belief-state
3188
+ // machinery shipped in Phase 1A. Default OFF gated by
3189
+ // `features.index.staleness_detection.enabled`. Runs after orphan purge
3190
+ // and before the URL check (which lives in the outer caller).
3191
+ if (sources.length > 0) {
3192
+ try {
3193
+ stalenessDetection = await withLlmStage("staleness-detection", () => stalenessDetectionFn({ config, sources, signal: budgetSignal, db }));
3194
+ if (stalenessDetection.considered > 0) {
3195
+ info(`[improve] staleness detection complete (considered ${stalenessDetection.considered}, ` +
3196
+ `deprecated ${stalenessDetection.deprecated}, confirmed ${stalenessDetection.confirmed}, ` +
3197
+ `skipped ${stalenessDetection.skipped}, ${stalenessDetection.durationMs}ms)`);
3198
+ }
3199
+ for (const w of stalenessDetection.warnings)
3200
+ allWarnings.push(`[improve] staleness detection: ${w}`);
3201
+ }
3202
+ catch (err) {
3203
+ allWarnings.push(`staleness detection failed: ${err instanceof Error ? err.message : String(err)}`);
2947
3204
  }
2948
- for (const w of stalenessDetection.warnings)
2949
- allWarnings.push(`[improve] staleness detection: ${w}`);
2950
- }
2951
- catch (err) {
2952
- allWarnings.push(`staleness detection failed: ${err instanceof Error ? err.message : String(err)}`);
2953
3205
  }
2954
3206
  }
2955
- }
2956
- finally {
2957
- if (db)
2958
- closeDatabase(db);
2959
- }
3207
+ finally {
3208
+ if (db)
3209
+ closeDatabase(db);
3210
+ }
3211
+ });
2960
3212
  return {
2961
3213
  ...(memoryInference ? { memoryInference } : {}),
2962
3214
  ...(graphExtraction ? { graphExtraction } : {}),