akm-cli 0.9.0-beta.2 → 0.9.0-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/CHANGELOG.md +87 -0
  2. package/dist/assets/templates/html/default.html +78 -0
  3. package/dist/assets/templates/html/health.html +560 -0
  4. package/dist/assets/templates/html/vendor/echarts.min.js +45 -0
  5. package/dist/cli/shared.js +21 -5
  6. package/dist/cli.js +36 -5
  7. package/dist/commands/health/html-report.js +448 -0
  8. package/dist/commands/health.js +97 -6
  9. package/dist/commands/improve/extract.js +38 -2
  10. package/dist/commands/improve/improve-auto-accept.js +27 -1
  11. package/dist/commands/improve/improve.js +167 -53
  12. package/dist/commands/improve/reflect-noise.js +0 -0
  13. package/dist/commands/improve/reflect.js +25 -0
  14. package/dist/commands/proposal/drain.js +73 -6
  15. package/dist/commands/proposal/proposal-cli.js +22 -10
  16. package/dist/commands/proposal/proposal.js +12 -1
  17. package/dist/commands/proposal/validators/proposals.js +361 -338
  18. package/dist/commands/remember.js +6 -2
  19. package/dist/core/config/config-schema.js +5 -0
  20. package/dist/core/logs-db.js +304 -0
  21. package/dist/core/state-db.js +107 -14
  22. package/dist/indexer/db/db.js +2 -2
  23. package/dist/indexer/passes/memory-inference.js +61 -22
  24. package/dist/integrations/harnesses/claude/session-log.js +16 -4
  25. package/dist/llm/client.js +15 -0
  26. package/dist/llm/usage-persist.js +77 -0
  27. package/dist/llm/usage-telemetry.js +103 -0
  28. package/dist/output/context.js +3 -2
  29. package/dist/output/html-render.js +73 -0
  30. package/dist/output/shapes/helpers.js +17 -1
  31. package/dist/output/text/helpers.js +69 -1
  32. package/dist/scripts/migrate-storage.js +65 -14
  33. package/dist/scripts/migrations/import-fs-improve-runs-to-db.js +14 -2
  34. package/dist/tasks/runner.js +99 -16
  35. package/dist/workflows/db.js +4 -0
  36. package/package.json +1 -1
@@ -4,11 +4,13 @@
4
4
  import fs from "node:fs";
5
5
  import { ConfigError, UsageError } from "../core/errors.js";
6
6
  import { appendEvent, readEvents } from "../core/events.js";
7
+ import { buildTaskRunId, getLoggedRunIds, openLogsDatabase } from "../core/logs-db.js";
7
8
  import { getStateDbPathInDataDir } from "../core/paths.js";
8
9
  import { listExistingTableNames, openStateDatabase, queryCompletedTaskIntervals, queryImproveRuns, queryTaskHistory, } from "../core/state-db.js";
9
10
  import { parseSinceToIso } from "../core/time.js";
10
11
  import { readSemanticStatus } from "../indexer/search/semantic-status.js";
11
12
  import { getExecutionLogCandidates } from "../integrations/session-logs/index.js";
13
+ import { LLM_USAGE_EVENT } from "../llm/usage-persist.js";
12
14
  import { HEALTH_CHECKS } from "./health/checks.js";
13
15
  const DEFAULT_SINCE_MS = 24 * 60 * 60 * 1000;
14
16
  const IMPROVE_COMPLETED_EVENT = "improve_completed";
@@ -882,14 +884,84 @@ function computeDeltas(first, last) {
882
884
  }
883
885
  return out;
884
886
  }
885
- function buildWindowMetrics(db, stateDbPath, since, until, now = () => Date.now()) {
887
+ /**
888
+ * Partition task_history rows into "should have a log" (non-null log_path) and
889
+ * "log is actually backed". A run counts as backed when logs.db holds rows for
890
+ * its run_id (#579 — the DB is the primary record); rows written before logs.db
891
+ * existed fall back to the transitional on-disk file check. `logsDb` may be
892
+ * undefined when logs.db could not be opened — then only the file check runs.
893
+ */
894
+ function partitionLogBackedRows(taskRows, logsDb) {
895
+ const withLogs = taskRows.filter((row) => row.log_path !== null);
896
+ const loggedRunIds = logsDb
897
+ ? getLoggedRunIds(logsDb, withLogs.map((row) => buildTaskRunId(row.task_id, row.started_at)))
898
+ : new Set();
899
+ const backed = withLogs.filter((row) => loggedRunIds.has(buildTaskRunId(row.task_id, row.started_at)) ||
900
+ (row.log_path !== null && fs.existsSync(row.log_path)));
901
+ return { withLogs, backed };
902
+ }
903
+ /** Stage key used for `llm_usage` events recorded outside any stage scope. */
904
+ const UNATTRIBUTED_STAGE = "unattributed";
905
+ function emptyLlmUsageStageAggregate() {
906
+ return {
907
+ calls: 0,
908
+ totalDurationMs: 0,
909
+ promptTokens: 0,
910
+ completionTokens: 0,
911
+ totalTokens: 0,
912
+ reasoningTokens: 0,
913
+ };
914
+ }
915
+ function emptyLlmUsageAggregate() {
916
+ return { ...emptyLlmUsageStageAggregate(), byStage: {} };
917
+ }
918
+ /**
919
+ * Aggregate `llm_usage` events (#576) into a window total plus a per-stage
920
+ * breakdown of call count, wall-time, and token usage. Token fields absent from
921
+ * a best-effort record contribute 0. Calls with no `stage` land under
922
+ * {@link UNATTRIBUTED_STAGE}.
923
+ */
924
+ function summarizeLlmUsage(events) {
925
+ const aggregate = emptyLlmUsageAggregate();
926
+ for (const event of events) {
927
+ const meta = event.metadata ?? {};
928
+ const stageKey = typeof meta.stage === "string" && meta.stage ? meta.stage : UNATTRIBUTED_STAGE;
929
+ let stage = aggregate.byStage[stageKey];
930
+ if (!stage) {
931
+ stage = emptyLlmUsageStageAggregate();
932
+ aggregate.byStage[stageKey] = stage;
933
+ }
934
+ const durationMs = toFiniteNumber(meta.durationMs);
935
+ const promptTokens = toFiniteNumber(meta.promptTokens);
936
+ const completionTokens = toFiniteNumber(meta.completionTokens);
937
+ const totalTokens = toFiniteNumber(meta.totalTokens);
938
+ const reasoningTokens = toFiniteNumber(meta.reasoningTokens);
939
+ for (const target of [aggregate, stage]) {
940
+ target.calls += 1;
941
+ target.totalDurationMs += durationMs;
942
+ target.promptTokens += promptTokens;
943
+ target.completionTokens += completionTokens;
944
+ target.totalTokens += totalTokens;
945
+ target.reasoningTokens += reasoningTokens;
946
+ }
947
+ }
948
+ return aggregate;
949
+ }
950
+ function readLlmUsageAggregate(stateDbPath, since, until) {
951
+ const events = readEvents({ since, type: LLM_USAGE_EVENT }, { dbPath: stateDbPath }).events.filter((event) => {
952
+ if (until === undefined)
953
+ return true;
954
+ return new Date(event.ts ?? since).getTime() < new Date(until).getTime();
955
+ });
956
+ return summarizeLlmUsage(events);
957
+ }
958
+ function buildWindowMetrics(db, stateDbPath, since, until, now = () => Date.now(), logsDb) {
886
959
  const taskRows = queryTaskHistory(db, { since }).filter((row) => {
887
960
  const startMs = new Date(row.started_at).getTime();
888
961
  const untilMs = new Date(until).getTime();
889
962
  return !Number.isFinite(untilMs) || startMs < untilMs;
890
963
  });
891
- const taskRowsWithLogs = taskRows.filter((row) => row.log_path !== null);
892
- const existingLogRows = taskRowsWithLogs.filter((row) => row.log_path && fs.existsSync(row.log_path));
964
+ const { withLogs: taskRowsWithLogs, backed: existingLogRows } = partitionLogBackedRows(taskRows, logsDb);
893
965
  const failedTaskRows = taskRows.filter((row) => row.status === "failed");
894
966
  const activeRows = taskRows.filter((row) => row.status === "active");
895
967
  const stuckActiveRuns = activeRows.filter((row) => now() - new Date(row.started_at).getTime() > ACTIVE_RUN_WARN_MS).length;
@@ -923,6 +995,7 @@ function buildWindowMetrics(db, stateDbPath, since, until, now = () => Date.now(
923
995
  stuckActiveRuns,
924
996
  logBackingRate: roundRate(logBackingRate),
925
997
  probeRoundTripMs: null,
998
+ llmUsage: readLlmUsageAggregate(stateDbPath, since, until),
926
999
  };
927
1000
  return { improve: improveSummary, metrics, runs: runCount };
928
1001
  }
@@ -961,6 +1034,16 @@ export function akmHealth(options = {}) {
961
1034
  catch (error) {
962
1035
  throw new ConfigError(`Unable to open state.db: ${error instanceof Error ? error.message : String(error)}`, "INVALID_CONFIG_FILE");
963
1036
  }
1037
+ // logs.db backs the log-backing metric (#579). Best-effort: when it cannot
1038
+ // be opened, partitionLogBackedRows falls back to the on-disk file check, so
1039
+ // health never hard-fails on a missing/locked logs database.
1040
+ let logsDb;
1041
+ try {
1042
+ logsDb = openLogsDatabase(options.logsDbPath);
1043
+ }
1044
+ catch {
1045
+ logsDb = undefined;
1046
+ }
964
1047
  try {
965
1048
  const tables = listExistingTableNames(db, ["events", "task_history", "proposals", "schema_migrations"]);
966
1049
  const tableNames = tables.map((row) => row.name).sort();
@@ -968,8 +1051,7 @@ export function akmHealth(options = {}) {
968
1051
  const missingTables = requiredTables.filter((name) => !tableNames.includes(name));
969
1052
  const probe = probeStateDbRoundTrip(stateDbPath);
970
1053
  const taskRows = queryTaskHistory(db, { since });
971
- const taskRowsWithLogs = taskRows.filter((row) => row.log_path !== null);
972
- const existingLogRows = taskRowsWithLogs.filter((row) => row.log_path && fs.existsSync(row.log_path));
1054
+ const { withLogs: taskRowsWithLogs, backed: existingLogRows } = partitionLogBackedRows(taskRows, logsDb);
973
1055
  const failedTaskRows = taskRows.filter((row) => row.status === "failed");
974
1056
  const activeRows = taskRows.filter((row) => row.status === "active");
975
1057
  const stuckActiveRuns = activeRows.filter((row) => now() - new Date(row.started_at).getTime() > ACTIVE_RUN_WARN_MS).length;
@@ -1041,6 +1123,7 @@ export function akmHealth(options = {}) {
1041
1123
  stuckActiveRuns,
1042
1124
  logBackingRate: roundRate(logBackingRate),
1043
1125
  probeRoundTripMs: probe.durationMs,
1126
+ llmUsage: readLlmUsageAggregate(stateDbPath, since),
1044
1127
  };
1045
1128
  const hardFailure = hardChecks.some((check) => check.status === "fail");
1046
1129
  const deterministicWarnings = [...hardChecks, ...advisories].some((check) => check.status === "warn" && check.kind === "deterministic");
@@ -1062,7 +1145,7 @@ export function akmHealth(options = {}) {
1062
1145
  windowResults = windowSpecs.map((spec) => {
1063
1146
  const winSince = parseHealthSince(spec.since);
1064
1147
  const winUntil = spec.until ? parseHealthSince(spec.until) : new Date(now()).toISOString();
1065
- const bundle = buildWindowMetrics(db, stateDbPath, winSince, winUntil, now);
1148
+ const bundle = buildWindowMetrics(db, stateDbPath, winSince, winUntil, now, logsDb);
1066
1149
  return {
1067
1150
  name: spec.name,
1068
1151
  since: winSince,
@@ -1112,6 +1195,14 @@ export function akmHealth(options = {}) {
1112
1195
  }
1113
1196
  finally {
1114
1197
  db.close();
1198
+ if (logsDb) {
1199
+ try {
1200
+ logsDb.close();
1201
+ }
1202
+ catch {
1203
+ // best-effort
1204
+ }
1205
+ }
1115
1206
  }
1116
1207
  }
1117
1208
  // ── Markdown renderers ───────────────────────────────────────────────────────
@@ -42,6 +42,14 @@ import { buildExtractPrompt, EXTRACT_JSON_SCHEMA, parseExtractPayload } from "./
42
42
  import { buildSessionSummaryPrompt, parseSessionSummary, SESSION_SUMMARY_JSON_SCHEMA, sessionMeetsDurationGate, writeSessionAsset, } from "./session-asset.js";
43
43
  /** Default minimum session duration (minutes) for session indexing (#561). */
44
44
  const DEFAULT_MIN_SESSION_DURATION_MINUTES = 5;
45
+ /**
46
+ * Default minimum raw session size (chars) below which the extract LLM call is
47
+ * skipped (#595/#596). Deliberately tiny: analysis of 218 candidate-producing
48
+ * sessions showed sessions of 22–368 raw chars regularly yield 1–5 candidates,
49
+ * so size is not a reliable proxy for value — only truly empty sessions
50
+ * (0 chars, journal files) are safe to skip.
51
+ */
52
+ const DEFAULT_MIN_CONTENT_CHARS = 10;
45
53
  // ── Helpers ──────────────────────────────────────────────────────────────────
46
54
  /**
47
55
  * Parse a since-string into an absolute ms-epoch cutoff. Accepts:
@@ -115,7 +123,7 @@ function buildCandidateProposal(candidate, sourceRef) {
115
123
  * proposal validation failure) the session result records a warning and
116
124
  * keeps going — one session's bad luck never aborts a multi-session run.
117
125
  */
118
- async function processSession(harness, sessionRef, stashDir, config, llmConfig, chat, ctx, sourceRun, dryRun, timeoutMs, maxTotalChars, sessionIndexing) {
126
+ async function processSession(harness, sessionRef, stashDir, config, llmConfig, chat, ctx, sourceRun, dryRun, timeoutMs, maxTotalChars, minContentChars, sessionIndexing) {
119
127
  const warnings = [];
120
128
  let data;
121
129
  try {
@@ -136,6 +144,31 @@ async function processSession(harness, sessionRef, stashDir, config, llmConfig,
136
144
  const filtered = preFilterSession(data, {
137
145
  ...(typeof maxTotalChars === "number" ? { maxTotalChars } : {}),
138
146
  });
147
+ // #595/#596 — minContentChars gate: skip the LLM call for sessions whose RAW
148
+ // size is below threshold. Measured on the raw event text BEFORE the noise
149
+ // pre-filter, NOT on post-filter output — the pre-filter strips boilerplate
150
+ // so aggressively that even signal-bearing sessions can have tiny output
151
+ // (#596: gating post-filter filtered out 100% of sessions). Note: the 0.8.x
152
+ // fix gated on `filtered.stats.inputCount`, which is an EVENT count, not a
153
+ // char count — this port measures actual raw chars so the threshold matches
154
+ // the config key's documented unit.
155
+ const rawContentChars = data.events.reduce((sum, event) => sum + event.text.length, 0);
156
+ if (minContentChars > 0 && rawContentChars < minContentChars) {
157
+ return {
158
+ sessionId: sessionRef.sessionId,
159
+ harness: harness.name,
160
+ candidateCount: 0,
161
+ proposalIds: [],
162
+ preFilter: {
163
+ inputCount: filtered.stats.inputCount,
164
+ outputCount: filtered.stats.outputCount,
165
+ truncatedCount: filtered.stats.truncatedCount,
166
+ },
167
+ warnings: [],
168
+ skipped: true,
169
+ skipReason: "too_short",
170
+ };
171
+ }
139
172
  const prompt = buildExtractPrompt({ data, events: filtered.events, inlineRefs: data.inlineRefs });
140
173
  // #561 — ADDITIVE session indexing. Generate + write the session asset
141
174
  // (`sessions/<harness>/<id>.md`). FAIL-OPEN: any failure only records a
@@ -339,6 +372,9 @@ export async function akmExtract(options) {
339
372
  60_000;
340
373
  // Pre-filter budget — process config can raise it for large-context models.
341
374
  const maxTotalChars = typeof extractProcess?.maxTotalChars === "number" ? extractProcess.maxTotalChars : undefined;
375
+ // #595/#596 — minimum raw session size; sessions below it skip the LLM call
376
+ // entirely. Set `processes.extract.minContentChars: 0` to disable the gate.
377
+ const minContentChars = typeof extractProcess?.minContentChars === "number" ? extractProcess.minContentChars : DEFAULT_MIN_CONTENT_CHARS;
342
378
  // Default discovery window — process config can override the built-in 24h.
343
379
  const effectiveSince = options.since ?? extractProcess?.defaultSince;
344
380
  // #561 — resolve session-indexing config. Default ON: we only reach this code
@@ -483,7 +519,7 @@ export async function akmExtract(options) {
483
519
  continue;
484
520
  }
485
521
  try {
486
- const result = await processSession(harness, summary, stashDir, config, llmConfig, chat, options.ctx, sourceRun, dryRun, timeoutMs, maxTotalChars, sessionIndexing);
522
+ const result = await processSession(harness, summary, stashDir, config, llmConfig, chat, options.ctx, sourceRun, dryRun, timeoutMs, maxTotalChars, minContentChars, sessionIndexing);
487
523
  sessions.push(result);
488
524
  if (result.skipped)
489
525
  skippedCount += 1;
@@ -4,7 +4,7 @@
4
4
  import { loadConfig } from "../../core/config/config.js";
5
5
  import { appendEvent } from "../../core/events.js";
6
6
  import { info, warn } from "../../core/warn.js";
7
- import { promoteProposal } from "../proposal/validators/proposals.js";
7
+ import { promoteProposal, recordGateDecision } from "../proposal/validators/proposals.js";
8
8
  // ---------------------------------------------------------------------------
9
9
  // Gate implementation
10
10
  // ---------------------------------------------------------------------------
@@ -26,14 +26,40 @@ export async function runAutoAcceptGate(candidates, cfg, promoteFn = promoteProp
26
26
  }
27
27
  const effectiveThreshold = Math.max(cfg.globalThreshold, cfg.minimumThreshold ?? 0) / 100;
28
28
  const resolvedConfig = typeof cfg.config === "function" ? cfg.config() : cfg.config;
29
+ const gateLabel = `improve:${cfg.phase}`;
30
+ // #577: stamp the gate's verdict onto each proposal so `akm proposal show`
31
+ // can explain why a proposal is pending (e.g. "deferred: below-threshold,
32
+ // 0.72 < 0.90"). Best-effort — a recording failure must never abort the gate.
33
+ const stamp = (proposalId, decision) => {
34
+ try {
35
+ recordGateDecision(cfg.stashDir, proposalId, decision);
36
+ }
37
+ catch (err) {
38
+ warn(`[improve] ${cfg.phase} failed to record gate decision for ${proposalId}: ${err instanceof Error ? err.message : String(err)}`);
39
+ }
40
+ };
29
41
  for (const candidate of candidates) {
30
42
  const { proposalId, confidence } = candidate;
31
43
  if (confidence === undefined || confidence < effectiveThreshold) {
44
+ stamp(proposalId, {
45
+ outcome: "deferred",
46
+ reason: confidence === undefined ? "no-confidence" : "below-threshold",
47
+ ...(confidence !== undefined ? { confidence } : {}),
48
+ thresholds: { autoAccept: effectiveThreshold },
49
+ gate: gateLabel,
50
+ });
32
51
  result.skipped.push(proposalId);
33
52
  continue;
34
53
  }
35
54
  try {
36
55
  const promotion = await promoteFn(cfg.stashDir, resolvedConfig, proposalId, {}, undefined);
56
+ stamp(promotion.proposal.id, {
57
+ outcome: "auto-accepted",
58
+ reason: "above-threshold",
59
+ confidence,
60
+ thresholds: { autoAccept: effectiveThreshold },
61
+ gate: gateLabel,
62
+ });
37
63
  appendEvent({
38
64
  eventType: "promoted",
39
65
  ref: promotion.ref,