codemem 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env node
2
- import { DEFAULT_COORDINATOR_DB_PATH, MemoryStore, ObserverClient, RawEventSweeper, SyncRetentionRunner, VERSION, applyBootstrapSnapshot, backfillTagsText, backfillVectors, buildAuthHeaders, buildBaseUrl, buildRawEventEnvelopeFromHook, compareMemoryRoleReports, connect, coordinatorCreateGroupAction, coordinatorCreateInviteAction, coordinatorDisableDeviceAction, coordinatorEnrollDeviceAction, coordinatorImportInviteAction, coordinatorListBootstrapGrantsAction, coordinatorListDevicesAction, coordinatorListGroupsAction, coordinatorListJoinRequestsAction, coordinatorRemoveDeviceAction, coordinatorRenameDeviceAction, coordinatorReviewJoinRequestAction, coordinatorRevokeBootstrapGrantAction, createBetterSqliteCoordinatorApp, deactivateLowSignalMemories, deactivateLowSignalObservations, ensureDeviceIdentity, exportMemories, fetchAllSnapshotPages, fingerprintPublicKey, getMemoryRoleReport, getRawEventRelinkPlan, getRawEventRelinkReport, getRawEventStatus, getWorkspaceCodememConfigPath, hasUnsyncedSharedMemoryChanges, importMemories, initDatabase, isEmbeddingDisabled, loadPublicKey, loadSqliteVec, planReplicationOpsAgePrune, pruneReplicationOpsUntilCaughtUp, rawEventsGate, readCodememConfigFile, readCodememConfigFileAtPath, readCoordinatorSyncConfig, readImportPayload, requestJson, resolveCodememConfigPath, resolveDbPath, resolveHookProject, resolveProject, retryRawEventFailures, runSyncDaemon, runSyncPass, schema, setPeerProjectFilter, stripJsonComments, stripPrivateObj, stripTrailingCommas, syncPassPreflight, updatePeerAddresses, vacuumDatabase, writeCodememConfigFile } from "@codemem/core";
2
+ import { DEFAULT_COORDINATOR_DB_PATH, MemoryStore, ObserverClient, RawEventSweeper, SyncRetentionRunner, VERSION, applyBootstrapSnapshot, backfillTagsText, backfillVectors, buildAuthHeaders, buildBaseUrl, buildRawEventEnvelopeFromHook, compareMemoryRoleReports, connect, coordinatorCreateGroupAction, coordinatorCreateInviteAction, coordinatorDisableDeviceAction, coordinatorEnrollDeviceAction, coordinatorImportInviteAction, coordinatorListBootstrapGrantsAction, coordinatorListDevicesAction, coordinatorListGroupsAction, coordinatorListJoinRequestsAction, coordinatorRemoveDeviceAction, coordinatorRenameDeviceAction, coordinatorReviewJoinRequestAction, coordinatorRevokeBootstrapGrantAction, createBetterSqliteCoordinatorApp, deactivateLowSignalMemories, deactivateLowSignalObservations, ensureDeviceIdentity, exportMemories, fetchAllSnapshotPages, fingerprintPublicKey, getExtractionBenchmarkProfile, getInjectionEvalScenarioPack, getInjectionEvalScenarioPrompts, getMemoryRoleReport, getRawEventRelinkPlan, getRawEventRelinkReport, getRawEventStatus, getSessionExtractionEval, getSessionExtractionEvalScenario, getWorkspaceCodememConfigPath, hasUnsyncedSharedMemoryChanges, importMemories, initDatabase, isEmbeddingDisabled, loadObserverConfig, loadPublicKey, loadSqliteVec, planReplicationOpsAgePrune, pruneReplicationOpsUntilCaughtUp, rawEventsGate, readCodememConfigFile, readCodememConfigFileAtPath, readCoordinatorSyncConfig, readImportPayload, replayBatchExtraction, replayBatchExtractionWithTierRouting, requestJson, resolveCodememConfigPath, resolveDbPath, resolveHookProject, resolveProject, retryRawEventFailures, runSyncDaemon, runSyncPass, schema, setPeerProjectFilter, stripJsonComments, stripPrivateObj, stripTrailingCommas, syncPassPreflight, updatePeerAddresses, vacuumDatabase, writeCodememConfigFile } from "@codemem/core";
3
3
  import { Command, Option } from "commander";
4
4
  import omelette from "omelette";
5
5
  import { existsSync, mkdirSync, readFileSync, rmSync, statSync, writeFileSync } from "node:fs";
@@ -1767,16 +1767,19 @@ function createInjectMemoryCommand() {
1767
1767
  return cmd;
1768
1768
  }
1769
1769
  function createMemoryRoleReportCommand() {
1770
- const cmd = new Command("role-report").configureHelp(helpStyle).description("Analyze inferred memory roles in a DB snapshot").option("--project <project>", "project identifier (defaults to git repo root)").option("--all-projects", "analyze across all projects").option("--probe <query>", "run a retrieval probe query against the snapshot", (value, prev) => [...prev, value], []).option("--inactive", "include inactive memories");
1770
+ const cmd = new Command("role-report").configureHelp(helpStyle).description("Analyze inferred memory roles in a DB snapshot").option("--project <project>", "project identifier (defaults to git repo root)").option("--all-projects", "analyze across all projects").option("--probe <query>", "run a retrieval probe query against the snapshot", (value, prev) => [...prev, value], []).option("--scenario <id>", "run a named injection-first eval scenario pack (can be repeated)", (value, prev) => [...prev, value], []).option("--inactive", "include inactive memories");
1771
1771
  addDbOption(cmd);
1772
1772
  addJsonOption(cmd);
1773
1773
  cmd.action((opts) => {
1774
1774
  const project = opts.allProjects === true ? null : opts.project?.trim() || process.env.CODEMEM_PROJECT?.trim() || resolveProject(process.cwd(), null);
1775
+ const invalidScenario = (opts.scenario ?? []).find((id) => getInjectionEvalScenarioPack(id) == null);
1776
+ if (invalidScenario) throw new Error(`Unknown eval scenario pack: ${invalidScenario}`);
1777
+ const probes = [...opts.probe ?? [], ...getInjectionEvalScenarioPrompts(opts.scenario ?? [])];
1775
1778
  const result = getMemoryRoleReport(resolveDbOpt(opts), {
1776
1779
  project,
1777
1780
  allProjects: opts.allProjects === true,
1778
1781
  includeInactive: opts.inactive === true,
1779
- probes: opts.probe
1782
+ probes
1780
1783
  });
1781
1784
  if (opts.json) {
1782
1785
  console.log(JSON.stringify(result, null, 2));
@@ -1800,17 +1803,23 @@ function createMemoryRoleReportCommand() {
1800
1803
  p.log.message(` summary_unmapped ${result.summary_mapping.unmapped}`);
1801
1804
  p.log.info("Project quality:");
1802
1805
  for (const [bucket, count] of Object.entries(result.project_quality)) p.log.message(` ${bucket.padEnd(12)} ${String(count)}`);
1806
+ p.log.info("Session classes:");
1807
+ for (const [bucket, count] of Object.entries(result.session_class_buckets)) p.log.message(` ${bucket.padEnd(20)} ${String(count)}`);
1808
+ p.log.info("Summary dispositions:");
1809
+ for (const [bucket, count] of Object.entries(result.summary_disposition_buckets)) p.log.message(` ${bucket.padEnd(20)} ${String(count)}`);
1803
1810
  if (result.probe_results.length > 0) {
1804
1811
  p.log.info("Probe results:");
1805
1812
  for (const probe of result.probe_results) {
1806
1813
  p.log.message(` query: ${probe.query}`);
1814
+ if (probe.scenario_id) p.log.message(` scenario: ${probe.scenario_id} (${probe.scenario_category ?? "unknown"})${probe.scenario_title ? ` — ${probe.scenario_title}` : ""}`);
1807
1815
  p.log.message(` mode: ${probe.mode}`);
1808
1816
  p.log.message(` top roles: durable=${probe.top_role_counts.durable} recap=${probe.top_role_counts.recap} ephemeral=${probe.top_role_counts.ephemeral} general=${probe.top_role_counts.general}`);
1809
1817
  p.log.message(` top mapping: mapped=${probe.top_mapping_counts.mapped} unmapped=${probe.top_mapping_counts.unmapped}`);
1810
1818
  p.log.message(` burden: recap_share=${probe.top_burden.recap_share.toFixed(2)} unmapped_share=${probe.top_burden.unmapped_share.toFixed(2)} recap_unmapped_share=${probe.top_burden.recap_unmapped_share.toFixed(2)}`);
1811
1819
  if (probe.simulated_demoted_unmapped_recap) p.log.message(` simulated demote-unmapped-recap burden: recap_share=${probe.simulated_demoted_unmapped_recap.top_burden.recap_share.toFixed(2)} unmapped_share=${probe.simulated_demoted_unmapped_recap.top_burden.unmapped_share.toFixed(2)} recap_unmapped_share=${probe.simulated_demoted_unmapped_recap.top_burden.recap_unmapped_share.toFixed(2)}`);
1812
1820
  if (probe.simulated_demoted_unmapped_recap_and_ephemeral) p.log.message(` simulated demote-unmapped-recap+ephemeral burden: recap_share=${probe.simulated_demoted_unmapped_recap_and_ephemeral.top_burden.recap_share.toFixed(2)} unmapped_share=${probe.simulated_demoted_unmapped_recap_and_ephemeral.top_burden.unmapped_share.toFixed(2)} recap_unmapped_share=${probe.simulated_demoted_unmapped_recap_and_ephemeral.top_burden.recap_unmapped_share.toFixed(2)}`);
1813
- for (const item of probe.items.slice(0, 5)) p.log.message(` [${item.id}] (${item.kind}/${item.role}/${item.mapping}) ${item.title} ${item.role_reason}`);
1821
+ if (probe.scenario_score) p.log.message(` scenario score: mode_match=${probe.scenario_score.mode_match ? "yes" : "no"} top1_primary=${probe.scenario_score.primary_in_top1 ? "yes" : "no"} top3_primary=${probe.scenario_score.primary_in_top3_count} top1_anti=${probe.scenario_score.anti_signal_in_top1 ? "yes" : "no"} primary=${probe.scenario_score.primary_match_count} anti=${probe.scenario_score.anti_signal_count} recap=${probe.scenario_score.recap_count} unmapped_recap=${probe.scenario_score.unmapped_recap_count} chatter=${probe.scenario_score.administrative_chatter_count} net=${probe.scenario_score.score}`);
1822
+ for (const item of probe.items.slice(0, 5)) p.log.message(` [${item.id}] (${item.kind}/${item.role}/${item.mapping}/${item.session_class}/${item.summary_disposition}) ${item.title} — ${item.role_reason}`);
1814
1823
  }
1815
1824
  }
1816
1825
  p.outro("done");
@@ -1818,14 +1827,18 @@ function createMemoryRoleReportCommand() {
1818
1827
  return cmd;
1819
1828
  }
1820
1829
  function createMemoryRoleCompareCommand() {
1821
- const cmd = new Command("role-compare").configureHelp(helpStyle).description("Compare inferred memory-role and probe metrics across two DB snapshots").argument("<baseline_db>", "baseline sqlite database path").argument("<candidate_db>", "candidate sqlite database path").option("--project <project>", "project identifier (defaults to git repo root)").option("--all-projects", "analyze across all projects").option("--probe <query>", "run a retrieval probe query against both snapshots", (value, prev) => [...prev, value], []).option("--inactive", "include inactive memories");
1830
+ const cmd = new Command("role-compare").configureHelp(helpStyle).description("Compare inferred memory-role and probe metrics across two DB snapshots").argument("<baseline_db>", "baseline sqlite database path").argument("<candidate_db>", "candidate sqlite database path").option("--project <project>", "project identifier (defaults to git repo root)").option("--all-projects", "analyze across all projects").option("--probe <query>", "run a retrieval probe query against both snapshots", (value, prev) => [...prev, value], []).option("--scenario <id>", "run a named injection-first eval scenario pack (can be repeated)", (value, prev) => [...prev, value], []).option("--inactive", "include inactive memories");
1822
1831
  addJsonOption(cmd);
1823
1832
  cmd.action((baselineDb, candidateDb, opts) => {
1833
+ const project = opts.allProjects === true ? null : opts.project?.trim() || process.env.CODEMEM_PROJECT?.trim() || resolveProject(process.cwd(), null);
1834
+ const invalidScenario = (opts.scenario ?? []).find((id) => getInjectionEvalScenarioPack(id) == null);
1835
+ if (invalidScenario) throw new Error(`Unknown eval scenario pack: ${invalidScenario}`);
1836
+ const probes = [...opts.probe ?? [], ...getInjectionEvalScenarioPrompts(opts.scenario ?? [])];
1824
1837
  const result = compareMemoryRoleReports(baselineDb, candidateDb, {
1825
- project: opts.allProjects === true ? null : opts.project?.trim() || process.env.CODEMEM_PROJECT?.trim() || resolveProject(process.cwd(), null),
1838
+ project,
1826
1839
  allProjects: opts.allProjects === true,
1827
1840
  includeInactive: opts.inactive === true,
1828
- probes: opts.probe
1841
+ probes
1829
1842
  });
1830
1843
  if (opts.json) {
1831
1844
  console.log(JSON.stringify(result, null, 2));
@@ -1843,6 +1856,10 @@ function createMemoryRoleCompareCommand() {
1843
1856
  ].join("\n"));
1844
1857
  p.log.info("Role deltas:");
1845
1858
  for (const [role, count] of Object.entries(result.delta.counts_by_role)) p.log.message(` ${role.padEnd(10)} ${String(count)}`);
1859
+ p.log.info("Session class deltas:");
1860
+ for (const [bucket, count] of Object.entries(result.delta.session_class_buckets)) p.log.message(` ${bucket.padEnd(20)} ${String(count)}`);
1861
+ p.log.info("Summary disposition deltas:");
1862
+ for (const [bucket, count] of Object.entries(result.delta.summary_disposition_buckets)) p.log.message(` ${bucket.padEnd(20)} ${String(count)}`);
1846
1863
  if (result.probe_comparisons.length > 0) {
1847
1864
  p.log.info("Probe comparisons:");
1848
1865
  for (const probe of result.probe_comparisons) {
@@ -1851,12 +1868,288 @@ function createMemoryRoleCompareCommand() {
1851
1868
  p.log.message(` overlap: shared_top_keys=${probe.shared_item_keys.length} baseline_top=${probe.baseline_item_ids.slice(0, 5).join(",") || "-"} candidate_top=${probe.candidate_item_ids.slice(0, 5).join(",") || "-"}`);
1852
1869
  if (probe.delta_top_burden) p.log.message(` burden delta: recap_share=${probe.delta_top_burden.recap_share.toFixed(2)} unmapped_share=${probe.delta_top_burden.unmapped_share.toFixed(2)} recap_unmapped_share=${probe.delta_top_burden.recap_unmapped_share.toFixed(2)}`);
1853
1870
  if (probe.delta_top_mapping_counts) p.log.message(` mapping delta: mapped=${probe.delta_top_mapping_counts.mapped} unmapped=${probe.delta_top_mapping_counts.unmapped}`);
1871
+ if (probe.baseline_scenario_score || probe.candidate_scenario_score) p.log.message(` scenario scores: baseline=${probe.baseline_scenario_score?.score ?? "-"} candidate=${probe.candidate_scenario_score?.score ?? "-"}`);
1872
+ if (probe.delta_scenario_score) p.log.message(` scenario delta: mode_match=${probe.delta_scenario_score.mode_match ?? "-"} top1_primary=${probe.delta_scenario_score.primary_in_top1 ?? "-"} top3_primary=${probe.delta_scenario_score.primary_in_top3_count ?? "-"} top1_anti=${probe.delta_scenario_score.anti_signal_in_top1 ?? "-"} primary=${probe.delta_scenario_score.primary_match_count ?? "-"} anti=${probe.delta_scenario_score.anti_signal_count ?? "-"} recap=${probe.delta_scenario_score.recap_count ?? "-"} unmapped_recap=${probe.delta_scenario_score.unmapped_recap_count ?? "-"} chatter=${probe.delta_scenario_score.administrative_chatter_count ?? "-"} net=${probe.delta_scenario_score.score ?? "-"}`);
1854
1873
  }
1855
1874
  }
1856
1875
  p.outro("done");
1857
1876
  });
1858
1877
  return cmd;
1859
1878
  }
1879
+ function createMemoryExtractionReportCommand() {
1880
+ const cmd = new Command("extraction-report").configureHelp(helpStyle).description("Score extracted memories for a session against a built-in extraction eval rubric").option("--session-id <id>", "session ID to evaluate").option("--batch-id <id>", "raw-event flush batch ID to evaluate").requiredOption("--scenario <id>", "built-in extraction eval scenario ID").option("--inactive", "include inactive memories");
1881
+ addDbOption(cmd);
1882
+ addJsonOption(cmd);
1883
+ cmd.action((opts) => {
1884
+ const sessionIdInput = opts.sessionId?.trim() ?? "";
1885
+ const batchIdInput = opts.batchId?.trim() ?? "";
1886
+ const hasSessionId = sessionIdInput.length > 0;
1887
+ const hasBatchId = batchIdInput.length > 0;
1888
+ if (hasSessionId === hasBatchId) throw new Error("Provide exactly one of --session-id or --batch-id");
1889
+ const sessionId = hasSessionId ? parseStrictPositiveId(sessionIdInput) : null;
1890
+ if (hasSessionId && sessionId === null) throw new Error(`Invalid session ID: ${sessionIdInput || opts.sessionId}`);
1891
+ const batchId = hasBatchId ? parseStrictPositiveId(batchIdInput) : null;
1892
+ if (hasBatchId && batchId === null) throw new Error(`Invalid batch ID: ${batchIdInput || opts.batchId}`);
1893
+ const scenarioId = opts.scenario?.trim() ?? "";
1894
+ const scenario = getSessionExtractionEvalScenario(scenarioId);
1895
+ if (!scenario) throw new Error(`Unknown extraction eval scenario: ${scenarioId || opts.scenario}`);
1896
+ const result = batchId != null ? getSessionExtractionEval(resolveDbOpt(opts), {
1897
+ batchId,
1898
+ scenarioId: scenario.id,
1899
+ includeInactive: opts.inactive === true
1900
+ }) : getSessionExtractionEval(resolveDbOpt(opts), {
1901
+ sessionId,
1902
+ scenarioId: scenario.id,
1903
+ includeInactive: opts.inactive === true
1904
+ });
1905
+ if (opts.json) {
1906
+ console.log(JSON.stringify(result, null, 2));
1907
+ return;
1908
+ }
1909
+ p.intro("codemem memory extraction-report");
1910
+ p.log.info([
1911
+ `Scenario: ${result.scenario.id} — ${result.scenario.title}`,
1912
+ `Target: ${result.target.type}${result.target.batchId != null ? ` #${result.target.batchId}` : ""}`,
1913
+ `Session: ${result.session.id} (${result.session.project ?? "no-project"})`,
1914
+ `Session class: ${result.session.sessionClass}`,
1915
+ `Summary disposition: ${result.session.summaryDisposition}`
1916
+ ].join("\n"));
1917
+ p.log.info([
1918
+ `Pass: ${result.pass ? "yes" : "no"}`,
1919
+ `Summary count: ${result.counts.summaries}`,
1920
+ `Observation count: ${result.counts.observations}`,
1921
+ `Summary thread coverage: ${result.coverage.summaryThreadCoverage}`,
1922
+ `Observation thread coverage: ${result.coverage.observationThreadCoverage}`,
1923
+ `Total thread coverage: ${result.coverage.totalThreadCoverage}`,
1924
+ `Duplicate observation threads: ${result.coverage.duplicateObservationThreads}`
1925
+ ].join("\n"));
1926
+ if (result.failureReasons.length > 0) {
1927
+ p.log.warn("Failure reasons:");
1928
+ for (const reason of result.failureReasons) p.log.message(` - ${reason}`);
1929
+ }
1930
+ p.log.info("Thread coverage:");
1931
+ for (const thread of result.threads) p.log.message(` ${thread.id.padEnd(22)} summary=${thread.summaryMatch ? "yes" : "no"} observations=${thread.observationMatch ? "yes" : "no"}`);
1932
+ p.outro("done");
1933
+ });
1934
+ return cmd;
1935
+ }
1936
+ function createMemoryExtractionReplayCommand() {
1937
+ const cmd = new Command("extraction-replay").configureHelp(helpStyle).description("Re-run the observer on a historical flush batch without persisting, then score the fresh output").requiredOption("--batch-id <id>", "raw-event flush batch ID to replay").option("--transcript-budget <chars>", "override replay transcript budget in characters (replay only)").option("--observer-tier-routing", "use replay-only benchmark-backed observer tier routing").option("--observer-temperature <value>", "override observer temperature for replay only").option("--openai-responses", "use OpenAI Responses API for replay only").option("--reasoning-effort <level>", "set OpenAI reasoning.effort for replay only (responses path)").option("--reasoning-summary <mode>", "set OpenAI reasoning.summary for replay only (responses path)").option("--max-output-tokens <n>", "override OpenAI max_output_tokens for replay only (responses path)").requiredOption("--scenario <id>", "built-in extraction eval scenario ID");
1938
+ addDbOption(cmd);
1939
+ addJsonOption(cmd);
1940
+ cmd.action(async (opts) => {
1941
+ const batchIdInput = opts.batchId?.trim() ?? "";
1942
+ const batchId = parseStrictPositiveId(batchIdInput);
1943
+ if (batchId === null) throw new Error(`Invalid batch ID: ${batchIdInput || opts.batchId}`);
1944
+ const scenarioId = opts.scenario?.trim() ?? "";
1945
+ const scenario = getSessionExtractionEvalScenario(scenarioId);
1946
+ if (!scenario) throw new Error(`Unknown extraction eval scenario: ${scenarioId || opts.scenario}`);
1947
+ const transcriptBudgetInput = opts.transcriptBudget?.trim() ?? "";
1948
+ const transcriptBudget = transcriptBudgetInput.length > 0 ? parseStrictPositiveId(transcriptBudgetInput) : null;
1949
+ if (transcriptBudgetInput.length > 0 && transcriptBudget === null) throw new Error(`Invalid transcript budget: ${transcriptBudgetInput || opts.transcriptBudget}`);
1950
+ const observerTemperatureInput = opts.observerTemperature?.trim() ?? "";
1951
+ let observerTemperature;
1952
+ if (observerTemperatureInput.length > 0) {
1953
+ const parsed = Number(observerTemperatureInput);
1954
+ if (!Number.isFinite(parsed)) throw new Error(`Invalid observer temperature: ${observerTemperatureInput || opts.observerTemperature}`);
1955
+ observerTemperature = parsed;
1956
+ }
1957
+ const maxOutputTokensInput = opts.maxOutputTokens?.trim() ?? "";
1958
+ const maxOutputTokens = maxOutputTokensInput.length > 0 ? parseStrictPositiveId(maxOutputTokensInput) : null;
1959
+ if (maxOutputTokensInput.length > 0 && maxOutputTokens === null) throw new Error(`Invalid max output tokens: ${maxOutputTokensInput || opts.maxOutputTokens}`);
1960
+ const observerConfig = loadObserverConfig();
1961
+ const observerConfigWithOverrides = {
1962
+ ...observerConfig,
1963
+ observerTemperature: observerTemperature ?? observerConfig.observerTemperature,
1964
+ observerOpenAIUseResponses: opts.openaiResponses === true,
1965
+ observerReasoningEffort: opts.reasoningEffort?.trim() || null,
1966
+ observerReasoningSummary: opts.reasoningSummary?.trim() || null,
1967
+ observerMaxOutputTokens: maxOutputTokens ?? observerConfig.observerMaxTokens
1968
+ };
1969
+ const observer = new ObserverClient(observerConfigWithOverrides);
1970
+ const result = opts.observerTierRouting === true ? await replayBatchExtractionWithTierRouting(resolveDbOpt(opts), observerConfigWithOverrides, {
1971
+ batchId,
1972
+ scenarioId: scenario.id,
1973
+ transcriptBudget: transcriptBudget ?? void 0
1974
+ }) : await replayBatchExtraction(resolveDbOpt(opts), observer, {
1975
+ batchId,
1976
+ scenarioId: scenario.id,
1977
+ transcriptBudget: transcriptBudget ?? void 0
1978
+ });
1979
+ if (opts.json) {
1980
+ console.log(JSON.stringify(result, null, 2));
1981
+ return;
1982
+ }
1983
+ p.intro("codemem memory extraction-replay");
1984
+ p.log.info([
1985
+ `Scenario: ${result.scenario.id} — ${result.scenario.title}`,
1986
+ `Batch: ${result.target.batchId}`,
1987
+ `Session: ${result.target.sessionId}`,
1988
+ `Observer: ${result.observer.provider}/${result.observer.model}`,
1989
+ `Tier: ${result.observer.tier ?? "manual"}`,
1990
+ `OpenAI Responses: ${result.observer.openaiUseResponses ? "yes" : "no"}`,
1991
+ `Reasoning effort: ${result.observer.reasoningEffort ?? "none"}`,
1992
+ `Classification: ${result.classification.status}`,
1993
+ `Pass: ${result.evaluation.pass ? "yes" : "no"}`
1994
+ ].join("\n"));
1995
+ if (result.classification.reason) p.log.message(`Classification reason: ${result.classification.reason}`);
1996
+ if (result.evaluation.failureReasons.length > 0) {
1997
+ p.log.warn("Failure reasons:");
1998
+ for (const reason of result.evaluation.failureReasons) p.log.message(` - ${reason}`);
1999
+ }
2000
+ p.log.info([
2001
+ `Fresh summaries: ${result.evaluation.counts.summaries}`,
2002
+ `Fresh observations: ${result.evaluation.counts.observations}`,
2003
+ `Summary thread coverage: ${result.evaluation.coverage.summaryThreadCoverage}`,
2004
+ `Observation thread coverage: ${result.evaluation.coverage.observationThreadCoverage}`,
2005
+ `Total thread coverage: ${result.evaluation.coverage.totalThreadCoverage}`
2006
+ ].join("\n"));
2007
+ p.outro("done");
2008
+ });
2009
+ return cmd;
2010
+ }
2011
+ function createMemoryExtractionBenchmarkCommand() {
2012
+ const cmd = new Command("extraction-benchmark").configureHelp(helpStyle).description("Run the formal extraction replay benchmark set and print a cost/quality scoreboard").requiredOption("--benchmark <id>", "benchmark profile id").option("--observer-provider <provider>", "override observer provider for this benchmark run").option("--observer-model <model>", "override observer model for this benchmark run").option("--observer-tier-routing", "use replay-only benchmark-backed observer tier routing").option("--openai-responses", "use OpenAI Responses API for this benchmark run").option("--reasoning-effort <level>", "set OpenAI reasoning.effort for this benchmark run (responses path)").option("--reasoning-summary <mode>", "set OpenAI reasoning.summary for this benchmark run (responses path)").option("--max-output-tokens <n>", "override OpenAI max_output_tokens for this benchmark run (responses path)").option("--observer-temperature <value>", "override observer temperature for this benchmark run").option("--transcript-budget <chars>", "override replay transcript budget in characters for this benchmark run");
2013
+ addDbOption(cmd);
2014
+ addJsonOption(cmd);
2015
+ cmd.action(async (opts) => {
2016
+ const benchmarkId = opts.benchmark?.trim() ?? "";
2017
+ const benchmark = getExtractionBenchmarkProfile(benchmarkId);
2018
+ if (!benchmark) throw new Error(`Unknown extraction benchmark: ${benchmarkId || opts.benchmark}`);
2019
+ const transcriptBudgetInput = opts.transcriptBudget?.trim() ?? "";
2020
+ const transcriptBudget = transcriptBudgetInput.length > 0 ? parseStrictPositiveId(transcriptBudgetInput) : null;
2021
+ if (transcriptBudgetInput.length > 0 && transcriptBudget === null) throw new Error(`Invalid transcript budget: ${transcriptBudgetInput || opts.transcriptBudget}`);
2022
+ const observerTemperatureInput = opts.observerTemperature?.trim() ?? "";
2023
+ let observerTemperature;
2024
+ if (observerTemperatureInput.length > 0) {
2025
+ const parsed = Number(observerTemperatureInput);
2026
+ if (!Number.isFinite(parsed)) throw new Error(`Invalid observer temperature: ${observerTemperatureInput || opts.observerTemperature}`);
2027
+ observerTemperature = parsed;
2028
+ }
2029
+ const maxOutputTokensInput = opts.maxOutputTokens?.trim() ?? "";
2030
+ const maxOutputTokens = maxOutputTokensInput.length > 0 ? parseStrictPositiveId(maxOutputTokensInput) : null;
2031
+ if (maxOutputTokensInput.length > 0 && maxOutputTokens === null) throw new Error(`Invalid max output tokens: ${maxOutputTokensInput || opts.maxOutputTokens}`);
2032
+ const observerConfig = loadObserverConfig();
2033
+ const observerConfigWithOverrides = {
2034
+ ...observerConfig,
2035
+ observerProvider: opts.observerProvider?.trim() || observerConfig.observerProvider,
2036
+ observerModel: opts.observerModel?.trim() || observerConfig.observerModel,
2037
+ observerTemperature: observerTemperature ?? observerConfig.observerTemperature,
2038
+ observerOpenAIUseResponses: opts.openaiResponses === true,
2039
+ observerReasoningEffort: opts.reasoningEffort?.trim() || null,
2040
+ observerReasoningSummary: opts.reasoningSummary?.trim() || null,
2041
+ observerMaxOutputTokens: maxOutputTokens ?? observerConfig.observerMaxTokens
2042
+ };
2043
+ const observer = new ObserverClient(observerConfigWithOverrides);
2044
+ const runs = [];
2045
+ for (const batch of benchmark.batches) {
2046
+ const scenarioId = batch.scenarioId ?? benchmark.scenarioId;
2047
+ const result = opts.observerTierRouting === true ? await replayBatchExtractionWithTierRouting(resolveDbOpt(opts), observerConfigWithOverrides, {
2048
+ batchId: batch.batchId,
2049
+ scenarioId,
2050
+ transcriptBudget: transcriptBudget ?? void 0
2051
+ }) : await replayBatchExtraction(resolveDbOpt(opts), observer, {
2052
+ batchId: batch.batchId,
2053
+ scenarioId,
2054
+ transcriptBudget: transcriptBudget ?? void 0
2055
+ });
2056
+ runs.push({
2057
+ batchId: batch.batchId,
2058
+ sessionId: batch.sessionId,
2059
+ label: batch.label,
2060
+ purpose: batch.purpose,
2061
+ complexity: batch.complexity,
2062
+ scenarioId,
2063
+ expectedTier: batch.expectedTier ?? null,
2064
+ analysis: {
2065
+ eventSpan: result.analysis.eventSpan,
2066
+ promptCount: result.analysis.promptCount,
2067
+ toolCount: result.analysis.toolCount,
2068
+ transcriptLength: result.analysis.transcriptLength
2069
+ },
2070
+ status: result.classification.status,
2071
+ reason: result.classification.reason,
2072
+ tier: result.observer.tier ?? "manual",
2073
+ provider: result.observer.provider,
2074
+ model: result.observer.model,
2075
+ openaiUseResponses: result.observer.openaiUseResponses,
2076
+ reasoningEffort: result.observer.reasoningEffort,
2077
+ reasoningSummary: result.observer.reasoningSummary,
2078
+ maxOutputTokens: result.observer.maxOutputTokens,
2079
+ temperature: result.observer.temperature,
2080
+ summaries: result.evaluation.counts.summaries,
2081
+ observations: result.evaluation.counts.observations,
2082
+ repairApplied: result.observer.repairApplied
2083
+ });
2084
+ }
2085
+ const summary = {
2086
+ total: runs.length,
2087
+ shapeQualityTotal: runs.filter((run) => run.purpose === "shape_quality").length,
2088
+ shapeQualityPasses: runs.filter((run) => run.purpose === "shape_quality" && run.status === "pass").length,
2089
+ shapeQualityFails: runs.filter((run) => run.purpose === "shape_quality" && run.status === "shape_fail").length,
2090
+ expectedTierTotal: runs.filter((run) => run.expectedTier != null).length,
2091
+ expectedTierMatches: runs.filter((run) => run.expectedTier != null && run.expectedTier === run.tier).length,
2092
+ robustnessNoOutput: runs.filter((run) => run.status === "observer_no_output").length
2093
+ };
2094
+ const uniqueObserverKeys = Array.from(new Set(runs.map((run) => `${run.provider}::${run.model}::${run.openaiUseResponses ? "responses" : "chat"}`)));
2095
+ const observerSummary = opts.observerTierRouting === true ? {
2096
+ provider: uniqueObserverKeys.length === 1 ? runs[0]?.provider ?? observer.provider : "mixed",
2097
+ model: uniqueObserverKeys.length === 1 ? runs[0]?.model ?? observer.model : "mixed",
2098
+ tierRouting: true,
2099
+ openaiUseResponses: uniqueObserverKeys.length === 1 ? runs[0]?.openaiUseResponses ?? observer.openaiUseResponses : null,
2100
+ reasoningEffort: uniqueObserverKeys.length === 1 ? runs[0]?.reasoningEffort ?? observer.reasoningEffort : "mixed",
2101
+ reasoningSummary: uniqueObserverKeys.length === 1 ? runs[0]?.reasoningSummary ?? observer.reasoningSummary : "mixed",
2102
+ maxOutputTokens: uniqueObserverKeys.length === 1 ? runs[0]?.maxOutputTokens ?? observer.maxOutputTokens : null,
2103
+ temperature: uniqueObserverKeys.length === 1 ? runs[0]?.temperature ?? observer.temperature : null,
2104
+ transcriptBudget: transcriptBudget ?? null,
2105
+ selectedObservers: uniqueObserverKeys
2106
+ } : {
2107
+ provider: observer.provider,
2108
+ model: observer.model,
2109
+ tierRouting: false,
2110
+ openaiUseResponses: observer.openaiUseResponses,
2111
+ reasoningEffort: observer.reasoningEffort,
2112
+ reasoningSummary: observer.reasoningSummary,
2113
+ maxOutputTokens: observer.maxOutputTokens,
2114
+ temperature: observer.temperature,
2115
+ transcriptBudget: transcriptBudget ?? null,
2116
+ selectedObservers: uniqueObserverKeys
2117
+ };
2118
+ const output = {
2119
+ benchmark: {
2120
+ id: benchmark.id,
2121
+ title: benchmark.title,
2122
+ scenarioId: benchmark.scenarioId
2123
+ },
2124
+ observer: observerSummary,
2125
+ summary,
2126
+ runs
2127
+ };
2128
+ if (opts.json) {
2129
+ console.log(JSON.stringify(output, null, 2));
2130
+ return;
2131
+ }
2132
+ p.intro("codemem memory extraction-benchmark");
2133
+ p.log.info([
2134
+ `Benchmark: ${benchmark.id} — ${benchmark.title}`,
2135
+ `Observer: ${observerSummary.provider}/${observerSummary.model}`,
2136
+ `Tier routing: ${opts.observerTierRouting === true ? "yes" : "no"}`,
2137
+ `OpenAI Responses: ${observerSummary.openaiUseResponses === null ? "mixed" : observerSummary.openaiUseResponses ? "yes" : "no"}`,
2138
+ `Reasoning effort: ${observerSummary.reasoningEffort ?? "none"}`,
2139
+ `Reasoning summary: ${observerSummary.reasoningSummary ?? "none"}`,
2140
+ `Max output tokens: ${observerSummary.maxOutputTokens ?? "mixed"}`,
2141
+ `Temperature: ${observerSummary.temperature ?? "mixed"}`,
2142
+ `Transcript budget override: ${transcriptBudget ?? "default"}`,
2143
+ `Shape-quality passes: ${summary.shapeQualityPasses}/${summary.shapeQualityTotal}`,
2144
+ `Shape-quality fails: ${summary.shapeQualityFails}`,
2145
+ `Expected-tier matches: ${summary.expectedTierMatches}/${summary.expectedTierTotal}`,
2146
+ `Observer no-output cases: ${summary.robustnessNoOutput}`
2147
+ ].join("\n"));
2148
+ for (const run of runs) p.log.message(` [${run.batchId}] ${run.status.padEnd(18)} ${run.complexity.padEnd(10)} tier=${run.tier.padEnd(6)} expected=${(run.expectedTier ?? "n/a").padEnd(6)} span=${String(run.analysis.eventSpan).padEnd(3)} prompts=${run.analysis.promptCount} tools=${String(run.analysis.toolCount).padEnd(2)} transcript=${run.analysis.transcriptLength} ${run.provider}/${run.model}${run.openaiUseResponses ? " [responses]" : ""} summaries=${run.summaries} observations=${run.observations} repair=${run.repairApplied ? "yes" : "no"} — ${run.label}`);
2149
+ p.outro("done");
2150
+ });
2151
+ return cmd;
2152
+ }
1860
2153
  function createMemoryRelinkReportCommand() {
1861
2154
  const cmd = new Command("relink-report").configureHelp(helpStyle).description("Analyze dry-run raw-event session relinking and compaction opportunities").option("--project <project>", "project identifier (defaults to git repo root)").option("--all-projects", "analyze across all projects").option("--limit <n>", "max groups to print", "25");
1862
2155
  addDbOption(cmd);
@@ -1935,6 +2228,9 @@ memoryCommand.addCommand(createRememberMemoryCommand());
1935
2228
  memoryCommand.addCommand(createInjectMemoryCommand());
1936
2229
  memoryCommand.addCommand(createMemoryRoleReportCommand());
1937
2230
  memoryCommand.addCommand(createMemoryRoleCompareCommand());
2231
+ memoryCommand.addCommand(createMemoryExtractionReportCommand());
2232
+ memoryCommand.addCommand(createMemoryExtractionReplayCommand());
2233
+ memoryCommand.addCommand(createMemoryExtractionBenchmarkCommand());
1938
2234
  memoryCommand.addCommand(createMemoryRelinkReportCommand());
1939
2235
  memoryCommand.addCommand(createMemoryRelinkPlanCommand());
1940
2236
  //#endregion