nlm-memory 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (257) hide show
  1. package/README.md +89 -34
  2. package/dist/cli/digest.d.ts +20 -0
  3. package/dist/cli/digest.js +142 -0
  4. package/dist/cli/digest.js.map +1 -0
  5. package/dist/cli/nlm.d.ts +1 -0
  6. package/dist/cli/nlm.js +25 -1
  7. package/dist/cli/nlm.js.map +1 -1
  8. package/dist/core/digest/compose.d.ts +38 -0
  9. package/dist/core/digest/compose.js +93 -0
  10. package/dist/core/digest/compose.js.map +1 -0
  11. package/dist/core/digest/hook-liveness.d.ts +32 -0
  12. package/dist/core/digest/hook-liveness.js +54 -0
  13. package/dist/core/digest/hook-liveness.js.map +1 -0
  14. package/dist/http/app.js +2 -1
  15. package/dist/http/app.js.map +1 -1
  16. package/dist/mcp/server.js +20 -1
  17. package/dist/mcp/server.js.map +1 -1
  18. package/dist/ui/assets/{index-C8cpwbYJ.css → index-Beo8psd-.css} +1 -1
  19. package/dist/ui/assets/{index-CB50QnL-.js → index-CSPTTeeM.js} +8 -8
  20. package/dist/ui/index.html +2 -2
  21. package/package.json +26 -1
  22. package/.agents/plugins/marketplace.json +0 -20
  23. package/.github/workflows/ci.yml +0 -30
  24. package/docs/methodology/re-derivation-rate.md +0 -112
  25. package/docs/methodology/useful-hit-rate.md +0 -79
  26. package/docs/plans/2026-05-20-fts5-lexical-recall.md +0 -1088
  27. package/docs/plans/2026-05-20-recall-daemon-wedge-fix.md +0 -662
  28. package/docs/plans/2026-05-20-recall-hook-design.md +0 -131
  29. package/docs/plans/2026-05-20-recall-hook-implementation.md +0 -1222
  30. package/docs/plans/desktop-product.md +0 -69
  31. package/docs/plans/factstore-design.md +0 -236
  32. package/logs/CHANGELOG/CHANGELOG-2026.md +0 -1575
  33. package/logs/CHANGELOG/CHANGELOG.md +0 -209
  34. package/migrations/000_initial_schema.sql +0 -174
  35. package/migrations/001_entity_type_rename.sql +0 -17
  36. package/migrations/002_adapter_state_extend.sql +0 -12
  37. package/migrations/003_session_embeddings.sql +0 -11
  38. package/migrations/004_facts.sql +0 -46
  39. package/migrations/005_sources.sql +0 -31
  40. package/migrations/006_providers.sql +0 -33
  41. package/migrations/007_source_tokens.sql +0 -17
  42. package/migrations/008_fts_rebuild.sql +0 -9
  43. package/migrations/009_session_embedding_chunks.sql +0 -46
  44. package/migrations/010_sources_opencode.sql +0 -30
  45. package/migrations/011_sources_hermes_agent.sql +0 -30
  46. package/migrations/012_sources_aider.sql +0 -30
  47. package/migrations/013_adapter_state_failure_count.sql +0 -12
  48. package/migrations/014_sources_cursor.sql +0 -30
  49. package/migrations/015_sources_windsurf.sql +0 -30
  50. package/plugin-hermes-agent/README.md +0 -49
  51. package/plugin-hermes-agent/__init__.py +0 -75
  52. package/plugin-hermes-agent/plugin.yaml +0 -15
  53. package/scripts/backfill-citations.mjs +0 -0
  54. package/scripts/build-codex-plugin.mjs +0 -61
  55. package/scripts/deepseek-probe.mjs +0 -67
  56. package/scripts/extract-triples.mjs +0 -207
  57. package/scripts/longmemeval/embedding-cache.ts +0 -77
  58. package/scripts/longmemeval/fetch-dataset.sh +0 -25
  59. package/scripts/longmemeval/run-harness.ts +0 -315
  60. package/scripts/longmemeval/scorer.ts +0 -99
  61. package/scripts/longmemeval/tsconfig.json +0 -9
  62. package/scripts/longmemeval/types.ts +0 -35
  63. package/scripts/nlm-daily-digest.py +0 -239
  64. package/scripts/nlm-daily-digest.sh +0 -28
  65. package/src/cli/classify-parity.ts +0 -257
  66. package/src/cli/launchctl-helpers.ts +0 -49
  67. package/src/cli/nlm.ts +0 -1078
  68. package/src/core/actions/actions-log.ts +0 -118
  69. package/src/core/actions/overlay.ts +0 -117
  70. package/src/core/adapters/aider.ts +0 -205
  71. package/src/core/adapters/claude-code.ts +0 -293
  72. package/src/core/adapters/common.ts +0 -54
  73. package/src/core/adapters/cursor.ts +0 -486
  74. package/src/core/adapters/from-source.ts +0 -67
  75. package/src/core/adapters/hermes-agent.ts +0 -240
  76. package/src/core/adapters/hermes.ts +0 -277
  77. package/src/core/adapters/jsonl-generic.ts +0 -208
  78. package/src/core/adapters/opencode.ts +0 -281
  79. package/src/core/adapters/pi.ts +0 -264
  80. package/src/core/adapters/windsurf.ts +0 -386
  81. package/src/core/classifier/prompt.ts +0 -200
  82. package/src/core/dataset/build-dataset.ts +0 -463
  83. package/src/core/embedding/chunk-body.ts +0 -76
  84. package/src/core/embedding/embed-backfill.ts +0 -210
  85. package/src/core/embedding/embed-normalize.ts +0 -135
  86. package/src/core/facts/backfill-facts.ts +0 -254
  87. package/src/core/facts/extract-facts.ts +0 -50
  88. package/src/core/hook/citation-detect.ts +0 -124
  89. package/src/core/hook/cite-memo.ts +0 -68
  90. package/src/core/hook/claude-settings.ts +0 -187
  91. package/src/core/hook/gate.ts +0 -25
  92. package/src/core/hook/hook-log.ts +0 -41
  93. package/src/core/hook/memo-sweep.ts +0 -164
  94. package/src/core/hook/memo.ts +0 -67
  95. package/src/core/hook/pointer-block.ts +0 -26
  96. package/src/core/hook/select.ts +0 -32
  97. package/src/core/hook/transcript.ts +0 -121
  98. package/src/core/ingest/ingest-session.ts +0 -111
  99. package/src/core/providers/provider-models.ts +0 -100
  100. package/src/core/providers/provider-registry.ts +0 -196
  101. package/src/core/recall/citation-log.ts +0 -108
  102. package/src/core/recall/filter.ts +0 -27
  103. package/src/core/recall/index.ts +0 -6
  104. package/src/core/recall/match-fields.ts +0 -40
  105. package/src/core/recall/query-log.ts +0 -149
  106. package/src/core/recall/query-shape.ts +0 -66
  107. package/src/core/recall/recall-service.ts +0 -320
  108. package/src/core/recall/recent-log.ts +0 -59
  109. package/src/core/recall/tokenize.ts +0 -18
  110. package/src/core/recall/useful-scan.ts +0 -336
  111. package/src/core/recall-facts/fact-query-log.ts +0 -150
  112. package/src/core/recall-facts/fact-recall-service.ts +0 -327
  113. package/src/core/scheduler/scan-once.ts +0 -142
  114. package/src/core/scheduler/scheduler.ts +0 -225
  115. package/src/core/sources/source-registry.ts +0 -278
  116. package/src/core/storage/db-restore.ts +0 -133
  117. package/src/core/storage/live-status.ts +0 -45
  118. package/src/core/storage/migrate.ts +0 -72
  119. package/src/core/storage/sqlite-fact-store.ts +0 -304
  120. package/src/core/storage/sqlite-session-store.ts +0 -810
  121. package/src/hook/hook-auth.ts +0 -18
  122. package/src/hook/prompt-recall-hook.ts +0 -180
  123. package/src/hook/session-end-hook.ts +0 -81
  124. package/src/hook/session-start-hook.ts +0 -168
  125. package/src/hook/stop-hook.ts +0 -239
  126. package/src/http/app.ts +0 -1215
  127. package/src/install/claude-code.ts +0 -128
  128. package/src/install/codex.ts +0 -367
  129. package/src/install/cursor.ts +0 -68
  130. package/src/install/hermes-agent.ts +0 -76
  131. package/src/install/hermes.ts +0 -78
  132. package/src/install/nlm-dir-perms.ts +0 -55
  133. package/src/install/ollama.ts +0 -284
  134. package/src/install/setup.ts +0 -489
  135. package/src/install/windsurf.ts +0 -68
  136. package/src/llm/classifier-box.ts +0 -64
  137. package/src/llm/deepseek-client.ts +0 -150
  138. package/src/llm/env-autoload.ts +0 -55
  139. package/src/llm/ollama-client.ts +0 -189
  140. package/src/mcp/server.ts +0 -534
  141. package/src/ports/fact-store.ts +0 -102
  142. package/src/ports/llm-client.ts +0 -52
  143. package/src/ports/logger.ts +0 -16
  144. package/src/ports/session-store.ts +0 -45
  145. package/src/ports/transcript-adapter.ts +0 -55
  146. package/src/shared/types.ts +0 -149
  147. package/src/ui/App.tsx +0 -58
  148. package/src/ui/components/PromoteOpenButton.tsx +0 -65
  149. package/src/ui/components/SessionDrawer.tsx +0 -199
  150. package/src/ui/components/SideNav.tsx +0 -162
  151. package/src/ui/components/Skeleton.tsx +0 -107
  152. package/src/ui/index.html +0 -13
  153. package/src/ui/lib/actions.ts +0 -30
  154. package/src/ui/lib/api.ts +0 -92
  155. package/src/ui/lib/dataset.ts +0 -141
  156. package/src/ui/lib/registries.ts +0 -155
  157. package/src/ui/lib/view-settings.ts +0 -41
  158. package/src/ui/main.tsx +0 -15
  159. package/src/ui/pages/Live.tsx +0 -229
  160. package/src/ui/pages/Pulse.tsx +0 -415
  161. package/src/ui/pages/Recall.tsx +0 -190
  162. package/src/ui/pages/River.tsx +0 -354
  163. package/src/ui/pages/Search.tsx +0 -386
  164. package/src/ui/pages/Stub.tsx +0 -9
  165. package/src/ui/pages/Thread.tsx +0 -473
  166. package/src/ui/pages/settings/Classifier.tsx +0 -227
  167. package/src/ui/pages/settings/Data.tsx +0 -190
  168. package/src/ui/pages/settings/Index.tsx +0 -65
  169. package/src/ui/pages/settings/Labels.tsx +0 -224
  170. package/src/ui/pages/settings/Providers.tsx +0 -305
  171. package/src/ui/pages/settings/SettingsSubnav.tsx +0 -28
  172. package/src/ui/pages/settings/Sources.tsx +0 -326
  173. package/src/ui/pages/settings/Views.tsx +0 -96
  174. package/src/ui/styles.css +0 -1890
  175. package/src/ui/tsconfig.json +0 -21
  176. package/src/ui/vite.config.ts +0 -19
  177. package/tests/fixtures/claude_code/short_session.jsonl +0 -2
  178. package/tests/fixtures/claude_code/standard_iso.jsonl +0 -4
  179. package/tests/fixtures/claude_code/tool_heavy.jsonl +0 -8
  180. package/tests/fixtures/claude_code/with_subagent.jsonl +0 -7
  181. package/tests/fixtures/facts.ts +0 -17
  182. package/tests/fixtures/golden-corpus.ts +0 -85
  183. package/tests/fixtures/hermes/paired_request_dump.json +0 -24
  184. package/tests/fixtures/hermes/paired_session.json +0 -23
  185. package/tests/fixtures/hermes/request_dump.json +0 -28
  186. package/tests/fixtures/hermes/session_iso.json +0 -38
  187. package/tests/fixtures/hermes/session_unix.json +0 -38
  188. package/tests/fixtures/hermes/system_only.json +0 -18
  189. package/tests/fixtures/pi/error-connection-abort.jsonl +0 -8
  190. package/tests/fixtures/pi/short-successful.jsonl +0 -5
  191. package/tests/fixtures/pi/with-custom-message.jsonl +0 -6
  192. package/tests/fixtures/sessions.ts +0 -22
  193. package/tests/integration/backfill-facts.test.ts +0 -362
  194. package/tests/integration/citation-explicit.test.ts +0 -111
  195. package/tests/integration/cite-event.test.ts +0 -169
  196. package/tests/integration/cite-memo.test.ts +0 -87
  197. package/tests/integration/db-restore.test.ts +0 -153
  198. package/tests/integration/embed-backfill.test.ts +0 -176
  199. package/tests/integration/fact-supersedence.test.ts +0 -313
  200. package/tests/integration/fts-index.test.ts +0 -60
  201. package/tests/integration/getbyids-sqlite.test.ts +0 -100
  202. package/tests/integration/hermes-agent-hooks.test.ts +0 -248
  203. package/tests/integration/hook-claude-settings.test.ts +0 -218
  204. package/tests/integration/hook-log.test.ts +0 -54
  205. package/tests/integration/hook-memo.test.ts +0 -68
  206. package/tests/integration/hook-pre-compact.test.ts +0 -105
  207. package/tests/integration/hook-subagent-start.test.ts +0 -102
  208. package/tests/integration/http.test.ts +0 -401
  209. package/tests/integration/keyword-search-fts.test.ts +0 -66
  210. package/tests/integration/mcp-recall-logging.test.ts +0 -88
  211. package/tests/integration/mcp.test.ts +0 -260
  212. package/tests/integration/memo-sweep.test.ts +0 -91
  213. package/tests/integration/prompt-recall-hook.test.ts +0 -88
  214. package/tests/integration/provider-registry.test.ts +0 -107
  215. package/tests/integration/recall-golden.test.ts +0 -59
  216. package/tests/integration/recall-sqlite.test.ts +0 -169
  217. package/tests/integration/scheduler.test.ts +0 -391
  218. package/tests/integration/session-end-hook.test.ts +0 -48
  219. package/tests/integration/session-start-hook.test.ts +0 -126
  220. package/tests/integration/source-registry.test.ts +0 -122
  221. package/tests/integration/sqlite-fact-store.test.ts +0 -346
  222. package/tests/integration/stop-hook.test.ts +0 -560
  223. package/tests/integration/wal-checkpoint.test.ts +0 -49
  224. package/tests/unit/cli/launchctl-helpers.test.ts +0 -60
  225. package/tests/unit/core/adapters/aider.test.ts +0 -230
  226. package/tests/unit/core/adapters/claude-code.test.ts +0 -118
  227. package/tests/unit/core/adapters/cursor.test.ts +0 -485
  228. package/tests/unit/core/adapters/hermes-agent.test.ts +0 -329
  229. package/tests/unit/core/adapters/hermes.test.ts +0 -81
  230. package/tests/unit/core/adapters/jsonl-generic.test.ts +0 -142
  231. package/tests/unit/core/adapters/opencode.test.ts +0 -354
  232. package/tests/unit/core/adapters/pi.test.ts +0 -110
  233. package/tests/unit/core/adapters/windsurf.test.ts +0 -416
  234. package/tests/unit/core/classifier/prompt.test.ts +0 -126
  235. package/tests/unit/core/embedding/chunk-body.test.ts +0 -100
  236. package/tests/unit/core/facts/extract-facts.test.ts +0 -117
  237. package/tests/unit/core/filter.test.ts +0 -40
  238. package/tests/unit/core/hook/citation-detect-cite-session.test.ts +0 -96
  239. package/tests/unit/core/hook/citation-detect.test.ts +0 -124
  240. package/tests/unit/core/hook/gate.test.ts +0 -29
  241. package/tests/unit/core/hook/pointer-block.test.ts +0 -22
  242. package/tests/unit/core/hook/select.test.ts +0 -66
  243. package/tests/unit/core/match-fields.test.ts +0 -39
  244. package/tests/unit/core/mcp-cite-session.test.ts +0 -51
  245. package/tests/unit/core/providers/provider-models.test.ts +0 -101
  246. package/tests/unit/core/query-shape.test.ts +0 -92
  247. package/tests/unit/core/recall-facts/fact-recall-service.test.ts +0 -258
  248. package/tests/unit/core/recall-service.test.ts +0 -200
  249. package/tests/unit/core/storage/live-status.test.ts +0 -54
  250. package/tests/unit/core/tokenize.test.ts +0 -32
  251. package/tests/unit/core/useful-scan.test.ts +0 -537
  252. package/tests/unit/llm/embed.test.ts +0 -93
  253. package/tests/unit/llm/ollama-client.test.ts +0 -124
  254. package/tests/unit/scripts/longmemeval-scorer.test.ts +0 -114
  255. package/tsconfig.json +0 -31
  256. package/tsconfig.test.json +0 -11
  257. package/vitest.config.ts +0 -22
@@ -1,77 +0,0 @@
1
- /**
2
- * SHA256-keyed on-disk embedding cache. The LongMemEval-S haystack has
3
- * ~24K session bodies (~19K unique); embedding them via local Ollama takes
4
- * ~30 min the first time. Reruns must be instant — calibrating retrieval
5
- * parameters means dozens of re-evaluations, and re-embedding each time
6
- * would burn hours of wall clock for no signal.
7
- *
8
- * Backed by a small SQLite at $LONGMEMEVAL_CACHE_DIR/embeddings.sqlite.
9
- * Key = sha256(kind + ":" + text); value = Float32Array as BLOB.
10
- */
11
-
12
- import Database from "better-sqlite3";
13
- import type { Database as DB } from "better-sqlite3";
14
- import { createHash } from "node:crypto";
15
- import { mkdirSync } from "node:fs";
16
- import { dirname } from "node:path";
17
- import type { EmbeddingKind, LLMClient } from "../../src/ports/llm-client.js";
18
-
19
- const CREATE_SQL =
20
- "CREATE TABLE IF NOT EXISTS embeddings (key TEXT PRIMARY KEY, vector BLOB NOT NULL)";
21
-
22
- export interface EmbeddingCacheOptions {
23
- readonly dbPath: string;
24
- readonly llm: LLMClient;
25
- }
26
-
27
- export class EmbeddingCache {
28
- private readonly db: DB;
29
- private readonly llm: LLMClient;
30
- private readonly getStmt: ReturnType<DB["prepare"]>;
31
- private readonly putStmt: ReturnType<DB["prepare"]>;
32
-
33
- constructor(opts: EmbeddingCacheOptions) {
34
- mkdirSync(dirname(opts.dbPath), { recursive: true });
35
- this.db = new Database(opts.dbPath);
36
- this.db.pragma("journal_mode = WAL");
37
- this.db.prepare(CREATE_SQL).run();
38
- this.getStmt = this.db.prepare(
39
- "SELECT vector FROM embeddings WHERE key = @key",
40
- );
41
- this.putStmt = this.db.prepare(
42
- "INSERT OR REPLACE INTO embeddings (key, vector) VALUES (@key, @vector)",
43
- );
44
- this.llm = opts.llm;
45
- }
46
-
47
- async embed(text: string, kind: EmbeddingKind): Promise<Float32Array> {
48
- const key = createHash("sha256").update(`${kind}:${text}`).digest("hex");
49
- const row = this.getStmt.get({ key }) as { vector: Buffer } | undefined;
50
- if (row) {
51
- return new Float32Array(
52
- row.vector.buffer,
53
- row.vector.byteOffset,
54
- row.vector.byteLength / Float32Array.BYTES_PER_ELEMENT,
55
- );
56
- }
57
- const result = await this.llm.embed(text, kind);
58
- const blob = Buffer.from(
59
- result.vector.buffer,
60
- result.vector.byteOffset,
61
- result.vector.byteLength,
62
- );
63
- this.putStmt.run({ key, vector: blob });
64
- return result.vector;
65
- }
66
-
67
- size(): number {
68
- const row = this.db.prepare("SELECT COUNT(*) AS n FROM embeddings").get() as {
69
- n: number;
70
- };
71
- return row.n;
72
- }
73
-
74
- close(): void {
75
- this.db.close();
76
- }
77
- }
@@ -1,25 +0,0 @@
1
- #!/usr/bin/env bash
2
- # Fetch the LongMemEval-S (small) dataset from HuggingFace into a local cache.
3
- # Idempotent — skips download if the target file already exists with non-zero size.
4
- #
5
- # Source: https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned
6
- # Variant: longmemeval_s_cleaned.json — 500 questions × ~40 haystack sessions.
7
-
8
- set -euo pipefail
9
-
10
- CACHE_DIR="${LONGMEMEVAL_CACHE_DIR:-$HOME/.cache/longmemeval}"
11
- VARIANT="${LONGMEMEVAL_VARIANT:-longmemeval_s_cleaned.json}"
12
- URL="https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/${VARIANT}"
13
- TARGET="${CACHE_DIR}/${VARIANT}"
14
-
15
- mkdir -p "$CACHE_DIR"
16
-
17
- if [[ -s "$TARGET" ]]; then
18
- echo "longmemeval-fetch: ${TARGET} already present ($(wc -c <"$TARGET") bytes) — skipping."
19
- exit 0
20
- fi
21
-
22
- echo "longmemeval-fetch: downloading ${URL}"
23
- curl -fsSL --retry 3 -o "${TARGET}.tmp" "$URL"
24
- mv "${TARGET}.tmp" "$TARGET"
25
- echo "longmemeval-fetch: wrote ${TARGET} ($(wc -c <"$TARGET") bytes)"
@@ -1,315 +0,0 @@
1
- /**
2
- * LongMemEval-S baseline harness for NLM.
3
- *
4
- * Body-only ingest (skip classifier) + local Ollama nomic-embed-text. For
5
- * each evaluation instance: spin up an in-memory NLM corpus loaded with
6
- * the haystack sessions, query in each retrieval mode (keyword / semantic
7
- * / hybrid+RRF), score R@5 plus the session-body companion metric.
8
- *
9
- * Pure body-only retrieval — this measures the retrieval *algorithm*, not
10
- * the full classifier-in-loop NLM pipeline. The number is comparable to
11
- * agentmemory's published R@5 because both bench bodies-only.
12
- *
13
- * Usage:
14
- * node dist/scripts/longmemeval/run-harness.js \
15
- * --variant longmemeval_s_cleaned.json \
16
- * --modes keyword,semantic,hybrid \
17
- * --limit 500 \
18
- * --report-dir reports/longmemeval
19
- *
20
- * Re-runs are fast: embeddings cache in ~/.cache/longmemeval/embeddings.sqlite
21
- * keyed by sha256(kind + text). First run = ~30 min embedding; subsequent = seconds.
22
- */
23
-
24
- import { mkdtempSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
25
- import { homedir, tmpdir } from "node:os";
26
- import { dirname, join, resolve } from "node:path";
27
- import { fileURLToPath } from "node:url";
28
-
29
- const __filename = fileURLToPath(import.meta.url);
30
- const __dirname = dirname(__filename);
31
- import { RecallService } from "../../src/core/recall/recall-service.js";
32
- import { SqliteSessionStore } from "../../src/core/storage/sqlite-session-store.js";
33
- import { OllamaClient } from "../../src/llm/ollama-client.js";
34
- import type {
35
- EmbedResult,
36
- EmbeddingKind,
37
- LLMClient,
38
- } from "../../src/ports/llm-client.js";
39
- import type { RecallMode } from "../../src/shared/types.js";
40
- import { EmbeddingCache } from "./embedding-cache.js";
41
- import { scoreOne, aggregate, type SingleScore } from "./scorer.js";
42
- import { turnsToBody, type LongMemEvalInstance } from "./types.js";
43
- import { chunkSessionText } from "../../src/core/embedding/chunk-body.js";
44
-
45
- interface Args {
46
- readonly datasetPath: string;
47
- readonly modes: ReadonlyArray<RecallMode>;
48
- readonly limit: number;
49
- readonly k: number;
50
- readonly reportDir: string;
51
- readonly cacheDir: string;
52
- readonly migrationsDir: string;
53
- }
54
-
55
- function parseArgs(argv: ReadonlyArray<string>): Args {
56
- const get = (flag: string, fallback?: string): string => {
57
- const i = argv.indexOf(flag);
58
- if (i < 0) {
59
- if (fallback === undefined) throw new Error(`missing required flag: ${flag}`);
60
- return fallback;
61
- }
62
- return argv[i + 1] ?? "";
63
- };
64
- const cacheDir =
65
- process.env["LONGMEMEVAL_CACHE_DIR"] ?? join(homedir(), ".cache", "longmemeval");
66
- const variant = get("--variant", "longmemeval_s_cleaned.json");
67
- const datasetPath = join(cacheDir, variant);
68
- const modes = get("--modes", "keyword,semantic,hybrid")
69
- .split(",")
70
- .map((m) => m.trim()) as RecallMode[];
71
- const limit = Number.parseInt(get("--limit", "500"), 10);
72
- const k = Number.parseInt(get("--k", "5"), 10);
73
- const reportDir = get("--report-dir", resolve("reports/longmemeval"));
74
- const migrationsDir = resolve(__dirname, "../../migrations");
75
- return { datasetPath, modes, limit, k, reportDir, cacheDir, migrationsDir };
76
- }
77
-
78
- /** LLMClient wrapper: routes embed() through the on-disk cache. */
79
- class CachingEmbedder implements LLMClient {
80
- constructor(private readonly cache: EmbeddingCache) {}
81
- async embed(text: string, kind: EmbeddingKind): Promise<EmbedResult> {
82
- const vector = await this.cache.embed(text, kind);
83
- return { vector, model: "nomic-embed-text@cached" };
84
- }
85
- async classify(): Promise<never> {
86
- throw new Error("classify not used in LongMemEval body-only harness");
87
- }
88
- }
89
-
90
- interface InstanceResult {
91
- readonly question_id: string;
92
- readonly question_type: string;
93
- readonly by_mode: Record<string, SingleScore & { returnedIds: string[] }>;
94
- readonly embed_failures: number;
95
- }
96
-
97
- async function runInstance(
98
- instance: LongMemEvalInstance,
99
- args: Args,
100
- cache: EmbeddingCache,
101
- embedder: LLMClient,
102
- ): Promise<InstanceResult> {
103
- const needsEmbeddings = args.modes.some(
104
- (m) => m === "semantic" || m === "hybrid",
105
- );
106
- const tmpDir = mkdtempSync(join(tmpdir(), "nlm-lmeval-"));
107
- const store = new SqliteSessionStore({
108
- dbPath: join(tmpDir, "canonical.sqlite"),
109
- migrationsDir: args.migrationsDir,
110
- });
111
- const bodyById = new Map<string, string>();
112
- let embedFailures = 0;
113
- const seen = new Set<string>();
114
- try {
115
- for (let i = 0; i < instance.haystack_sessions.length; i++) {
116
- const id = instance.haystack_session_ids[i];
117
- const date = instance.haystack_dates[i];
118
- const turns = instance.haystack_sessions[i];
119
- if (!id || !date || !turns) continue;
120
- if (seen.has(id)) continue;
121
- seen.add(id);
122
- const body = turnsToBody(turns);
123
- bodyById.set(id, body);
124
- store.insertSessionForTest({
125
- id,
126
- runtime: "longmemeval",
127
- runtimeSessionId: id,
128
- startedAt: date,
129
- endedAt: date,
130
- durationMin: 0,
131
- label: "",
132
- summary: "",
133
- body,
134
- status: "closed",
135
- transcriptKind: "longmemeval-jsonl",
136
- transcriptPath: null,
137
- entities: [],
138
- decisions: [],
139
- open: [],
140
- });
141
- if (needsEmbeddings) {
142
- const chunks = chunkSessionText({ body });
143
- for (let c = 0; c < chunks.length; c++) {
144
- try {
145
- const vector = await cache.embed(chunks[c]!, "document");
146
- store.insertChunkEmbeddingForTest(id, c, vector);
147
- } catch {
148
- // Per-chunk embed failure is non-fatal — successfully embedded
149
- // chunks still contribute via max-pool. Counter tracks attempts.
150
- embedFailures++;
151
- }
152
- }
153
- }
154
- }
155
-
156
- const recall = new RecallService({ store, llm: embedder });
157
- const byMode: InstanceResult["by_mode"] = {};
158
- for (const mode of args.modes) {
159
- const result = await recall.search({
160
- query: instance.question,
161
- mode,
162
- limit: args.k,
163
- });
164
- const returnedIds = result.results.map((r) => r.id);
165
- const returnedBodies = returnedIds.map((id) => bodyById.get(id) ?? "");
166
- const score = scoreOne({
167
- returnedIds,
168
- goldIds: instance.answer_session_ids,
169
- returnedBodies,
170
- answer: instance.answer,
171
- k: args.k,
172
- });
173
- byMode[mode] = { ...score, returnedIds };
174
- }
175
- return {
176
- question_id: instance.question_id,
177
- question_type: instance.question_type,
178
- by_mode: byMode,
179
- embed_failures: embedFailures,
180
- };
181
- } finally {
182
- store.close();
183
- rmSync(tmpDir, { recursive: true, force: true });
184
- }
185
- }
186
-
187
- async function main(): Promise<void> {
188
- const args = parseArgs(process.argv.slice(2));
189
- console.log(`longmemeval-harness: loading ${args.datasetPath}`);
190
- const raw = readFileSync(args.datasetPath, "utf8");
191
- const dataset = JSON.parse(raw) as LongMemEvalInstance[];
192
- const slice = dataset.slice(0, args.limit);
193
- console.log(
194
- `longmemeval-harness: ${slice.length}/${dataset.length} instances, modes=${args.modes.join(",")}, k=${args.k}`,
195
- );
196
-
197
- // Warm the embedding cache and the LLM transport.
198
- const llm = new OllamaClient({ embedModel: "nomic-embed-text" });
199
- const cache = new EmbeddingCache({
200
- dbPath: join(args.cacheDir, "embeddings.sqlite"),
201
- llm,
202
- });
203
- const embedder = new CachingEmbedder(cache);
204
-
205
- console.log(`longmemeval-harness: cache contains ${cache.size()} embeddings on entry`);
206
-
207
- const results: InstanceResult[] = [];
208
- const t0 = Date.now();
209
- for (let i = 0; i < slice.length; i++) {
210
- const inst = slice[i];
211
- if (!inst) continue;
212
- const result = await runInstance(inst, args, cache, embedder);
213
- results.push(result);
214
- if ((i + 1) % 10 === 0 || i === slice.length - 1) {
215
- const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
216
- const cached = cache.size();
217
- console.log(
218
- ` [${i + 1}/${slice.length}] ${elapsed}s elapsed, cache=${cached}`,
219
- );
220
- }
221
- }
222
-
223
- // Aggregate.
224
- const aggregated: Record<string, ReturnType<typeof aggregate>> = {};
225
- for (const mode of args.modes) {
226
- aggregated[mode] = aggregate(
227
- results.map((r) => r.by_mode[mode] as SingleScore).filter(Boolean),
228
- );
229
- }
230
-
231
- // Per-question-type breakdown.
232
- const byType: Record<string, Record<string, ReturnType<typeof aggregate>>> = {};
233
- const types = new Set(results.map((r) => r.question_type));
234
- for (const t of types) {
235
- byType[t] = {};
236
- for (const mode of args.modes) {
237
- const subset = results
238
- .filter((r) => r.question_type === t)
239
- .map((r) => r.by_mode[mode] as SingleScore)
240
- .filter(Boolean);
241
- byType[t]![mode] = aggregate(subset);
242
- }
243
- }
244
-
245
- const stamp = new Date().toISOString().replace(/[:T]/g, "-").slice(0, 19);
246
- const outDir = join(args.reportDir, stamp);
247
- mkdirSync(outDir, { recursive: true });
248
- const json = {
249
- dataset: args.datasetPath,
250
- n: results.length,
251
- k: args.k,
252
- modes: args.modes,
253
- aggregate: aggregated,
254
- by_question_type: byType,
255
- results,
256
- elapsed_seconds: (Date.now() - t0) / 1000,
257
- };
258
- writeFileSync(join(outDir, "results.json"), JSON.stringify(json, null, 2));
259
- writeFileSync(join(outDir, "summary.md"), renderSummary(json));
260
- console.log(`longmemeval-harness: wrote ${outDir}/`);
261
- console.log(renderSummary(json));
262
-
263
- cache.close();
264
- }
265
-
266
- function renderSummary(json: {
267
- dataset: string;
268
- n: number;
269
- k: number;
270
- modes: ReadonlyArray<RecallMode>;
271
- aggregate: Record<string, ReturnType<typeof aggregate>>;
272
- by_question_type: Record<string, Record<string, ReturnType<typeof aggregate>>>;
273
- elapsed_seconds: number;
274
- }): string {
275
- const lines: string[] = [];
276
- lines.push(`# LongMemEval-S — NLM baseline (body-only, n=${json.n}, k=${json.k})`);
277
- lines.push("");
278
- lines.push(`Dataset: \`${json.dataset}\``);
279
- lines.push(`Elapsed: ${json.elapsed_seconds.toFixed(1)}s`);
280
- lines.push("");
281
- lines.push("## Aggregate");
282
- lines.push("");
283
- lines.push(`| Mode | R@${json.k} | Session-body hit |`);
284
- lines.push(`| --- | --- | --- |`);
285
- for (const mode of json.modes) {
286
- const a = json.aggregate[mode];
287
- if (!a) continue;
288
- lines.push(
289
- `| ${mode} | ${(a.recallAtK * 100).toFixed(1)}% | ${(a.sessionBodyHitRate * 100).toFixed(1)}% |`,
290
- );
291
- }
292
- lines.push("");
293
- lines.push("## By question type");
294
- lines.push("");
295
- const types = Object.keys(json.by_question_type).sort();
296
- lines.push(
297
- `| Question type | ${json.modes.map((m) => `${m} R@${json.k}`).join(" | ")} |`,
298
- );
299
- lines.push(`| --- | ${json.modes.map(() => "---").join(" | ")} |`);
300
- for (const t of types) {
301
- const row = json.by_question_type[t]!;
302
- const cells = json.modes.map((m) => {
303
- const a = row[m];
304
- return a ? `${(a.recallAtK * 100).toFixed(1)}% (n=${a.n})` : "—";
305
- });
306
- lines.push(`| ${t} | ${cells.join(" | ")} |`);
307
- }
308
- lines.push("");
309
- return lines.join("\n");
310
- }
311
-
312
- void main().catch((err) => {
313
- console.error("longmemeval-harness: fatal", err);
314
- process.exit(1);
315
- });
@@ -1,99 +0,0 @@
1
- /**
2
- * Pure scoring functions for the LongMemEval harness. Two metrics:
3
- *
4
- * - R@k (recall at k): did the retriever return any gold session ID in
5
- * its top-k results? Standard benchmark metric.
6
- * - Session-body hit: did the gold answer text appear anywhere in the
7
- * bodies of the top-k returned sessions? NLM-specific companion that
8
- * captures session-as-primary-unit value the strict-ID R@k can miss
9
- * (e.g. a session that supersedes the gold session and quotes its
10
- * decision).
11
- *
12
- * Both functions are deterministic and dependency-free so the harness can
13
- * test them with synthetic inputs.
14
- */
15
-
16
- export interface ScoreInputs {
17
- readonly returnedIds: ReadonlyArray<string>;
18
- readonly goldIds: ReadonlyArray<string>;
19
- /** Map id → body for the bodies of the top-k returned sessions. */
20
- readonly returnedBodies: ReadonlyArray<string>;
21
- /** Some LongMemEval answers are ints (counting questions); coerced to string. */
22
- readonly answer: string | number | boolean;
23
- readonly k: number;
24
- }
25
-
26
- export interface SingleScore {
27
- readonly recallAtK: 0 | 1;
28
- readonly sessionBodyHit: 0 | 1;
29
- }
30
-
31
- /** Score a single question. Returns 0/1 indicators that aggregate via mean. */
32
- export function scoreOne(input: ScoreInputs): SingleScore {
33
- const topK = input.returnedIds.slice(0, input.k);
34
- const goldSet = new Set(input.goldIds);
35
- const recallAtK = topK.some((id) => goldSet.has(id)) ? 1 : 0;
36
-
37
- // Session-body hit: substring match for multi-word answers; word-boundary
38
- // match for short answers (single token <4 chars: "3", "yes", numeric
39
- // counts). Without the boundary, a numeric answer "3" hits every body
40
- // containing "3 days", "$3", etc., inflating the metric to noise.
41
- const ans = normalize(String(input.answer));
42
- let sessionBodyHit: 0 | 1 = 0;
43
- if (ans.length > 0) {
44
- const isShortToken = !ans.includes(" ") && ans.length < 4;
45
- const test = isShortToken
46
- ? (body: string): boolean =>
47
- new RegExp(`\\b${escapeRegExp(ans)}\\b`).test(normalize(body))
48
- : (body: string): boolean => normalize(body).includes(ans);
49
- const bodies = input.returnedBodies.slice(0, input.k);
50
- for (const body of bodies) {
51
- if (test(body)) {
52
- sessionBodyHit = 1;
53
- break;
54
- }
55
- }
56
- }
57
- return { recallAtK, sessionBodyHit };
58
- }
59
-
60
- export interface Aggregate {
61
- readonly n: number;
62
- readonly recallAtK: number;
63
- readonly sessionBodyHitRate: number;
64
- }
65
-
66
- /** Aggregate per-question scores into mean rates. */
67
- export function aggregate(scores: ReadonlyArray<SingleScore>): Aggregate {
68
- const n = scores.length;
69
- if (n === 0) {
70
- return { n: 0, recallAtK: 0, sessionBodyHitRate: 0 };
71
- }
72
- let r = 0;
73
- let s = 0;
74
- for (const x of scores) {
75
- r += x.recallAtK;
76
- s += x.sessionBodyHit;
77
- }
78
- return {
79
- n,
80
- recallAtK: round3(r / n),
81
- sessionBodyHitRate: round3(s / n),
82
- };
83
- }
84
-
85
- function normalize(s: string): string {
86
- return s
87
- .toLowerCase()
88
- .replace(/[^\p{L}\p{N}\s]/gu, " ")
89
- .replace(/\s+/g, " ")
90
- .trim();
91
- }
92
-
93
- function round3(x: number): number {
94
- return Math.round(x * 1000) / 1000;
95
- }
96
-
97
- function escapeRegExp(s: string): string {
98
- return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
99
- }
@@ -1,9 +0,0 @@
1
- {
2
- "extends": "../../tsconfig.json",
3
- "compilerOptions": {
4
- "rootDir": "../..",
5
- "outDir": "../../dist-scripts",
6
- "noEmit": true
7
- },
8
- "include": ["./*.ts", "../../src/**/*.ts"]
9
- }
@@ -1,35 +0,0 @@
1
- /**
2
- * LongMemEval dataset schema. Mirrors the published JSON shape from
3
- * huggingface.co/datasets/xiaowu0162/longmemeval-cleaned.
4
- *
5
- * Each instance: a question against a haystack of past chat sessions. The
6
- * gold session IDs are in `answer_session_ids` — that's what the retrieval
7
- * step is scored against (R@k: was any gold ID returned in the top k).
8
- */
9
-
10
- export interface LongMemEvalTurn {
11
- readonly role: "user" | "assistant";
12
- readonly content: string;
13
- readonly has_answer?: boolean;
14
- }
15
-
16
- export interface LongMemEvalInstance {
17
- readonly question_id: string;
18
- readonly question_type: string;
19
- readonly question: string;
20
- // LongMemEval answers are sometimes ints/booleans for counting and
21
- // temporal-reasoning questions — coerce at the call site.
22
- readonly answer: string | number | boolean;
23
- readonly question_date: string;
24
- readonly haystack_session_ids: ReadonlyArray<string>;
25
- readonly haystack_dates: ReadonlyArray<string>;
26
- readonly haystack_sessions: ReadonlyArray<ReadonlyArray<LongMemEvalTurn>>;
27
- readonly answer_session_ids: ReadonlyArray<string>;
28
- }
29
-
30
- /** Serialize a session's turn list to a single body string for NLM ingest. */
31
- export function turnsToBody(turns: ReadonlyArray<LongMemEvalTurn>): string {
32
- return turns
33
- .map((t) => `${t.role === "user" ? "User" : "Assistant"}: ${t.content}`)
34
- .join("\n\n");
35
- }