nlm-memory 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (285) hide show
  1. package/README.md +72 -34
  2. package/dist/cli/nlm.js +223 -33
  3. package/dist/cli/nlm.js.map +1 -1
  4. package/dist/core/adapters/cursor.d.ts +45 -0
  5. package/dist/core/adapters/cursor.js +397 -0
  6. package/dist/core/adapters/cursor.js.map +1 -0
  7. package/dist/core/adapters/from-source.js +10 -0
  8. package/dist/core/adapters/from-source.js.map +1 -1
  9. package/dist/core/adapters/windsurf.d.ts +44 -0
  10. package/dist/core/adapters/windsurf.js +299 -0
  11. package/dist/core/adapters/windsurf.js.map +1 -0
  12. package/dist/core/hook/claude-settings.d.ts +12 -5
  13. package/dist/core/hook/claude-settings.js +21 -6
  14. package/dist/core/hook/claude-settings.js.map +1 -1
  15. package/dist/core/sources/source-registry.d.ts +1 -1
  16. package/dist/core/sources/source-registry.js +18 -0
  17. package/dist/core/sources/source-registry.js.map +1 -1
  18. package/dist/core/storage/sqlite-session-store.d.ts +2 -0
  19. package/dist/core/storage/sqlite-session-store.js +38 -2
  20. package/dist/core/storage/sqlite-session-store.js.map +1 -1
  21. package/dist/hook/hook-auth.d.ts +13 -0
  22. package/dist/hook/hook-auth.js +19 -0
  23. package/dist/hook/hook-auth.js.map +1 -0
  24. package/dist/hook/prompt-recall-hook.js +7 -1
  25. package/dist/hook/prompt-recall-hook.js.map +1 -1
  26. package/dist/hook/session-start-hook.js +4 -1
  27. package/dist/hook/session-start-hook.js.map +1 -1
  28. package/dist/hook/stop-hook.js +4 -1
  29. package/dist/hook/stop-hook.js.map +1 -1
  30. package/dist/http/app.d.ts +2 -0
  31. package/dist/http/app.js +76 -1
  32. package/dist/http/app.js.map +1 -1
  33. package/dist/install/claude-code.js +1 -1
  34. package/dist/install/claude-code.js.map +1 -1
  35. package/dist/install/cursor.d.ts +25 -0
  36. package/dist/install/cursor.js +43 -0
  37. package/dist/install/cursor.js.map +1 -0
  38. package/dist/install/nlm-dir-perms.d.ts +19 -0
  39. package/dist/install/nlm-dir-perms.js +43 -0
  40. package/dist/install/nlm-dir-perms.js.map +1 -0
  41. package/dist/install/ollama.d.ts +18 -1
  42. package/dist/install/ollama.js +62 -7
  43. package/dist/install/ollama.js.map +1 -1
  44. package/dist/install/setup.d.ts +4 -0
  45. package/dist/install/setup.js +141 -18
  46. package/dist/install/setup.js.map +1 -1
  47. package/dist/install/windsurf.d.ts +25 -0
  48. package/dist/install/windsurf.js +43 -0
  49. package/dist/install/windsurf.js.map +1 -0
  50. package/dist/mcp/server.js +20 -1
  51. package/dist/mcp/server.js.map +1 -1
  52. package/dist/shared/types.d.ts +4 -0
  53. package/dist/ui/assets/{index-BA6IpU8g.css → index-Beo8psd-.css} +1 -1
  54. package/dist/ui/assets/index-CSPTTeeM.js +69 -0
  55. package/dist/ui/index.html +2 -2
  56. package/package.json +26 -1
  57. package/plugin/scripts/prompt-recall-hook.mjs +55 -4
  58. package/plugin/scripts/stop-hook.mjs +57 -6
  59. package/.agents/plugins/marketplace.json +0 -20
  60. package/.github/workflows/ci.yml +0 -30
  61. package/dist/ui/assets/index-B_qIVV0k.js +0 -69
  62. package/docs/methodology/re-derivation-rate.md +0 -112
  63. package/docs/methodology/useful-hit-rate.md +0 -79
  64. package/docs/plans/2026-05-20-fts5-lexical-recall.md +0 -1088
  65. package/docs/plans/2026-05-20-recall-daemon-wedge-fix.md +0 -662
  66. package/docs/plans/2026-05-20-recall-hook-design.md +0 -131
  67. package/docs/plans/2026-05-20-recall-hook-implementation.md +0 -1222
  68. package/docs/plans/desktop-product.md +0 -69
  69. package/docs/plans/factstore-design.md +0 -236
  70. package/logs/CHANGELOG/CHANGELOG-2026.md +0 -1389
  71. package/logs/CHANGELOG/CHANGELOG.md +0 -337
  72. package/migrations/000_initial_schema.sql +0 -174
  73. package/migrations/001_entity_type_rename.sql +0 -17
  74. package/migrations/002_adapter_state_extend.sql +0 -12
  75. package/migrations/003_session_embeddings.sql +0 -11
  76. package/migrations/004_facts.sql +0 -46
  77. package/migrations/005_sources.sql +0 -31
  78. package/migrations/006_providers.sql +0 -33
  79. package/migrations/007_source_tokens.sql +0 -17
  80. package/migrations/008_fts_rebuild.sql +0 -9
  81. package/migrations/009_session_embedding_chunks.sql +0 -46
  82. package/migrations/010_sources_opencode.sql +0 -30
  83. package/migrations/011_sources_hermes_agent.sql +0 -30
  84. package/migrations/012_sources_aider.sql +0 -30
  85. package/migrations/013_adapter_state_failure_count.sql +0 -12
  86. package/plugin-hermes-agent/README.md +0 -49
  87. package/plugin-hermes-agent/__init__.py +0 -75
  88. package/plugin-hermes-agent/plugin.yaml +0 -15
  89. package/scripts/backfill-citations.mjs +0 -0
  90. package/scripts/build-codex-plugin.mjs +0 -61
  91. package/scripts/deepseek-probe.mjs +0 -67
  92. package/scripts/extract-triples.mjs +0 -207
  93. package/scripts/longmemeval/embedding-cache.ts +0 -77
  94. package/scripts/longmemeval/fetch-dataset.sh +0 -25
  95. package/scripts/longmemeval/run-harness.ts +0 -315
  96. package/scripts/longmemeval/scorer.ts +0 -99
  97. package/scripts/longmemeval/tsconfig.json +0 -9
  98. package/scripts/longmemeval/types.ts +0 -35
  99. package/scripts/nlm-daily-digest.py +0 -239
  100. package/scripts/nlm-daily-digest.sh +0 -28
  101. package/src/cli/classify-parity.ts +0 -257
  102. package/src/cli/launchctl-helpers.ts +0 -49
  103. package/src/cli/nlm.ts +0 -885
  104. package/src/core/actions/actions-log.ts +0 -118
  105. package/src/core/actions/overlay.ts +0 -117
  106. package/src/core/adapters/aider.ts +0 -205
  107. package/src/core/adapters/claude-code.ts +0 -293
  108. package/src/core/adapters/common.ts +0 -54
  109. package/src/core/adapters/from-source.ts +0 -57
  110. package/src/core/adapters/hermes-agent.ts +0 -240
  111. package/src/core/adapters/hermes.ts +0 -277
  112. package/src/core/adapters/jsonl-generic.ts +0 -208
  113. package/src/core/adapters/opencode.ts +0 -281
  114. package/src/core/adapters/pi.ts +0 -264
  115. package/src/core/classifier/prompt.ts +0 -200
  116. package/src/core/dataset/build-dataset.ts +0 -463
  117. package/src/core/embedding/chunk-body.ts +0 -76
  118. package/src/core/embedding/embed-backfill.ts +0 -210
  119. package/src/core/embedding/embed-normalize.ts +0 -135
  120. package/src/core/facts/backfill-facts.ts +0 -254
  121. package/src/core/facts/extract-facts.ts +0 -50
  122. package/src/core/hook/citation-detect.ts +0 -124
  123. package/src/core/hook/cite-memo.ts +0 -68
  124. package/src/core/hook/claude-settings.ts +0 -166
  125. package/src/core/hook/gate.ts +0 -25
  126. package/src/core/hook/hook-log.ts +0 -41
  127. package/src/core/hook/memo-sweep.ts +0 -164
  128. package/src/core/hook/memo.ts +0 -67
  129. package/src/core/hook/pointer-block.ts +0 -26
  130. package/src/core/hook/select.ts +0 -32
  131. package/src/core/hook/transcript.ts +0 -121
  132. package/src/core/ingest/ingest-session.ts +0 -111
  133. package/src/core/providers/provider-models.ts +0 -100
  134. package/src/core/providers/provider-registry.ts +0 -196
  135. package/src/core/recall/citation-log.ts +0 -108
  136. package/src/core/recall/filter.ts +0 -27
  137. package/src/core/recall/index.ts +0 -6
  138. package/src/core/recall/match-fields.ts +0 -40
  139. package/src/core/recall/query-log.ts +0 -149
  140. package/src/core/recall/query-shape.ts +0 -66
  141. package/src/core/recall/recall-service.ts +0 -320
  142. package/src/core/recall/recent-log.ts +0 -59
  143. package/src/core/recall/tokenize.ts +0 -18
  144. package/src/core/recall/useful-scan.ts +0 -336
  145. package/src/core/recall-facts/fact-query-log.ts +0 -150
  146. package/src/core/recall-facts/fact-recall-service.ts +0 -327
  147. package/src/core/scheduler/scan-once.ts +0 -142
  148. package/src/core/scheduler/scheduler.ts +0 -225
  149. package/src/core/sources/source-registry.ts +0 -260
  150. package/src/core/storage/db-restore.ts +0 -133
  151. package/src/core/storage/live-status.ts +0 -45
  152. package/src/core/storage/migrate.ts +0 -72
  153. package/src/core/storage/sqlite-fact-store.ts +0 -304
  154. package/src/core/storage/sqlite-session-store.ts +0 -765
  155. package/src/hook/prompt-recall-hook.ts +0 -174
  156. package/src/hook/session-end-hook.ts +0 -81
  157. package/src/hook/session-start-hook.ts +0 -165
  158. package/src/hook/stop-hook.ts +0 -236
  159. package/src/http/app.ts +0 -1137
  160. package/src/install/claude-code.ts +0 -128
  161. package/src/install/codex.ts +0 -367
  162. package/src/install/hermes-agent.ts +0 -76
  163. package/src/install/hermes.ts +0 -78
  164. package/src/install/ollama.ts +0 -211
  165. package/src/install/setup.ts +0 -368
  166. package/src/llm/classifier-box.ts +0 -64
  167. package/src/llm/deepseek-client.ts +0 -150
  168. package/src/llm/env-autoload.ts +0 -55
  169. package/src/llm/ollama-client.ts +0 -189
  170. package/src/mcp/server.ts +0 -534
  171. package/src/ports/fact-store.ts +0 -102
  172. package/src/ports/llm-client.ts +0 -52
  173. package/src/ports/logger.ts +0 -16
  174. package/src/ports/session-store.ts +0 -45
  175. package/src/ports/transcript-adapter.ts +0 -55
  176. package/src/shared/types.ts +0 -145
  177. package/src/ui/App.tsx +0 -58
  178. package/src/ui/components/PromoteOpenButton.tsx +0 -65
  179. package/src/ui/components/SessionDrawer.tsx +0 -136
  180. package/src/ui/components/SideNav.tsx +0 -162
  181. package/src/ui/components/Skeleton.tsx +0 -107
  182. package/src/ui/index.html +0 -13
  183. package/src/ui/lib/actions.ts +0 -30
  184. package/src/ui/lib/api.ts +0 -92
  185. package/src/ui/lib/dataset.ts +0 -141
  186. package/src/ui/lib/registries.ts +0 -155
  187. package/src/ui/lib/view-settings.ts +0 -41
  188. package/src/ui/main.tsx +0 -15
  189. package/src/ui/pages/Live.tsx +0 -229
  190. package/src/ui/pages/Pulse.tsx +0 -415
  191. package/src/ui/pages/Recall.tsx +0 -190
  192. package/src/ui/pages/River.tsx +0 -308
  193. package/src/ui/pages/Search.tsx +0 -93
  194. package/src/ui/pages/Stub.tsx +0 -9
  195. package/src/ui/pages/Thread.tsx +0 -262
  196. package/src/ui/pages/settings/Classifier.tsx +0 -227
  197. package/src/ui/pages/settings/Data.tsx +0 -190
  198. package/src/ui/pages/settings/Index.tsx +0 -65
  199. package/src/ui/pages/settings/Labels.tsx +0 -224
  200. package/src/ui/pages/settings/Providers.tsx +0 -305
  201. package/src/ui/pages/settings/SettingsSubnav.tsx +0 -28
  202. package/src/ui/pages/settings/Sources.tsx +0 -326
  203. package/src/ui/pages/settings/Views.tsx +0 -96
  204. package/src/ui/styles.css +0 -1766
  205. package/src/ui/tsconfig.json +0 -21
  206. package/src/ui/vite.config.ts +0 -19
  207. package/tests/fixtures/claude_code/short_session.jsonl +0 -2
  208. package/tests/fixtures/claude_code/standard_iso.jsonl +0 -4
  209. package/tests/fixtures/claude_code/tool_heavy.jsonl +0 -8
  210. package/tests/fixtures/claude_code/with_subagent.jsonl +0 -7
  211. package/tests/fixtures/facts.ts +0 -17
  212. package/tests/fixtures/golden-corpus.ts +0 -85
  213. package/tests/fixtures/hermes/paired_request_dump.json +0 -24
  214. package/tests/fixtures/hermes/paired_session.json +0 -23
  215. package/tests/fixtures/hermes/request_dump.json +0 -28
  216. package/tests/fixtures/hermes/session_iso.json +0 -38
  217. package/tests/fixtures/hermes/session_unix.json +0 -38
  218. package/tests/fixtures/hermes/system_only.json +0 -18
  219. package/tests/fixtures/pi/error-connection-abort.jsonl +0 -8
  220. package/tests/fixtures/pi/short-successful.jsonl +0 -5
  221. package/tests/fixtures/pi/with-custom-message.jsonl +0 -6
  222. package/tests/fixtures/sessions.ts +0 -22
  223. package/tests/integration/backfill-facts.test.ts +0 -362
  224. package/tests/integration/citation-explicit.test.ts +0 -111
  225. package/tests/integration/cite-event.test.ts +0 -169
  226. package/tests/integration/cite-memo.test.ts +0 -87
  227. package/tests/integration/db-restore.test.ts +0 -153
  228. package/tests/integration/embed-backfill.test.ts +0 -176
  229. package/tests/integration/fact-supersedence.test.ts +0 -313
  230. package/tests/integration/fts-index.test.ts +0 -60
  231. package/tests/integration/getbyids-sqlite.test.ts +0 -60
  232. package/tests/integration/hermes-agent-hooks.test.ts +0 -248
  233. package/tests/integration/hook-claude-settings.test.ts +0 -205
  234. package/tests/integration/hook-log.test.ts +0 -54
  235. package/tests/integration/hook-memo.test.ts +0 -68
  236. package/tests/integration/hook-pre-compact.test.ts +0 -105
  237. package/tests/integration/hook-subagent-start.test.ts +0 -102
  238. package/tests/integration/http.test.ts +0 -401
  239. package/tests/integration/keyword-search-fts.test.ts +0 -66
  240. package/tests/integration/mcp-recall-logging.test.ts +0 -88
  241. package/tests/integration/mcp.test.ts +0 -248
  242. package/tests/integration/memo-sweep.test.ts +0 -91
  243. package/tests/integration/prompt-recall-hook.test.ts +0 -88
  244. package/tests/integration/provider-registry.test.ts +0 -107
  245. package/tests/integration/recall-golden.test.ts +0 -59
  246. package/tests/integration/recall-sqlite.test.ts +0 -169
  247. package/tests/integration/scheduler.test.ts +0 -391
  248. package/tests/integration/session-end-hook.test.ts +0 -48
  249. package/tests/integration/session-start-hook.test.ts +0 -126
  250. package/tests/integration/source-registry.test.ts +0 -120
  251. package/tests/integration/sqlite-fact-store.test.ts +0 -346
  252. package/tests/integration/stop-hook.test.ts +0 -560
  253. package/tests/integration/wal-checkpoint.test.ts +0 -49
  254. package/tests/unit/cli/launchctl-helpers.test.ts +0 -60
  255. package/tests/unit/core/adapters/aider.test.ts +0 -230
  256. package/tests/unit/core/adapters/claude-code.test.ts +0 -118
  257. package/tests/unit/core/adapters/hermes-agent.test.ts +0 -329
  258. package/tests/unit/core/adapters/hermes.test.ts +0 -81
  259. package/tests/unit/core/adapters/jsonl-generic.test.ts +0 -142
  260. package/tests/unit/core/adapters/opencode.test.ts +0 -354
  261. package/tests/unit/core/adapters/pi.test.ts +0 -110
  262. package/tests/unit/core/classifier/prompt.test.ts +0 -126
  263. package/tests/unit/core/embedding/chunk-body.test.ts +0 -100
  264. package/tests/unit/core/facts/extract-facts.test.ts +0 -117
  265. package/tests/unit/core/filter.test.ts +0 -40
  266. package/tests/unit/core/hook/citation-detect-cite-session.test.ts +0 -96
  267. package/tests/unit/core/hook/citation-detect.test.ts +0 -124
  268. package/tests/unit/core/hook/gate.test.ts +0 -29
  269. package/tests/unit/core/hook/pointer-block.test.ts +0 -22
  270. package/tests/unit/core/hook/select.test.ts +0 -66
  271. package/tests/unit/core/match-fields.test.ts +0 -39
  272. package/tests/unit/core/mcp-cite-session.test.ts +0 -51
  273. package/tests/unit/core/providers/provider-models.test.ts +0 -101
  274. package/tests/unit/core/query-shape.test.ts +0 -92
  275. package/tests/unit/core/recall-facts/fact-recall-service.test.ts +0 -258
  276. package/tests/unit/core/recall-service.test.ts +0 -200
  277. package/tests/unit/core/storage/live-status.test.ts +0 -54
  278. package/tests/unit/core/tokenize.test.ts +0 -32
  279. package/tests/unit/core/useful-scan.test.ts +0 -537
  280. package/tests/unit/llm/embed.test.ts +0 -93
  281. package/tests/unit/llm/ollama-client.test.ts +0 -124
  282. package/tests/unit/scripts/longmemeval-scorer.test.ts +0 -114
  283. package/tsconfig.json +0 -31
  284. package/tsconfig.test.json +0 -11
  285. package/vitest.config.ts +0 -22
@@ -1,76 +0,0 @@
1
- /**
2
- * chunk-body — split a session body into ≤MAX_CHUNK_CHARS-char windows
3
- * for the chunk + max-pool semantic index. Header (label + summary) is
4
- * prepended to the first chunk so it's always part of the index without
5
- * inflating later chunk sizes.
6
- *
7
- * MAX_CHUNK_CHARS sized for nomic-embed-text's 2048-token context. Char
8
- * density varies by content: prose ~4 chars/token, code/tool-output ~3
9
- * chars/token. The 2026-05-26 backfill bisect found the cliff at ~6,388
10
- * chars for token-dense Claude Code session bodies — 5,500 holds a safe
11
- * margin and eliminates the "input exceeds context length" 500s that
12
- * drove ~76% per-chunk rejection at 7,500. See 2026-05-26 CHANGELOG.
13
- *
14
- * OVERLAP_CHARS preserves context across boundaries so a phrase split
15
- * mid-chunk still appears intact in one neighboring chunk.
16
- *
17
- * Pure function. No I/O, no allocations beyond the returned array.
18
- */
19
-
20
- export const MAX_CHUNK_CHARS = 5_500;
21
- export const OVERLAP_CHARS = 500;
22
-
23
- export interface ChunkInput {
24
- readonly label?: string | null;
25
- readonly summary?: string | null;
26
- readonly body?: string | null;
27
- }
28
-
29
- export interface ChunkOptions {
30
- readonly maxChars?: number;
31
- readonly overlap?: number;
32
- }
33
-
34
- export function chunkSessionText(
35
- input: ChunkInput,
36
- opts: ChunkOptions = {},
37
- ): string[] {
38
- const maxChars = opts.maxChars ?? MAX_CHUNK_CHARS;
39
- const overlap = opts.overlap ?? OVERLAP_CHARS;
40
- if (maxChars <= 0) throw new Error("chunkSessionText: maxChars must be > 0");
41
- if (overlap < 0 || overlap >= maxChars) {
42
- throw new Error("chunkSessionText: overlap must satisfy 0 <= overlap < maxChars");
43
- }
44
-
45
- const header = [input.label ?? "", input.summary ?? ""]
46
- .map((s) => s.trim())
47
- .filter((s) => s.length > 0)
48
- .join(" ");
49
- const body = (input.body ?? "").trim();
50
-
51
- if (!header && !body) return [];
52
- if (!body) return [header];
53
-
54
- // First chunk: header + as much body as fits.
55
- const headerPrefix = header ? header + " " : "";
56
- const firstBodyBudget = Math.max(1, maxChars - headerPrefix.length);
57
-
58
- if (body.length <= firstBodyBudget) {
59
- return [(headerPrefix + body).trim()];
60
- }
61
-
62
- const chunks: string[] = [];
63
- chunks.push((headerPrefix + body.slice(0, firstBodyBudget)).trim());
64
-
65
- // Subsequent chunks: body windows with overlap.
66
- const step = maxChars - overlap;
67
- let pos = Math.max(0, firstBodyBudget - overlap);
68
- while (pos < body.length) {
69
- const end = Math.min(pos + maxChars, body.length);
70
- const slice = body.slice(pos, end).trim();
71
- if (slice.length > 0) chunks.push(slice);
72
- if (end >= body.length) break;
73
- pos += step;
74
- }
75
- return chunks;
76
- }
@@ -1,210 +0,0 @@
1
- /**
2
- * embed-backfill — re-embed every session in canonical.sqlite into the
3
- * chunk + max-pool index (session_embedding_chunks). Replaces the prior
4
- * one-vector-per-session backfill that wrote to session_embeddings.
5
- *
6
- * For each session: chunk (label + summary + body) via chunkSessionText,
7
- * embed each chunk with kind="document", and write to the chunk table +
8
- * session_chunk_map via the same INSERT pair used by the live ingest path.
9
- *
10
- * Resumable via a JSON state file at $NLM_EMBED_STATE (default
11
- * ~/.nlm/embed_reembed.state). Interrupting + rerunning skips already-done
12
- * session ids. A session is considered "done" only when ALL its chunks
13
- * embed successfully — partial sessions are retried on the next run.
14
- *
15
- * Layering: depends on the LLMClient port. SQLite touched directly via
16
- * better-sqlite3 because this is a one-shot operational tool, not a hot
17
- * path. Lives under core/ but is invoked from the CLI composition root.
18
- */
19
-
20
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
21
- import { dirname, join } from "node:path";
22
- import { homedir } from "node:os";
23
- import Database from "better-sqlite3";
24
- import * as sqliteVec from "sqlite-vec";
25
- import type { LLMClient } from "@ports/llm-client.js";
26
- import { LLMUnreachableError } from "@ports/llm-client.js";
27
- import { chunkSessionText } from "@core/embedding/chunk-body.js";
28
-
29
- const DEFAULT_STATE_PATH = join(homedir(), ".nlm", "embed_reembed.state");
30
- const SAVE_EVERY = 25;
31
-
32
- export interface BackfillOptions {
33
- readonly dbPath: string;
34
- readonly embedder: LLMClient;
35
- readonly statePath?: string;
36
- readonly limit?: number;
37
- readonly onProgress?: (i: number, total: number, sid: string, status: string) => void;
38
- }
39
-
40
- export interface BackfillReport {
41
- readonly total: number;
42
- readonly processed: number;
43
- readonly succeeded: number;
44
- readonly failed: number;
45
- readonly skippedAlreadyDone: number;
46
- readonly dbMissing: boolean;
47
- }
48
-
49
- interface SessionRow {
50
- id: string;
51
- label: string | null;
52
- summary: string | null;
53
- body: string | null;
54
- }
55
-
56
- function loadState(path: string): Set<string> {
57
- if (!existsSync(path)) return new Set();
58
- try {
59
- const data = JSON.parse(readFileSync(path, "utf8")) as { done?: string[] };
60
- return new Set(data.done ?? []);
61
- } catch {
62
- return new Set();
63
- }
64
- }
65
-
66
- function saveState(path: string, done: Set<string>): void {
67
- mkdirSync(dirname(path), { recursive: true });
68
- writeFileSync(path, JSON.stringify({ done: [...done].sort() }));
69
- }
70
-
71
-
72
- export async function reembedCorpus(opts: BackfillOptions): Promise<BackfillReport> {
73
- const statePath = opts.statePath ?? DEFAULT_STATE_PATH;
74
-
75
- if (!existsSync(opts.dbPath)) {
76
- return { total: 0, processed: 0, succeeded: 0, failed: 0, skippedAlreadyDone: 0, dbMissing: true };
77
- }
78
-
79
- const db = new Database(opts.dbPath);
80
- sqliteVec.load(db);
81
-
82
- // Backfill every session with content; live ingest covers ongoing writes.
83
- // The state file dedupes across runs so partial completion resumes cleanly.
84
- const sql =
85
- "SELECT s.id, s.label, s.summary, s.body FROM sessions s " +
86
- "WHERE s.body IS NOT NULL OR s.summary IS NOT NULL OR s.label IS NOT NULL " +
87
- "ORDER BY s.started_at" +
88
- (opts.limit ? ` LIMIT ${Math.trunc(opts.limit)}` : "");
89
- const rows = db.prepare<[], SessionRow>(sql).all();
90
- const total = rows.length;
91
-
92
- const done = loadState(statePath);
93
-
94
- const selectChunks = db.prepare<[string], { chunk_id: number }>(
95
- "SELECT chunk_id FROM session_chunk_map WHERE session_id = ?",
96
- );
97
- const delChunks = (sessionId: string): void => {
98
- const existing = selectChunks.all(sessionId);
99
- if (existing.length === 0) return;
100
- const placeholders = existing.map(() => "?").join(",");
101
- const ids = existing.map((r) => r.chunk_id);
102
- db.prepare(
103
- `DELETE FROM session_embedding_chunks WHERE chunk_id IN (${placeholders})`,
104
- ).run(...ids);
105
- db.prepare("DELETE FROM session_chunk_map WHERE session_id = ?").run(sessionId);
106
- };
107
- const insChunk = db.prepare(
108
- "INSERT INTO session_embedding_chunks (embedding, session_id, chunk_idx) VALUES (?, ?, ?)",
109
- );
110
- const insMap = db.prepare(
111
- "INSERT INTO session_chunk_map (chunk_id, session_id, chunk_idx) VALUES (?, ?, ?)",
112
- );
113
-
114
- let succeeded = 0;
115
- let failed = 0;
116
- let skipped = 0;
117
-
118
- try {
119
- for (let i = 0; i < rows.length; i++) {
120
- const row = rows[i]!;
121
- const idx = i + 1;
122
- if (done.has(row.id)) {
123
- skipped += 1;
124
- continue;
125
- }
126
- const chunks = chunkSessionText({
127
- label: row.label,
128
- summary: row.summary,
129
- body: row.body,
130
- });
131
- if (chunks.length === 0) {
132
- failed += 1;
133
- opts.onProgress?.(idx, total, row.id, "SKIP (no text)");
134
- continue;
135
- }
136
-
137
- // Per-chunk failure tolerance matches live ingest: one chunk hitting
138
- // the Ollama edge-cliff 500 must not zero out an entire session's
139
- // coverage. Single retry on LLMUnreachableError catches transient
140
- // failures; persistent ones are dropped. Session is "done" if any
141
- // chunk landed — partial max-pool coverage beats none.
142
- const vectors: { idx: number; vec: Float32Array }[] = [];
143
- let chunkSkipped = 0;
144
- for (let c = 0; c < chunks.length; c++) {
145
- const chunk = chunks[c]!;
146
- let lastErr: unknown;
147
- for (let attempt = 0; attempt < 2; attempt++) {
148
- try {
149
- const out = await opts.embedder.embed(chunk, "document");
150
- vectors.push({ idx: c, vec: out.vector });
151
- lastErr = undefined;
152
- break;
153
- } catch (e) {
154
- lastErr = e;
155
- if (!(e instanceof LLMUnreachableError)) throw e;
156
- if (attempt === 0) await new Promise((r) => setTimeout(r, 200));
157
- }
158
- }
159
- if (lastErr !== undefined) chunkSkipped += 1;
160
- }
161
- if (vectors.length === 0) {
162
- failed += 1;
163
- opts.onProgress?.(idx, total, row.id, `FAIL (embedder, ${chunkSkipped}/${chunks.length} chunks)`);
164
- continue;
165
- }
166
-
167
- try {
168
- delChunks(row.id);
169
- for (const { idx: cidx, vec } of vectors) {
170
- const blob = Buffer.from(vec.buffer, vec.byteOffset, vec.byteLength);
171
- // BigInt cast so vec0's aux chunk_idx column receives an INTEGER.
172
- const info = insChunk.run(blob, row.id, BigInt(cidx));
173
- insMap.run(Number(info.lastInsertRowid), row.id, cidx);
174
- }
175
- } catch (e) {
176
- failed += 1;
177
- opts.onProgress?.(idx, total, row.id, `FAIL (db): ${(e as Error).message}`);
178
- continue;
179
- }
180
-
181
- done.add(row.id);
182
- succeeded += 1;
183
- const status =
184
- chunkSkipped === 0
185
- ? `OK (${vectors.length} chunks)`
186
- : `PARTIAL (${vectors.length}/${chunks.length} chunks, ${chunkSkipped} skipped)`;
187
- opts.onProgress?.(idx, total, row.id, status);
188
- if (succeeded % SAVE_EVERY === 0) saveState(statePath, done);
189
- }
190
- saveState(statePath, done);
191
- } finally {
192
- db.close();
193
- }
194
-
195
- return {
196
- total,
197
- processed: succeeded + failed + skipped,
198
- succeeded,
199
- failed,
200
- skippedAlreadyDone: skipped,
201
- dbMissing: false,
202
- };
203
- }
204
-
205
- export function clearBackfillState(statePath: string = DEFAULT_STATE_PATH): void {
206
- if (existsSync(statePath)) {
207
- const { unlinkSync } = require("node:fs") as typeof import("node:fs");
208
- unlinkSync(statePath);
209
- }
210
- }
@@ -1,135 +0,0 @@
1
- /**
2
- * embed-normalize — one-shot migration: L2-normalize every row in
3
- * session_embeddings. Ports `embed_normalize.py`.
4
- *
5
- * vec0 with implicit L2 distance ranks correctly by cosine similarity
6
- * only when stored vectors are unit-length. New writes (post-this-fix)
7
- * are normalized at source by OllamaClient.embed; this module brings
8
- * existing rows to the same invariant.
9
- *
10
- * Idempotent: re-running on already-normalized vectors is a no-op
11
- * within float tolerance (EPS = 1e-3). Each row is rewritten in its
12
- * own transaction so interrupts are safe.
13
- */
14
-
15
- import { existsSync } from "node:fs";
16
- import Database from "better-sqlite3";
17
- import * as sqliteVec from "sqlite-vec";
18
-
19
- const EPS = 1e-3;
20
- const DEFAULT_DIM = 768;
21
- const DEFAULT_BATCH = 100;
22
-
23
- export interface NormalizeOptions {
24
- readonly dbPath: string;
25
- readonly dim?: number;
26
- readonly batchSize?: number;
27
- readonly dryRun?: boolean;
28
- }
29
-
30
- export interface NormalizeReport {
31
- readonly total: number;
32
- readonly alreadyNormalized: number;
33
- readonly rewritten: number;
34
- readonly zeroVector: number;
35
- readonly dbMissing: boolean;
36
- readonly dryRun: boolean;
37
- }
38
-
39
- interface EmbeddingRow {
40
- session_id: string;
41
- embedding: Buffer;
42
- }
43
-
44
- interface IdRow {
45
- session_id: string;
46
- }
47
-
48
- function bytesToFloats(buf: Buffer, dim: number): Float32Array {
49
- if (buf.byteLength !== dim * 4) {
50
- throw new Error(`expected ${dim * 4} bytes, got ${buf.byteLength}`);
51
- }
52
- return new Float32Array(buf.buffer, buf.byteOffset, dim);
53
- }
54
-
55
- function floatsToBytes(vec: Float32Array): Buffer {
56
- return Buffer.from(vec.buffer, vec.byteOffset, vec.byteLength);
57
- }
58
-
59
- export function normalizeEmbeddings(opts: NormalizeOptions): NormalizeReport {
60
- const dim = opts.dim ?? DEFAULT_DIM;
61
- const batchSize = opts.batchSize ?? DEFAULT_BATCH;
62
- const dryRun = opts.dryRun ?? false;
63
-
64
- if (!existsSync(opts.dbPath)) {
65
- return { total: 0, alreadyNormalized: 0, rewritten: 0, zeroVector: 0, dbMissing: true, dryRun };
66
- }
67
-
68
- const db = new Database(opts.dbPath);
69
- sqliteVec.load(db);
70
-
71
- let total = 0;
72
- let alreadyNormalized = 0;
73
- let rewritten = 0;
74
- let zeroVector = 0;
75
-
76
- try {
77
- const ids = db
78
- .prepare<[], IdRow>("SELECT session_id FROM session_embeddings")
79
- .all()
80
- .map((r) => r.session_id);
81
- total = ids.length;
82
-
83
- const sel = db.prepare<[string], EmbeddingRow>(
84
- "SELECT session_id, embedding FROM session_embeddings WHERE session_id = ?",
85
- );
86
- const del = db.prepare("DELETE FROM session_embeddings WHERE session_id = ?");
87
- const ins = db.prepare("INSERT INTO session_embeddings (session_id, embedding) VALUES (?, ?)");
88
-
89
- for (let start = 0; start < total; start += batchSize) {
90
- const batch = ids.slice(start, start + batchSize);
91
- for (const sid of batch) {
92
- const row = sel.get(sid);
93
- if (!row) continue;
94
-
95
- const vec = bytesToFloats(row.embedding, dim);
96
- let sumSq = 0;
97
- for (let i = 0; i < dim; i++) {
98
- const v = vec[i] ?? 0;
99
- sumSq += v * v;
100
- }
101
- if (sumSq === 0) {
102
- zeroVector += 1;
103
- continue;
104
- }
105
- const norm = Math.sqrt(sumSq);
106
- if (Math.abs(norm - 1) <= EPS) {
107
- alreadyNormalized += 1;
108
- continue;
109
- }
110
- if (dryRun) {
111
- rewritten += 1;
112
- continue;
113
- }
114
- const normalized = new Float32Array(dim);
115
- for (let i = 0; i < dim; i++) {
116
- normalized[i] = (vec[i] ?? 0) / norm;
117
- }
118
- del.run(sid);
119
- ins.run(sid, floatsToBytes(normalized));
120
- rewritten += 1;
121
- }
122
- }
123
- } finally {
124
- db.close();
125
- }
126
-
127
- return {
128
- total,
129
- alreadyNormalized,
130
- rewritten,
131
- zeroVector,
132
- dbMissing: false,
133
- dryRun,
134
- };
135
- }
@@ -1,254 +0,0 @@
1
- /**
2
- * backfill-facts — one-shot population of the FactStore from the existing
3
- * session corpus. Phase B.5, see docs/plans/factstore-design.md Section 7.
4
- *
5
- * For each session in `sessions` that has no facts yet (and was started
6
- * before the script's start timestamp, to avoid racing with live ingest),
7
- * runs the classifier over its body, extracts facts, and writes them via
8
- * SqliteSessionStore.insertFactsForSession.
9
- *
10
- * Resumable via a JSON state file (mirrors core/embedding/embed-backfill).
11
- * Interrupting and rerunning skips already-processed sessions. State path
12
- * defaults to ~/.nlm/backfill_facts.state.
13
- *
14
- * Layering: depends on the LLMClient + FactStore ports through the
15
- * SqliteSessionStore + SqliteFactStore composition. Lives under core/ but
16
- * is invoked from the CLI composition root, like embed-backfill.
17
- */
18
-
19
- import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
20
- import { dirname, join } from "node:path";
21
- import { homedir } from "node:os";
22
- import { extractFacts } from "@core/facts/extract-facts.js";
23
- import type { SqliteFactStore } from "@core/storage/sqlite-fact-store.js";
24
- import type { SqliteSessionStore } from "@core/storage/sqlite-session-store.js";
25
- import type { LLMClient } from "@ports/llm-client.js";
26
- import { LLMUnreachableError } from "@ports/llm-client.js";
27
-
28
- const DEFAULT_STATE_PATH = join(homedir(), ".nlm", "backfill_facts.state");
29
- const SAVE_EVERY = 25;
30
-
31
- export interface BackfillFactsOptions {
32
- readonly store: SqliteSessionStore;
33
- readonly factStore: SqliteFactStore;
34
- readonly classifier: LLMClient;
35
- /** Optional embedder. When omitted, facts are written without semantic vectors. */
36
- readonly embedder?: LLMClient | null;
37
- readonly statePath?: string;
38
- /** Cap on sessions processed this run. Default: all eligible. */
39
- readonly limit?: number;
40
- /**
41
- * Resume from a specific session id. When set, sessions with id
42
- * lexicographically <= this value are skipped on top of the state file's
43
- * done set. Useful when the state file is lost but the operator
44
- * remembers the last successful id.
45
- */
46
- readonly from?: string;
47
- /** Don't write — just count what would happen. */
48
- readonly dryRun?: boolean;
49
- /**
50
- * Re-process sessions that already have facts. Default: false (skip).
51
- * Use when iterating the classifier prompt to refresh the corpus.
52
- */
53
- readonly reprocess?: boolean;
54
- readonly onProgress?: (
55
- i: number,
56
- total: number,
57
- sessionId: string,
58
- status: BackfillStatus,
59
- details?: string,
60
- ) => void;
61
- }
62
-
63
- export type BackfillStatus =
64
- | "ok"
65
- | "skipped_done"
66
- | "skipped_existing_facts"
67
- | "skipped_no_body"
68
- | "skipped_low_confidence"
69
- | "classify_failed"
70
- | "storage_failed";
71
-
72
- export interface BackfillFactsReport {
73
- readonly total: number;
74
- readonly processed: number;
75
- readonly factsWritten: number;
76
- readonly skippedAlreadyDone: number;
77
- readonly skippedExistingFacts: number;
78
- readonly skippedNoBody: number;
79
- readonly skippedLowConfidence: number;
80
- readonly classifyFailures: number;
81
- readonly storageFailures: number;
82
- }
83
-
84
- interface CandidateRow {
85
- id: string;
86
- started_at: string;
87
- body: string | null;
88
- }
89
-
90
- function loadState(path: string): Set<string> {
91
- if (!existsSync(path)) return new Set();
92
- try {
93
- const data = JSON.parse(readFileSync(path, "utf8")) as { done?: string[] };
94
- return new Set(data.done ?? []);
95
- } catch {
96
- return new Set();
97
- }
98
- }
99
-
100
- function saveState(path: string, done: Set<string>): void {
101
- const dir = dirname(path);
102
- if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
103
- writeFileSync(path, JSON.stringify({ done: Array.from(done) }, null, 0));
104
- }
105
-
106
- export async function backfillFacts(
107
- opts: BackfillFactsOptions,
108
- ): Promise<BackfillFactsReport> {
109
- const startedAtCutoff = new Date().toISOString();
110
- const statePath = opts.statePath ?? DEFAULT_STATE_PATH;
111
- const done = opts.dryRun ? new Set<string>() : loadState(statePath);
112
-
113
- const db = opts.store.rawDb();
114
-
115
- // Eligible sessions: started strictly before this run's cutoff (don't
116
- // race with live ingest), with a non-empty body (the classifier needs
117
- // transcript text). When reprocess=false, exclude sessions that already
118
- // have facts attributed to them.
119
- const sql = opts.reprocess
120
- ? `
121
- SELECT id, started_at, body
122
- FROM sessions
123
- WHERE started_at < ?
124
- AND body IS NOT NULL AND length(body) > 0
125
- ${opts.from ? "AND id > ?" : ""}
126
- ORDER BY started_at ASC, id ASC
127
- `
128
- : `
129
- SELECT s.id, s.started_at, s.body
130
- FROM sessions s
131
- WHERE s.started_at < ?
132
- AND s.body IS NOT NULL AND length(s.body) > 0
133
- AND NOT EXISTS (
134
- SELECT 1 FROM facts f WHERE f.source_session_id = s.id
135
- )
136
- ${opts.from ? "AND s.id > ?" : ""}
137
- ORDER BY s.started_at ASC, s.id ASC
138
- `;
139
- const rows: CandidateRow[] = opts.from
140
- ? db.prepare<[string, string], CandidateRow>(sql).all(startedAtCutoff, opts.from)
141
- : db.prepare<[string], CandidateRow>(sql).all(startedAtCutoff);
142
-
143
- // Filter state-file-known done ids BEFORE applying limit. Without this,
144
- // a dense cluster of previously-skipped (low-confidence) sessions would
145
- // burn the batch's --limit on no-op skips. With it, --limit N means
146
- // "N actually-processable sessions" — much more useful UX for repeated
147
- // small batches that walk forward through the corpus. The pre-filter
148
- // count gets reported as `skippedAlreadyDone` so the operator still sees
149
- // how big the skip region was.
150
- const skippedByStateFile = rows.filter((r) => done.has(r.id)).length;
151
- const candidates = rows.filter((r) => !done.has(r.id));
152
- const limit = opts.limit ?? candidates.length;
153
- const work = candidates.slice(0, limit);
154
- const total = work.length;
155
-
156
- let processed = 0;
157
- let factsWritten = 0;
158
- let skippedAlreadyDone = skippedByStateFile;
159
- let skippedExistingFacts = 0;
160
- let skippedNoBody = 0;
161
- let skippedLowConfidence = 0;
162
- let classifyFailures = 0;
163
- let storageFailures = 0;
164
-
165
- for (let i = 0; i < work.length; i++) {
166
- const row = work[i]!;
167
- const sid = row.id;
168
-
169
- // No per-iteration `done` check needed — `work` is already filtered
170
- // against the state file above.
171
-
172
- if (!row.body || row.body.length === 0) {
173
- skippedNoBody += 1;
174
- opts.onProgress?.(i + 1, total, sid, "skipped_no_body");
175
- continue;
176
- }
177
-
178
- let classification;
179
- try {
180
- classification = await opts.classifier.classify(row.body);
181
- } catch (err) {
182
- classifyFailures += 1;
183
- const detail =
184
- err instanceof LLMUnreachableError
185
- ? "ollama unreachable — stopping run"
186
- : err instanceof Error
187
- ? err.message
188
- : String(err);
189
- opts.onProgress?.(i + 1, total, sid, "classify_failed", detail);
190
- // Ollama-down is fatal: every subsequent classify will fail. Stop
191
- // here so the operator can fix and resume.
192
- if (err instanceof LLMUnreachableError) break;
193
- continue;
194
- }
195
-
196
- const facts = extractFacts(classification, sid, row.started_at);
197
- if (facts.length === 0) {
198
- skippedLowConfidence += 1;
199
- opts.onProgress?.(
200
- i + 1,
201
- total,
202
- sid,
203
- "skipped_low_confidence",
204
- `confidence=${classification.confidence}`,
205
- );
206
- // Mark done so a re-run doesn't keep paying the classifier cost on
207
- // sessions the model can't extract anything from.
208
- done.add(sid);
209
- if (!opts.dryRun && processed % SAVE_EVERY === 0) saveState(statePath, done);
210
- continue;
211
- }
212
-
213
- if (opts.dryRun) {
214
- factsWritten += facts.length;
215
- processed += 1;
216
- opts.onProgress?.(i + 1, total, sid, "ok", `would-write=${facts.length}`);
217
- continue;
218
- }
219
-
220
- try {
221
- await opts.store.insertFactsForSession(
222
- sid,
223
- opts.factStore,
224
- facts,
225
- opts.embedder ?? null,
226
- );
227
- } catch (err) {
228
- storageFailures += 1;
229
- const detail = err instanceof Error ? err.message : String(err);
230
- opts.onProgress?.(i + 1, total, sid, "storage_failed", detail);
231
- continue;
232
- }
233
-
234
- factsWritten += facts.length;
235
- processed += 1;
236
- done.add(sid);
237
- opts.onProgress?.(i + 1, total, sid, "ok", `wrote=${facts.length}`);
238
- if (processed % SAVE_EVERY === 0) saveState(statePath, done);
239
- }
240
-
241
- if (!opts.dryRun) saveState(statePath, done);
242
-
243
- return {
244
- total,
245
- processed,
246
- factsWritten,
247
- skippedAlreadyDone,
248
- skippedExistingFacts,
249
- skippedNoBody,
250
- skippedLowConfidence,
251
- classifyFailures,
252
- storageFailures,
253
- };
254
- }