nlm-memory 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli/nlm.js +221 -32
- package/dist/cli/nlm.js.map +1 -1
- package/dist/core/adapters/cursor.d.ts +45 -0
- package/dist/core/adapters/cursor.js +397 -0
- package/dist/core/adapters/cursor.js.map +1 -0
- package/dist/core/adapters/from-source.js +10 -0
- package/dist/core/adapters/from-source.js.map +1 -1
- package/dist/core/adapters/windsurf.d.ts +44 -0
- package/dist/core/adapters/windsurf.js +299 -0
- package/dist/core/adapters/windsurf.js.map +1 -0
- package/dist/core/hook/claude-settings.d.ts +12 -5
- package/dist/core/hook/claude-settings.js +21 -6
- package/dist/core/hook/claude-settings.js.map +1 -1
- package/dist/core/sources/source-registry.d.ts +1 -1
- package/dist/core/sources/source-registry.js +18 -0
- package/dist/core/sources/source-registry.js.map +1 -1
- package/dist/core/storage/sqlite-session-store.d.ts +2 -0
- package/dist/core/storage/sqlite-session-store.js +38 -2
- package/dist/core/storage/sqlite-session-store.js.map +1 -1
- package/dist/hook/hook-auth.d.ts +13 -0
- package/dist/hook/hook-auth.js +19 -0
- package/dist/hook/hook-auth.js.map +1 -0
- package/dist/hook/prompt-recall-hook.js +7 -1
- package/dist/hook/prompt-recall-hook.js.map +1 -1
- package/dist/hook/session-start-hook.js +4 -1
- package/dist/hook/session-start-hook.js.map +1 -1
- package/dist/hook/stop-hook.js +4 -1
- package/dist/hook/stop-hook.js.map +1 -1
- package/dist/http/app.d.ts +2 -0
- package/dist/http/app.js +74 -0
- package/dist/http/app.js.map +1 -1
- package/dist/install/claude-code.js +1 -1
- package/dist/install/claude-code.js.map +1 -1
- package/dist/install/cursor.d.ts +25 -0
- package/dist/install/cursor.js +43 -0
- package/dist/install/cursor.js.map +1 -0
- package/dist/install/nlm-dir-perms.d.ts +19 -0
- package/dist/install/nlm-dir-perms.js +43 -0
- package/dist/install/nlm-dir-perms.js.map +1 -0
- package/dist/install/ollama.d.ts +18 -1
- package/dist/install/ollama.js +68 -10
- package/dist/install/ollama.js.map +1 -1
- package/dist/install/setup.d.ts +4 -0
- package/dist/install/setup.js +141 -18
- package/dist/install/setup.js.map +1 -1
- package/dist/install/windsurf.d.ts +25 -0
- package/dist/install/windsurf.js +43 -0
- package/dist/install/windsurf.js.map +1 -0
- package/dist/shared/types.d.ts +4 -0
- package/dist/ui/assets/{index-BA6IpU8g.css → index-C8cpwbYJ.css} +1 -1
- package/dist/ui/assets/index-CB50QnL-.js +69 -0
- package/dist/ui/index.html +2 -2
- package/logs/CHANGELOG/CHANGELOG-2026.md +186 -0
- package/logs/CHANGELOG/CHANGELOG.md +107 -235
- package/migrations/014_sources_cursor.sql +30 -0
- package/migrations/015_sources_windsurf.sql +30 -0
- package/package.json +1 -1
- package/plugin/scripts/prompt-recall-hook.mjs +55 -4
- package/plugin/scripts/stop-hook.mjs +57 -6
- package/src/cli/nlm.ts +224 -31
- package/src/core/adapters/cursor.ts +486 -0
- package/src/core/adapters/from-source.ts +10 -0
- package/src/core/adapters/windsurf.ts +386 -0
- package/src/core/hook/claude-settings.ts +30 -9
- package/src/core/sources/source-registry.ts +19 -1
- package/src/core/storage/sqlite-session-store.ts +46 -1
- package/src/hook/hook-auth.ts +18 -0
- package/src/hook/prompt-recall-hook.ts +7 -1
- package/src/hook/session-start-hook.ts +4 -1
- package/src/hook/stop-hook.ts +4 -1
- package/src/http/app.ts +78 -0
- package/src/install/claude-code.ts +1 -1
- package/src/install/cursor.ts +68 -0
- package/src/install/nlm-dir-perms.ts +55 -0
- package/src/install/ollama.ts +86 -10
- package/src/install/setup.ts +138 -17
- package/src/install/windsurf.ts +68 -0
- package/src/shared/types.ts +4 -0
- package/src/ui/components/SessionDrawer.tsx +97 -34
- package/src/ui/pages/River.tsx +90 -44
- package/src/ui/pages/Search.tsx +357 -64
- package/src/ui/pages/Thread.tsx +267 -56
- package/src/ui/styles.css +129 -5
- package/tests/integration/getbyids-sqlite.test.ts +40 -0
- package/tests/integration/hook-claude-settings.test.ts +14 -1
- package/tests/integration/mcp.test.ts +12 -0
- package/tests/integration/source-registry.test.ts +5 -3
- package/tests/unit/core/adapters/cursor.test.ts +485 -0
- package/tests/unit/core/adapters/windsurf.test.ts +416 -0
- package/dist/ui/assets/index-B_qIVV0k.js +0 -69
|
@@ -2,6 +2,113 @@
|
|
|
2
2
|
|
|
3
3
|
Session-level log per session protocol. Cap: 10 entries — archive older to `CHANGELOG-YYYY.md` when exceeded.
|
|
4
4
|
|
|
5
|
+
## 2026-05-29 — v0.5.0: cross-platform parity, security hardening, model picker
|
|
6
|
+
|
|
7
|
+
**Cross-platform daemon + hooks** — the missing platforms now install cleanly:
|
|
8
|
+
|
|
9
|
+
- **Windows hook command format.** `buildHookCommand` is platform-aware: emits `set NLM_HOOK_MODE=mode && "exec" "script"` on Windows so cmd.exe parses it correctly; `smokeTestHookCommand` dispatches via `cmd.exe /c` instead of `sh -c`. Adds `cmdQuote()` for cmd.exe double-quote escaping. Pre-fix, hook install would silently roll back on any Windows-native install because the POSIX `sh -c` smoke test failed at `spawnSync`.
|
|
10
|
+
- **Linux systemd user unit.** New `~/.config/systemd/user/nlm.service` template — `Type=simple`, `Restart=on-failure`, logs to `~/.nlm/logs/daemon-{out,err}.log`. `nlm install` / `nlm uninstall` / `nlm setup` all branch into systemd on Linux (was hint-only). Detects `XDG_RUNTIME_DIR` + `systemctl --user` presence so headless servers get a `loginctl enable-linger` callout instead of a confusing error.
|
|
11
|
+
- Setup wizard now branches on `process.platform` for all three OSes; macOS LaunchAgent flow unchanged.
|
|
12
|
+
|
|
13
|
+
**Hook mode default flipped: shadow → live.** Shadow remains opt-in via `NLM_HOOK_MODE=shadow` in the command, but the wizard and `nlm hook install` now ship `live` so new users see pointer-block injection on the first prompt. Earlier behavior left the hook silent until the user found the toggle, which fails the recall hook's own value prop. Install message text + descriptions updated.
|
|
14
|
+
|
|
15
|
+
**Security hardening** — closes three classes of exposure on top of the existing 127.0.0.1 bind:
|
|
16
|
+
|
|
17
|
+
- **`~/.nlm/` perms backfill.** New `src/install/nlm-dir-perms.ts` recursively chmods dirs → `0o700` and files → `0o600`, idempotent, runs on every `nlm setup`, `nlm install`, and `nlm start`. Covers the upgrade path for installs from before v0.4.2 (when explicit chmod was added only to `writeClassifierConfig`'s output) — existing `~/.nlm/.env` and `canonical.sqlite` would otherwise stay `0o644` forever.
|
|
18
|
+
- **Local-only HTTP middleware** on `/api/*`. Threat model: external network blocked by bind, but DNS rebinding, browser drive-by from cross-origin tabs, and port-forwarded clients on other machines remained. New middleware enforces (1) Host header on the allowed loopback list with or without port, (2) Origin header (when present) on the same loopback list, (3) Bearer token (`Authorization: Bearer ${NLM_MCP_TOKEN}`, timing-safe compared) when Origin is absent. `/api/health` bypasses Origin/Bearer for liveness probes but is still Host-checked. Skipped under Vitest via `!!process.env["VITEST"]` so in-process `app.request()` tests still work.
|
|
19
|
+
- **Auto-generated `NLM_MCP_TOKEN`.** New `ensureMcpToken()` in `src/install/ollama.ts` generates 32 random bytes (64-char hex, 256-bit entropy) and persists to `~/.nlm/.env` if no token is set. Idempotent — re-reads file before writing to survive parallel setup runs. Called from `runSetup` and `nlm start` so existing installs upgrade without operator action.
|
|
20
|
+
- **Hook auth headers.** New `src/hook/hook-auth.ts` exports `hookAuthHeaders()` that attaches Bearer when `NLM_MCP_TOKEN` is set. All three hooks (`prompt-recall-hook`, `session-start-hook`, `stop-hook`) now call `autoloadEnv()` at startup and route their fetch headers through it so they continue to reach `/api/recall` and `/api/recall/cite-event` after the gate goes on.
|
|
21
|
+
|
|
22
|
+
**Classifier provider + model picker.** Wizard now asks for provider (DeepSeek cloud / Ollama local) and model:
|
|
23
|
+
|
|
24
|
+
- DeepSeek path surfaces an explicit privacy callout before the API key prompt: "DeepSeek classification sends up to 30K chars of each session transcript to api.deepseek.com." Then picks from `deepseek-v4-flash` / `deepseek-v4-pro` / `deepseek-chat`.
|
|
25
|
+
- Ollama path queries `localhost:11434/api/tags`, filters out embedding-only models (`nomic-embed`, `mxbai-embed`, `snowflake-arctic-embed`, `bge-*`), and shows the rest as a sorted list. Falls back to `phi4-mini:latest` with a warning if Ollama isn't reachable.
|
|
26
|
+
- `writeClassifierConfig` is now overloaded: legacy `(choice, apiKey?)` signature still works; new `({choice, model, apiKey})` form persists `NLM_CLASSIFIER`, `NLM_CLASSIFIER_MODEL`, and `DEEPSEEK_API_KEY` in `~/.nlm/.env`.
|
|
27
|
+
|
|
28
|
+
**Switch default-case exhaustiveness.** `runSetup`'s runtime-loop switch now has a `default: never` arm that warns on an unknown id — guards against the `multiselect<RuntimeId>` runtime cast silently producing an unmatched value.
|
|
29
|
+
|
|
30
|
+
**Dry-run path fixes.** `nlm connect claude-code --dry-run` now calls `mcpConfigPath()` instead of hardcoding `~/.mcp.json`, so users with `NLM_MCP_CONFIG` set see the right path in the dry-run output.
|
|
31
|
+
|
|
32
|
+
**Tests:** 601/601 passing. New tests cover the Windows hook command format and the cmd.exe smoke-test branch. The local-only middleware is exercised in-process via the existing Vitest harness (skip-gate active under `VITEST`).
|
|
33
|
+
|
|
34
|
+
**Next:** publish v0.5.0 to npm + tag GitHub. Consider an `nlm hook live` / `nlm hook shadow` toggle command so existing v0.4.x installs can flip without re-running `hook install`.
|
|
35
|
+
|
|
36
|
+
## 2026-05-29 — Search rebuild, Thread runtime filters, SessionDrawer nav, pagination
|
|
37
|
+
|
|
38
|
+
**Search page full rewrite** — replaced a hard-capped 50-result list with a production-grade search UI:
|
|
39
|
+
- Pagination: `PAGE_SIZE_OPTIONS = [10, 25, 50, 100]`, default 25, prev/next/first/last controls
|
|
40
|
+
- Filter chips: runtime, status, entity (top 12 + overflow `<select>`), sort mode (relevance/recent)
|
|
41
|
+
- Match snippets: 120-char window anchored on first token hit, `<mark>` highlighting, XSS-safe via HTML escape before regex
|
|
42
|
+
- Field-origin live-tag: shows which field (label/entity/decision/open/summary) matched
|
|
43
|
+
- Score weights: label×3, entity-exact×4, entity-substring×2, decision×2, open×2, summary×1, phrase-bonus+5
|
|
44
|
+
- Sticky search header, "clear filters" button, `anyFilterActive` empty-state hint
|
|
45
|
+
- SessionDrawer integrated; prev/next from paged slice
|
|
46
|
+
|
|
47
|
+
**Thread page** — runtime/agent filter chips:
|
|
48
|
+
- `EntityPicker` now has: search input, sort chips (most-active/least-active/a-z/z-a), pagination [24,48,96] default 48, runtime filter chips
|
|
49
|
+
- `ThreadSessionList` adds runtime filter chip row (only rendered when `threadRuntimes.length > 1`)
|
|
50
|
+
- Bug fixed: runtime filter reset now depends on `entity` string prop, not `thread` object reference (was resetting on sort)
|
|
51
|
+
|
|
52
|
+
**SessionDrawer** — keyboard and button navigation:
|
|
53
|
+
- `prevSessionId` / `nextSessionId` props; ← / → arrow key nav
|
|
54
|
+
- Chevron SVG buttons in drawer header
|
|
55
|
+
|
|
56
|
+
**UI/UX review loop** — spec-first pass (Opus) before developer subagent; review pass after. Caught 3 runtime bugs: hooks ordering, Vitest env detection (`VITEST=1` not `"true"`), CSS currentColor misuse on dot-pulse.
|
|
57
|
+
|
|
58
|
+
**Tests:** 601/601 passing.
|
|
59
|
+
|
|
60
|
+
**Next:** Supersedence visible in River + `get_session` MCP response (editable timeline moat needs to be visible in the UI).
|
|
61
|
+
|
|
62
|
+
## 2026-05-29 — Workspace coverage, global DB mode, CLI connect/disconnect (9503042)
|
|
63
|
+
|
|
64
|
+
**CursorAdapter expansion** — three-format coverage via prefix-based dispatch:
|
|
65
|
+
- `cr_` — global `cursorDiskKV` (current, v1.x+; already shipped)
|
|
66
|
+
- `crw_` — workspace `ItemTable` `composer.composerData` → `allComposers[]` (v0.43–v1.x migration artifact)
|
|
67
|
+
- `crc_` — workspace `ItemTable` `chatdata` tabs (all versions)
|
|
68
|
+
|
|
69
|
+
`parseSession()` routes by prefix; `workspaceStorageDir()` derived from global DB path parent-of-parent so no extra config needed. `discover()` deduplicates across global + all workspace DBs via a `seen` Set.
|
|
70
|
+
|
|
71
|
+
**WindsurfAdapter expansion** — global DB agent/flow sessions (`wsg_` prefix):
|
|
72
|
+
- Tries `cursorDiskKV` first (`composerData:*`, `agentData:*`, `flowData:*`); falls back to `ItemTable` LIKE query on `%agent%`/`%flow%`/`%cascade%` keys when `cursorDiskKV` absent
|
|
73
|
+
- `since` filter bug fixed: `lastSendTime=0` previously matched `0 < cutoff` and was filtered out; guard changed to `ts > 0 && ts < cutoff` so zero-timestamps (unknown age) are always included
|
|
74
|
+
|
|
75
|
+
**All discover() IDs now prefixed** — prefix is the routing token, not decoration. Legacy unprefixed IDs still accepted in `parseSession()` via fallthrough.
|
|
76
|
+
|
|
77
|
+
**CLI commands wired** — `nlm connect cursor`, `nlm connect windsurf`, `nlm disconnect cursor`, `nlm disconnect windsurf`. Each opens the NLM `SqliteSessionStore`, creates a `SourceRegistry`, calls the appropriate install function, prints a one-line report. Supports `--dry-run` and `--db-path`/`--user-dir` overrides. `exactOptionalPropertyTypes` fix: optional CLI option values spread conditionally (`...(opts.x ? { x: opts.x } : {})`).
|
|
78
|
+
|
|
79
|
+
**Tests** — 596/596 passing (up from 543). Adapter tests grew from 39 to 53. Added workspace composer (`crw_`), chat tab (`crc_`), Windsurf global DB (`wsg_`), since=0 fix, and since-filter for all three tab types.
|
|
80
|
+
|
|
81
|
+
**State:** v0.4.2 on npm (no bump this session — workspace+CLI work is additive on 0.4.2). 596 tests green. Commit `9503042`.
|
|
82
|
+
|
|
83
|
+
**Next:** Dedicated UI session — supersedence visible in River + `get_session` MCP response (editable timeline moat needs to be visible).
|
|
84
|
+
|
|
85
|
+
## 2026-05-29 — Cursor adapter + Windsurf adapter (NocoDB #182, #183)
|
|
86
|
+
|
|
87
|
+
**CursorAdapter** (`src/core/adapters/cursor.ts`)
|
|
88
|
+
|
|
89
|
+
Reads Cursor AI composer sessions from `globalStorage/state.vscdb` (macOS: `~/Library/Application Support/Cursor/User/globalStorage/state.vscdb`, Linux: `~/.config/Cursor/User/globalStorage/state.vscdb`). Schema: `cursorDiskKV` key-value table. Session = one `composerData:<composerId>` entry. Messages read from inline `conversation[]` (v1.x) or `bubbleId:<composerId>:*` separate storage (v1.5+). Type `1` = user, `2` = assistant. Session ID prefix `cr_`. Env override: `NLM_CURSOR_DB_PATH`. Migration 014 adds `'cursor'` to the `sources.kind` CHECK constraint.
|
|
90
|
+
|
|
91
|
+
**WindsurfAdapter** (`src/core/adapters/windsurf.ts`)
|
|
92
|
+
|
|
93
|
+
Reads Windsurf (Codeium Cascade) chat sessions from workspace-scoped SQLite DBs in `<UserDir>/workspaceStorage/<hash>/state.vscdb`. Schema: `ItemTable`, key `workbench.panel.aichat.view.aichat.chatdata`, value JSON with `tabs[]`. Each tab = one session. Bubble role: `type: 'user'`→user, `type: 'ai'`→assistant; prefers `rawText` over `text`. Session ID prefix `ws_`. `pathOrUrl` = User directory (adapter discovers all workspace DBs by scanning). Env override: `NLM_WINDSURF_USER_DIR`. Migration 015 adds `'windsurf'` to the constraint.
|
|
94
|
+
|
|
95
|
+
**Wiring:**
|
|
96
|
+
- `from-source.ts`: `cursor` and `windsurf` cases added
|
|
97
|
+
- `source-registry.ts`: `SourceKind` extended; `seedDefaults()` now seeds 8 presets (cursor + windsurf auto-enabled if their paths exist)
|
|
98
|
+
- `tests/integration/source-registry.test.ts`: preset assertions updated to 8
|
|
99
|
+
|
|
100
|
+
**State:** 582 tests passing (was 543, +39 new). Build clean, typecheck clean.
|
|
101
|
+
|
|
102
|
+
**Next:** Training pipeline (#185, deferred until `nlm useful-scan --stats` shows >50 useful-hit-log positives). Consider bumping to v0.5.0 after `nlm connect cursor` CLI wiring.
|
|
103
|
+
|
|
104
|
+
## 2026-05-29 — Credential file permissions hardening (v0.4.2)
|
|
105
|
+
|
|
106
|
+
**Security fix (automated plugin review catch):** `src/install/ollama.ts` `writeClassifierConfig()` now creates `~/.nlm` with `mode: 0o700` and writes `.env` with `mode: 0o600`. Added `chmodSync` on both dir and file to repair permissions on pre-existing installations. This was flagged as HIGH by the `security-guidance@claude-code-plugins` stop hook after the manual audit and peer review had both noted the `644` file issue but the prior session ended before fixing it. Published as `nlm-memory@0.4.2`.
|
|
107
|
+
|
|
108
|
+
**State:** 543 tests passing. GitHub tag `v0.4.2` pushed.
|
|
109
|
+
|
|
110
|
+
**Next:** Cursor adapter (NocoDB task #182), Windsurf adapter (#183). Training pipeline (#185) deferred until >50 useful hits in `useful-hit-log.jsonl`.
|
|
111
|
+
|
|
5
112
|
## 2026-05-29 — Security hardening: bind address, timing-safe auth, backup/restore gate
|
|
6
113
|
|
|
7
114
|
**Changes:**
|
|
@@ -99,239 +206,4 @@ CLI: `nlm connect hermes-agent` / `nlm disconnect hermes-agent` added to `src/cl
|
|
|
99
206
|
|
|
100
207
|
**Next:** transcript adapter for NousResearch Hermes sessions (session files stored in `~/.hermes-agent/sessions/` or equivalent); C2 Aider adapter; B3 extract-triples improvements.
|
|
101
208
|
|
|
102
|
-
## 2026-05-28 — C1: OpenCode adapter (SQLite-based, `opencode/1.0`)
|
|
103
|
-
|
|
104
|
-
OpenCode stores all sessions in a single SQLite DB (`~/Library/Application Support/opencode/opencode.db` on macOS, `$XDG_DATA_HOME/opencode/opencode.db` on Linux) rather than per-session JSONL files. The adapter reads it via `better-sqlite3` in readonly mode, reusing the same `TranscriptAdapter` port as Claude Code, Hermes, and pi.
|
|
105
|
-
|
|
106
|
-
**What ships**
|
|
107
|
-
|
|
108
|
-
- `src/core/adapters/opencode.ts` (new) — `OpenCodeAdapter` class. `detect()` checks for the DB file. `discover()` queries `session WHERE time_archived IS NULL` with optional `time_updated >= since` filter. `parseSession(sessionId)` joins the `session`, `message`, and `part` tables: extracts `text` parts (non-ignored) and `tool` parts (summarized as `[tool: <name>]`), skips structural parts (step-start/finish, reasoning, compaction, snapshot, patch, agent, retry). Label comes from `session.title` unless it's `"New session"`, in which case it falls back to the first user turn. `gitBranch` read from `.git/HEAD` in `session.directory`. `sourcePath` is `${dbPath}::${sessionId}`.
|
|
109
|
-
- `migrations/010_sources_opencode.sql` (new) — SQLite table-recreate migration to add `"opencode"` to the `sources.kind` CHECK constraint (SQLite does not support `ALTER COLUMN`). Copies existing rows, drops old table, renames new.
|
|
110
|
-
- `src/core/adapters/from-source.ts` — `"opencode"` case added to `adapterFromSource` switch.
|
|
111
|
-
- `src/core/sources/source-registry.ts` — `SourceKind` union extended; `seedDefaults()` now seeds 4 presets (added OpenCode row, auto-enabled if DB exists).
|
|
112
|
-
- `tests/unit/core/adapters/opencode.test.ts` (new) — 15 tests: detect enabled/disabled, discover (all sessions, archived exclusion, since filter, absent DB), parseSession (null for unknown, null for no usable turns, turn count + roles, ignored-part skipping, tool-part summarization, title label, fallback label, sourcePath format, projectDir, absent DB, ISO timestamps), and metadata assertions.
|
|
113
|
-
- `tests/integration/source-registry.test.ts` — two assertions updated: "seeds three presets" → "seeds four presets"; kind list updated to include `"opencode"`.
|
|
114
|
-
|
|
115
|
-
**Architecture note**
|
|
116
|
-
|
|
117
|
-
The `discover()` / `parseSession()` contract treats session IDs (not file paths) as the identifying string — the interface's `path: string` param is opaque, so this is valid. Users with OpenCode already installed get the source auto-enabled on first `nlm migrate` + daemon restart with no manual configuration.
|
|
118
|
-
|
|
119
|
-
**Tests: 488 pass** (was 470 before this session). All 57 test files green, build clean.
|
|
120
|
-
|
|
121
|
-
**Next:** README rewrite (D) — drop "self-improving accuracy" promise; lead with the three moats (editable timeline, cross-runtime MCP reach, 97.2% R@5). Then NousResearch Hermes adapter (#165, P1).
|
|
122
|
-
|
|
123
|
-
## 2026-05-28 — Code review: HOOK_SCRIPT_MARKERS bug caught and patched (44fec62)
|
|
124
|
-
|
|
125
|
-
`code-review:code-review` skill run against commits `10c16ac..285fe9e`. One confirmed bug found and fixed: `HOOK_SCRIPT_MARKERS` in `claude-settings.ts` did not include the three Phase 2 hook filenames (`session-start-hook.js`, `pre-compact-hook.js`, `subagent-start-hook.js`). Consequence: `nlm hook uninstall` silently left all three hooks behind; each reinstall appended a duplicate instead of replacing. Live settings had two `SessionStart` NLM entries. Fix: added three filenames to `HOOK_SCRIPT_MARKERS`, updated stale file-level comment, rebuilt, reinstalled. Settings deduplicated (1 entry per event × 6 hooks). 436/436 tests pass. No other confirmed bugs from the review — four lower-confidence items scored below 80 and were not acted on.
|
|
126
|
-
|
|
127
|
-
**State:** `nlm v0.3.0` installed globally. 6 hooks clean in `~/.claude/settings.json`. Shadow mode live.
|
|
128
|
-
|
|
129
|
-
**Next:** `nlm useful-scan` CLI (B1 full); C1 OpenCode adapter #180 (P1); B3 extract-triples redesign; tests for `session-start-hook.ts`.
|
|
130
|
-
|
|
131
|
-
## 2026-05-28 — Deploy v0.3.0: 6 hooks live; cite_session double-count fixed; useful_hit_rate stub; session-start source added
|
|
132
|
-
|
|
133
|
-
Four commits on main (`976e549` → `d013caf`). All 436 tests green throughout.
|
|
134
|
-
|
|
135
|
-
1. **B2 double-count fix** (`976e549`): `citation-detect.ts` was re-detecting `cite_session` tool_uses in the Stop hook and writing a second citation log entry. MCP handler already calls `appendCitation()` directly. Fix: skip `cite_session` in Stop hook detector; updated 5 tests in `citation-detect-cite-session.test.ts`.
|
|
136
|
-
2. **B1 stub** (`976e549`): added `useful_hit_rate: null` to `StatsResult` + both `recallStats()` return paths. Daily digest shows "pending" cleanly instead of a field-access error. Unblocks schema for future `nlm useful-scan` CLI.
|
|
137
|
-
3. **Phase 2 hook wiring** (`becb591`): `ALL_HOOKS` now includes SessionStart, PreCompact, SubagentStart. Version string corrected 0.2.0-dev → 0.3.0.
|
|
138
|
-
4. **session-start source** (`d013caf`): `src/hook/session-start-hook.ts` written against current interfaces (stale dist imported `loadSurfacedForBudget` that no longer exists). `ClaudeHookEvent` union extended with `SessionStart` + `SubagentStart`.
|
|
139
|
-
|
|
140
|
-
**State:** `nlm v0.3.0` installed globally, all 6 hooks active in shadow mode. Live measurement window open. Three weekly metrics per D5 start accumulating: cite_session call rate, useful_hit_rate (shows pending until nlm useful-scan lands), null-recall rate.
|
|
141
|
-
|
|
142
|
-
**Next:** `nlm useful-scan` CLI (B1 full implementation); B3 extract-triples redesign; C1 OpenCode adapter #180.
|
|
143
|
-
|
|
144
|
-
## 2026-05-28 — D4 thesis pivot: citation moat downgraded permanently; adapter breadth + editable timeline elevated; Phase 0/2/3 engineering landed
|
|
145
|
-
|
|
146
|
-
Full-day arc on 2026-05-27 producing three clusters of work: a 3-agent audit exposing recall-layer defects, five engineering branches integrated (Phases 0/2/3 of the 90-day plan), and a D4 strategic-pivot decision ending in a permanent thesis revision. The cite_session MCP tool lands on this branch (`phase-1c-cite-tool`) as the last Phase 0 piece.
|
|
147
|
-
|
|
148
|
-
**Morning 3-agent audit findings (functional + comparative + operability lenses, parallel dispatch)**
|
|
149
|
-
|
|
150
|
-
All three agents converged on the same structural defect: `src/hook/citation-detect.ts:54-64` matches session IDs only in tool *inputs*, but `recall_sessions` inputs carry `{query, mode, limit}` — never a session ID. Only `get_session({id})` triggers a match. The dominant agent usage pattern (read the surfaced digest → answer from it without a follow-up `get_session`) generates zero citations. Additional defects found: `useful:true` field never written by any code path (metric permanently reads zero); conversation memo cap of 10 silently kills injection past turn 4 because the memo is never pruned mid-conversation; hook locked to `mode=keyword` so Build F's force-include logic never activates at hook time.
|
|
151
|
-
|
|
152
|
-
**Phases 0/2/3 engineering shipped (5 branches integrated)**
|
|
153
|
-
|
|
154
|
-
- Phase 0.1 — Stop hook citation detector fixed: scans ALL assistant turns, not just the last (CHANGELOG entry "Stop-hook multi-turn citation detection" above)
|
|
155
|
-
- Phase 0.2 — `useful:true` write path added; `useful_hit_rate` goes from structurally 0% to a real metric
|
|
156
|
-
- Phase 0.3 — Conversation memo pruning: memo capped at 15 and pruned on oldest-first when full; late-turn injection no longer silently dies at turn 4
|
|
157
|
-
- Phase 0.4 — Hook mode aligned to `hybrid` so Build F's force-include logic activates at hook time
|
|
158
|
-
- Phase 2 — SessionStart injection + opinionated 3-hook subset (SessionEnd, PreCompact, SubagentStart)
|
|
159
|
-
- Phase 3 — Training-data collection scaffolding (citation-log schema extended)
|
|
160
|
-
- Phase 1c (this branch) — `cite_session` MCP tool: explicit citation primitive that lets MCP clients signal "I used session X," bypassing the Stop-hook inference chain entirely
|
|
161
|
-
|
|
162
|
-
**Phase 1 simulation methodology failures (do not repeat)**
|
|
163
|
-
|
|
164
|
-
Two simulations run during the day produced misleading numbers and were invalidated before they influenced any decision:
|
|
165
|
-
|
|
166
|
-
1. A 70-prompt simulation of hook injection reported 15.7% `useful_hit_rate` but used a content-full injection format instead of production's pointer-only format. The content-full format inflates compliance because the session body is present in context; in production the body is NOT injected, only the ID + label. Methodology bug caught by a follow-up validator. The simulation's number was correct for the simulation's format; it is not an estimate of production rates.
|
|
167
|
-
2. A 6-session "Arm C" simulation generated by the evaluating agent itself with deliberate compliance, reporting 100% session compliance. This is not a population estimate — it is a demonstration that the measurement pipeline works end-to-end. The agent knows it is being measured and authored the sessions to pass. Not a useful number for forecasting real-system citation rates.
|
|
168
|
-
|
|
169
|
-
Rule extracted: simulation is valid for pipeline-validity checks (does the detector fire, does the rate field get written, does the log grow?). Simulation cannot estimate rates because rates are population statistics that require real-system samples.
|
|
170
|
-
|
|
171
|
-
**Pivot to `cite_session` MCP tool primitive**
|
|
172
|
-
|
|
173
|
-
The per-prompt ID injection approach (injecting surfaced session IDs into every UserPromptSubmit) was prototyped, tested against real Claude Code behavior, and rejected as not shippable. Real Claude 0% prose citation rate on pointer-only injection: the model does not naturally surface NLM session IDs in its prose responses, and the pointer-only format (no body) doesn't give the model enough content to form a meaningful citation. Reverting that branch (`revert(hook): remove cite-by-ID injection`) was correct before `cite_session` landed as the alternative: an explicit MCP tool the agent can call to record a citation when it genuinely used a session.
|
|
174
|
-
|
|
175
|
-
**D4 thesis pivot (permanent)**
|
|
176
|
-
|
|
177
|
-
Senior ML engineer review concluded the citation-trained-reranker moat hypothesis fails on fundamentals:
|
|
178
|
-
|
|
179
|
-
1. Per-operator reranker trained on ~500-1000 citation events/year with prose-channel noise cannot beat the existing hybrid+RRF baseline at 97.2% R@5. The training set is too small and the signal-to-noise ratio too low for a meaningful quality lift.
|
|
180
|
-
2. Aggregated cross-operator training violates local-first. The privacy property is the product; pooling operator data to make reranking better destroys it.
|
|
181
|
-
3. Per-operator moat math: at ~70 daily recalls with ~15% compliance, the per-operator corpus is ~3,800 labeled rows/year. Cross-encoder reranking needs 10K+ to statistically separate from the baseline. The gap does not close in reasonable product timelines.
|
|
182
|
-
|
|
183
|
-
**Citation feedback loop's new role (quality-monitoring, not reranker training):** detect trash recalls, populate a "frequently useful" dashboard the operator can act on, seed future supervised work if/when aggregated data becomes available under explicit consent. NOT a moat. NOT a reranker pipeline. Permanently downgraded.
|
|
184
|
-
|
|
185
|
-
**Three elevated moats (D4 thesis):**
|
|
186
|
-
|
|
187
|
-
1. **Editable timeline / supersedence** — schema-level, retrofit-impossible for append-only-with-decay competitors (agentmemory, mem0, honcho). The supersedence link is the only non-destructive timeline primitive in any of the four systems.
|
|
188
|
-
2. **Cross-runtime reach via MCP** — Claude Code + Codex shipped. OpenCode (#164) and NousResearch Hermes (#165) adapters UNSHELVED — both elevated to P1. Cross-runtime distribution is now load-bearing, not nice-to-have.
|
|
189
|
-
3. **Passive corpus quality** — 97.2% R@5 on LongMemEval-S (better than agentmemory's published 95.2%). Build F's force-include + hybrid+RRF is the foundation. This number is credible, reproducible, and does not require the citation moat to be true.
|
|
190
|
-
|
|
191
|
-
**90-day direction (revised)**
|
|
192
|
-
|
|
193
|
-
Adapter breadth is the primary workstream: #164 OpenCode (~2 weeks), #165 NousResearch Hermes (after #164), then pi.dev/Cursor/Aider via `nlm-wrap` MCP wrapper. Editable-timeline UX visibility (making supersedence queryable from the River UI and the MCP `get_session` surface) is the secondary workstream — the moat is structural but invisible until the UI exposes it. Phase 1 passive measurement window continues; 97.2% R@5 baseline is the floor to defend.
|
|
194
|
-
|
|
195
|
-
**Next priorities**
|
|
196
|
-
|
|
197
|
-
1. #164 OpenCode adapter (P1, ~2 weeks) — cross-runtime is load-bearing
|
|
198
|
-
2. #165 NousResearch Hermes adapter (P1, after #164)
|
|
199
|
-
3. Editable-timeline UX: supersedence visible in River + `get_session` response
|
|
200
|
-
4. Retain 97.2% R@5 baseline — no retrieval algorithm changes without clearing this bar
|
|
201
|
-
|
|
202
|
-
## 2026-05-27 — Codex CLI adapter: marketplace plugin + MCP config wiring + interactive-mode hook dispatch
|
|
203
|
-
|
|
204
|
-
Cross-runtime adapter work, first target landed. NLM is now installable on Codex CLI via `nlm connect codex`, which registers a local plugin marketplace, installs the `nlm-memory` plugin, writes a sentinel-bracketed `[mcp_servers.nlm-memory]` block to `~/.codex/config.toml`, and (optionally with `--with-hooks`) drops a legacy `~/.codex/hooks.json` fallback. Designed to mirror agentmemory's distribution pattern but the integration surface for current Codex (0.134.0) is materially different from both Codex Desktop and the wiki's 2026-05-23 prediction.
|
|
205
|
-
|
|
206
|
-
**What ships**
|
|
207
|
-
|
|
208
|
-
- `plugin/.codex-plugin/plugin.json` — Codex plugin manifest declaring `mcpServers: "./.mcp.json"` and `hooks: "./hooks/hooks.json"` pointers
|
|
209
|
-
- `plugin/hooks/hooks.json` — `UserPromptSubmit` + `Stop` event registrations, scripts referenced via `${CLAUDE_PLUGIN_ROOT}`
|
|
210
|
-
- `plugin/.mcp.json` — MCP server registration (spawns `nlm mcp` over stdio); duplicated by the direct config.toml writer for redundancy
|
|
211
|
-
- `plugin/scripts/{prompt-recall-hook,stop-hook}.mjs` — esbuild single-file bundles of the existing TS hook entries, build pinned in `scripts/build-codex-plugin.mjs`
|
|
212
|
-
- `.agents/plugins/marketplace.json` — marketplace manifest declaring the plugin and its source path (`./plugin`)
|
|
213
|
-
- `src/install/codex.ts` — `connectCodex` / `disconnectCodex` / `writeMcpServerToConfig` / `removeMcpServerFromConfig` / `writeLegacyHooks` / `removeLegacyHooks`. Marketplace + plugin add are delegated to the `codex` binary (it owns trust + snapshot state); MCP config and hooks.json are written directly with sentinel markers so disconnect can strip exact regions without touching user-authored content.
|
|
214
|
-
- `src/cli/nlm.ts` — `nlm connect codex` and `nlm disconnect codex` commands. Flags: `--source <owner/repo>` (default `pbmagnet4/nlm-memory-ts`), `--local` shortcut for dev, `--with-hooks` to also write the legacy fallback, `--dry-run`.
|
|
215
|
-
|
|
216
|
-
**The four wrong-then-right turns worth keeping in memory**
|
|
217
|
-
|
|
218
|
-
1. *Codex hooks are not Claude-Code-shape settings.json entries.* The 2026-05-23 wiki claim of "identical schema, ~95% script reuse" was wrong on the install mechanism. Codex uses a marketplace + plugin architecture. Hook *contract* (events, stdin payload, stdout convention) is identical to Claude Code; install path is entirely different. Script logic reuses verbatim.
|
|
219
|
-
2. *Marketplace requires a `.agents/plugins/marketplace.json` at the repo root.* First connect attempt failed with `marketplace root does not contain a supported manifest` until that file landed. Reverse-engineered from `~/.codex/.tmp/plugins/.agents/plugins/marketplace.json` shipped by `openai-curated`.
|
|
220
|
-
3. *The marketplace policy field is enum-constrained.* `authentication: "NONE"` rejected as `unknown variant`; only `"ON_INSTALL"` and `"ON_USE"` accepted. NLM has no auth to do, so `"ON_USE"` was picked as a no-op-on-use default. Marketplace went green after the swap.
|
|
221
|
-
4. *`--dangerously-bypass-hook-trust` is misleadingly named.* The flag warns "hooks may run without review for this invocation" but in practice does not bypass trust at all. Hooks dispatched only after persisting trust via an interactive Codex session. Once trust landed in `[hooks.state]`, hooks fired in subsequent `codex exec` (non-interactive) calls too. The bypass flag's real role is unclear.
|
|
222
|
-
|
|
223
|
-
**Verified end-to-end** (`019e69fa-4ea1-7b10-8c66-70bda64ba086` is the codex session used for final validation)
|
|
224
|
-
|
|
225
|
-
- ✅ `codex plugin marketplace add ./` (local source) succeeds
|
|
226
|
-
- ✅ `codex plugin add nlm-memory@nlm-memory-ts` produces `installed, enabled` in `codex plugin list`
|
|
227
|
-
- ✅ Cached plugin at `~/.codex/plugins/cache/nlm-memory-ts/nlm-memory/0.3.0/` contains all expected files including dotfile dirs (`.codex-plugin/`, `.mcp.json`)
|
|
228
|
-
- ✅ `[mcp_servers.nlm-memory]` block written to `~/.codex/config.toml` between sentinels; idempotent under repeated connects; cleanly stripped on disconnect
|
|
229
|
-
- ✅ `UserPromptSubmit` hook dispatches from plugin path: codex stdout shows `hook: UserPromptSubmit` / `hook: UserPromptSubmit Completed`, hook-log gains an entry with codex session UUID (`019e...`), recall ran, gate evaluated, would-inject populated, shadow mode logged correctly
|
|
230
|
-
- ✅ Plugin-only default (`nlm connect codex` without `--with-hooks`) fires UserPromptSubmit exactly once per turn. The earlier double-fire with `--with-hooks` enabled (plugin path + legacy `~/.codex/hooks.json` both fired) is exactly why `--with-hooks` stays opt-in
|
|
231
|
-
- ✅ `codex_features list` confirms `hooks: stable, true` (so the runtime supports them) but `plugin_hooks: removed, false` (the older feature flag is dead; current path is the `hooks` engine with plugin-bundled config pointers)
|
|
232
|
-
|
|
233
|
-
**Not yet verified**
|
|
234
|
-
|
|
235
|
-
- ⏳ `Stop` hook dispatch — needs a one-time interactive trust approval before it fires (Codex only prompts for trust on hooks that have a chance to run; `codex exec` -p with bypass-trust did not surface a Stop prompt). Will land on Edward's next interactive `codex` turn.
|
|
236
|
-
- ⏳ Remote marketplace install (`codex plugin marketplace add pbmagnet4/nlm-memory-ts`). The local install is the harder code path (the marketplace.json had to be authored from scratch); remote install reuses the same files via git fetch. Verifying in this session's tail after the GitHub push.
|
|
237
|
-
|
|
238
|
-
**Trust mechanics, for the future**
|
|
239
|
-
|
|
240
|
-
Codex persists hook trust per `(source, event, ...)` tuple under `[hooks.state]` in `config.toml`. Once a user approves a hook the first time, subsequent invocations (including `codex exec`) fire without prompting. The hash is content-addressed — a release that changes a script binary requires re-trust. This means `nlm connect codex` from a fresh install always requires one interactive `codex` turn to bootstrap trust before hooks fire; we cannot do that step on the user's behalf.
|
|
241
|
-
|
|
242
|
-
**Build pipeline**
|
|
243
|
-
|
|
244
|
-
`npm run build` now chains `build:server` (tsc) + `build:ui` (vite) + `build:codex-plugin` (esbuild). The codex-plugin build is single-file per entry (no dependency tree shipped), platform=node, format=esm, target=node20. Each .mjs is under 10KB.
|
|
245
|
-
|
|
246
|
-
**Tests**
|
|
247
|
-
|
|
248
|
-
414 unit + integration pass unchanged. No new test files added in this commit — the install path is exercised by the verified end-to-end smoke flow (`nlm connect codex --local` → `codex exec` → hook-log delta inspection). Test surface for install/codex.ts and the build script should land in a follow-up.
|
|
249
|
-
|
|
250
|
-
**Wiki correction owed**
|
|
251
|
-
|
|
252
|
-
`Whtnxt Agent Vault/Ventures/nlm-memory/learnings.md` line 218 lists Codex CLI as "`~/.codex/` JSON-config hooks (identical schema to Claude Code) … ~95% script reuse from Claude Code". The script reuse claim is correct (the .ts files port verbatim); the install-mechanism claim is wrong (marketplace + plugin, not settings.json). Wiki update is the next priority after this commit lands.
|
|
253
|
-
|
|
254
|
-
**Next priorities** (revised from the morning's stack)
|
|
255
|
-
|
|
256
|
-
1. Wiki update correcting the 2026-05-23 cross-runtime hook landscape table and adding a Codex plugin Tool Lesson. ← **Up next.**
|
|
257
|
-
2. Stop hook validation on Edward's first interactive codex turn (passive — happens whenever).
|
|
258
|
-
3. NousResearch Hermes Agent (#165) — has the cleanest `plugin.yaml` hook surface and was identified in the wiki as the next runtime worth a real adapter. I can validate it end-to-end without a TTY, unlike Codex.
|
|
259
|
-
4. Mode B pre-mortem and alt-embedding A/B remain shelved.
|
|
260
|
-
|
|
261
|
-
## 2026-05-27 — Stop-hook multi-turn citation detection: useful_hit_rate goes from structurally 0% to a real metric
|
|
262
|
-
|
|
263
|
-
Bug-fix to the Stop hook's citation detector. The previous implementation scanned only the LAST assistant turn of the transcript, but `tool_use` blocks live in earlier turns — the typical pattern is `tool_use → tool_result → prose summary`, and Stop fires after the summary. The detector saw prose, found no tool_use, logged 0 citations. Production evidence: 348 Stop firings with surfaced IDs, **zero** citations recorded, despite 23 real `mcp__nlm-memory__*` tool_uses in the matching transcripts over the last 7 days.
|
|
264
|
-
|
|
265
|
-
**Diagnosis path.** Cross-referenced `~/.nlm/hook-log.jsonl` (stop entries, all `citedIds:[]`) against `~/.claude/projects/<workspace>/<conv>.jsonl` (real assistant turns). Drilled into `1fc5a8f1-00fa-4ff5-85e7-a239072082b2`: recall hook surfaced `cc_7ff73609-…`, the assistant called `get_session({id:"cc_7ff73609-…"})` in turn N-1, then wrote a prose summary in turn N; the Stop hook scanned only turn N and logged `citedIds:[]`. Confirmed by code path at `transcript.ts:48` — the loop returns on the first assistant line found walking from the end.
|
|
266
|
-
|
|
267
|
-
**Changes**
|
|
268
|
-
- `src/core/hook/transcript.ts` — added `readAllAssistantTurns(transcriptPath): ReadonlyArray<AssistantTurn>` that returns every assistant turn in order. Kept `readLastAssistantTurn` as a thin wrapper (single test caller; back-compat for non-Stop callers).
|
|
269
|
-
- `src/core/hook/cite-memo.ts` (new) — per-conversation cited-set memo mirroring `memo.ts`. Same state dir (`~/.nlm/hook-state/`, overridable via `NLM_HOOK_STATE_DIR`), filename suffix `.cited.json` so memo-sweep's existing dir-walk cleans both surfaced and cited memos by mtime. `loadCited` / `recordCited` / `clearCited`.
|
|
270
|
-
- `src/hook/stop-hook.ts` — `runStopHook` now reads all assistant turns, unions text + tool_uses across them, runs `detectCitations` over the union, dedupes against `loadCited(conversationId)`, posts the fresh ones, and persists via `recordCited`. The `responsePreview` stays as the LAST turn's prose (that's the text Edward saw when Stop fired). Daemon remains blind-append; dedup is hook-local.
|
|
271
|
-
- `src/hook/session-end-hook.ts` — `runSessionEnd` now also calls `clearCited` so both memos are cleaned on session close.
|
|
272
|
-
- `scripts/backfill-citations.mjs` (new) — one-shot historical replay. Walks `~/.nlm/hook-log.jsonl` to collect surfaced-ID sets per conversation, finds matching transcripts under `~/.claude/projects/`, runs the same detector, dedupes against existing `~/.nlm/citation-log.jsonl` entries, appends fresh citations with a `backfill:true` marker. Idempotent. Dry-run by default; `--commit` writes.
|
|
273
|
-
|
|
274
|
-
**Validation**
|
|
275
|
-
- Tests: 414 unit + integration tests pass (was 396, +18 new). New cases cover: tool_use detected when it's in an earlier turn and the last turn is prose-only (the real-world pattern); dedup across repeated Stop firings on a growing transcript; local memo update even when `postCitation` fails (no double-count on next fire); 10 `cite-memo` cases (load/record/clear/corrupt-file/non-array/path-safety); 3 `readAllAssistantTurns` cases; 2 new session-end cases.
|
|
276
|
-
- Typecheck clean on changes (pre-existing `SessionEnd` error in `hook-claude-settings.test.ts` is unrelated and predates this work).
|
|
277
|
-
- Backfill dry-run against the live `~/.nlm/hook-log.jsonl`: 42 conversations had surfaced IDs, 37 had a matching transcript, **4 conversations contain at least one tool_use citation the old detector missed**. Lower than the upper bound suggests by raw tool-use count (23) because many tool_uses were `recall_sessions`/`recall_facts` (no surfaced-ID-in-input — those are pull, not push-follow-up). The 4 captured citations are the ones where the model actually drilled into a surfaced session via `get_session(id=...)`.
|
|
278
|
-
|
|
279
|
-
**Impact.** `useful_hit_rate` (cited / surfaced) goes from a structural 0% to a real signal. This is the training-data substrate for the future learned reranker (each row in the citation log is a `(query, returned_id, was_cited)` triple once joined against `~/.nlm/query_log.jsonl` by `conversation_id`). The 348 stop firings that previously generated zero training rows would have generated ~10-15 if the detector had been working — small but real, and growing with every conversation going forward.
|
|
280
|
-
|
|
281
|
-
**Methodology note worth keeping.** The bug was diagnosable in <10 minutes by cross-referencing two existing log streams (hook-log.jsonl × Claude Code transcripts) before touching code. Tomorrow's-self version of this rule: when a telemetry metric reads structurally zero, scan the raw inputs the metric is supposed to consume before assuming the metric is correct. Filing in `Operations/what-works/code-quality.md` candidate set.
|
|
282
|
-
|
|
283
|
-
**Next priorities (unchanged from earlier today's update):**
|
|
284
|
-
|
|
285
|
-
1. ~~Stop hook citation rate.~~ Shipped.
|
|
286
|
-
2. Pre-mortem Mode B before any code. Ceiling +1.5% hybrid temporal — current recommendation is to shelve unless a separate driver emerges.
|
|
287
|
-
3. Cross-runtime hook adapters (Hermes / pi / Codex). Unchanged.
|
|
288
|
-
4. Alt-embedding A/B — still deferred.
|
|
289
|
-
|
|
290
|
-
**Source:** Whtnxt Agent orchestrator session 2026-05-27 (continuation from Build F ship). Diagnosis grounded in `~/.nlm/hook-log.jsonl` (342 stop entries, 0 citations) and `~/.claude/projects/-Users-echalupa-Documents-Coding-Projects-Whtnxt-Agent/*.jsonl` (23 NLM tool_uses across 7 days).
|
|
291
|
-
|
|
292
|
-
## 2026-05-27 — Build F shipped: force-include keyword rank-1 on temporal+entity shape; hybrid temporal +3.0 / aggregate +0.8 / hybrid beats keyword for the first time
|
|
293
|
-
|
|
294
|
-
Single session arc, ~6 hours: Build E′ (asymmetric RRF multiplicative boost) shipped → harness-tested → falsified by head-baseline → reverted → diagnosed via per-question `results.json` → Probes 1 & 2 designed and run → Build F (post-merge force-include) shipped → confirmed by clean A/B head-baseline → shipped. Three full harness runs (1 cold ~50 min + 2 hot ~25s) plus two probe scripts. Zero false ships.
|
|
295
|
-
|
|
296
|
-
**Build E′ (falsified path, recorded for audit trail).** Built `src/core/recall/query-shape.ts` with `detectQueryShape(query)` returning `{hasTemporal, hasNamedEntity}` (temporal regex covers "N days/weeks/months ago", "last <day>", "when did", "before/after I", "yesterday/today/tomorrow"; named-entity accepts ALL-CAPS acronyms and mixed-case tokens, excludes days of week and month names to avoid Mode B false-fires). Modified `mergeHybrid` to accept a `boostKeyword` param and multiply the keyword leg's `1/(RRF_K + rank_kw)` by 1.75 on shape match. Added 27 unit tests for `detectQueryShape`. Harness run `2026-05-26-16-39-52` (n=500, ~48 min, partial cache): hybrid temporal 91.0 → 92.5 / aggregate 95.8 → 96.4. Head-baseline rerun with boost disabled on the same cache (`2026-05-26-16-57-47`, 26.3s): **byte-identical numbers**. The lift was 100% cache enrichment from the 7,500→5,500 chunk-size change populating new embeddings; the boost contributed zero. Post-mortem probe: detector fires on 23/133 temporal queries, but on those 23 the multiplicative boost changed zero top-5 results — the boost magnitude (1.75×) was too small to overcome the "session appears in both lists at lower rank" advantage in RRF. Reverted; recorded in [[track-record]].
|
|
297
|
-
|
|
298
|
-
**Build F (shipped).** Replaced the failed multiplicative boost with a post-merge **force-include**: when shape is `temporal && namedEntity`, ensure `kwHits[0].session.id` is in the merged top-`limit` set; if not, insert at position `limit - 1`, displacing the lowest-confidence merged hit. Sidesteps RRF arithmetic entirely. ~10 lines in `forceIncludeKeywordTop()` helper at `src/core/recall/recall-service.ts`; detector unchanged from E′.
|
|
299
|
-
|
|
300
|
-
**Pre-build probes justified the build.** Probe 1 joined each hybrid temporal miss's keyword `returnedIds` against the dataset's `answer_session_ids` to compute keyword's rank for the gold session — on the 7 KW-FOUND misses, 5 had keyword rank=1 and 2 were within rank 5 (force-include trivially recovers all 7 if the detector fires). Probe 2 measured detector fire rate by `question_type`: 17.3% on temporal-reasoning, 0% on the two paraphrase types (single-session-preference, single-session-assistant), 1.4-2.6% on the other non-temporal types — bounded blast radius of ~5 queries across 367 non-temporal questions.
|
|
301
|
-
|
|
302
|
-
**Clean A/B (same hot cache, identical code except the force-include branch).** Build F (`2026-05-26-22-47-07`, cold rebuild ~85 min) vs head-baseline boost-off (`2026-05-26-22-56-53`, 22.1s on now-hot cache):
|
|
303
|
-
|
|
304
|
-
| Metric | Off | On | Δ |
|
|
305
|
-
|---|---|---|---|
|
|
306
|
-
| hybrid aggregate | 96.4 | **97.2** | **+0.8** |
|
|
307
|
-
| hybrid temporal | 92.5 | **95.5** | **+3.0** |
|
|
308
|
-
| all other types | byte-identical | byte-identical | 0 |
|
|
309
|
-
| keyword aggregate | 96.6 | 96.6 | 0 |
|
|
310
|
-
| semantic aggregate | 91.6 | 91.6 | 0 |
|
|
311
|
-
|
|
312
|
-
Zero regression on any question type. Detector unchanged from E′ — the difference is force-include sidestepping the RRF math rather than trying to outmuscle it.
|
|
313
|
-
|
|
314
|
-
**Hybrid finally beats keyword on aggregate** (97.2 > 96.6) — first time on this benchmark. Resolves the structural tension from 2026-05-25 where keyword led aggregate R@5. The 2026-05-23 MCP default flip to hybrid is now backed by k=5 numbers, not just the k=20 ablation.
|
|
315
|
-
|
|
316
|
-
**Gate check vs the 2026-05-26 brief:** target was `hybrid temporal R@5 ≥ +4 (target ~95+)`. Landed at +3.0 / 95.5 — one question shy of +4 but inside the 95+ landing target. The miss is "Who did I meet with during the lunch last Tuesday?" — detector skips because day-of-week is excluded from the named-entity set (necessary to avoid Mode B false-fires). Adding day-of-week as NE would catch this one question but cost the Mode B exclusions. Not worth the trade at scale.
|
|
317
|
-
|
|
318
|
-
**Tests:** 186 unit tests pass (added 27 for `detectQueryShape`); typecheck clean on changes (pre-existing `SessionEnd` error in `hook-claude-settings.test.ts` unrelated). Daemon unchanged (Build F is recall-path code, not ingest/embed).
|
|
319
|
-
|
|
320
|
-
**Operational gotcha filed.** Mid-session, `~/.cache/longmemeval/{embeddings.sqlite,longmemeval_s_cleaned.json}` vanished between two harness runs — macOS Sonoma+ auto-cleanup of `~/.cache/` during an idle window. Cost ~90 min of cold rebuild + 277 MB redownload. Mitigation: move the cache outside `~/.cache/` via `LONGMEMEVAL_CACHE_DIR=$HOME/.local/share/longmemeval` before the next harness run. Full diagnosis in `Operations/Tool Lessons/longmemeval-harness.md` (vault) — also captures the harness performance envelope and the pre-build probing methodology.
|
|
321
|
-
|
|
322
|
-
**Methodology lesson worth keeping.** Two-to-five-line probe scripts catch dead hypotheses cheaper than a full harness run. Pattern: (a) probe detector fire rate on the target distribution, (b) probe detector fire rate on the non-target distribution (blast radius), (c) probe the failure mode's mechanism (rank position, candidate-set membership). Run before harness; the result is right whether or not the build ships. Filed in `Ventures/nlm-memory/track-record.md` and `Operations/Tool Lessons/longmemeval-harness.md`. Candidate addition to `Operations/what-works/code-quality.md` if the pattern recurs outside NLM.
|
|
323
|
-
|
|
324
|
-
**Next priorities (updated):**
|
|
325
|
-
|
|
326
|
-
1. **Stop hook citation rate.** Now the highest-leverage moat work — hybrid is structurally sound at 97.2 aggregate; further R@5 work hits diminishing returns until a different lever gets pre-mortem'd.
|
|
327
|
-
2. **Pre-mortem Mode B before any code.** Only 2 of 10 hybrid temporal misses are both-leg misses. Ceiling on a successful Mode B fix is +2/133 = +1.5% hybrid temporal. Probe: can a query-time date parser actually resolve those 2 questions' answer windows? If under 50%, the build doesn't justify itself.
|
|
328
|
-
3. **Cross-runtime hook adapters** (Hermes / pi / Codex). Unchanged from prior handoff.
|
|
329
|
-
4. **Alt-embedding A/B** — still deferred. Hybrid 97.2 is a higher floor than the alt-embedding work was originally framed against. Reopen only when migration 010 is justified by a separate driver.
|
|
330
|
-
|
|
331
|
-
**Source:** Whtnxt Agent orchestrator session 2026-05-26 → 2026-05-27 (continuation); harness reports `reports/longmemeval/2026-05-26-16-39-52/` (E′ on partial cache), `…16-57-47/` (head-baseline boost off, byte-identical to E′), `…22-47-07/` (Build F on cold rebuild), `…22-56-53/` (head-baseline force-include off, same hot cache as 22-47-07). Probe scripts ephemeral at `/tmp/nlm-eprime/`.
|
|
332
|
-
|
|
333
|
-
_Older entries archived in CHANGELOG-2026.md_
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
209
|
_Older entries archived in CHANGELOG-2026.md_
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
-- Migration 014: add 'cursor' to the sources.kind CHECK constraint.
|
|
2
|
+
--
|
|
3
|
+
-- SQLite does not support ALTER COLUMN to modify CHECK constraints in place.
|
|
4
|
+
-- Standard approach: rename → recreate → copy → drop old.
|
|
5
|
+
|
|
6
|
+
PRAGMA foreign_keys = OFF;
|
|
7
|
+
|
|
8
|
+
CREATE TABLE sources_new (
|
|
9
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
10
|
+
kind TEXT NOT NULL CHECK (kind IN ('claude-code', 'hermes', 'hermes-agent', 'aider', 'cursor', 'opencode', 'pi', 'jsonl-generic', 'webhook')),
|
|
11
|
+
name TEXT NOT NULL UNIQUE,
|
|
12
|
+
path_or_url TEXT,
|
|
13
|
+
runtime_label TEXT NOT NULL,
|
|
14
|
+
parse_config TEXT NOT NULL DEFAULT '{}',
|
|
15
|
+
enabled INTEGER NOT NULL DEFAULT 1,
|
|
16
|
+
token TEXT,
|
|
17
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
18
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
19
|
+
);
|
|
20
|
+
|
|
21
|
+
INSERT INTO sources_new SELECT id, kind, name, path_or_url, runtime_label, parse_config, enabled, token, created_at, updated_at FROM sources;
|
|
22
|
+
|
|
23
|
+
DROP TABLE sources;
|
|
24
|
+
ALTER TABLE sources_new RENAME TO sources;
|
|
25
|
+
|
|
26
|
+
CREATE INDEX IF NOT EXISTS idx_sources_enabled ON sources(enabled) WHERE enabled = 1;
|
|
27
|
+
|
|
28
|
+
PRAGMA foreign_keys = ON;
|
|
29
|
+
|
|
30
|
+
INSERT OR IGNORE INTO schema_migrations (version, name) VALUES (14, '014_sources_cursor');
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
-- Migration 015: add 'windsurf' to the sources.kind CHECK constraint.
|
|
2
|
+
--
|
|
3
|
+
-- SQLite does not support ALTER COLUMN to modify CHECK constraints in place.
|
|
4
|
+
-- Standard approach: rename → recreate → copy → drop old.
|
|
5
|
+
|
|
6
|
+
PRAGMA foreign_keys = OFF;
|
|
7
|
+
|
|
8
|
+
CREATE TABLE sources_new (
|
|
9
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
10
|
+
kind TEXT NOT NULL CHECK (kind IN ('claude-code', 'hermes', 'hermes-agent', 'aider', 'cursor', 'windsurf', 'opencode', 'pi', 'jsonl-generic', 'webhook')),
|
|
11
|
+
name TEXT NOT NULL UNIQUE,
|
|
12
|
+
path_or_url TEXT,
|
|
13
|
+
runtime_label TEXT NOT NULL,
|
|
14
|
+
parse_config TEXT NOT NULL DEFAULT '{}',
|
|
15
|
+
enabled INTEGER NOT NULL DEFAULT 1,
|
|
16
|
+
token TEXT,
|
|
17
|
+
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
18
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
19
|
+
);
|
|
20
|
+
|
|
21
|
+
INSERT INTO sources_new SELECT id, kind, name, path_or_url, runtime_label, parse_config, enabled, token, created_at, updated_at FROM sources;
|
|
22
|
+
|
|
23
|
+
DROP TABLE sources;
|
|
24
|
+
ALTER TABLE sources_new RENAME TO sources;
|
|
25
|
+
|
|
26
|
+
CREATE INDEX IF NOT EXISTS idx_sources_enabled ON sources(enabled) WHERE enabled = 1;
|
|
27
|
+
|
|
28
|
+
PRAGMA foreign_keys = ON;
|
|
29
|
+
|
|
30
|
+
INSERT OR IGNORE INTO schema_migrations (version, name) VALUES (15, '015_sources_windsurf');
|
package/package.json
CHANGED
|
@@ -87,6 +87,56 @@ function selectHits(params) {
|
|
|
87
87
|
return eligible.slice(0, limit);
|
|
88
88
|
}
|
|
89
89
|
|
|
90
|
+
// src/llm/env-autoload.ts
|
|
91
|
+
import { readFileSync as readFileSync2, existsSync as existsSync2 } from "node:fs";
|
|
92
|
+
import { homedir as homedir3 } from "node:os";
|
|
93
|
+
import { resolve } from "node:path";
|
|
94
|
+
var DEFAULT_SEARCH_PATHS = [
|
|
95
|
+
"~/.nlm/.env",
|
|
96
|
+
"./.env",
|
|
97
|
+
"../.env",
|
|
98
|
+
"../../.env"
|
|
99
|
+
];
|
|
100
|
+
function expandHome(p) {
|
|
101
|
+
if (p.startsWith("~/")) return resolve(homedir3(), p.slice(2));
|
|
102
|
+
return p;
|
|
103
|
+
}
|
|
104
|
+
function autoloadEnv(extraPaths = []) {
|
|
105
|
+
const loaded = [];
|
|
106
|
+
const paths = [...DEFAULT_SEARCH_PATHS, ...extraPaths];
|
|
107
|
+
for (const raw of paths) {
|
|
108
|
+
const path = expandHome(raw);
|
|
109
|
+
if (!existsSync2(path)) continue;
|
|
110
|
+
try {
|
|
111
|
+
const content = readFileSync2(path, "utf8");
|
|
112
|
+
for (const line of content.split("\n")) {
|
|
113
|
+
const trimmed = line.trim();
|
|
114
|
+
if (!trimmed || trimmed.startsWith("#") || !trimmed.includes("=")) continue;
|
|
115
|
+
const eq = trimmed.indexOf("=");
|
|
116
|
+
const key = trimmed.slice(0, eq).trim();
|
|
117
|
+
let value = trimmed.slice(eq + 1).trim();
|
|
118
|
+
if (value.startsWith('"') && value.endsWith('"') || value.startsWith("'") && value.endsWith("'")) {
|
|
119
|
+
value = value.slice(1, -1);
|
|
120
|
+
}
|
|
121
|
+
if (key && process.env[key] === void 0) {
|
|
122
|
+
process.env[key] = value;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
loaded.push(path);
|
|
126
|
+
} catch {
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
return loaded;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// src/hook/hook-auth.ts
|
|
134
|
+
function hookAuthHeaders(extra = {}) {
|
|
135
|
+
const token = process.env["NLM_MCP_TOKEN"];
|
|
136
|
+
if (!token) return { ...extra };
|
|
137
|
+
return { ...extra, authorization: `Bearer ${token}` };
|
|
138
|
+
}
|
|
139
|
+
|
|
90
140
|
// src/hook/prompt-recall-hook.ts
|
|
91
141
|
var SCORE_THRESHOLD = 0;
|
|
92
142
|
var PER_FIRE_CAP = 3;
|
|
@@ -143,12 +193,12 @@ async function runHook(input, deps) {
|
|
|
143
193
|
return "";
|
|
144
194
|
}
|
|
145
195
|
function readStdin() {
|
|
146
|
-
return new Promise((
|
|
196
|
+
return new Promise((resolve2) => {
|
|
147
197
|
let data = "";
|
|
148
198
|
process.stdin.setEncoding("utf8");
|
|
149
199
|
process.stdin.on("data", (chunk) => data += chunk);
|
|
150
|
-
process.stdin.on("end", () =>
|
|
151
|
-
process.stdin.on("error", () =>
|
|
200
|
+
process.stdin.on("end", () => resolve2(data));
|
|
201
|
+
process.stdin.on("error", () => resolve2(data));
|
|
152
202
|
});
|
|
153
203
|
}
|
|
154
204
|
async function recallOverHttp(prompt) {
|
|
@@ -158,7 +208,7 @@ async function recallOverHttp(prompt) {
|
|
|
158
208
|
const timer = setTimeout(() => controller.abort(), RECALL_TIMEOUT_MS);
|
|
159
209
|
try {
|
|
160
210
|
const res = await fetch(url, {
|
|
161
|
-
headers: { "x-recall-source": "hook" },
|
|
211
|
+
headers: hookAuthHeaders({ "x-recall-source": "hook" }),
|
|
162
212
|
signal: controller.signal
|
|
163
213
|
});
|
|
164
214
|
if (!res.ok) return [];
|
|
@@ -180,6 +230,7 @@ async function recallOverHttp(prompt) {
|
|
|
180
230
|
}
|
|
181
231
|
async function main() {
|
|
182
232
|
try {
|
|
233
|
+
autoloadEnv();
|
|
183
234
|
const raw = await readStdin();
|
|
184
235
|
const payload = JSON.parse(raw);
|
|
185
236
|
const prompt = typeof payload.prompt === "string" ? payload.prompt : "";
|