sanook-cli 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. package/.env.example +19 -0
  2. package/CHANGELOG.md +173 -0
  3. package/README.md +153 -20
  4. package/README.th.md +136 -0
  5. package/dist/agentContext.js +4 -0
  6. package/dist/approval.js +6 -0
  7. package/dist/bin.js +405 -57
  8. package/dist/brain.js +92 -59
  9. package/dist/brand.js +47 -0
  10. package/dist/checkpoint.js +37 -0
  11. package/dist/commands.js +86 -6
  12. package/dist/compaction.js +76 -5
  13. package/dist/config.js +100 -12
  14. package/dist/cost.js +60 -3
  15. package/dist/doctor.js +92 -0
  16. package/dist/gateway/auth.js +2 -2
  17. package/dist/gateway/ledger.js +2 -2
  18. package/dist/gateway/scheduler.js +1 -0
  19. package/dist/gateway/serve.js +6 -4
  20. package/dist/gateway/server.js +10 -2
  21. package/dist/git.js +11 -2
  22. package/dist/hooks.js +43 -17
  23. package/dist/knowledge.js +48 -49
  24. package/dist/loop.js +182 -66
  25. package/dist/lsp/client.js +173 -0
  26. package/dist/lsp/framing.js +56 -0
  27. package/dist/lsp/index.js +138 -0
  28. package/dist/lsp/servers.js +82 -0
  29. package/dist/mcp-server.js +244 -0
  30. package/dist/mcp.js +184 -29
  31. package/dist/memory-store.js +559 -0
  32. package/dist/memory.js +143 -29
  33. package/dist/orchestrate.js +150 -0
  34. package/dist/providers/codex.js +21 -7
  35. package/dist/providers/keys.js +3 -2
  36. package/dist/providers/models.js +22 -6
  37. package/dist/providers/registry.js +155 -1
  38. package/dist/repomap.js +93 -0
  39. package/dist/search/chunk.js +158 -0
  40. package/dist/search/embed-store.js +187 -0
  41. package/dist/search/engine.js +203 -0
  42. package/dist/search/fuse.js +35 -0
  43. package/dist/search/index-core.js +187 -0
  44. package/dist/search/indexer.js +241 -0
  45. package/dist/search/store.js +77 -0
  46. package/dist/session.js +42 -8
  47. package/dist/skill-install.js +10 -10
  48. package/dist/skills.js +12 -9
  49. package/dist/summarize.js +31 -0
  50. package/dist/tools/bash.js +21 -2
  51. package/dist/tools/diagnostics.js +41 -0
  52. package/dist/tools/edit.js +29 -7
  53. package/dist/tools/index.js +8 -1
  54. package/dist/tools/list.js +7 -2
  55. package/dist/tools/permission.js +90 -9
  56. package/dist/tools/read.js +23 -4
  57. package/dist/tools/remember.js +1 -1
  58. package/dist/tools/sandbox.js +61 -0
  59. package/dist/tools/search.js +105 -4
  60. package/dist/tools/task.js +195 -29
  61. package/dist/tools/timeout.js +35 -0
  62. package/dist/tools/util.js +10 -0
  63. package/dist/tools/write.js +6 -4
  64. package/dist/trust.js +89 -0
  65. package/dist/ui/app.js +228 -31
  66. package/dist/ui/banner.js +4 -9
  67. package/dist/ui/brain-wizard.js +2 -2
  68. package/dist/ui/history.js +30 -0
  69. package/dist/ui/mentions.js +44 -0
  70. package/dist/ui/render.js +55 -15
  71. package/dist/ui/setup.js +97 -12
  72. package/dist/ui/useEditor.js +83 -0
  73. package/dist/update.js +114 -0
  74. package/dist/worktree.js +173 -0
  75. package/package.json +11 -5
  76. package/scripts/postinstall.mjs +33 -0
  77. package/second-brain/.agents/_Index.md +30 -0
  78. package/second-brain/.agents/skills/_Index.md +30 -0
  79. package/second-brain/.agents/workflows/_Index.md +30 -0
  80. package/second-brain/AGENTS.md +4 -4
  81. package/second-brain/Acceptance/_Index.md +30 -0
  82. package/second-brain/Acceptance/golden-case-template.md +39 -0
  83. package/second-brain/Areas/_Index.md +30 -0
  84. package/second-brain/Bugs/System-OS/_Index.md +30 -0
  85. package/second-brain/Bugs/_Index.md +30 -0
  86. package/second-brain/CLAUDE.md +4 -1
  87. package/second-brain/Checklists/_Index.md +30 -0
  88. package/second-brain/Checklists/preflight-postflight-template.md +29 -0
  89. package/second-brain/Distillations/_Index.md +30 -0
  90. package/second-brain/Entities/_Index.md +30 -0
  91. package/second-brain/Entities/entity-template.md +33 -0
  92. package/second-brain/Evals/_Index.md +30 -0
  93. package/second-brain/Evals/correction-pairs.md +24 -0
  94. package/second-brain/Evals/failure-taxonomy.md +24 -0
  95. package/second-brain/Evals/golden-set.md +25 -0
  96. package/second-brain/Evals/quality-ledger.md +23 -0
  97. package/second-brain/Evals/self-eval-rubric.md +23 -0
  98. package/second-brain/GEMINI.md +4 -4
  99. package/second-brain/Goals/_Index.md +30 -0
  100. package/second-brain/Handoffs/_Index.md +30 -0
  101. package/second-brain/Home.md +7 -0
  102. package/second-brain/Intake/Raw Sources/_Index.md +30 -0
  103. package/second-brain/Intake/_Index.md +30 -0
  104. package/second-brain/Intake/_Quarantine/_Index.md +30 -0
  105. package/second-brain/Learning/_Index.md +30 -0
  106. package/second-brain/Playbooks/_Index.md +30 -0
  107. package/second-brain/Playbooks/playbook-template.md +23 -0
  108. package/second-brain/Projects/_Index.md +30 -0
  109. package/second-brain/Prompts/_Index.md +30 -0
  110. package/second-brain/README.md +2 -1
  111. package/second-brain/Research/_Index.md +30 -0
  112. package/second-brain/Retrospectives/_Index.md +30 -0
  113. package/second-brain/Reviews/_Index.md +30 -0
  114. package/second-brain/Runbooks/_Index.md +30 -0
  115. package/second-brain/Runbooks/eval-loop.md +24 -0
  116. package/second-brain/Sessions/_Index.md +30 -0
  117. package/second-brain/Shared/AI-Context-Index.md +20 -0
  118. package/second-brain/Shared/AI-Threads/_Index.md +30 -0
  119. package/second-brain/Shared/Archive/_Index.md +30 -0
  120. package/second-brain/Shared/Assets/_Index.md +30 -0
  121. package/second-brain/Shared/Context-Packs/_Index.md +30 -0
  122. package/second-brain/Shared/Context7-Docs/_Index.md +30 -0
  123. package/second-brain/Shared/Coordination/NOW.md +28 -0
  124. package/second-brain/Shared/Coordination/_Index.md +30 -0
  125. package/second-brain/Shared/Coordination/agent-registry.md +24 -0
  126. package/second-brain/Shared/Coordination/task-board/_Index.md +30 -0
  127. package/second-brain/Shared/Coordination/task-board/task-template.md +43 -0
  128. package/second-brain/Shared/Coordination/task-board.md +32 -0
  129. package/second-brain/Shared/Core-Facts/_Index.md +30 -0
  130. package/second-brain/Shared/Decision-Memory/_Index.md +30 -0
  131. package/second-brain/Shared/Glossary/_Index.md +30 -0
  132. package/second-brain/Shared/Memory-Inbox/_Index.md +30 -0
  133. package/second-brain/Shared/Operating-State/_Index.md +30 -0
  134. package/second-brain/Shared/Prompting/_Index.md +30 -0
  135. package/second-brain/Shared/Provenance/_Index.md +30 -0
  136. package/second-brain/Shared/Rules/_Index.md +30 -0
  137. package/second-brain/Shared/Rules/contextual-note-rule.md +30 -0
  138. package/second-brain/Shared/Rules/frontmatter-standard.md +10 -0
  139. package/second-brain/Shared/Rules/memory-write-protocol.md +28 -0
  140. package/second-brain/Shared/Rules/procedural-runbook-header.md +40 -0
  141. package/second-brain/Shared/Rules/review-and-staleness-policy.md +22 -0
  142. package/second-brain/Shared/Rules/rules-formatting.md +34 -0
  143. package/second-brain/Shared/Scripts/_Index.md +30 -0
  144. package/second-brain/Shared/Scripts-Archive/_Index.md +30 -0
  145. package/second-brain/Shared/Tech-Standards/_Index.md +30 -0
  146. package/second-brain/Shared/Tech-Standards/verification-standard.md +40 -0
  147. package/second-brain/Shared/User-Memory/_Index.md +30 -0
  148. package/second-brain/Shared/User-Persona/_Index.md +30 -0
  149. package/second-brain/Shared/User-Persona/owner-profile.md +25 -0
  150. package/second-brain/Shared/Working-Memory/_Index.md +30 -0
  151. package/second-brain/Shared/_Index.md +30 -0
  152. package/second-brain/Shared/mcp-servers/_Index.md +30 -0
  153. package/second-brain/Skills/_Index.md +30 -0
  154. package/second-brain/Templates/_Index.md +30 -0
  155. package/second-brain/Templates/bug.md +2 -0
  156. package/second-brain/Templates/handoff.md +2 -0
  157. package/second-brain/Templates/session.md +2 -0
  158. package/second-brain/Tools/_Index.md +30 -0
  159. package/second-brain/Traces/_Index.md +30 -0
  160. package/second-brain/Vault Structure Map.md +33 -1
  161. package/second-brain/copilot/_Index.md +30 -0
  162. package/skills/audit-license-compliance/SKILL.md +117 -0
  163. package/skills/author-codemod/SKILL.md +110 -0
  164. package/skills/build-audit-logging/SKILL.md +112 -0
  165. package/skills/build-cdc-streaming-pipeline/SKILL.md +123 -0
  166. package/skills/build-cli-tool/SKILL.md +108 -0
  167. package/skills/build-data-table/SKILL.md +141 -0
  168. package/skills/build-native-mobile-ui/SKILL.md +154 -0
  169. package/skills/build-offline-first-sync/SKILL.md +118 -0
  170. package/skills/build-realtime-channel/SKILL.md +122 -0
  171. package/skills/build-vector-search/SKILL.md +131 -0
  172. package/skills/compose-local-dev-stack/SKILL.md +149 -0
  173. package/skills/configure-bundler-build/SKILL.md +166 -0
  174. package/skills/configure-dns-tls/SKILL.md +142 -0
  175. package/skills/configure-reverse-proxy-lb/SKILL.md +129 -0
  176. package/skills/configure-security-headers-csp/SKILL.md +122 -0
  177. package/skills/contract-testing/SKILL.md +140 -0
  178. package/skills/datetime-timezone-correctness/SKILL.md +125 -0
  179. package/skills/debug-ci-pipeline-failure/SKILL.md +134 -0
  180. package/skills/debug-flaky-tests/SKILL.md +128 -0
  181. package/skills/defend-llm-prompt-injection/SKILL.md +110 -0
  182. package/skills/deliver-webhooks/SKILL.md +116 -0
  183. package/skills/design-api-pagination/SKILL.md +144 -0
  184. package/skills/design-authorization-model/SKILL.md +119 -0
  185. package/skills/design-backup-dr-recovery/SKILL.md +113 -0
  186. package/skills/design-event-sourcing-cqrs/SKILL.md +143 -0
  187. package/skills/design-multi-tenancy/SKILL.md +100 -0
  188. package/skills/design-protobuf-grpc-service/SKILL.md +146 -0
  189. package/skills/design-relational-schema/SKILL.md +129 -0
  190. package/skills/design-search-index-infra/SKILL.md +151 -0
  191. package/skills/design-state-machine/SKILL.md +108 -0
  192. package/skills/design-token-system/SKILL.md +109 -0
  193. package/skills/distributed-locks-leases/SKILL.md +120 -0
  194. package/skills/encrypt-sensitive-data/SKILL.md +148 -0
  195. package/skills/feature-flags-rollout/SKILL.md +130 -0
  196. package/skills/file-upload-object-storage/SKILL.md +107 -0
  197. package/skills/fuzz-dynamic-security-test/SKILL.md +111 -0
  198. package/skills/harden-llm-app-reliability/SKILL.md +126 -0
  199. package/skills/i18n-localization-setup/SKILL.md +113 -0
  200. package/skills/idempotency-keys/SKILL.md +107 -0
  201. package/skills/implement-push-notifications/SKILL.md +142 -0
  202. package/skills/ingest-webhook-secure/SKILL.md +120 -0
  203. package/skills/integrate-oauth-oidc/SKILL.md +126 -0
  204. package/skills/load-stress-test/SKILL.md +129 -0
  205. package/skills/map-privacy-data-gdpr/SKILL.md +146 -0
  206. package/skills/model-nosql-data/SKILL.md +118 -0
  207. package/skills/money-decimal-arithmetic/SKILL.md +123 -0
  208. package/skills/monitor-ml-drift/SKILL.md +109 -0
  209. package/skills/numeric-precision-units/SKILL.md +144 -0
  210. package/skills/optimize-llm-cost-latency/SKILL.md +103 -0
  211. package/skills/optimize-react-rerenders/SKILL.md +124 -0
  212. package/skills/orchestrate-agent-workflow/SKILL.md +100 -0
  213. package/skills/payments-billing-integration/SKILL.md +114 -0
  214. package/skills/pin-toolchain-versions/SKILL.md +116 -0
  215. package/skills/plan-strangler-migration/SKILL.md +95 -0
  216. package/skills/property-based-testing/SKILL.md +108 -0
  217. package/skills/publish-package-registry/SKILL.md +130 -0
  218. package/skills/recover-git-state/SKILL.md +119 -0
  219. package/skills/remediate-web-vulnerabilities/SKILL.md +125 -0
  220. package/skills/resilience-timeouts-retries/SKILL.md +104 -0
  221. package/skills/resolve-merge-rebase-conflict/SKILL.md +97 -0
  222. package/skills/rewrite-git-history/SKILL.md +109 -0
  223. package/skills/scaffold-cross-platform-app/SKILL.md +137 -0
  224. package/skills/schema-evolution-compatibility/SKILL.md +121 -0
  225. package/skills/send-transactional-email/SKILL.md +126 -0
  226. package/skills/serve-deploy-ml-model/SKILL.md +107 -0
  227. package/skills/setup-cdn-edge-waf/SKILL.md +107 -0
  228. package/skills/setup-devcontainer-env/SKILL.md +131 -0
  229. package/skills/setup-lint-format-precommit/SKILL.md +140 -0
  230. package/skills/setup-monorepo-tooling/SKILL.md +125 -0
  231. package/skills/ship-mobile-app-store-release/SKILL.md +137 -0
  232. package/skills/structured-output-llm/SKILL.md +86 -0
  233. package/skills/supply-chain-sbom-provenance/SKILL.md +120 -0
  234. package/skills/test-data-factories/SKILL.md +158 -0
  235. package/skills/threat-model-stride/SKILL.md +123 -0
  236. package/skills/train-evaluate-ml-model/SKILL.md +109 -0
  237. package/skills/unicode-text-correctness/SKILL.md +109 -0
  238. package/skills/visual-regression-testing/SKILL.md +120 -0
@@ -0,0 +1,203 @@
1
+ // ============================================================================
2
+ // src/search/engine.ts — the search orchestrator (the one module callers use).
3
+ //
4
+ // Implements the degradation ladder as a single search() call:
5
+ // mode='fts' → pure BM25 (the always-on floor)
6
+ // mode='semantic' → cosine over BYOK vectors (full recall)
7
+ // mode='hybrid' → BM25 ⊕ cosine ⊕ memory-importance prior, fused by RRF
8
+ // mode='auto' → hybrid when vectors are usable, else fts (the smart default)
9
+ //
10
+ // rankSearch() is the PURE core (index + optional vectors + optional query vector
11
+ // in, ranked hits out) so the whole ranking pipeline unit-tests with zero disk
12
+ // and zero network. search() is the thin disk/embedding wrapper: it caches the
13
+ // index by mtime, caches query embeddings in an LRU, resolves a BYOK embedder
14
+ // lazily, and on ANY embedding error degrades to BM25 with a `degraded` flag —
15
+ // search must never throw at the floor.
16
+ // ============================================================================
17
+ import { readFile } from 'node:fs/promises';
18
+ import { appHomePath } from '../brand.js';
19
+ import { bm25Search, termList } from './index-core.js';
20
+ import { rrfFuse } from './fuse.js';
21
+ import { cosineTopK, embedQuery, getEmbedder, loadVectors, vectorsMtimeMs, } from './embed-store.js';
22
+ import { indexMtimeMs, loadIndex } from './store.js';
23
+ const CAND = 60; // candidate pool depth per leg before fusion/limit
24
+ const SNIPPET_WIDTH = 64;
25
+ /** ±width snippet around the first matched query term; falls back to the head for semantic-only hits. */
26
+ function makeSnippet(text, qTerms, width = SNIPPET_WIDTH) {
27
+ const flat = text.replace(/\s+/g, ' ').trim();
28
+ const lower = flat.toLowerCase();
29
+ let pos = -1;
30
+ for (const t of qTerms) {
31
+ const i = lower.indexOf(t);
32
+ if (i >= 0 && (pos < 0 || i < pos))
33
+ pos = i;
34
+ }
35
+ if (pos < 0)
36
+ return flat.length > width * 2 ? `${flat.slice(0, width * 2).trim()}…` : flat;
37
+ const start = Math.max(0, pos - width);
38
+ const end = Math.min(flat.length, pos + width);
39
+ return `${start > 0 ? '…' : ''}${flat.slice(start, end).trim()}${end < flat.length ? '…' : ''}`;
40
+ }
41
+ /** ids of docs whose source is allowed (or all if no filter). */
42
+ function sourceFilteredIds(index, sources) {
43
+ if (!sources)
44
+ return undefined;
45
+ const out = new Set();
46
+ for (const m of index.docs.values())
47
+ if (sources.has(m.source))
48
+ out.add(m.id);
49
+ return out;
50
+ }
51
+ /**
52
+ * PURE ranking core. Given the index, optional vectors, and an optional query
53
+ * vector, produce ranked hits per the requested mode. No disk, no network.
54
+ */
55
+ export function rankSearch(index, query, opts = {}, vectors, queryVec) {
56
+ const mode = opts.mode ?? 'auto';
57
+ const limit = opts.limit ?? 8;
58
+ const sources = opts.sources?.length ? new Set(opts.sources) : undefined;
59
+ const qTerms = [...new Set(termList(query))];
60
+ const bm25 = bm25Search(index, query, CAND, sources);
61
+ const bm25Ids = bm25.map((h) => h.id);
62
+ const semanticPossible = !!(vectors && vectors.dim && queryVec && queryVec.length === vectors.dim);
63
+ const wantsSemantic = mode === 'semantic' || mode === 'hybrid' || mode === 'auto';
64
+ // resolve the executed mode + a degraded reason if the request can't be honored
65
+ let exec;
66
+ let degraded;
67
+ if (!wantsSemantic)
68
+ exec = 'fts';
69
+ else if (semanticPossible)
70
+ exec = mode === 'auto' ? 'hybrid' : mode;
71
+ else {
72
+ exec = 'fts';
73
+ if (mode === 'semantic' || mode === 'hybrid')
74
+ degraded = 'semantic-unavailable';
75
+ }
76
+ let orderedIds;
77
+ if (exec === 'fts') {
78
+ orderedIds = bm25Ids;
79
+ }
80
+ else {
81
+ const allowed = sourceFilteredIds(index, sources);
82
+ const cosine = cosineTopK(vectors, queryVec, CAND, allowed).filter((h) => index.docs.has(h.id));
83
+ const cosineIds = cosine.map((h) => h.id);
84
+ if (exec === 'semantic') {
85
+ orderedIds = cosineIds;
86
+ }
87
+ else {
88
+ // hybrid: BM25 ⊕ cosine ⊕ memory-importance prior, fused by rank (scale-free)
89
+ const priorIds = [...new Set([...bm25Ids, ...cosineIds])]
90
+ .map((id) => index.docs.get(id))
91
+ .filter((m) => !!m && m.source === 'memory' && m.importance != null)
92
+ .sort((a, b) => (b.importance ?? 0) - (a.importance ?? 0))
93
+ .map((m) => m.id);
94
+ orderedIds = rrfFuse([
95
+ { ids: bm25Ids },
96
+ { ids: cosineIds },
97
+ { ids: priorIds, weight: 0.4 },
98
+ ]);
99
+ }
100
+ }
101
+ const snippets = opts.snippets !== false;
102
+ const hits = [];
103
+ for (const id of orderedIds.slice(0, limit)) {
104
+ const m = index.docs.get(id);
105
+ if (!m)
106
+ continue;
107
+ hits.push({
108
+ id: m.id,
109
+ source: m.source,
110
+ title: m.title,
111
+ path: m.path,
112
+ noteType: m.noteType,
113
+ tags: m.tags,
114
+ score: 0, // rank-based; fused score isn't meaningful cross-mode, so we expose rank order
115
+ snippet: snippets ? makeSnippet(m.text, qTerms) : '',
116
+ importance: m.importance,
117
+ });
118
+ }
119
+ return { hits, mode: exec, degraded, total: new Set(orderedIds).size };
120
+ }
121
+ // ---- disk/embedding wrapper (the only impure part) -------------------------
122
+ let indexCache = null;
123
+ let vectorCache = null;
124
+ const queryVecLRU = new Map(); // key = `${tag}\n${query}`
125
+ const LRU_CAP = 100;
126
+ /** cached index load — re-reads only when the on-disk index.json mtime changes. */
127
+ async function cachedIndex() {
128
+ const mtime = await indexMtimeMs();
129
+ if (!indexCache || indexCache.mtime !== mtime) {
130
+ indexCache = { index: (await loadIndex()).index, mtime };
131
+ }
132
+ return indexCache.index;
133
+ }
134
+ async function cachedVectors() {
135
+ const mtime = await vectorsMtimeMs();
136
+ if (!vectorCache || vectorCache.mtime !== mtime) {
137
+ vectorCache = { vectors: await loadVectors(), mtime };
138
+ }
139
+ return vectorCache.vectors;
140
+ }
141
+ /** read an optional embeddingModel spec from ~/.sanook/config.json. */
142
+ async function configEmbeddingModel() {
143
+ try {
144
+ const cfg = JSON.parse(await readFile(appHomePath('config.json'), 'utf8'));
145
+ return cfg.embeddingModel;
146
+ }
147
+ catch {
148
+ return undefined;
149
+ }
150
+ }
151
+ /** drop in-process caches (tests + after a reindex in the same process). */
152
+ export function resetSearchCaches() {
153
+ indexCache = null;
154
+ vectorCache = null;
155
+ queryVecLRU.clear();
156
+ }
157
+ /**
158
+ * The public search entrypoint. Loads the cached index, lazily resolves a BYOK
159
+ * embedder (opts → env SANOOK_EMBEDDING_MODEL → config → auto-detect), embeds the
160
+ * query (LRU-cached) only when semantic is wanted AND a usable same-tag vector set
161
+ * exists, then delegates to rankSearch. Any embedding failure degrades to BM25.
162
+ */
163
+ export async function search(query, opts = {}) {
164
+ const index = await cachedIndex();
165
+ const mode = opts.mode ?? 'auto';
166
+ if (mode === 'fts')
167
+ return rankSearch(index, query, opts);
168
+ const spec = opts.embeddingModel ?? process.env.SANOOK_EMBEDDING_MODEL ?? (await configEmbeddingModel());
169
+ const embedder = getEmbedder(spec);
170
+ if (!embedder) {
171
+ const res = rankSearch(index, query, opts);
172
+ if (mode === 'semantic' || mode === 'hybrid')
173
+ res.degraded = 'no-embedder';
174
+ return res;
175
+ }
176
+ const vectors = await cachedVectors();
177
+ // a model change (different tag) invalidates the cache → behave as no-vectors until reindex
178
+ if (!vectors.dim || vectors.tag !== embedder.tag) {
179
+ const res = rankSearch(index, query, opts);
180
+ res.degraded = vectors.dim ? 'embedding-model-changed' : 'no-vectors';
181
+ return mode === 'auto' ? { ...res, degraded: undefined } : res;
182
+ }
183
+ let queryVec;
184
+ try {
185
+ const key = `${embedder.tag}\n${query}`;
186
+ const cached = queryVecLRU.get(key);
187
+ if (cached) {
188
+ queryVec = cached;
189
+ }
190
+ else {
191
+ queryVec = await embedQuery(embedder, query);
192
+ queryVecLRU.set(key, queryVec);
193
+ if (queryVecLRU.size > LRU_CAP)
194
+ queryVecLRU.delete(queryVecLRU.keys().next().value);
195
+ }
196
+ }
197
+ catch {
198
+ const res = rankSearch(index, query, opts); // embedding failed mid-query → BM25 floor
199
+ res.degraded = 'semantic-unavailable';
200
+ return res;
201
+ }
202
+ return rankSearch(index, query, opts, vectors, queryVec);
203
+ }
@@ -0,0 +1,35 @@
1
+ // ============================================================================
2
+ // src/search/fuse.ts — Reciprocal Rank Fusion (RRF).
3
+ //
4
+ // arra-oracle blends results with a hand-tuned linear formula
5
+ // (fts*0.7 + vec*0.65 + 0.12*overlap) that mixes BM25 magnitudes with cosine
6
+ // distances — two scales that are not comparable, so the weights are fragile and
7
+ // corpus-dependent. RRF sidesteps the whole problem: it fuses on RANK, not score,
8
+ // so a document's contribution depends only on where it placed in each list, not
9
+ // on the (incomparable) raw numbers. A doc that ranks well in two lists naturally
10
+ // sums two reciprocals and outranks a doc strong in only one. k=60 is the
11
+ // standard Cormack et al. constant. Pure, deterministic, parameter-light.
12
+ // ============================================================================
13
+ const RRF_K = 60;
14
+ /**
15
+ * Fuse N ranked id-lists into a single score map (higher = better).
16
+ * score(d) = Σ_lists weight / (k + rank_in_list(d)), rank 0-based.
17
+ */
18
+ export function rrf(lists, k = RRF_K) {
19
+ const scores = new Map();
20
+ for (const list of lists) {
21
+ const w = list.weight ?? 1;
22
+ for (let rank = 0; rank < list.ids.length; rank++) {
23
+ const id = list.ids[rank];
24
+ scores.set(id, (scores.get(id) ?? 0) + w / (k + rank));
25
+ }
26
+ }
27
+ return scores;
28
+ }
29
+ /** RRF then sort → fused id list (best first), deterministic tie-break by id. */
30
+ export function rrfFuse(lists, limit, k = RRF_K) {
31
+ const scores = rrf(lists, k);
32
+ const out = [...scores.entries()].sort((a, b) => b[1] - a[1] || (a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0));
33
+ const ids = out.map(([id]) => id);
34
+ return limit == null ? ids : ids.slice(0, limit);
35
+ }
@@ -0,0 +1,187 @@
1
+ // ============================================================================
2
+ // src/search/index-core.ts — the zero-dependency search FLOOR.
3
+ //
4
+ // A pure-TS inverted index with REAL BM25 (k1=1.2, b=0.75, genuine corpus-stat
5
+ // IDF via df/N). No SQLite, no Bun, no native binary, no network — it works the
6
+ // instant a corpus exists, on any OS Node 22 runs on. This is deliberately NOT
7
+ // node:sqlite FTS5: that is experimental, its FTS5 build is not guaranteed across
8
+ // platforms, and it reintroduces a quasi-native dependency that fights the
9
+ // zero-config/portability contract. A few hundred lines of TS give us a real
10
+ // ranking model that FTS5's bm25() only approximates without true global IDF.
11
+ //
12
+ // Tokenization REUSES memory-store.ts normalize() (the canonical, Thai-safe,
13
+ // stopword-aware tokenizer) so memory matching and search matching never drift.
14
+ //
15
+ // addDoc/removeDoc MUTATE the index in place and return it — an index over a
16
+ // large vault must not deep-copy its postings map on every chunk (that is the
17
+ // one place we diverge from memory-store's small-array immutability). bm25Search
18
+ // is pure and read-only. Re-adding the same doc id replaces its postings, so the
19
+ // index can never accumulate duplicate postings the way arra's FTS5
20
+ // delete-then-insert can drift.
21
+ // ============================================================================
22
+ import { normalize } from '../memory-store.js';
23
+ export const SEARCH_SOURCES = ['memory', 'vault', 'session', 'skill'];
24
+ /** BM25 params — Robertson/Spärck-Jones defaults; title terms get weighted tf. */
25
+ const K1 = 1.2;
26
+ const B = 0.75;
27
+ const TITLE_BOOST = 2; // a term in a doc's title counts this many times toward tf
28
+ const WORD_SEG = new Intl.Segmenter(undefined, { granularity: 'word' });
29
+ export const INDEX_VERSION = 1;
30
+ export function emptyIndex() {
31
+ return { version: INDEX_VERSION, postings: new Map(), docs: new Map(), totalDl: 0 };
32
+ }
33
+ /**
34
+ * Ordered tokens WITH repeats — BM25 needs term frequencies, so unlike
35
+ * memory-store's tokens() (a deduped Set) we keep counts. Builds on the SAME
36
+ * canonical normalize() (lowercase, punctuation→space, Thai preserved), then
37
+ * segments with Intl.Segmenter at word granularity so Thai (which has no spaces)
38
+ * splits into real words instead of one coarse blob, giving BM25 genuine Thai
39
+ * term frequencies while preserving repeats.
40
+ */
41
+ export function termList(text) {
42
+ const out = [];
43
+ for (const seg of WORD_SEG.segment(normalize(text))) {
44
+ if (!seg.isWordLike)
45
+ continue;
46
+ const token = seg.segment.trim();
47
+ if (token.length > 1)
48
+ out.push(token);
49
+ }
50
+ return out;
51
+ }
52
+ /** combined term-frequency map for a doc, with title terms weighted, + the token length. */
53
+ function termFreqs(title, text) {
54
+ const tf = new Map();
55
+ const body = termList(text);
56
+ const head = termList(title);
57
+ for (const t of body)
58
+ tf.set(t, (tf.get(t) ?? 0) + 1);
59
+ for (const t of head)
60
+ tf.set(t, (tf.get(t) ?? 0) + TITLE_BOOST);
61
+ return { tf, dl: body.length + head.length };
62
+ }
63
+ /** add (or REPLACE, if id already present) a document. Mutates + returns idx. */
64
+ export function addDoc(idx, doc) {
65
+ if (idx.docs.has(doc.id))
66
+ removeDoc(idx, doc.id); // replace → no posting creep
67
+ const { tf, dl } = termFreqs(doc.title, doc.text);
68
+ const meta = {
69
+ id: doc.id,
70
+ source: doc.source,
71
+ title: doc.title,
72
+ text: doc.text,
73
+ path: doc.path,
74
+ noteType: doc.noteType,
75
+ tags: doc.tags ?? [],
76
+ links: doc.links ?? [],
77
+ importance: doc.importance,
78
+ updatedMs: doc.updatedMs,
79
+ dl,
80
+ };
81
+ idx.docs.set(doc.id, meta);
82
+ idx.totalDl += dl;
83
+ for (const [term, freq] of tf) {
84
+ const plist = idx.postings.get(term);
85
+ if (plist)
86
+ plist.push({ docId: doc.id, tf: freq });
87
+ else
88
+ idx.postings.set(term, [{ docId: doc.id, tf: freq }]);
89
+ }
90
+ return idx;
91
+ }
92
+ /** remove a document and all its postings. Mutates + returns idx. No-op if absent. */
93
+ export function removeDoc(idx, id) {
94
+ const meta = idx.docs.get(id);
95
+ if (!meta)
96
+ return idx;
97
+ const { tf } = termFreqs(meta.title, meta.text);
98
+ for (const term of tf.keys()) {
99
+ const plist = idx.postings.get(term);
100
+ if (!plist)
101
+ continue;
102
+ const next = plist.filter((p) => p.docId !== id);
103
+ if (next.length)
104
+ idx.postings.set(term, next);
105
+ else
106
+ idx.postings.delete(term);
107
+ }
108
+ idx.totalDl -= meta.dl;
109
+ idx.docs.delete(id);
110
+ return idx;
111
+ }
112
+ /**
113
+ * BM25 ranking — pure, read-only. Genuine IDF from df/N (the always-positive
114
+ * BM25+ form ln(1 + (N-df+0.5)/(df+0.5))), length-normalized by avgdl. Optional
115
+ * source allow-list keeps cross-corpus queries cheap. Deterministic tie-break by id.
116
+ */
117
+ export function bm25Search(idx, query, limit = 50, sources) {
118
+ const n = idx.docs.size;
119
+ if (!n)
120
+ return [];
121
+ const avgdl = idx.totalDl / n || 1;
122
+ const qTerms = [...new Set(termList(query))];
123
+ if (!qTerms.length)
124
+ return [];
125
+ const scores = new Map();
126
+ for (const term of qTerms) {
127
+ const plist = idx.postings.get(term);
128
+ if (!plist)
129
+ continue;
130
+ const df = plist.length;
131
+ const idf = Math.log(1 + (n - df + 0.5) / (df + 0.5));
132
+ for (const p of plist) {
133
+ const meta = idx.docs.get(p.docId);
134
+ if (!meta)
135
+ continue;
136
+ if (sources && !sources.has(meta.source))
137
+ continue;
138
+ const denom = p.tf + K1 * (1 - B + B * (meta.dl / avgdl));
139
+ const contrib = idf * ((p.tf * (K1 + 1)) / denom);
140
+ scores.set(p.docId, (scores.get(p.docId) ?? 0) + contrib);
141
+ }
142
+ }
143
+ return [...scores.entries()]
144
+ .map(([id, score]) => ({ id, score }))
145
+ .sort((a, b) => b.score - a.score || (a.id < b.id ? -1 : a.id > b.id ? 1 : 0))
146
+ .slice(0, limit);
147
+ }
148
+ /** remove every doc of a given source (used to refresh the live memory/session/skill corpora). Returns count removed. */
149
+ export function removeSource(idx, source) {
150
+ const ids = [];
151
+ for (const m of idx.docs.values())
152
+ if (m.source === source)
153
+ ids.push(m.id);
154
+ for (const id of ids)
155
+ removeDoc(idx, id);
156
+ return ids.length;
157
+ }
158
+ export function indexStats(idx) {
159
+ const bySource = {};
160
+ for (const m of idx.docs.values())
161
+ bySource[m.source] = (bySource[m.source] ?? 0) + 1;
162
+ return {
163
+ docs: idx.docs.size,
164
+ terms: idx.postings.size,
165
+ bySource,
166
+ avgdl: idx.docs.size ? idx.totalDl / idx.docs.size : 0,
167
+ };
168
+ }
169
+ export function indexToJSON(idx) {
170
+ const postings = {};
171
+ for (const [term, plist] of idx.postings)
172
+ postings[term] = plist;
173
+ return { version: idx.version, totalDl: idx.totalDl, postings, docs: [...idx.docs.values()] };
174
+ }
175
+ export function indexFromJSON(raw) {
176
+ const obj = raw;
177
+ if (!obj || obj.version !== INDEX_VERSION || !obj.postings || !Array.isArray(obj.docs)) {
178
+ return emptyIndex(); // unknown/old shape degrades to empty rather than throwing
179
+ }
180
+ const idx = emptyIndex();
181
+ idx.totalDl = obj.totalDl ?? 0;
182
+ for (const [term, plist] of Object.entries(obj.postings))
183
+ idx.postings.set(term, plist);
184
+ for (const m of obj.docs)
185
+ idx.docs.set(m.id, m);
186
+ return idx;
187
+ }
@@ -0,0 +1,241 @@
1
+ // ============================================================================
2
+ // src/search/indexer.ts — incremental, O(delta) vault indexer.
3
+ //
4
+ // Beats arra-oracle's indexer on three axes:
5
+ // 1. NO directory convention. arra requires a `ψ/memory/…` tree; we index the
6
+ // user's EXISTING second-brain vault via getBrainPath(), any layout.
7
+ // 2. TRUE incremental. arra full-re-indexes every pass (guarded only by a >50%
8
+ // delete abort). We diff a per-file manifest: an unchanged file costs ONE
9
+ // stat(); only changed files are read+sha256+re-chunked; deleted files have
10
+ // their chunks evicted precisely (manifest stores each file's chunk ids).
11
+ // 3. ONE unified surface. Vault chunks, active memory Facts, recent session
12
+ // turns, and skills all land in the SAME ranked index — the unification arra
13
+ // never did (its memory store and search index use divorced formats).
14
+ //
15
+ // The file-walk is injected (VaultFS) so the core logic unit-tests against an
16
+ // in-memory fs + clock with zero disk, exactly like memory-store.ts.
17
+ // ============================================================================
18
+ import { createHash } from 'node:crypto';
19
+ import { readFile, readdir, stat } from 'node:fs/promises';
20
+ import { join } from 'node:path';
21
+ import { appHomePath } from '../brand.js';
22
+ import { getBrainPath } from '../memory.js';
23
+ import { loadSkills } from '../skills.js';
24
+ import { activeFacts, effImportance, loadStore } from '../memory-store.js';
25
+ import { chunkMarkdown } from './chunk.js';
26
+ import { addDoc, removeDoc, removeSource } from './index-core.js';
27
+ import { loadIndex, saveIndex } from './store.js';
28
+ /** strip a .md path to a human title fallback when a chunk has no heading. */
29
+ function fileTitle(rel) {
30
+ return (rel.split('/').pop() ?? rel).replace(/\.md$/i, '');
31
+ }
32
+ /**
33
+ * Incremental vault pass. Mutates `index`, returns the NEXT manifest + a diff.
34
+ * Pure w.r.t. the injected fs/clock — no disk access of its own.
35
+ */
36
+ export async function indexVaultFiles(index, manifest, fs) {
37
+ const next = {};
38
+ const diff = { added: 0, updated: 0, removed: 0, skipped: 0 };
39
+ const paths = await fs.listMarkdown();
40
+ const seenExisting = new Set();
41
+ for (const rel of paths) {
42
+ const fp = await fs.fingerprint(rel);
43
+ if (!fp)
44
+ continue; // vanished between listing and stat → treat as deletion below
45
+ seenExisting.add(rel);
46
+ const prev = manifest[rel];
47
+ // cheap path: mtime + size unchanged ⇒ skip without reading the file
48
+ if (prev && prev.mtimeMs === fp.mtimeMs && prev.size === fp.size) {
49
+ next[rel] = prev;
50
+ diff.skipped++;
51
+ continue;
52
+ }
53
+ const content = await fs.read(rel);
54
+ const sha = fs.hash(content);
55
+ // touched but content identical (mtime bumped by a sync) ⇒ refresh fingerprint, keep chunks
56
+ if (prev && prev.sha === sha) {
57
+ next[rel] = { ...prev, mtimeMs: fp.mtimeMs, size: fp.size };
58
+ diff.skipped++;
59
+ continue;
60
+ }
61
+ // changed or new ⇒ evict old chunks, re-chunk, re-add
62
+ if (prev)
63
+ for (const id of prev.ids)
64
+ removeDoc(index, id);
65
+ const parsed = chunkMarkdown(rel, content);
66
+ const title0 = fileTitle(rel);
67
+ const ids = [];
68
+ for (const c of parsed.chunks) {
69
+ const doc = {
70
+ id: c.id,
71
+ source: 'vault',
72
+ title: c.heading || title0,
73
+ text: c.text,
74
+ path: rel,
75
+ noteType: parsed.frontmatter.noteType,
76
+ tags: parsed.frontmatter.tags,
77
+ links: parsed.links,
78
+ updatedMs: fp.mtimeMs,
79
+ };
80
+ addDoc(index, doc);
81
+ ids.push(c.id);
82
+ }
83
+ next[rel] = { mtimeMs: fp.mtimeMs, size: fp.size, sha, ids };
84
+ if (prev)
85
+ diff.updated++;
86
+ else
87
+ diff.added++;
88
+ }
89
+ // deletions: present last time, absent now ⇒ evict their chunks
90
+ for (const rel of Object.keys(manifest)) {
91
+ if (seenExisting.has(rel))
92
+ continue;
93
+ for (const id of manifest[rel].ids)
94
+ removeDoc(index, id);
95
+ diff.removed++;
96
+ }
97
+ return { manifest: next, diff };
98
+ }
99
+ /** refresh the live memory corpus: drop old memory docs, re-add active Facts with an importance prior. */
100
+ export function foldFacts(index, facts, now) {
101
+ removeSource(index, 'memory');
102
+ const searchable = facts.filter((f) => f.status === 'active' && f.tier !== 'inbox');
103
+ for (const f of searchable) {
104
+ addDoc(index, {
105
+ id: f.id, // memory-store deriveId — stable, dedups against itself
106
+ source: 'memory',
107
+ title: '',
108
+ text: f.text,
109
+ noteType: f.noteType,
110
+ tags: f.tags,
111
+ importance: effImportance(f, now),
112
+ updatedMs: f.updated,
113
+ });
114
+ }
115
+ return searchable.length;
116
+ }
117
+ /** refresh the session corpus (first-user-message per recent session). */
118
+ export function foldSessions(index, sessions) {
119
+ removeSource(index, 'session');
120
+ for (const s of sessions) {
121
+ addDoc(index, { id: s.id, source: 'session', title: '', text: s.text, updatedMs: s.updatedMs });
122
+ }
123
+ return sessions.length;
124
+ }
125
+ /** refresh the skill corpus (name + description + whenToUse). */
126
+ export function foldSkills(index, skills) {
127
+ removeSource(index, 'skill');
128
+ for (const s of skills) {
129
+ addDoc(index, { id: s.id, source: 'skill', title: s.name, text: s.text });
130
+ }
131
+ return skills.length;
132
+ }
133
+ // ---- real-filesystem wiring ------------------------------------------------
134
+ const IGNORE_DIRS = new Set([
135
+ 'node_modules', 'dist', 'build', 'coverage', '.next', '.cache', '.git',
136
+ '.obsidian', 'vendor', '.turbo', '.vercel',
137
+ ]);
138
+ /** node:fs implementation of VaultFS — recursive .md walk with the default-ignore set. */
139
+ export function nodeVaultFS(root) {
140
+ async function walk(dir, rel, out) {
141
+ let entries;
142
+ try {
143
+ entries = await readdir(dir, { withFileTypes: true });
144
+ }
145
+ catch {
146
+ return;
147
+ }
148
+ for (const e of entries) {
149
+ if (e.isDirectory()) {
150
+ if (IGNORE_DIRS.has(e.name) || e.name.startsWith('.'))
151
+ continue;
152
+ await walk(join(dir, e.name), rel ? `${rel}/${e.name}` : e.name, out);
153
+ }
154
+ else if (e.isFile() && e.name.toLowerCase().endsWith('.md')) {
155
+ out.push(rel ? `${rel}/${e.name}` : e.name);
156
+ }
157
+ }
158
+ }
159
+ return {
160
+ async listMarkdown() {
161
+ const out = [];
162
+ await walk(root, '', out);
163
+ return out.sort();
164
+ },
165
+ async fingerprint(relPath) {
166
+ try {
167
+ const s = await stat(join(root, relPath));
168
+ return { mtimeMs: s.mtimeMs, size: s.size };
169
+ }
170
+ catch {
171
+ return null;
172
+ }
173
+ },
174
+ read: (relPath) => readFile(join(root, relPath), 'utf8'),
175
+ hash: (content) => createHash('sha256').update(content).digest('hex'),
176
+ };
177
+ }
178
+ const SESSIONS_DIR = appHomePath('sessions');
179
+ /** load first-user-message of the most recent sessions (bounded) for the session corpus. */
180
+ export async function loadRecentSessions(limit = 60) {
181
+ const out = [];
182
+ let candidates;
183
+ try {
184
+ const files = (await readdir(SESSIONS_DIR)).filter((f) => f.endsWith('.json'));
185
+ const withStats = await Promise.all(files.map(async (file) => {
186
+ const full = join(SESSIONS_DIR, file);
187
+ try {
188
+ return { file, full, mtimeMs: (await stat(full)).mtimeMs };
189
+ }
190
+ catch {
191
+ return null;
192
+ }
193
+ }));
194
+ candidates = withStats
195
+ .filter((c) => c !== null)
196
+ .sort((a, b) => b.mtimeMs - a.mtimeMs || b.file.localeCompare(a.file))
197
+ .slice(0, limit);
198
+ }
199
+ catch {
200
+ return out;
201
+ }
202
+ for (const { file, full, mtimeMs } of candidates) {
203
+ try {
204
+ const s = JSON.parse(await readFile(full, 'utf8'));
205
+ const firstUser = (s.messages ?? []).find((m) => m.role === 'user');
206
+ const text = typeof firstUser?.content === 'string' ? firstUser.content : '';
207
+ if (!text.trim())
208
+ continue;
209
+ out.push({ id: `sess:${s.id ?? file}`, text: text.slice(0, 2000), updatedMs: mtimeMs });
210
+ }
211
+ catch {
212
+ /* skip a corrupt session file */
213
+ }
214
+ }
215
+ return out;
216
+ }
217
+ /**
218
+ * Full incremental reindex: vault (via getBrainPath) + memory + sessions + skills,
219
+ * persisted atomically. Returns a change report. This is what `sanook index` and
220
+ * the MCP `sanook_index` tool call.
221
+ */
222
+ export async function reindex(now = Date.now()) {
223
+ const { index, manifest } = await loadIndex();
224
+ let diff = { added: 0, updated: 0, removed: 0, skipped: 0 };
225
+ let nextManifest = manifest;
226
+ const brain = await getBrainPath();
227
+ if (brain) {
228
+ const r = await indexVaultFiles(index, manifest, nodeVaultFS(brain));
229
+ nextManifest = r.manifest;
230
+ diff = r.diff;
231
+ }
232
+ const memory = foldFacts(index, activeFacts(await loadStore(now)), now);
233
+ const sessions = foldSessions(index, await loadRecentSessions());
234
+ const skills = foldSkills(index, (await loadSkills()).map((s) => ({
235
+ id: `skill:${s.name}`,
236
+ name: s.name,
237
+ text: `${s.description} ${s.whenToUse ?? ''}`.trim(),
238
+ })));
239
+ await saveIndex(index, nextManifest);
240
+ return { ...diff, memory, sessions, skills, vaultPath: brain ?? null };
241
+ }