sanook-cli 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +19 -0
- package/CHANGELOG.md +173 -0
- package/README.md +153 -20
- package/README.th.md +136 -0
- package/dist/agentContext.js +4 -0
- package/dist/approval.js +6 -0
- package/dist/bin.js +405 -57
- package/dist/brain.js +92 -59
- package/dist/brand.js +47 -0
- package/dist/checkpoint.js +37 -0
- package/dist/commands.js +86 -6
- package/dist/compaction.js +76 -5
- package/dist/config.js +100 -12
- package/dist/cost.js +60 -3
- package/dist/doctor.js +92 -0
- package/dist/gateway/auth.js +2 -2
- package/dist/gateway/ledger.js +2 -2
- package/dist/gateway/scheduler.js +1 -0
- package/dist/gateway/serve.js +6 -4
- package/dist/gateway/server.js +10 -2
- package/dist/git.js +11 -2
- package/dist/hooks.js +43 -17
- package/dist/knowledge.js +48 -49
- package/dist/loop.js +182 -66
- package/dist/lsp/client.js +173 -0
- package/dist/lsp/framing.js +56 -0
- package/dist/lsp/index.js +138 -0
- package/dist/lsp/servers.js +82 -0
- package/dist/mcp-server.js +244 -0
- package/dist/mcp.js +184 -29
- package/dist/memory-store.js +559 -0
- package/dist/memory.js +143 -29
- package/dist/orchestrate.js +150 -0
- package/dist/providers/codex.js +21 -7
- package/dist/providers/keys.js +3 -2
- package/dist/providers/models.js +22 -6
- package/dist/providers/registry.js +155 -1
- package/dist/repomap.js +93 -0
- package/dist/search/chunk.js +158 -0
- package/dist/search/embed-store.js +187 -0
- package/dist/search/engine.js +203 -0
- package/dist/search/fuse.js +35 -0
- package/dist/search/index-core.js +187 -0
- package/dist/search/indexer.js +241 -0
- package/dist/search/store.js +77 -0
- package/dist/session.js +42 -8
- package/dist/skill-install.js +10 -10
- package/dist/skills.js +12 -9
- package/dist/summarize.js +31 -0
- package/dist/tools/bash.js +21 -2
- package/dist/tools/diagnostics.js +41 -0
- package/dist/tools/edit.js +29 -7
- package/dist/tools/index.js +8 -1
- package/dist/tools/list.js +7 -2
- package/dist/tools/permission.js +90 -9
- package/dist/tools/read.js +23 -4
- package/dist/tools/remember.js +1 -1
- package/dist/tools/sandbox.js +61 -0
- package/dist/tools/search.js +105 -4
- package/dist/tools/task.js +195 -29
- package/dist/tools/timeout.js +35 -0
- package/dist/tools/util.js +10 -0
- package/dist/tools/write.js +6 -4
- package/dist/trust.js +89 -0
- package/dist/ui/app.js +228 -31
- package/dist/ui/banner.js +4 -9
- package/dist/ui/brain-wizard.js +2 -2
- package/dist/ui/history.js +30 -0
- package/dist/ui/mentions.js +44 -0
- package/dist/ui/render.js +55 -15
- package/dist/ui/setup.js +97 -12
- package/dist/ui/useEditor.js +83 -0
- package/dist/update.js +114 -0
- package/dist/worktree.js +173 -0
- package/package.json +11 -5
- package/scripts/postinstall.mjs +33 -0
- package/second-brain/.agents/_Index.md +30 -0
- package/second-brain/.agents/skills/_Index.md +30 -0
- package/second-brain/.agents/workflows/_Index.md +30 -0
- package/second-brain/AGENTS.md +4 -4
- package/second-brain/Acceptance/_Index.md +30 -0
- package/second-brain/Acceptance/golden-case-template.md +39 -0
- package/second-brain/Areas/_Index.md +30 -0
- package/second-brain/Bugs/System-OS/_Index.md +30 -0
- package/second-brain/Bugs/_Index.md +30 -0
- package/second-brain/CLAUDE.md +4 -1
- package/second-brain/Checklists/_Index.md +30 -0
- package/second-brain/Checklists/preflight-postflight-template.md +29 -0
- package/second-brain/Distillations/_Index.md +30 -0
- package/second-brain/Entities/_Index.md +30 -0
- package/second-brain/Entities/entity-template.md +33 -0
- package/second-brain/Evals/_Index.md +30 -0
- package/second-brain/Evals/correction-pairs.md +24 -0
- package/second-brain/Evals/failure-taxonomy.md +24 -0
- package/second-brain/Evals/golden-set.md +25 -0
- package/second-brain/Evals/quality-ledger.md +23 -0
- package/second-brain/Evals/self-eval-rubric.md +23 -0
- package/second-brain/GEMINI.md +4 -4
- package/second-brain/Goals/_Index.md +30 -0
- package/second-brain/Handoffs/_Index.md +30 -0
- package/second-brain/Home.md +7 -0
- package/second-brain/Intake/Raw Sources/_Index.md +30 -0
- package/second-brain/Intake/_Index.md +30 -0
- package/second-brain/Intake/_Quarantine/_Index.md +30 -0
- package/second-brain/Learning/_Index.md +30 -0
- package/second-brain/Playbooks/_Index.md +30 -0
- package/second-brain/Playbooks/playbook-template.md +23 -0
- package/second-brain/Projects/_Index.md +30 -0
- package/second-brain/Prompts/_Index.md +30 -0
- package/second-brain/README.md +2 -1
- package/second-brain/Research/_Index.md +30 -0
- package/second-brain/Retrospectives/_Index.md +30 -0
- package/second-brain/Reviews/_Index.md +30 -0
- package/second-brain/Runbooks/_Index.md +30 -0
- package/second-brain/Runbooks/eval-loop.md +24 -0
- package/second-brain/Sessions/_Index.md +30 -0
- package/second-brain/Shared/AI-Context-Index.md +20 -0
- package/second-brain/Shared/AI-Threads/_Index.md +30 -0
- package/second-brain/Shared/Archive/_Index.md +30 -0
- package/second-brain/Shared/Assets/_Index.md +30 -0
- package/second-brain/Shared/Context-Packs/_Index.md +30 -0
- package/second-brain/Shared/Context7-Docs/_Index.md +30 -0
- package/second-brain/Shared/Coordination/NOW.md +28 -0
- package/second-brain/Shared/Coordination/_Index.md +30 -0
- package/second-brain/Shared/Coordination/agent-registry.md +24 -0
- package/second-brain/Shared/Coordination/task-board/_Index.md +30 -0
- package/second-brain/Shared/Coordination/task-board/task-template.md +43 -0
- package/second-brain/Shared/Coordination/task-board.md +32 -0
- package/second-brain/Shared/Core-Facts/_Index.md +30 -0
- package/second-brain/Shared/Decision-Memory/_Index.md +30 -0
- package/second-brain/Shared/Glossary/_Index.md +30 -0
- package/second-brain/Shared/Memory-Inbox/_Index.md +30 -0
- package/second-brain/Shared/Operating-State/_Index.md +30 -0
- package/second-brain/Shared/Prompting/_Index.md +30 -0
- package/second-brain/Shared/Provenance/_Index.md +30 -0
- package/second-brain/Shared/Rules/_Index.md +30 -0
- package/second-brain/Shared/Rules/contextual-note-rule.md +30 -0
- package/second-brain/Shared/Rules/frontmatter-standard.md +10 -0
- package/second-brain/Shared/Rules/memory-write-protocol.md +28 -0
- package/second-brain/Shared/Rules/procedural-runbook-header.md +40 -0
- package/second-brain/Shared/Rules/review-and-staleness-policy.md +22 -0
- package/second-brain/Shared/Rules/rules-formatting.md +34 -0
- package/second-brain/Shared/Scripts/_Index.md +30 -0
- package/second-brain/Shared/Scripts-Archive/_Index.md +30 -0
- package/second-brain/Shared/Tech-Standards/_Index.md +30 -0
- package/second-brain/Shared/Tech-Standards/verification-standard.md +40 -0
- package/second-brain/Shared/User-Memory/_Index.md +30 -0
- package/second-brain/Shared/User-Persona/_Index.md +30 -0
- package/second-brain/Shared/User-Persona/owner-profile.md +25 -0
- package/second-brain/Shared/Working-Memory/_Index.md +30 -0
- package/second-brain/Shared/_Index.md +30 -0
- package/second-brain/Shared/mcp-servers/_Index.md +30 -0
- package/second-brain/Skills/_Index.md +30 -0
- package/second-brain/Templates/_Index.md +30 -0
- package/second-brain/Templates/bug.md +2 -0
- package/second-brain/Templates/handoff.md +2 -0
- package/second-brain/Templates/session.md +2 -0
- package/second-brain/Tools/_Index.md +30 -0
- package/second-brain/Traces/_Index.md +30 -0
- package/second-brain/Vault Structure Map.md +33 -1
- package/second-brain/copilot/_Index.md +30 -0
- package/skills/audit-license-compliance/SKILL.md +117 -0
- package/skills/author-codemod/SKILL.md +110 -0
- package/skills/build-audit-logging/SKILL.md +112 -0
- package/skills/build-cdc-streaming-pipeline/SKILL.md +123 -0
- package/skills/build-cli-tool/SKILL.md +108 -0
- package/skills/build-data-table/SKILL.md +141 -0
- package/skills/build-native-mobile-ui/SKILL.md +154 -0
- package/skills/build-offline-first-sync/SKILL.md +118 -0
- package/skills/build-realtime-channel/SKILL.md +122 -0
- package/skills/build-vector-search/SKILL.md +131 -0
- package/skills/compose-local-dev-stack/SKILL.md +149 -0
- package/skills/configure-bundler-build/SKILL.md +166 -0
- package/skills/configure-dns-tls/SKILL.md +142 -0
- package/skills/configure-reverse-proxy-lb/SKILL.md +129 -0
- package/skills/configure-security-headers-csp/SKILL.md +122 -0
- package/skills/contract-testing/SKILL.md +140 -0
- package/skills/datetime-timezone-correctness/SKILL.md +125 -0
- package/skills/debug-ci-pipeline-failure/SKILL.md +134 -0
- package/skills/debug-flaky-tests/SKILL.md +128 -0
- package/skills/defend-llm-prompt-injection/SKILL.md +110 -0
- package/skills/deliver-webhooks/SKILL.md +116 -0
- package/skills/design-api-pagination/SKILL.md +144 -0
- package/skills/design-authorization-model/SKILL.md +119 -0
- package/skills/design-backup-dr-recovery/SKILL.md +113 -0
- package/skills/design-event-sourcing-cqrs/SKILL.md +143 -0
- package/skills/design-multi-tenancy/SKILL.md +100 -0
- package/skills/design-protobuf-grpc-service/SKILL.md +146 -0
- package/skills/design-relational-schema/SKILL.md +129 -0
- package/skills/design-search-index-infra/SKILL.md +151 -0
- package/skills/design-state-machine/SKILL.md +108 -0
- package/skills/design-token-system/SKILL.md +109 -0
- package/skills/distributed-locks-leases/SKILL.md +120 -0
- package/skills/encrypt-sensitive-data/SKILL.md +148 -0
- package/skills/feature-flags-rollout/SKILL.md +130 -0
- package/skills/file-upload-object-storage/SKILL.md +107 -0
- package/skills/fuzz-dynamic-security-test/SKILL.md +111 -0
- package/skills/harden-llm-app-reliability/SKILL.md +126 -0
- package/skills/i18n-localization-setup/SKILL.md +113 -0
- package/skills/idempotency-keys/SKILL.md +107 -0
- package/skills/implement-push-notifications/SKILL.md +142 -0
- package/skills/ingest-webhook-secure/SKILL.md +120 -0
- package/skills/integrate-oauth-oidc/SKILL.md +126 -0
- package/skills/load-stress-test/SKILL.md +129 -0
- package/skills/map-privacy-data-gdpr/SKILL.md +146 -0
- package/skills/model-nosql-data/SKILL.md +118 -0
- package/skills/money-decimal-arithmetic/SKILL.md +123 -0
- package/skills/monitor-ml-drift/SKILL.md +109 -0
- package/skills/numeric-precision-units/SKILL.md +144 -0
- package/skills/optimize-llm-cost-latency/SKILL.md +103 -0
- package/skills/optimize-react-rerenders/SKILL.md +124 -0
- package/skills/orchestrate-agent-workflow/SKILL.md +100 -0
- package/skills/payments-billing-integration/SKILL.md +114 -0
- package/skills/pin-toolchain-versions/SKILL.md +116 -0
- package/skills/plan-strangler-migration/SKILL.md +95 -0
- package/skills/property-based-testing/SKILL.md +108 -0
- package/skills/publish-package-registry/SKILL.md +130 -0
- package/skills/recover-git-state/SKILL.md +119 -0
- package/skills/remediate-web-vulnerabilities/SKILL.md +125 -0
- package/skills/resilience-timeouts-retries/SKILL.md +104 -0
- package/skills/resolve-merge-rebase-conflict/SKILL.md +97 -0
- package/skills/rewrite-git-history/SKILL.md +109 -0
- package/skills/scaffold-cross-platform-app/SKILL.md +137 -0
- package/skills/schema-evolution-compatibility/SKILL.md +121 -0
- package/skills/send-transactional-email/SKILL.md +126 -0
- package/skills/serve-deploy-ml-model/SKILL.md +107 -0
- package/skills/setup-cdn-edge-waf/SKILL.md +107 -0
- package/skills/setup-devcontainer-env/SKILL.md +131 -0
- package/skills/setup-lint-format-precommit/SKILL.md +140 -0
- package/skills/setup-monorepo-tooling/SKILL.md +125 -0
- package/skills/ship-mobile-app-store-release/SKILL.md +137 -0
- package/skills/structured-output-llm/SKILL.md +86 -0
- package/skills/supply-chain-sbom-provenance/SKILL.md +120 -0
- package/skills/test-data-factories/SKILL.md +158 -0
- package/skills/threat-model-stride/SKILL.md +123 -0
- package/skills/train-evaluate-ml-model/SKILL.md +109 -0
- package/skills/unicode-text-correctness/SKILL.md +109 -0
- package/skills/visual-regression-testing/SKILL.md +120 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
// ============================================================================
|
|
2
|
+
// src/search/engine.ts — the search orchestrator (the one module callers use).
|
|
3
|
+
//
|
|
4
|
+
// Implements the degradation ladder as a single search() call:
|
|
5
|
+
// mode='fts' → pure BM25 (the always-on floor)
|
|
6
|
+
// mode='semantic' → cosine over BYOK vectors (full recall)
|
|
7
|
+
// mode='hybrid' → BM25 ⊕ cosine ⊕ memory-importance prior, fused by RRF
|
|
8
|
+
// mode='auto' → hybrid when vectors are usable, else fts (the smart default)
|
|
9
|
+
//
|
|
10
|
+
// rankSearch() is the PURE core (index + optional vectors + optional query vector
|
|
11
|
+
// in, ranked hits out) so the whole ranking pipeline unit-tests with zero disk
|
|
12
|
+
// and zero network. search() is the thin disk/embedding wrapper: it caches the
|
|
13
|
+
// index by mtime, caches query embeddings in an LRU, resolves a BYOK embedder
|
|
14
|
+
// lazily, and on ANY embedding error degrades to BM25 with a `degraded` flag —
|
|
15
|
+
// search must never throw at the floor.
|
|
16
|
+
// ============================================================================
|
|
17
|
+
import { readFile } from 'node:fs/promises';
|
|
18
|
+
import { appHomePath } from '../brand.js';
|
|
19
|
+
import { bm25Search, termList } from './index-core.js';
|
|
20
|
+
import { rrfFuse } from './fuse.js';
|
|
21
|
+
import { cosineTopK, embedQuery, getEmbedder, loadVectors, vectorsMtimeMs, } from './embed-store.js';
|
|
22
|
+
import { indexMtimeMs, loadIndex } from './store.js';
|
|
23
|
+
const CAND = 60; // candidate pool depth per leg before fusion/limit
|
|
24
|
+
const SNIPPET_WIDTH = 64;
|
|
25
|
+
/** ±width snippet around the first matched query term; falls back to the head for semantic-only hits. */
|
|
26
|
+
function makeSnippet(text, qTerms, width = SNIPPET_WIDTH) {
|
|
27
|
+
const flat = text.replace(/\s+/g, ' ').trim();
|
|
28
|
+
const lower = flat.toLowerCase();
|
|
29
|
+
let pos = -1;
|
|
30
|
+
for (const t of qTerms) {
|
|
31
|
+
const i = lower.indexOf(t);
|
|
32
|
+
if (i >= 0 && (pos < 0 || i < pos))
|
|
33
|
+
pos = i;
|
|
34
|
+
}
|
|
35
|
+
if (pos < 0)
|
|
36
|
+
return flat.length > width * 2 ? `${flat.slice(0, width * 2).trim()}…` : flat;
|
|
37
|
+
const start = Math.max(0, pos - width);
|
|
38
|
+
const end = Math.min(flat.length, pos + width);
|
|
39
|
+
return `${start > 0 ? '…' : ''}${flat.slice(start, end).trim()}${end < flat.length ? '…' : ''}`;
|
|
40
|
+
}
|
|
41
|
+
/** ids of docs whose source is allowed (or all if no filter). */
|
|
42
|
+
function sourceFilteredIds(index, sources) {
|
|
43
|
+
if (!sources)
|
|
44
|
+
return undefined;
|
|
45
|
+
const out = new Set();
|
|
46
|
+
for (const m of index.docs.values())
|
|
47
|
+
if (sources.has(m.source))
|
|
48
|
+
out.add(m.id);
|
|
49
|
+
return out;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* PURE ranking core. Given the index, optional vectors, and an optional query
|
|
53
|
+
* vector, produce ranked hits per the requested mode. No disk, no network.
|
|
54
|
+
*/
|
|
55
|
+
export function rankSearch(index, query, opts = {}, vectors, queryVec) {
|
|
56
|
+
const mode = opts.mode ?? 'auto';
|
|
57
|
+
const limit = opts.limit ?? 8;
|
|
58
|
+
const sources = opts.sources?.length ? new Set(opts.sources) : undefined;
|
|
59
|
+
const qTerms = [...new Set(termList(query))];
|
|
60
|
+
const bm25 = bm25Search(index, query, CAND, sources);
|
|
61
|
+
const bm25Ids = bm25.map((h) => h.id);
|
|
62
|
+
const semanticPossible = !!(vectors && vectors.dim && queryVec && queryVec.length === vectors.dim);
|
|
63
|
+
const wantsSemantic = mode === 'semantic' || mode === 'hybrid' || mode === 'auto';
|
|
64
|
+
// resolve the executed mode + a degraded reason if the request can't be honored
|
|
65
|
+
let exec;
|
|
66
|
+
let degraded;
|
|
67
|
+
if (!wantsSemantic)
|
|
68
|
+
exec = 'fts';
|
|
69
|
+
else if (semanticPossible)
|
|
70
|
+
exec = mode === 'auto' ? 'hybrid' : mode;
|
|
71
|
+
else {
|
|
72
|
+
exec = 'fts';
|
|
73
|
+
if (mode === 'semantic' || mode === 'hybrid')
|
|
74
|
+
degraded = 'semantic-unavailable';
|
|
75
|
+
}
|
|
76
|
+
let orderedIds;
|
|
77
|
+
if (exec === 'fts') {
|
|
78
|
+
orderedIds = bm25Ids;
|
|
79
|
+
}
|
|
80
|
+
else {
|
|
81
|
+
const allowed = sourceFilteredIds(index, sources);
|
|
82
|
+
const cosine = cosineTopK(vectors, queryVec, CAND, allowed).filter((h) => index.docs.has(h.id));
|
|
83
|
+
const cosineIds = cosine.map((h) => h.id);
|
|
84
|
+
if (exec === 'semantic') {
|
|
85
|
+
orderedIds = cosineIds;
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
// hybrid: BM25 ⊕ cosine ⊕ memory-importance prior, fused by rank (scale-free)
|
|
89
|
+
const priorIds = [...new Set([...bm25Ids, ...cosineIds])]
|
|
90
|
+
.map((id) => index.docs.get(id))
|
|
91
|
+
.filter((m) => !!m && m.source === 'memory' && m.importance != null)
|
|
92
|
+
.sort((a, b) => (b.importance ?? 0) - (a.importance ?? 0))
|
|
93
|
+
.map((m) => m.id);
|
|
94
|
+
orderedIds = rrfFuse([
|
|
95
|
+
{ ids: bm25Ids },
|
|
96
|
+
{ ids: cosineIds },
|
|
97
|
+
{ ids: priorIds, weight: 0.4 },
|
|
98
|
+
]);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
const snippets = opts.snippets !== false;
|
|
102
|
+
const hits = [];
|
|
103
|
+
for (const id of orderedIds.slice(0, limit)) {
|
|
104
|
+
const m = index.docs.get(id);
|
|
105
|
+
if (!m)
|
|
106
|
+
continue;
|
|
107
|
+
hits.push({
|
|
108
|
+
id: m.id,
|
|
109
|
+
source: m.source,
|
|
110
|
+
title: m.title,
|
|
111
|
+
path: m.path,
|
|
112
|
+
noteType: m.noteType,
|
|
113
|
+
tags: m.tags,
|
|
114
|
+
score: 0, // rank-based; fused score isn't meaningful cross-mode, so we expose rank order
|
|
115
|
+
snippet: snippets ? makeSnippet(m.text, qTerms) : '',
|
|
116
|
+
importance: m.importance,
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
return { hits, mode: exec, degraded, total: new Set(orderedIds).size };
|
|
120
|
+
}
|
|
121
|
+
// ---- disk/embedding wrapper (the only impure part) -------------------------
|
|
122
|
+
let indexCache = null;
|
|
123
|
+
let vectorCache = null;
|
|
124
|
+
const queryVecLRU = new Map(); // key = `${tag}\n${query}`
|
|
125
|
+
const LRU_CAP = 100;
|
|
126
|
+
/** cached index load — re-reads only when the on-disk index.json mtime changes. */
|
|
127
|
+
async function cachedIndex() {
|
|
128
|
+
const mtime = await indexMtimeMs();
|
|
129
|
+
if (!indexCache || indexCache.mtime !== mtime) {
|
|
130
|
+
indexCache = { index: (await loadIndex()).index, mtime };
|
|
131
|
+
}
|
|
132
|
+
return indexCache.index;
|
|
133
|
+
}
|
|
134
|
+
async function cachedVectors() {
|
|
135
|
+
const mtime = await vectorsMtimeMs();
|
|
136
|
+
if (!vectorCache || vectorCache.mtime !== mtime) {
|
|
137
|
+
vectorCache = { vectors: await loadVectors(), mtime };
|
|
138
|
+
}
|
|
139
|
+
return vectorCache.vectors;
|
|
140
|
+
}
|
|
141
|
+
/** read an optional embeddingModel spec from ~/.sanook/config.json. */
|
|
142
|
+
async function configEmbeddingModel() {
|
|
143
|
+
try {
|
|
144
|
+
const cfg = JSON.parse(await readFile(appHomePath('config.json'), 'utf8'));
|
|
145
|
+
return cfg.embeddingModel;
|
|
146
|
+
}
|
|
147
|
+
catch {
|
|
148
|
+
return undefined;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
/** drop in-process caches (tests + after a reindex in the same process). */
|
|
152
|
+
export function resetSearchCaches() {
|
|
153
|
+
indexCache = null;
|
|
154
|
+
vectorCache = null;
|
|
155
|
+
queryVecLRU.clear();
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* The public search entrypoint. Loads the cached index, lazily resolves a BYOK
|
|
159
|
+
* embedder (opts → env SANOOK_EMBEDDING_MODEL → config → auto-detect), embeds the
|
|
160
|
+
* query (LRU-cached) only when semantic is wanted AND a usable same-tag vector set
|
|
161
|
+
* exists, then delegates to rankSearch. Any embedding failure degrades to BM25.
|
|
162
|
+
*/
|
|
163
|
+
export async function search(query, opts = {}) {
|
|
164
|
+
const index = await cachedIndex();
|
|
165
|
+
const mode = opts.mode ?? 'auto';
|
|
166
|
+
if (mode === 'fts')
|
|
167
|
+
return rankSearch(index, query, opts);
|
|
168
|
+
const spec = opts.embeddingModel ?? process.env.SANOOK_EMBEDDING_MODEL ?? (await configEmbeddingModel());
|
|
169
|
+
const embedder = getEmbedder(spec);
|
|
170
|
+
if (!embedder) {
|
|
171
|
+
const res = rankSearch(index, query, opts);
|
|
172
|
+
if (mode === 'semantic' || mode === 'hybrid')
|
|
173
|
+
res.degraded = 'no-embedder';
|
|
174
|
+
return res;
|
|
175
|
+
}
|
|
176
|
+
const vectors = await cachedVectors();
|
|
177
|
+
// a model change (different tag) invalidates the cache → behave as no-vectors until reindex
|
|
178
|
+
if (!vectors.dim || vectors.tag !== embedder.tag) {
|
|
179
|
+
const res = rankSearch(index, query, opts);
|
|
180
|
+
res.degraded = vectors.dim ? 'embedding-model-changed' : 'no-vectors';
|
|
181
|
+
return mode === 'auto' ? { ...res, degraded: undefined } : res;
|
|
182
|
+
}
|
|
183
|
+
let queryVec;
|
|
184
|
+
try {
|
|
185
|
+
const key = `${embedder.tag}\n${query}`;
|
|
186
|
+
const cached = queryVecLRU.get(key);
|
|
187
|
+
if (cached) {
|
|
188
|
+
queryVec = cached;
|
|
189
|
+
}
|
|
190
|
+
else {
|
|
191
|
+
queryVec = await embedQuery(embedder, query);
|
|
192
|
+
queryVecLRU.set(key, queryVec);
|
|
193
|
+
if (queryVecLRU.size > LRU_CAP)
|
|
194
|
+
queryVecLRU.delete(queryVecLRU.keys().next().value);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
catch {
|
|
198
|
+
const res = rankSearch(index, query, opts); // embedding failed mid-query → BM25 floor
|
|
199
|
+
res.degraded = 'semantic-unavailable';
|
|
200
|
+
return res;
|
|
201
|
+
}
|
|
202
|
+
return rankSearch(index, query, opts, vectors, queryVec);
|
|
203
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
// ============================================================================
|
|
2
|
+
// src/search/fuse.ts — Reciprocal Rank Fusion (RRF).
|
|
3
|
+
//
|
|
4
|
+
// arra-oracle blends results with a hand-tuned linear formula
|
|
5
|
+
// (fts*0.7 + vec*0.65 + 0.12*overlap) that mixes BM25 magnitudes with cosine
|
|
6
|
+
// distances — two scales that are not comparable, so the weights are fragile and
|
|
7
|
+
// corpus-dependent. RRF sidesteps the whole problem: it fuses on RANK, not score,
|
|
8
|
+
// so a document's contribution depends only on where it placed in each list, not
|
|
9
|
+
// on the (incomparable) raw numbers. A doc that ranks well in two lists naturally
|
|
10
|
+
// sums two reciprocals and outranks a doc strong in only one. k=60 is the
|
|
11
|
+
// standard Cormack et al. constant. Pure, deterministic, parameter-light.
|
|
12
|
+
// ============================================================================
|
|
13
|
+
const RRF_K = 60;
|
|
14
|
+
/**
|
|
15
|
+
* Fuse N ranked id-lists into a single score map (higher = better).
|
|
16
|
+
* score(d) = Σ_lists weight / (k + rank_in_list(d)), rank 0-based.
|
|
17
|
+
*/
|
|
18
|
+
export function rrf(lists, k = RRF_K) {
|
|
19
|
+
const scores = new Map();
|
|
20
|
+
for (const list of lists) {
|
|
21
|
+
const w = list.weight ?? 1;
|
|
22
|
+
for (let rank = 0; rank < list.ids.length; rank++) {
|
|
23
|
+
const id = list.ids[rank];
|
|
24
|
+
scores.set(id, (scores.get(id) ?? 0) + w / (k + rank));
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return scores;
|
|
28
|
+
}
|
|
29
|
+
/** RRF then sort → fused id list (best first), deterministic tie-break by id. */
|
|
30
|
+
export function rrfFuse(lists, limit, k = RRF_K) {
|
|
31
|
+
const scores = rrf(lists, k);
|
|
32
|
+
const out = [...scores.entries()].sort((a, b) => b[1] - a[1] || (a[0] < b[0] ? -1 : a[0] > b[0] ? 1 : 0));
|
|
33
|
+
const ids = out.map(([id]) => id);
|
|
34
|
+
return limit == null ? ids : ids.slice(0, limit);
|
|
35
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
// ============================================================================
|
|
2
|
+
// src/search/index-core.ts — the zero-dependency search FLOOR.
|
|
3
|
+
//
|
|
4
|
+
// A pure-TS inverted index with REAL BM25 (k1=1.2, b=0.75, genuine corpus-stat
|
|
5
|
+
// IDF via df/N). No SQLite, no Bun, no native binary, no network — it works the
|
|
6
|
+
// instant a corpus exists, on any OS Node 22 runs on. This is deliberately NOT
|
|
7
|
+
// node:sqlite FTS5: that is experimental, its FTS5 build is not guaranteed across
|
|
8
|
+
// platforms, and it reintroduces a quasi-native dependency that fights the
|
|
9
|
+
// zero-config/portability contract. A few hundred lines of TS give us a real
|
|
10
|
+
// ranking model that FTS5's bm25() only approximates without true global IDF.
|
|
11
|
+
//
|
|
12
|
+
// Tokenization REUSES memory-store.ts normalize() (the canonical, Thai-safe,
|
|
13
|
+
// stopword-aware tokenizer) so memory matching and search matching never drift.
|
|
14
|
+
//
|
|
15
|
+
// addDoc/removeDoc MUTATE the index in place and return it — an index over a
|
|
16
|
+
// large vault must not deep-copy its postings map on every chunk (that is the
|
|
17
|
+
// one place we diverge from memory-store's small-array immutability). bm25Search
|
|
18
|
+
// is pure and read-only. Re-adding the same doc id replaces its postings, so the
|
|
19
|
+
// index can never accumulate duplicate postings the way arra's FTS5
|
|
20
|
+
// delete-then-insert can drift.
|
|
21
|
+
// ============================================================================
|
|
22
|
+
import { normalize } from '../memory-store.js';
|
|
23
|
+
export const SEARCH_SOURCES = ['memory', 'vault', 'session', 'skill'];
|
|
24
|
+
/** BM25 params — Robertson/Spärck-Jones defaults; title terms get weighted tf. */
|
|
25
|
+
const K1 = 1.2;
|
|
26
|
+
const B = 0.75;
|
|
27
|
+
const TITLE_BOOST = 2; // a term in a doc's title counts this many times toward tf
|
|
28
|
+
const WORD_SEG = new Intl.Segmenter(undefined, { granularity: 'word' });
|
|
29
|
+
export const INDEX_VERSION = 1;
|
|
30
|
+
export function emptyIndex() {
|
|
31
|
+
return { version: INDEX_VERSION, postings: new Map(), docs: new Map(), totalDl: 0 };
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Ordered tokens WITH repeats — BM25 needs term frequencies, so unlike
|
|
35
|
+
* memory-store's tokens() (a deduped Set) we keep counts. Builds on the SAME
|
|
36
|
+
* canonical normalize() (lowercase, punctuation→space, Thai preserved), then
|
|
37
|
+
* segments with Intl.Segmenter at word granularity so Thai (which has no spaces)
|
|
38
|
+
* splits into real words instead of one coarse blob, giving BM25 genuine Thai
|
|
39
|
+
* term frequencies while preserving repeats.
|
|
40
|
+
*/
|
|
41
|
+
export function termList(text) {
|
|
42
|
+
const out = [];
|
|
43
|
+
for (const seg of WORD_SEG.segment(normalize(text))) {
|
|
44
|
+
if (!seg.isWordLike)
|
|
45
|
+
continue;
|
|
46
|
+
const token = seg.segment.trim();
|
|
47
|
+
if (token.length > 1)
|
|
48
|
+
out.push(token);
|
|
49
|
+
}
|
|
50
|
+
return out;
|
|
51
|
+
}
|
|
52
|
+
/** combined term-frequency map for a doc, with title terms weighted, + the token length. */
|
|
53
|
+
function termFreqs(title, text) {
|
|
54
|
+
const tf = new Map();
|
|
55
|
+
const body = termList(text);
|
|
56
|
+
const head = termList(title);
|
|
57
|
+
for (const t of body)
|
|
58
|
+
tf.set(t, (tf.get(t) ?? 0) + 1);
|
|
59
|
+
for (const t of head)
|
|
60
|
+
tf.set(t, (tf.get(t) ?? 0) + TITLE_BOOST);
|
|
61
|
+
return { tf, dl: body.length + head.length };
|
|
62
|
+
}
|
|
63
|
+
/** add (or REPLACE, if id already present) a document. Mutates + returns idx. */
|
|
64
|
+
export function addDoc(idx, doc) {
|
|
65
|
+
if (idx.docs.has(doc.id))
|
|
66
|
+
removeDoc(idx, doc.id); // replace → no posting creep
|
|
67
|
+
const { tf, dl } = termFreqs(doc.title, doc.text);
|
|
68
|
+
const meta = {
|
|
69
|
+
id: doc.id,
|
|
70
|
+
source: doc.source,
|
|
71
|
+
title: doc.title,
|
|
72
|
+
text: doc.text,
|
|
73
|
+
path: doc.path,
|
|
74
|
+
noteType: doc.noteType,
|
|
75
|
+
tags: doc.tags ?? [],
|
|
76
|
+
links: doc.links ?? [],
|
|
77
|
+
importance: doc.importance,
|
|
78
|
+
updatedMs: doc.updatedMs,
|
|
79
|
+
dl,
|
|
80
|
+
};
|
|
81
|
+
idx.docs.set(doc.id, meta);
|
|
82
|
+
idx.totalDl += dl;
|
|
83
|
+
for (const [term, freq] of tf) {
|
|
84
|
+
const plist = idx.postings.get(term);
|
|
85
|
+
if (plist)
|
|
86
|
+
plist.push({ docId: doc.id, tf: freq });
|
|
87
|
+
else
|
|
88
|
+
idx.postings.set(term, [{ docId: doc.id, tf: freq }]);
|
|
89
|
+
}
|
|
90
|
+
return idx;
|
|
91
|
+
}
|
|
92
|
+
/** remove a document and all its postings. Mutates + returns idx. No-op if absent. */
|
|
93
|
+
export function removeDoc(idx, id) {
|
|
94
|
+
const meta = idx.docs.get(id);
|
|
95
|
+
if (!meta)
|
|
96
|
+
return idx;
|
|
97
|
+
const { tf } = termFreqs(meta.title, meta.text);
|
|
98
|
+
for (const term of tf.keys()) {
|
|
99
|
+
const plist = idx.postings.get(term);
|
|
100
|
+
if (!plist)
|
|
101
|
+
continue;
|
|
102
|
+
const next = plist.filter((p) => p.docId !== id);
|
|
103
|
+
if (next.length)
|
|
104
|
+
idx.postings.set(term, next);
|
|
105
|
+
else
|
|
106
|
+
idx.postings.delete(term);
|
|
107
|
+
}
|
|
108
|
+
idx.totalDl -= meta.dl;
|
|
109
|
+
idx.docs.delete(id);
|
|
110
|
+
return idx;
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* BM25 ranking — pure, read-only. Genuine IDF from df/N (the always-positive
|
|
114
|
+
* BM25+ form ln(1 + (N-df+0.5)/(df+0.5))), length-normalized by avgdl. Optional
|
|
115
|
+
* source allow-list keeps cross-corpus queries cheap. Deterministic tie-break by id.
|
|
116
|
+
*/
|
|
117
|
+
export function bm25Search(idx, query, limit = 50, sources) {
|
|
118
|
+
const n = idx.docs.size;
|
|
119
|
+
if (!n)
|
|
120
|
+
return [];
|
|
121
|
+
const avgdl = idx.totalDl / n || 1;
|
|
122
|
+
const qTerms = [...new Set(termList(query))];
|
|
123
|
+
if (!qTerms.length)
|
|
124
|
+
return [];
|
|
125
|
+
const scores = new Map();
|
|
126
|
+
for (const term of qTerms) {
|
|
127
|
+
const plist = idx.postings.get(term);
|
|
128
|
+
if (!plist)
|
|
129
|
+
continue;
|
|
130
|
+
const df = plist.length;
|
|
131
|
+
const idf = Math.log(1 + (n - df + 0.5) / (df + 0.5));
|
|
132
|
+
for (const p of plist) {
|
|
133
|
+
const meta = idx.docs.get(p.docId);
|
|
134
|
+
if (!meta)
|
|
135
|
+
continue;
|
|
136
|
+
if (sources && !sources.has(meta.source))
|
|
137
|
+
continue;
|
|
138
|
+
const denom = p.tf + K1 * (1 - B + B * (meta.dl / avgdl));
|
|
139
|
+
const contrib = idf * ((p.tf * (K1 + 1)) / denom);
|
|
140
|
+
scores.set(p.docId, (scores.get(p.docId) ?? 0) + contrib);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
return [...scores.entries()]
|
|
144
|
+
.map(([id, score]) => ({ id, score }))
|
|
145
|
+
.sort((a, b) => b.score - a.score || (a.id < b.id ? -1 : a.id > b.id ? 1 : 0))
|
|
146
|
+
.slice(0, limit);
|
|
147
|
+
}
|
|
148
|
+
/** remove every doc of a given source (used to refresh the live memory/session/skill corpora). Returns count removed. */
|
|
149
|
+
export function removeSource(idx, source) {
|
|
150
|
+
const ids = [];
|
|
151
|
+
for (const m of idx.docs.values())
|
|
152
|
+
if (m.source === source)
|
|
153
|
+
ids.push(m.id);
|
|
154
|
+
for (const id of ids)
|
|
155
|
+
removeDoc(idx, id);
|
|
156
|
+
return ids.length;
|
|
157
|
+
}
|
|
158
|
+
export function indexStats(idx) {
|
|
159
|
+
const bySource = {};
|
|
160
|
+
for (const m of idx.docs.values())
|
|
161
|
+
bySource[m.source] = (bySource[m.source] ?? 0) + 1;
|
|
162
|
+
return {
|
|
163
|
+
docs: idx.docs.size,
|
|
164
|
+
terms: idx.postings.size,
|
|
165
|
+
bySource,
|
|
166
|
+
avgdl: idx.docs.size ? idx.totalDl / idx.docs.size : 0,
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
export function indexToJSON(idx) {
|
|
170
|
+
const postings = {};
|
|
171
|
+
for (const [term, plist] of idx.postings)
|
|
172
|
+
postings[term] = plist;
|
|
173
|
+
return { version: idx.version, totalDl: idx.totalDl, postings, docs: [...idx.docs.values()] };
|
|
174
|
+
}
|
|
175
|
+
export function indexFromJSON(raw) {
|
|
176
|
+
const obj = raw;
|
|
177
|
+
if (!obj || obj.version !== INDEX_VERSION || !obj.postings || !Array.isArray(obj.docs)) {
|
|
178
|
+
return emptyIndex(); // unknown/old shape degrades to empty rather than throwing
|
|
179
|
+
}
|
|
180
|
+
const idx = emptyIndex();
|
|
181
|
+
idx.totalDl = obj.totalDl ?? 0;
|
|
182
|
+
for (const [term, plist] of Object.entries(obj.postings))
|
|
183
|
+
idx.postings.set(term, plist);
|
|
184
|
+
for (const m of obj.docs)
|
|
185
|
+
idx.docs.set(m.id, m);
|
|
186
|
+
return idx;
|
|
187
|
+
}
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
// ============================================================================
|
|
2
|
+
// src/search/indexer.ts — incremental, O(delta) vault indexer.
|
|
3
|
+
//
|
|
4
|
+
// Beats arra-oracle's indexer on three axes:
|
|
5
|
+
// 1. NO directory convention. arra requires a `ψ/memory/…` tree; we index the
|
|
6
|
+
// user's EXISTING second-brain vault via getBrainPath(), any layout.
|
|
7
|
+
// 2. TRUE incremental. arra full-re-indexes every pass (guarded only by a >50%
|
|
8
|
+
// delete abort). We diff a per-file manifest: an unchanged file costs ONE
|
|
9
|
+
// stat(); only changed files are read+sha256+re-chunked; deleted files have
|
|
10
|
+
// their chunks evicted precisely (manifest stores each file's chunk ids).
|
|
11
|
+
// 3. ONE unified surface. Vault chunks, active memory Facts, recent session
|
|
12
|
+
// turns, and skills all land in the SAME ranked index — the unification arra
|
|
13
|
+
// never did (its memory store and search index use divorced formats).
|
|
14
|
+
//
|
|
15
|
+
// The file-walk is injected (VaultFS) so the core logic unit-tests against an
|
|
16
|
+
// in-memory fs + clock with zero disk, exactly like memory-store.ts.
|
|
17
|
+
// ============================================================================
|
|
18
|
+
import { createHash } from 'node:crypto';
|
|
19
|
+
import { readFile, readdir, stat } from 'node:fs/promises';
|
|
20
|
+
import { join } from 'node:path';
|
|
21
|
+
import { appHomePath } from '../brand.js';
|
|
22
|
+
import { getBrainPath } from '../memory.js';
|
|
23
|
+
import { loadSkills } from '../skills.js';
|
|
24
|
+
import { activeFacts, effImportance, loadStore } from '../memory-store.js';
|
|
25
|
+
import { chunkMarkdown } from './chunk.js';
|
|
26
|
+
import { addDoc, removeDoc, removeSource } from './index-core.js';
|
|
27
|
+
import { loadIndex, saveIndex } from './store.js';
|
|
28
|
+
/** strip a .md path to a human title fallback when a chunk has no heading. */
|
|
29
|
+
function fileTitle(rel) {
|
|
30
|
+
return (rel.split('/').pop() ?? rel).replace(/\.md$/i, '');
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Incremental vault pass. Mutates `index`, returns the NEXT manifest + a diff.
|
|
34
|
+
* Pure w.r.t. the injected fs/clock — no disk access of its own.
|
|
35
|
+
*/
|
|
36
|
+
export async function indexVaultFiles(index, manifest, fs) {
|
|
37
|
+
const next = {};
|
|
38
|
+
const diff = { added: 0, updated: 0, removed: 0, skipped: 0 };
|
|
39
|
+
const paths = await fs.listMarkdown();
|
|
40
|
+
const seenExisting = new Set();
|
|
41
|
+
for (const rel of paths) {
|
|
42
|
+
const fp = await fs.fingerprint(rel);
|
|
43
|
+
if (!fp)
|
|
44
|
+
continue; // vanished between listing and stat → treat as deletion below
|
|
45
|
+
seenExisting.add(rel);
|
|
46
|
+
const prev = manifest[rel];
|
|
47
|
+
// cheap path: mtime + size unchanged ⇒ skip without reading the file
|
|
48
|
+
if (prev && prev.mtimeMs === fp.mtimeMs && prev.size === fp.size) {
|
|
49
|
+
next[rel] = prev;
|
|
50
|
+
diff.skipped++;
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
const content = await fs.read(rel);
|
|
54
|
+
const sha = fs.hash(content);
|
|
55
|
+
// touched but content identical (mtime bumped by a sync) ⇒ refresh fingerprint, keep chunks
|
|
56
|
+
if (prev && prev.sha === sha) {
|
|
57
|
+
next[rel] = { ...prev, mtimeMs: fp.mtimeMs, size: fp.size };
|
|
58
|
+
diff.skipped++;
|
|
59
|
+
continue;
|
|
60
|
+
}
|
|
61
|
+
// changed or new ⇒ evict old chunks, re-chunk, re-add
|
|
62
|
+
if (prev)
|
|
63
|
+
for (const id of prev.ids)
|
|
64
|
+
removeDoc(index, id);
|
|
65
|
+
const parsed = chunkMarkdown(rel, content);
|
|
66
|
+
const title0 = fileTitle(rel);
|
|
67
|
+
const ids = [];
|
|
68
|
+
for (const c of parsed.chunks) {
|
|
69
|
+
const doc = {
|
|
70
|
+
id: c.id,
|
|
71
|
+
source: 'vault',
|
|
72
|
+
title: c.heading || title0,
|
|
73
|
+
text: c.text,
|
|
74
|
+
path: rel,
|
|
75
|
+
noteType: parsed.frontmatter.noteType,
|
|
76
|
+
tags: parsed.frontmatter.tags,
|
|
77
|
+
links: parsed.links,
|
|
78
|
+
updatedMs: fp.mtimeMs,
|
|
79
|
+
};
|
|
80
|
+
addDoc(index, doc);
|
|
81
|
+
ids.push(c.id);
|
|
82
|
+
}
|
|
83
|
+
next[rel] = { mtimeMs: fp.mtimeMs, size: fp.size, sha, ids };
|
|
84
|
+
if (prev)
|
|
85
|
+
diff.updated++;
|
|
86
|
+
else
|
|
87
|
+
diff.added++;
|
|
88
|
+
}
|
|
89
|
+
// deletions: present last time, absent now ⇒ evict their chunks
|
|
90
|
+
for (const rel of Object.keys(manifest)) {
|
|
91
|
+
if (seenExisting.has(rel))
|
|
92
|
+
continue;
|
|
93
|
+
for (const id of manifest[rel].ids)
|
|
94
|
+
removeDoc(index, id);
|
|
95
|
+
diff.removed++;
|
|
96
|
+
}
|
|
97
|
+
return { manifest: next, diff };
|
|
98
|
+
}
|
|
99
|
+
/** refresh the live memory corpus: drop old memory docs, re-add active Facts with an importance prior. */
|
|
100
|
+
export function foldFacts(index, facts, now) {
|
|
101
|
+
removeSource(index, 'memory');
|
|
102
|
+
const searchable = facts.filter((f) => f.status === 'active' && f.tier !== 'inbox');
|
|
103
|
+
for (const f of searchable) {
|
|
104
|
+
addDoc(index, {
|
|
105
|
+
id: f.id, // memory-store deriveId — stable, dedups against itself
|
|
106
|
+
source: 'memory',
|
|
107
|
+
title: '',
|
|
108
|
+
text: f.text,
|
|
109
|
+
noteType: f.noteType,
|
|
110
|
+
tags: f.tags,
|
|
111
|
+
importance: effImportance(f, now),
|
|
112
|
+
updatedMs: f.updated,
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
return searchable.length;
|
|
116
|
+
}
|
|
117
|
+
/** refresh the session corpus (first-user-message per recent session). */
|
|
118
|
+
export function foldSessions(index, sessions) {
|
|
119
|
+
removeSource(index, 'session');
|
|
120
|
+
for (const s of sessions) {
|
|
121
|
+
addDoc(index, { id: s.id, source: 'session', title: '', text: s.text, updatedMs: s.updatedMs });
|
|
122
|
+
}
|
|
123
|
+
return sessions.length;
|
|
124
|
+
}
|
|
125
|
+
/** refresh the skill corpus (name + description + whenToUse). */
|
|
126
|
+
export function foldSkills(index, skills) {
|
|
127
|
+
removeSource(index, 'skill');
|
|
128
|
+
for (const s of skills) {
|
|
129
|
+
addDoc(index, { id: s.id, source: 'skill', title: s.name, text: s.text });
|
|
130
|
+
}
|
|
131
|
+
return skills.length;
|
|
132
|
+
}
|
|
133
|
+
// ---- real-filesystem wiring ------------------------------------------------
|
|
134
|
+
const IGNORE_DIRS = new Set([
|
|
135
|
+
'node_modules', 'dist', 'build', 'coverage', '.next', '.cache', '.git',
|
|
136
|
+
'.obsidian', 'vendor', '.turbo', '.vercel',
|
|
137
|
+
]);
|
|
138
|
+
/** node:fs implementation of VaultFS — recursive .md walk with the default-ignore set. */
|
|
139
|
+
export function nodeVaultFS(root) {
|
|
140
|
+
async function walk(dir, rel, out) {
|
|
141
|
+
let entries;
|
|
142
|
+
try {
|
|
143
|
+
entries = await readdir(dir, { withFileTypes: true });
|
|
144
|
+
}
|
|
145
|
+
catch {
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
for (const e of entries) {
|
|
149
|
+
if (e.isDirectory()) {
|
|
150
|
+
if (IGNORE_DIRS.has(e.name) || e.name.startsWith('.'))
|
|
151
|
+
continue;
|
|
152
|
+
await walk(join(dir, e.name), rel ? `${rel}/${e.name}` : e.name, out);
|
|
153
|
+
}
|
|
154
|
+
else if (e.isFile() && e.name.toLowerCase().endsWith('.md')) {
|
|
155
|
+
out.push(rel ? `${rel}/${e.name}` : e.name);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
return {
|
|
160
|
+
async listMarkdown() {
|
|
161
|
+
const out = [];
|
|
162
|
+
await walk(root, '', out);
|
|
163
|
+
return out.sort();
|
|
164
|
+
},
|
|
165
|
+
async fingerprint(relPath) {
|
|
166
|
+
try {
|
|
167
|
+
const s = await stat(join(root, relPath));
|
|
168
|
+
return { mtimeMs: s.mtimeMs, size: s.size };
|
|
169
|
+
}
|
|
170
|
+
catch {
|
|
171
|
+
return null;
|
|
172
|
+
}
|
|
173
|
+
},
|
|
174
|
+
read: (relPath) => readFile(join(root, relPath), 'utf8'),
|
|
175
|
+
hash: (content) => createHash('sha256').update(content).digest('hex'),
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
const SESSIONS_DIR = appHomePath('sessions');
|
|
179
|
+
/** load first-user-message of the most recent sessions (bounded) for the session corpus. */
|
|
180
|
+
export async function loadRecentSessions(limit = 60) {
|
|
181
|
+
const out = [];
|
|
182
|
+
let candidates;
|
|
183
|
+
try {
|
|
184
|
+
const files = (await readdir(SESSIONS_DIR)).filter((f) => f.endsWith('.json'));
|
|
185
|
+
const withStats = await Promise.all(files.map(async (file) => {
|
|
186
|
+
const full = join(SESSIONS_DIR, file);
|
|
187
|
+
try {
|
|
188
|
+
return { file, full, mtimeMs: (await stat(full)).mtimeMs };
|
|
189
|
+
}
|
|
190
|
+
catch {
|
|
191
|
+
return null;
|
|
192
|
+
}
|
|
193
|
+
}));
|
|
194
|
+
candidates = withStats
|
|
195
|
+
.filter((c) => c !== null)
|
|
196
|
+
.sort((a, b) => b.mtimeMs - a.mtimeMs || b.file.localeCompare(a.file))
|
|
197
|
+
.slice(0, limit);
|
|
198
|
+
}
|
|
199
|
+
catch {
|
|
200
|
+
return out;
|
|
201
|
+
}
|
|
202
|
+
for (const { file, full, mtimeMs } of candidates) {
|
|
203
|
+
try {
|
|
204
|
+
const s = JSON.parse(await readFile(full, 'utf8'));
|
|
205
|
+
const firstUser = (s.messages ?? []).find((m) => m.role === 'user');
|
|
206
|
+
const text = typeof firstUser?.content === 'string' ? firstUser.content : '';
|
|
207
|
+
if (!text.trim())
|
|
208
|
+
continue;
|
|
209
|
+
out.push({ id: `sess:${s.id ?? file}`, text: text.slice(0, 2000), updatedMs: mtimeMs });
|
|
210
|
+
}
|
|
211
|
+
catch {
|
|
212
|
+
/* skip a corrupt session file */
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
return out;
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Full incremental reindex: vault (via getBrainPath) + memory + sessions + skills,
|
|
219
|
+
* persisted atomically. Returns a change report. This is what `sanook index` and
|
|
220
|
+
* the MCP `sanook_index` tool call.
|
|
221
|
+
*/
|
|
222
|
+
export async function reindex(now = Date.now()) {
|
|
223
|
+
const { index, manifest } = await loadIndex();
|
|
224
|
+
let diff = { added: 0, updated: 0, removed: 0, skipped: 0 };
|
|
225
|
+
let nextManifest = manifest;
|
|
226
|
+
const brain = await getBrainPath();
|
|
227
|
+
if (brain) {
|
|
228
|
+
const r = await indexVaultFiles(index, manifest, nodeVaultFS(brain));
|
|
229
|
+
nextManifest = r.manifest;
|
|
230
|
+
diff = r.diff;
|
|
231
|
+
}
|
|
232
|
+
const memory = foldFacts(index, activeFacts(await loadStore(now)), now);
|
|
233
|
+
const sessions = foldSessions(index, await loadRecentSessions());
|
|
234
|
+
const skills = foldSkills(index, (await loadSkills()).map((s) => ({
|
|
235
|
+
id: `skill:${s.name}`,
|
|
236
|
+
name: s.name,
|
|
237
|
+
text: `${s.description} ${s.whenToUse ?? ''}`.trim(),
|
|
238
|
+
})));
|
|
239
|
+
await saveIndex(index, nextManifest);
|
|
240
|
+
return { ...diff, memory, sessions, skills, vaultPath: brain ?? null };
|
|
241
|
+
}
|