npm - neurain - Versions diffs - 0.1.0-alpha.4 → 0.1.0-alpha.6 - Mend

neurain 0.1.0-alpha.4 → 0.1.0-alpha.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/CHANGELOG.md +10 -0
package/README.md +1 -1
package/docs/development-status.en.md +3 -3
package/docs/development-status.kr.md +3 -3
package/package.json +1 -1
package/src/core/recall.mjs +26 -9
package/src/core/recall_lexical.mjs +9 -3
package/src/core/semantic.mjs +35 -7

package/CHANGELOG.md CHANGED Viewed

@@ -4,6 +4,16 @@
 - No unreleased changes recorded.
+## 0.1.0-alpha.6
+- Performance (hybrid recall): `hybrid-search` now walks the markdown corpus ONCE and shares it across its semantic and routed-lexical branches instead of each branch re-walking and re-reading the whole vault. The walk is shared only when no `--area` is set (the two branches then select the same whole-vault corpus); with an area they still walk independently. Results stay byte-identical (golden-verified) because the shared file list is exactly what each branch would have walked. Measured: `recall hybrid-search` ~970ms -> ~763ms (warm median); combined with alpha.5 that is ~1234ms -> ~763ms (-38%). npm test 153/153.
+## 0.1.0-alpha.5
+- Performance (recall processing): cut recall/search processing time without changing results. The semantic scorer now prepares the query once and precomputes per-doc trigrams (instead of re-tokenizing the query and rebuilding `charTrigrams` per document), and the lexical BM25 counts term frequency with an index loop instead of `String.split`. Measured: `recall hybrid-search` ~1234ms -> ~970ms, `semantic-search` ~1031ms -> ~750ms (warm median), with byte-identical ranking/scores/matched_terms (golden-verified) and npm test 153/153.
 ## 0.1.0-alpha.4
 - Performance: lazy dynamic-import CLI dispatch. Each command now imports only its own `core/*.mjs` module on demand instead of loading all ~54 command modules on every invocation. Engine subprocess latency drops ~60-75ms across the board (`tidy` 150->90ms, `structure-audit` 120->60ms, `--help`/`--version` 50ms), with no change to the command surface or behavior (npm test 153/153; reviewed).

package/README.md CHANGED Viewed

@@ -204,7 +204,7 @@ It exposes read/capture/scan/preview tools only. It does not silently compile, p
 ## Status
-This is `0.1.0-alpha.4`. It is not a public SaaS GA release. The alpha exists to prove installability, local-first onboarding, Codex, Claude, Gemini, and Runtime connectivity, plus safety receipts.
+This is `0.1.0-alpha.6`. It is not a public SaaS GA release. The alpha exists to prove installability, local-first onboarding, Codex, Claude, Gemini, and Runtime connectivity, plus safety receipts.
 Alpha publish command:

package/docs/development-status.en.md CHANGED Viewed

@@ -1,9 +1,9 @@
 # Development Status
 Version: v0.1
-Last updated: 2026-06-19 KST
-Package: `neurain@0.1.0-alpha.4`
-Latest documented commit: `53aba29 perf(cli): lazy dynamic-import dispatch (load only the dispatched command)`
+Last updated: 2026-06-20 KST
+Package: `neurain@0.1.0-alpha.6`
+Latest documented commit: `908d51d perf(recall): share one corpus walk across hybrid branches, byte-identical`
 This document is the canonical product development snapshot for the public package. It tracks what is shipped, what has evidence, and what must not be claimed yet.

package/docs/development-status.kr.md CHANGED Viewed

@@ -1,9 +1,9 @@
 # 개발 진행 상태
 Version: v0.1
-Last updated: 2026-06-19 KST
-Package: `neurain@0.1.0-alpha.4`
-Latest documented commit: `53aba29 perf(cli): lazy dynamic-import dispatch (load only the dispatched command)`
+Last updated: 2026-06-20 KST
+Package: `neurain@0.1.0-alpha.6`
+Latest documented commit: `908d51d perf(recall): share one corpus walk across hybrid branches, byte-identical`
 이 문서는 public package 기준의 canonical 개발 상태 스냅샷입니다. 무엇이 shipped인지, 어떤 증거가 있는지, 아직 주장하면 안 되는 것이 무엇인지 함께 기록합니다.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "neurain",
-  "version": "0.1.0-alpha.4",
+  "version": "0.1.0-alpha.6",
   "description": "Local-first Neurain Knowledge OS CLI and MCP connector.",
   "type": "module",
   "license": "Apache-2.0",

package/src/core/recall.mjs CHANGED Viewed

@@ -183,7 +183,7 @@ export async function searchRecall(root, query, { top = 10, host = '', fallback
 // corpus. No SQLite required (markdown stays canonical, the default provider
 // needs no generated index), no model calls, no external calls. Private and
 // unsafe docs are excluded exactly like the exact-token path.
-export async function semanticSearchRecall(root, query, { top = 10, host = '', provider = 'local-lexical', minScore = 0.34, scope = '' } = {}) {
+export async function semanticSearchRecall(root, query, { top = 10, host = '', provider = 'local-lexical', minScore = 0.34, scope = '', markdownFiles } = {}) {
   const prov = getProvider(provider);
   const text = String(query || '');
   if (!text.trim()) throw new Error('Recall semantic search requires a query.');
@@ -191,13 +191,19 @@ export async function semanticSearchRecall(root, query, { top = 10, host = '', p
   const hostFilter = String(host || '');
   const scopeFilter = String(scope || '');
   const floor = Number.isFinite(Number(minScore)) ? Math.max(0, Math.min(Number(minScore), 1)) : 0.34;
-  const docs = collectRecallDocs(root)
+  const docs = collectRecallDocs(root, { markdownFiles })
     .filter((doc) => doc.sensitivity !== 'private')
     .filter((doc) => !hostFilter || doc.host === hostFilter)
     .filter((doc) => !scopeFilter || doc.scope === scopeFilter);
+  // Prepare the query ONCE, then score every doc against it (avoids re-tokenizing the
+  // query per document). Falls back to provider.score for providers without the fast path.
+  const preparedQuery = prov.prepareQuery ? prov.prepareQuery(text) : null;
   const scored = docs
     .map((doc) => {
-      const scoredDoc = prov.score(text, `${doc.title} ${doc.body}`);
+      const docText = `${doc.title} ${doc.body}`;
+      const scoredDoc = (preparedQuery && prov.scorePrepared)
+        ? prov.scorePrepared(preparedQuery, docText)
+        : prov.score(text, docText);
       return { doc, score: Number(scoredDoc.score || 0), matched_terms: scoredDoc.matched_terms || [] };
     })
     .filter((item) => item.score >= floor)
@@ -282,7 +288,14 @@ export async function hybridSearchRecall(root, query, { top = 10, host = '', pro
   const scope = scopeForArea(areaDir);
   const routedEnabled = decideRouting(routing, areaDir, root, recallCfg);
   const exact = await searchRecall(root, text, { top: limit, host, scope });
-  const semantic = await semanticSearchRecall(root, text, { top: limit, host, provider, minScore, scope });
+  // Walk the markdown corpus ONCE and share it across the semantic and (routed)
+  // lexical branches, which otherwise each re-walk+read the whole vault. Only when
+  // no area is set, because then both branches select the same whole-vault corpus;
+  // with an area, semantic stays whole-vault while lexical scopes to the area, so
+  // their selections differ and each must walk its own. The shared array is exactly
+  // what each branch would have walked, so results stay byte-identical.
+  const sharedFiles = areaDir ? null : listRecallMarkdownFiles(root, recallCfg);
+  const semantic = await semanticSearchRecall(root, text, { top: limit, host, provider, minScore, scope, markdownFiles: sharedFiles });
   if (!routedEnabled) {
     const merged = mergeHybridResults(exact.results, semantic.results);
@@ -310,7 +323,7 @@ export async function hybridSearchRecall(root, query, { top = 10, host = '', pro
     };
   }
-  const lexicalCtx = buildLexicalContext(root, { area: areaDir, recallCfg });
+  const lexicalCtx = buildLexicalContext(root, { area: areaDir, recallCfg, markdownFiles: sharedFiles });
   const lexical = lexicalSearchWithContext(lexicalCtx, text, { top: limit });
   const merged = mergeRoutedHybridResults(lexical.results, exact.results, semantic.results);
   return {
@@ -1597,9 +1610,9 @@ function buildSqliteIndex(DatabaseSync, file, docs, manifestHash) {
   }
 }
-function collectRecallDocs(root, { recallCfg = recallConfig(root) } = {}) {
+function collectRecallDocs(root, { recallCfg = recallConfig(root), markdownFiles } = {}) {
   const docs = [
-    ...collectMarkdownDocs(root, recallCfg),
+    ...collectMarkdownDocs(root, recallCfg, markdownFiles),
     ...collectEventDocs(root),
     ...collectReceiptDocs(root),
   ];
@@ -1614,8 +1627,12 @@ function collectRecallDocs(root, { recallCfg = recallConfig(root) } = {}) {
 // label resolver (per-file frontmatter + area baseline + boundary path markers),
 // which fixes the old substring gate that dropped `..._tokenomics/` because the
 // path contained `token`. config.recall.include/exclude extend the whitelist.
-function collectMarkdownDocs(root, recallCfg = recallConfig(root)) {
-  return listRecallMarkdownFiles(root, recallCfg).map(({ rel, text, sensitivity }) => docFromText({
+// `markdownFiles`, when given, is a pre-walked listRecallMarkdownFiles() result
+// for the SAME (root, recallCfg, whole-vault) selection, so a caller that already
+// walked the corpus (e.g. hybrid sharing one walk across branches) can skip the
+// redundant walk+read. The mapping is identical, so the docs are byte-identical.
+function collectMarkdownDocs(root, recallCfg = recallConfig(root), markdownFiles) {
+  return (markdownFiles || listRecallMarkdownFiles(root, recallCfg)).map(({ rel, text, sensitivity }) => docFromText({
     path: rel,
     kind: kindForPath(rel),
     host: 'markdown',

package/src/core/recall_lexical.mjs CHANGED Viewed

@@ -111,11 +111,14 @@ function slugish(value) {
 // intel/facts/alias snapshots + the held-aside queue doc), reused across many
 // queries. intel/facts/aliasMap can be injected (tests); otherwise loaded from
 // the registry, degrading to empty when files are absent.
-export function buildLexicalContext(root, { area = '', recallCfg, intel, facts, aliasMap } = {}) {
+export function buildLexicalContext(root, { area = '', recallCfg, intel, facts, aliasMap, markdownFiles } = {}) {
   if (!recallCfg) throw new Error('buildLexicalContext requires recallCfg');
   const dirs = dirsFromConfig(recallCfg);
   const classify = makeLayerClassifier(dirs);
-  const files = listRecallMarkdownFiles(root, recallCfg, { area });
+  // `markdownFiles`, when given, must be a pre-walked listRecallMarkdownFiles()
+  // result for this exact (root, recallCfg, area) selection; a caller that already
+  // walked the corpus (hybrid sharing one walk) passes it to skip the redundant walk.
+  const files = markdownFiles || listRecallMarkdownFiles(root, recallCfg, { area });
   const baseDocs = files.map(({ rel, text }) => ({
     text,
     lower: text.toLowerCase(),
@@ -192,7 +195,10 @@ export function lexicalSearchWithContext(ctx, query, { top = 10, maxPerLayer = 3
     let bm25 = 0;
     for (const term of searchTerms) {
-      const tf = lower.split(term).length - 1;
+      // Non-overlapping occurrence count (identical to `lower.split(term).length - 1`)
+      // without allocating the split array on every doc/term pair.
+      let tf = 0;
+      for (let i = lower.indexOf(term); i !== -1; i = lower.indexOf(term, i + term.length)) tf += 1;
       if (tf === 0) continue;
       const denom = tf + BM25_K1 * (1 - BM25_B + (BM25_B * length) / avgLength);
       bm25 += (idf[term] || 0) * ((tf * (BM25_K1 + 1)) / denom);

package/src/core/semantic.mjs CHANGED Viewed

@@ -117,9 +117,13 @@ function charTrigrams(token) {
 export function fuzzyOverlap(a, b) {
   if (!a || !b) return 0;
   if (a === b) return 1;
-  const ga = charTrigrams(a);
-  const gb = charTrigrams(b);
-  if (!ga.size || !gb.size) return 0;
+  return trigramJaccard(charTrigrams(a), charTrigrams(b));
+}
+// Jaccard over two PRE-COMPUTED trigram sets (same math as fuzzyOverlap's tail),
+// so the per-doc fuzzy loop can reuse cached trigrams instead of rebuilding them.
+function trigramJaccard(ga, gb) {
+  if (!ga || !gb || !ga.size || !gb.size) return 0;
   let inter = 0;
   for (const g of ga) if (gb.has(g)) inter += 1;
   return inter / (ga.size + gb.size - inter);
@@ -127,13 +131,25 @@ export function fuzzyOverlap(a, b) {
 // Deterministic lexical-semantic score of a query against a document body.
 // Returns { score: 0..1 normalized by query length, matched_terms: [...] }.
-export function lexicalSemanticScore(query, docText) {
-  const queryExpanded = tokenize(query).map(expandToken);
+// Prepare a query ONCE (tokenize + expand) so a corpus scan can reuse it across all
+// docs instead of re-tokenizing the query per document (the per-doc hot path).
+export function prepareSemanticQuery(query) {
+  // Precompute each term's trigrams ONCE so the per-doc fuzzy loop never rebuilds them.
+  return tokenize(query).map(expandToken).map((q) => ({ ...q, trigrams: charTrigrams(q.stem) }));
+}
+// Score a pre-prepared query against a document body. Behaviour is identical to
+// lexicalSemanticScore; only the query preparation is hoisted out.
+export function scorePreparedSemantic(queryExpanded, docText) {
   if (!queryExpanded.length) return { score: 0, matched_terms: [] };
   const docTokens = tokenize(docText).map(expandToken);
   if (!docTokens.length) return { score: 0, matched_terms: [] };
   const docStems = new Set(docTokens.map((d) => d.stem));
   const docCanons = new Set(docTokens.map((d) => d.canon).filter(Boolean));
+  // Build each unique doc-stem's trigrams ONCE per doc (was recomputed per query term
+  // inside fuzzyOverlap -> the charTrigrams hot path).
+  const docStemTrigrams = new Map();
+  for (const s of docStems) docStemTrigrams.set(s, charTrigrams(s));
   const matched = [];
   let total = 0;
   for (const q of queryExpanded) {
@@ -142,9 +158,11 @@ export function lexicalSemanticScore(query, docText) {
     if (docStems.has(q.stem)) { best = 1; how = 'exact'; }
     else if (q.canon && docCanons.has(q.canon)) { best = 0.75; how = 'synonym'; }
     else {
-      // fuzzy: best trigram overlap against any doc stem (typos / variants)
+      // fuzzy: best trigram overlap against any doc stem (typos / variants), using
+      // the precomputed query + doc-stem trigrams instead of rebuilding them.
+      const qTri = q.trigrams || charTrigrams(q.stem);
       for (const d of docStems) {
-        const ov = fuzzyOverlap(q.stem, d);
+        const ov = trigramJaccard(qTri, docStemTrigrams.get(d));
         if (ov > best) { best = ov; how = 'fuzzy'; }
       }
       if (best < 0.6) best = 0;
@@ -155,6 +173,10 @@ export function lexicalSemanticScore(query, docText) {
   return { score: Number((total / queryExpanded.length).toFixed(4)), matched_terms: matched };
 }
+export function lexicalSemanticScore(query, docText) {
+  return scorePreparedSemantic(prepareSemanticQuery(query), docText);
+}
 const PROVIDERS = new Map();
 export function registerProvider(name, impl) {
@@ -181,6 +203,12 @@ registerProvider('local-lexical', {
   expandQuery(query) {
     return tokenize(query).map(expandToken);
   },
+  prepareQuery(query) {
+    return prepareSemanticQuery(query);
+  },
+  scorePrepared(prepared, docText) {
+    return scorePreparedSemantic(prepared, docText);
+  },
   score(query, docText) {
     return lexicalSemanticScore(query, docText);
   },