neurain 0.1.0-alpha.6 → 0.1.0-alpha.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/README.md +1 -1
- package/docs/development-status.en.md +2 -2
- package/docs/development-status.kr.md +2 -2
- package/package.json +1 -1
- package/src/core/recall_lexical.mjs +23 -6
- package/src/core/semantic.mjs +9 -1
package/CHANGELOG.md
CHANGED
|
@@ -4,6 +4,15 @@
|
|
|
4
4
|
|
|
5
5
|
- No unreleased changes recorded.
|
|
6
6
|
|
|
7
|
+
## 0.1.0-alpha.7
|
|
8
|
+
|
|
9
|
+
- Hardening (recall perf, from an adversarial review): lock the "byte-identical results" claim and tighten the fast-path contracts, with no change to ranking/scores (golden-identical).
|
|
10
|
+
|
|
11
|
+
- Added `test/perf_recall_equivalence.test.mjs`: an oracle test that `countOccurrences` equals `split(term).length - 1` for every non-empty term (overlap/unicode/surrogate cases included), a proof that `scorePreparedSemantic` (prepare-once + per-doc trigram precompute) equals a naive per-doc reference scorer, a determinism check, and a guard that the shared corpus selector keeps private + secret-bearing files out of every branch.
|
|
12
|
+
- Extracted the lexical BM25 term-frequency count into an exported `countOccurrences(haystack, needle)` with an empty-needle guard (returns 0) so the index-loop form can never spin even if the term filter changes.
|
|
13
|
+
- `buildLexicalContext` now throws if a caller passes shared `markdownFiles` together with an `area` (the only safe share is whole-vault; an area context selects a strict subset, so this prevents a future caller from silently widening the corpus).
|
|
14
|
+
- `prepareSemanticQuery` now returns a frozen prepared query, and the provider fast-path contract (prepared query is immutable, no cross-call mutable state) is documented on the default provider.
|
|
15
|
+
|
|
7
16
|
## 0.1.0-alpha.6
|
|
8
17
|
|
|
9
18
|
- Performance (hybrid recall): `hybrid-search` now walks the markdown corpus ONCE and shares it across its semantic and routed-lexical branches instead of each branch re-walking and re-reading the whole vault. The walk is shared only when no `--area` is set (the two branches then select the same whole-vault corpus); with an area they still walk independently. Results stay byte-identical (golden-verified) because the shared file list is exactly what each branch would have walked. Measured: `recall hybrid-search` ~970ms -> ~763ms (warm median); combined with alpha.5 that is ~1234ms -> ~763ms (-38%). npm test 153/153.
|
package/README.md
CHANGED
|
@@ -204,7 +204,7 @@ It exposes read/capture/scan/preview tools only. It does not silently compile, p
|
|
|
204
204
|
|
|
205
205
|
## Status
|
|
206
206
|
|
|
207
|
-
This is `0.1.0-alpha.
|
|
207
|
+
This is `0.1.0-alpha.7`. It is not a public SaaS GA release. The alpha exists to prove installability, local-first onboarding, Codex, Claude, Gemini, and Runtime connectivity, plus safety receipts.
|
|
208
208
|
|
|
209
209
|
Alpha publish command:
|
|
210
210
|
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
Version: v0.1
|
|
4
4
|
Last updated: 2026-06-20 KST
|
|
5
|
-
Package: `neurain@0.1.0-alpha.
|
|
6
|
-
Latest documented commit: `
|
|
5
|
+
Package: `neurain@0.1.0-alpha.7`
|
|
6
|
+
Latest documented commit: `18bbb9f perf(recall): lock byte-identical claim in CI + harden fast-path contracts`
|
|
7
7
|
|
|
8
8
|
This document is the canonical product development snapshot for the public package. It tracks what is shipped, what has evidence, and what must not be claimed yet.
|
|
9
9
|
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
Version: v0.1
|
|
4
4
|
Last updated: 2026-06-20 KST
|
|
5
|
-
Package: `neurain@0.1.0-alpha.
|
|
6
|
-
Latest documented commit: `
|
|
5
|
+
Package: `neurain@0.1.0-alpha.7`
|
|
6
|
+
Latest documented commit: `18bbb9f perf(recall): lock byte-identical claim in CI + harden fast-path contracts`
|
|
7
7
|
|
|
8
8
|
이 문서는 public package 기준의 canonical 개발 상태 스냅샷입니다. 무엇이 shipped인지, 어떤 증거가 있는지, 아직 주장하면 안 되는 것이 무엇인지 함께 기록합니다.
|
|
9
9
|
|
package/package.json
CHANGED
|
@@ -30,6 +30,20 @@ import {
|
|
|
30
30
|
import { factsFor, loadFactIntel } from './recall_facts.mjs';
|
|
31
31
|
|
|
32
32
|
const sourceIdPattern = /\braw-\d{8}-(?:\d{3}|dryrun)\b/i;
|
|
33
|
+
|
|
34
|
+
// Count non-overlapping occurrences of `needle` in `haystack`. For a non-empty
|
|
35
|
+
// needle this is exactly `haystack.split(needle).length - 1` but without
|
|
36
|
+
// allocating the split array on every doc/term pair (the hottest BM25 loop). An
|
|
37
|
+
// empty needle returns 0: every term that reaches scoring is filter(Boolean)'d,
|
|
38
|
+
// so this only future-proofs against an infinite loop if that contract ever
|
|
39
|
+
// changes (the split form would never hit it, but the index form would).
|
|
40
|
+
export function countOccurrences(haystack, needle) {
|
|
41
|
+
if (!needle) return 0;
|
|
42
|
+
let n = 0;
|
|
43
|
+
for (let i = haystack.indexOf(needle); i !== -1; i = haystack.indexOf(needle, i + needle.length)) n += 1;
|
|
44
|
+
return n;
|
|
45
|
+
}
|
|
46
|
+
|
|
33
47
|
// BM25 content weight relative to the additive structural boosts (vault parity).
|
|
34
48
|
const BM25_WEIGHT = 4;
|
|
35
49
|
const BM25_K1 = 1.5;
|
|
@@ -113,11 +127,17 @@ function slugish(value) {
|
|
|
113
127
|
// the registry, degrading to empty when files are absent.
|
|
114
128
|
export function buildLexicalContext(root, { area = '', recallCfg, intel, facts, aliasMap, markdownFiles } = {}) {
|
|
115
129
|
if (!recallCfg) throw new Error('buildLexicalContext requires recallCfg');
|
|
116
|
-
const dirs = dirsFromConfig(recallCfg);
|
|
117
|
-
const classify = makeLayerClassifier(dirs);
|
|
118
130
|
// `markdownFiles`, when given, must be a pre-walked listRecallMarkdownFiles()
|
|
119
131
|
// result for this exact (root, recallCfg, area) selection; a caller that already
|
|
120
132
|
// walked the corpus (hybrid sharing one walk) passes it to skip the redundant walk.
|
|
133
|
+
// The only safe sharing is whole-vault: an area-scoped context selects a strict
|
|
134
|
+
// subset, so accepting whole-vault files under an area would silently widen the
|
|
135
|
+
// corpus and change results. Reject that misuse loudly instead of ranking wrong.
|
|
136
|
+
if (markdownFiles && area) {
|
|
137
|
+
throw new Error('buildLexicalContext: markdownFiles can only be shared for a whole-vault context (no area)');
|
|
138
|
+
}
|
|
139
|
+
const dirs = dirsFromConfig(recallCfg);
|
|
140
|
+
const classify = makeLayerClassifier(dirs);
|
|
121
141
|
const files = markdownFiles || listRecallMarkdownFiles(root, recallCfg, { area });
|
|
122
142
|
const baseDocs = files.map(({ rel, text }) => ({
|
|
123
143
|
text,
|
|
@@ -195,10 +215,7 @@ export function lexicalSearchWithContext(ctx, query, { top = 10, maxPerLayer = 3
|
|
|
195
215
|
|
|
196
216
|
let bm25 = 0;
|
|
197
217
|
for (const term of searchTerms) {
|
|
198
|
-
|
|
199
|
-
// without allocating the split array on every doc/term pair.
|
|
200
|
-
let tf = 0;
|
|
201
|
-
for (let i = lower.indexOf(term); i !== -1; i = lower.indexOf(term, i + term.length)) tf += 1;
|
|
218
|
+
const tf = countOccurrences(lower, term);
|
|
202
219
|
if (tf === 0) continue;
|
|
203
220
|
const denom = tf + BM25_K1 * (1 - BM25_B + (BM25_B * length) / avgLength);
|
|
204
221
|
bm25 += (idf[term] || 0) * ((tf * (BM25_K1 + 1)) / denom);
|
package/src/core/semantic.mjs
CHANGED
|
@@ -135,7 +135,10 @@ function trigramJaccard(ga, gb) {
|
|
|
135
135
|
// docs instead of re-tokenizing the query per document (the per-doc hot path).
|
|
136
136
|
export function prepareSemanticQuery(query) {
|
|
137
137
|
// Precompute each term's trigrams ONCE so the per-doc fuzzy loop never rebuilds them.
|
|
138
|
-
|
|
138
|
+
// The prepared query is the SHARED, immutable input to scorePreparedSemantic across an
|
|
139
|
+
// entire corpus scan: freezing it makes the "no per-call mutable state" contract
|
|
140
|
+
// enforced, not just documented, so a long-lived process can reuse it safely.
|
|
141
|
+
return Object.freeze(tokenize(query).map(expandToken).map((q) => Object.freeze({ ...q, trigrams: charTrigrams(q.stem) })));
|
|
139
142
|
}
|
|
140
143
|
|
|
141
144
|
// Score a pre-prepared query against a document body. Behaviour is identical to
|
|
@@ -203,6 +206,11 @@ registerProvider('local-lexical', {
|
|
|
203
206
|
expandQuery(query) {
|
|
204
207
|
return tokenize(query).map(expandToken);
|
|
205
208
|
},
|
|
209
|
+
// Provider fast-path contract: prepareQuery returns an IMMUTABLE prepared query
|
|
210
|
+
// (frozen) that scorePrepared treats as read-only. A provider must not mutate the
|
|
211
|
+
// prepared object nor keep cross-call mutable state keyed off it, so the same
|
|
212
|
+
// prepared query is safe to reuse across an entire corpus scan and across calls in
|
|
213
|
+
// a long-lived process. The default provider is fully stateless.
|
|
206
214
|
prepareQuery(query) {
|
|
207
215
|
return prepareSemanticQuery(query);
|
|
208
216
|
},
|