neurain 0.1.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/LICENSE +57 -0
  3. package/README.md +205 -0
  4. package/SECURITY.md +22 -0
  5. package/bin/neurain.mjs +7 -0
  6. package/docs/comparison-mem0.en.md +22 -0
  7. package/docs/connect-claude.en.md +48 -0
  8. package/docs/connect-claude.kr.md +51 -0
  9. package/docs/connect-codex.en.md +38 -0
  10. package/docs/connect-codex.kr.md +40 -0
  11. package/docs/connect-gemini.en.md +71 -0
  12. package/docs/connect-gemini.kr.md +71 -0
  13. package/docs/connect-runtime.en.md +61 -0
  14. package/docs/connect-runtime.kr.md +61 -0
  15. package/docs/development-status.en.md +157 -0
  16. package/docs/development-status.kr.md +157 -0
  17. package/docs/knowledge-os.en.md +105 -0
  18. package/docs/knowledge-os.kr.md +106 -0
  19. package/docs/pricing.en.md +14 -0
  20. package/docs/privacy-and-data-flow.en.md +25 -0
  21. package/docs/public-saas-readiness.en.md +39 -0
  22. package/docs/quickstart.en.md +64 -0
  23. package/docs/quickstart.kr.md +64 -0
  24. package/docs/release-checklist.en.md +38 -0
  25. package/docs/safety.en.md +36 -0
  26. package/docs/self-improvement-90-roadmap.en.md +429 -0
  27. package/docs/self-improvement-90-roadmap.kr.md +429 -0
  28. package/docs/self-improving-workflows.en.md +163 -0
  29. package/docs/self-improving-workflows.kr.md +163 -0
  30. package/docs/support.en.md +17 -0
  31. package/docs/troubleshooting.en.md +35 -0
  32. package/package.json +36 -0
  33. package/src/cli.mjs +261 -0
  34. package/src/core/adopt.mjs +304 -0
  35. package/src/core/answer_eval.mjs +450 -0
  36. package/src/core/capabilities.mjs +217 -0
  37. package/src/core/capture_durable.mjs +181 -0
  38. package/src/core/classify.mjs +237 -0
  39. package/src/core/compile_desk.mjs +324 -0
  40. package/src/core/complete.mjs +108 -0
  41. package/src/core/config.mjs +142 -0
  42. package/src/core/connect.mjs +355 -0
  43. package/src/core/curator.mjs +351 -0
  44. package/src/core/daemon.mjs +536 -0
  45. package/src/core/digest.mjs +155 -0
  46. package/src/core/doctor.mjs +115 -0
  47. package/src/core/durable.mjs +96 -0
  48. package/src/core/envelope.mjs +97 -0
  49. package/src/core/flush.mjs +190 -0
  50. package/src/core/fs.mjs +121 -0
  51. package/src/core/init.mjs +194 -0
  52. package/src/core/journal.mjs +269 -0
  53. package/src/core/labels.mjs +117 -0
  54. package/src/core/lessons.mjs +793 -0
  55. package/src/core/lifecycle.mjs +1138 -0
  56. package/src/core/link_check.mjs +180 -0
  57. package/src/core/live_cases.mjs +221 -0
  58. package/src/core/onboard.mjs +175 -0
  59. package/src/core/plan_receipt.mjs +177 -0
  60. package/src/core/plan_writeback.mjs +176 -0
  61. package/src/core/queue.mjs +62 -0
  62. package/src/core/queue_archive.mjs +87 -0
  63. package/src/core/queue_model.mjs +161 -0
  64. package/src/core/queue_write.mjs +28 -0
  65. package/src/core/recall.mjs +1802 -0
  66. package/src/core/recall_bench.mjs +275 -0
  67. package/src/core/recall_corpus.mjs +152 -0
  68. package/src/core/recall_facts.mjs +233 -0
  69. package/src/core/recall_intel.mjs +233 -0
  70. package/src/core/recall_lexical.mjs +269 -0
  71. package/src/core/recap.mjs +78 -0
  72. package/src/core/review_queue.mjs +131 -0
  73. package/src/core/review_worker.mjs +284 -0
  74. package/src/core/route.mjs +73 -0
  75. package/src/core/safety.mjs +57 -0
  76. package/src/core/scheduler.mjs +697 -0
  77. package/src/core/search.mjs +54 -0
  78. package/src/core/secret_scan.mjs +143 -0
  79. package/src/core/semantic.mjs +187 -0
  80. package/src/core/source_digest.mjs +56 -0
  81. package/src/core/source_digest_gen.mjs +311 -0
  82. package/src/core/stage.mjs +105 -0
  83. package/src/core/status.mjs +175 -0
  84. package/src/core/vault_state.mjs +115 -0
  85. package/src/core/watch.mjs +282 -0
  86. package/src/core/wiki_log.mjs +29 -0
  87. package/src/core/wrap.mjs +62 -0
  88. package/src/mcp/server.mjs +865 -0
  89. package/templates/starter-vault/README.md +9 -0
@@ -0,0 +1,275 @@
1
+ // Recall benchmark + scorecard runner (W-A). Repeatably scores a directory of
2
+ // benchmark suites against the engine's routed lexical ranker, the apples-to-
3
+ // apples comparison with the vault's neurain-search (same BM25 + boosts). The
4
+ // suite files and the eval corpus stay external (passed as a runtime arg); this
5
+ // module ships only synthetic fixtures in tests, never real vault data.
6
+ //
7
+ // Why the LEXICAL branch and not the full hybrid: the vault baseline (94.1%) was
8
+ // produced by the vault lexical search. In the engine's routed hybrid the lexical
9
+ // results come FIRST, so hybrid's top-K equals lexical's top-K whenever lexical
10
+ // returns >= K results, and otherwise hybrid only ADDS exact/semantic catches.
11
+ // Lexical source recall is therefore a faithful lower bound on hybrid; gating on
12
+ // it is both fast (one corpus build, no per-case semantic walk) and conservative.
13
+ import fs from 'node:fs';
14
+ import path from 'node:path';
15
+ import { performance } from 'node:perf_hooks';
16
+ import { recallConfig } from './config.mjs';
17
+ import { resolveAreaDir } from './recall_corpus.mjs';
18
+ import { buildLexicalContext, lexicalSearchWithContext } from './recall_lexical.mjs';
19
+
20
+ function resolveSuitesDir(root, suites) {
21
+ if (!suites) return '';
22
+ return path.isAbsolute(suites) ? suites : path.join(root, suites);
23
+ }
24
+
25
+ function loadSuites(dir) {
26
+ if (!dir || !fs.existsSync(dir)) return [];
27
+ const out = [];
28
+ for (const file of fs.readdirSync(dir).filter((f) => f.endsWith('.json')).sort()) {
29
+ let parsed;
30
+ try {
31
+ parsed = JSON.parse(fs.readFileSync(path.join(dir, file), 'utf8'));
32
+ } catch {
33
+ continue;
34
+ }
35
+ const cases = Array.isArray(parsed) ? parsed : parsed.cases;
36
+ if (!Array.isArray(cases)) continue;
37
+ out.push({
38
+ file,
39
+ suite: parsed.suite || file.replace(/\.json$/, ''),
40
+ pass_threshold: parsed.pass_threshold || {},
41
+ cases,
42
+ });
43
+ }
44
+ return out;
45
+ }
46
+
47
+ const base = (p) => String(p || '').split('/').pop();
48
+
49
+ // strict = exact path equality (the vault recall-check matcher, how 94.1% was
50
+ // computed); loose = strict OR basename equality OR endsWith (the vault scorecard
51
+ // matcher). Both are reported; the gate uses strict.
52
+ function matchExpected(resultPaths, expected, mode) {
53
+ if (mode === 'loose') {
54
+ return expected.some((s) => resultPaths.some((p) => p === s || p.endsWith(`/${s}`) || base(p) === base(s)));
55
+ }
56
+ return expected.some((s) => resultPaths.includes(s));
57
+ }
58
+
59
+ function rankOfFirstHit(resultPaths, expected, mode) {
60
+ for (let i = 0; i < resultPaths.length; i += 1) {
61
+ if (matchExpected([resultPaths[i]], expected, mode)) return i + 1;
62
+ }
63
+ return 0;
64
+ }
65
+
66
+ function runCase(ctx, c, { top, maxPerLayer, prefix, mode }) {
67
+ const out = lexicalSearchWithContext(ctx, c.question, { top, maxPerLayer });
68
+ const paths = out.results.map((r) => (prefix && r.path.startsWith(prefix) ? r.path.slice(prefix.length) : r.path));
69
+ const entities = new Set(out.results.flatMap((r) => r.matched_entities || []));
70
+ const expSources = Array.isArray(c.expected_sources) ? c.expected_sources : [];
71
+ const expEntities = Array.isArray(c.expected_entities) ? c.expected_entities : [];
72
+ return {
73
+ id: c.id || '',
74
+ results: out.results,
75
+ paths,
76
+ source_ok_strict: !expSources.length || matchExpected(paths, expSources, 'strict'),
77
+ source_ok_loose: !expSources.length || matchExpected(paths, expSources, 'loose'),
78
+ entity_ok: !expEntities.length || expEntities.every((e) => entities.has(e)),
79
+ source_ok: !expSources.length || matchExpected(paths, expSources, mode),
80
+ expected_sources: expSources,
81
+ expected_entities: expEntities,
82
+ };
83
+ }
84
+
85
+ export function benchRecall(root, opts = {}) {
86
+ const recallCfg = recallConfig(root);
87
+ const areaDir = resolveAreaDir(root, opts.area || '', recallCfg);
88
+ const top = Math.max(1, Math.min(Number(opts.top || 5), 50));
89
+ const maxPerLayer = Number.isFinite(Number(opts.maxPerLayer)) ? Number(opts.maxPerLayer) : 3;
90
+ const mode = opts.matcher === 'loose' ? 'loose' : 'strict';
91
+ const suitesDir = resolveSuitesDir(root, opts.suites || opts.suiteDir || '');
92
+ const baseline = Number.isFinite(Number(opts.baseline)) ? Number(opts.baseline) : recallCfg.bench.baseline_source_recall;
93
+ const prefix = areaDir ? `${recallCfg.areas_dir}/${areaDir}/` : '';
94
+
95
+ if (!suitesDir) {
96
+ return { ok: false, command: 'recall bench', root, durable_write: false, reason: 'no_suites', note: 'Provide --suites <dir> pointing at a directory of benchmark suite JSON files.' };
97
+ }
98
+ const suites = loadSuites(suitesDir);
99
+ if (!suites.length) {
100
+ return { ok: false, command: 'recall bench', root, durable_write: false, reason: 'no_suites', suites_dir: suitesDir, note: 'No suite JSON files with a cases array were found.' };
101
+ }
102
+
103
+ const ctx = buildLexicalContext(root, { area: areaDir, recallCfg });
104
+ const corpusPaths = new Set(ctx.baseDocs.map((d) => (prefix && d.relPath.startsWith(prefix) ? d.relPath.slice(prefix.length) : d.relPath)));
105
+
106
+ // Single-case explain mode for parity debugging.
107
+ if (opts.caseId) {
108
+ for (const suite of suites) {
109
+ const c = suite.cases.find((x) => x.id === opts.caseId);
110
+ if (!c) continue;
111
+ const r = runCase(ctx, c, { top, maxPerLayer, prefix, mode });
112
+ const missing = r.expected_sources.filter((s) => !matchExpected(r.paths, [s], 'loose'));
113
+ return {
114
+ ok: r.source_ok && r.entity_ok,
115
+ command: 'recall bench',
116
+ root,
117
+ durable_write: false,
118
+ area: opts.area || '',
119
+ area_dir: areaDir,
120
+ suite: suite.suite,
121
+ case: {
122
+ id: c.id,
123
+ question: c.question,
124
+ expected_sources: r.expected_sources,
125
+ expected_entities: r.expected_entities,
126
+ source_ok_strict: r.source_ok_strict,
127
+ source_ok_loose: r.source_ok_loose,
128
+ entity_ok: r.entity_ok,
129
+ missing_expected: missing,
130
+ // distinguishes a corpus bug (expected source not indexed at all) from a
131
+ // ranking bug (indexed but below top-K).
132
+ expected_in_corpus: Object.fromEntries(r.expected_sources.map((s) => [s, corpusPaths.has(s)])),
133
+ top_results: opts.explain ? r.results.slice(0, top).map((x) => ({
134
+ path: prefix && x.path.startsWith(prefix) ? x.path.slice(prefix.length) : x.path,
135
+ matched_by: 'lexical',
136
+ layer: x.layer,
137
+ score: x.score,
138
+ signals: x.signals,
139
+ matched_entities: x.matched_entities,
140
+ })) : undefined,
141
+ },
142
+ };
143
+ }
144
+ return { ok: false, command: 'recall bench', root, durable_write: false, reason: 'case_not_found', case_id: opts.caseId };
145
+ }
146
+
147
+ const report = [];
148
+ let allSourcePass = 0;
149
+ let allSourcePassLoose = 0;
150
+ let allEntityPass = 0;
151
+ let allCases = 0;
152
+ const failures = [];
153
+ for (const suite of suites) {
154
+ let sPass = 0;
155
+ let sPassLoose = 0;
156
+ let ePass = 0;
157
+ for (const c of suite.cases) {
158
+ const r = runCase(ctx, c, { top, maxPerLayer, prefix, mode });
159
+ if (r.source_ok_strict) sPass += 1; else failures.push({ suite: suite.suite, id: r.id, expected_in_corpus: r.expected_sources.map((s) => corpusPaths.has(s)) });
160
+ if (r.source_ok_loose) sPassLoose += 1;
161
+ if (r.entity_ok) ePass += 1;
162
+ }
163
+ const n = suite.cases.length || 1;
164
+ const sThr = suite.pass_threshold?.source_recall ?? 0.8;
165
+ const eThr = suite.pass_threshold?.entity_recall ?? 0.8;
166
+ const sR = sPass / n;
167
+ const eR = ePass / n;
168
+ report.push({
169
+ suite: suite.suite,
170
+ cases: n,
171
+ source_recall: round3(sR),
172
+ source_recall_loose: round3(sPassLoose / n),
173
+ entity_recall: round3(eR),
174
+ passed: sR >= sThr && eR >= eThr,
175
+ });
176
+ allSourcePass += sPass;
177
+ allSourcePassLoose += sPassLoose;
178
+ allEntityPass += ePass;
179
+ allCases += n;
180
+ }
181
+
182
+ const overall = {
183
+ source_recall: allCases ? round3(allSourcePass / allCases) : 0,
184
+ source_recall_loose: allCases ? round3(allSourcePassLoose / allCases) : 0,
185
+ entity_recall: allCases ? round3(allEntityPass / allCases) : 0,
186
+ cases: allCases,
187
+ };
188
+ const suitesPassed = report.length > 0 && report.every((r) => r.passed);
189
+ const gate = baseline != null
190
+ ? { baseline, metric: 'source_recall_strict', value: overall.source_recall, passed: overall.source_recall >= baseline }
191
+ : null;
192
+ return {
193
+ ok: suitesPassed && (gate ? gate.passed : true),
194
+ command: 'recall bench',
195
+ root,
196
+ durable_write: false,
197
+ strategy: 'routed_lexical',
198
+ note: 'Measures the routed lexical branch (vault-equivalent). The runtime hybrid (lexical union exact union semantic) is a superset, so its source recall is >= these numbers.',
199
+ area: opts.area || '',
200
+ area_dir: areaDir,
201
+ top,
202
+ matcher: mode,
203
+ suites_dir: suitesDir,
204
+ suites: report,
205
+ overall,
206
+ gate,
207
+ suites_passed: suitesPassed,
208
+ failing_cases: failures,
209
+ };
210
+ }
211
+
212
+ export function scorecardRecall(root, opts = {}) {
213
+ const recallCfg = recallConfig(root);
214
+ const areaDir = resolveAreaDir(root, opts.area || '', recallCfg);
215
+ const top = Math.max(1, Math.min(Number(opts.top || 5), 50));
216
+ const maxPerLayer = Number.isFinite(Number(opts.maxPerLayer)) ? Number(opts.maxPerLayer) : 3;
217
+ const mode = opts.matcher === 'loose' ? 'loose' : 'strict';
218
+ const suitesDir = resolveSuitesDir(root, opts.suites || opts.suiteDir || '');
219
+ const prefix = areaDir ? `${recallCfg.areas_dir}/${areaDir}/` : '';
220
+ if (!suitesDir) return { ok: false, command: 'recall scorecard', root, durable_write: false, reason: 'no_suites' };
221
+ const suites = loadSuites(suitesDir);
222
+ if (!suites.length) return { ok: false, command: 'recall scorecard', root, durable_write: false, reason: 'no_suites', suites_dir: suitesDir };
223
+
224
+ const ctx = buildLexicalContext(root, { area: areaDir, recallCfg });
225
+ let hit = 0;
226
+ let rSum = 0;
227
+ let mrrSum = 0;
228
+ let entHit = 0;
229
+ let total = 0;
230
+ const latencies = [];
231
+ for (const suite of suites) {
232
+ for (const c of suite.cases) {
233
+ const started = performance.now();
234
+ const r = runCase(ctx, c, { top, maxPerLayer, prefix, mode });
235
+ latencies.push(performance.now() - started);
236
+ total += 1;
237
+ if (r.source_ok) hit += 1;
238
+ const exp = r.expected_sources;
239
+ if (exp.length) {
240
+ const found = exp.filter((s) => matchExpected(r.paths, [s], mode)).length;
241
+ rSum += found / exp.length;
242
+ } else {
243
+ rSum += 1;
244
+ }
245
+ const rank = rankOfFirstHit(r.paths, exp, mode);
246
+ mrrSum += rank ? 1 / rank : 0;
247
+ if (r.entity_ok) entHit += 1;
248
+ }
249
+ }
250
+ latencies.sort((a, b) => a - b);
251
+ const p95 = latencies.length ? latencies[Math.min(latencies.length - 1, Math.floor(latencies.length * 0.95))] : 0;
252
+ const avg = latencies.length ? latencies.reduce((s, x) => s + x, 0) / latencies.length : 0;
253
+ return {
254
+ ok: true,
255
+ command: 'recall scorecard',
256
+ root,
257
+ durable_write: false,
258
+ strategy: 'routed_lexical',
259
+ area: opts.area || '',
260
+ area_dir: areaDir,
261
+ top,
262
+ matcher: mode,
263
+ cases: total,
264
+ hit_at_k: round3(total ? hit / total : 0),
265
+ recall_at_k: round3(total ? rSum / total : 0),
266
+ mrr: round3(total ? mrrSum / total : 0),
267
+ entity_recall: round3(total ? entHit / total : 0),
268
+ latency_ms_avg: Math.round(avg * 100) / 100,
269
+ latency_ms_p95: Math.round(p95 * 100) / 100,
270
+ };
271
+ }
272
+
273
+ function round3(value) {
274
+ return Number(Number(value || 0).toFixed(3));
275
+ }
@@ -0,0 +1,152 @@
1
+ // Shared recall corpus selection: the single definition of WHICH markdown files
2
+ // enter the recall corpus (whitelist + config include/exclude + label-based
3
+ // private exclusion + secret/injection content gate). Both the indexed/semantic
4
+ // path (recall.collectMarkdownDocs) and the routed lexical ranker
5
+ // (recall_lexical) consume listRecallMarkdownFiles so they share one corpus; they
6
+ // differ only in how they use the text (normalized snippet vs raw scoring body).
7
+ // Lives in its own module so recall.mjs and recall_lexical.mjs both import it
8
+ // without a cycle.
9
+ import fs from 'node:fs';
10
+ import path from 'node:path';
11
+ import { isTextFile, readText, relPath, walkFiles } from './fs.mjs';
12
+ import { injectionLike, secretLike } from './safety.mjs';
13
+ import { createSensitivityResolver } from './labels.mjs';
14
+ import { recallConfig } from './config.mjs';
15
+
16
+ export const reEsc = (value) => String(value).replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
17
+
18
+ // Compile config-supplied regex sources, skipping (and reporting) invalid ones
19
+ // so a hand-edited config never throws mid-walk.
20
+ export function compilePatterns(sources) {
21
+ const regexes = [];
22
+ const errors = [];
23
+ for (const source of sources || []) {
24
+ try {
25
+ regexes.push(new RegExp(String(source)));
26
+ } catch (error) {
27
+ errors.push({ pattern: String(source), error: error.message });
28
+ }
29
+ }
30
+ return { regexes, errors };
31
+ }
32
+
33
+ export function recallConfigErrors(recallCfg) {
34
+ return [...compilePatterns(recallCfg.include).errors, ...compilePatterns(recallCfg.exclude).errors];
35
+ }
36
+
37
+ // Build the markdown whitelist matcher once per corpus walk. Order: hard
38
+ // excludes -> config excludes -> (built-in includes OR config includes).
39
+ // Directory names are config-driven; defaults match the reference vault. The
40
+ // built-in include set was expanded in W-A to the general area knowledge class
41
+ // (areas/<area>/**.md), hubs, and the area registry.
42
+ export function buildRecallPathMatcher(recallCfg = {}) {
43
+ const areasDir = recallCfg.areas_dir || '10_areas';
44
+ const wikiDir = recallCfg.wiki_dir || 'wiki';
45
+ const hubsDir = recallCfg.hubs_dir || '20_hubs';
46
+ const systemDir = recallCfg.system_dir || '00_system';
47
+ const archiveDir = recallCfg.archive_dir || '90_archive';
48
+ const include = compilePatterns(recallCfg.include).regexes;
49
+ const exclude = compilePatterns(recallCfg.exclude).regexes;
50
+ const hardExclude = [
51
+ new RegExp(`^${reEsc(recallCfg.raw_dir || 'raw')}\\/`),
52
+ new RegExp(`^${reEsc(recallCfg.output_dir || 'output')}\\/receipts\\/`),
53
+ /^node_modules\//,
54
+ /(^|\/)(_trash|_archive)(\/|$)/,
55
+ new RegExp(`^${reEsc(archiveDir)}\\/`),
56
+ /\/agent-mailbox\/runtime\//,
57
+ ];
58
+ const builtinInclude = [
59
+ /^README\.md$/,
60
+ /^index\.md$/,
61
+ /^log\.md$/,
62
+ new RegExp(`^${reEsc(wikiDir)}\\/`),
63
+ new RegExp(`^${reEsc(systemDir)}\\/neurain\\/lessons\\.md$`),
64
+ new RegExp(`^${reEsc(systemDir)}\\/sessions\\/handoffs\\/.+\\.md$`),
65
+ new RegExp(`^${reEsc(systemDir)}\\/area-registry\\.md$`),
66
+ new RegExp(`^${reEsc(areasDir)}\\/[^/]+\\/.+\\.md$`),
67
+ new RegExp(`^${reEsc(hubsDir)}\\/.+\\.md$`),
68
+ ];
69
+ return (rel) => {
70
+ if (hardExclude.some((re) => re.test(rel))) return false;
71
+ if (exclude.some((re) => re.test(rel))) return false;
72
+ if (builtinInclude.some((re) => re.test(rel))) return true;
73
+ return include.some((re) => re.test(rel));
74
+ };
75
+ }
76
+
77
+ export function safeToIndex(text) {
78
+ const value = String(text || '');
79
+ return !secretLike(value) && !injectionLike(value);
80
+ }
81
+
82
+ export function kindForPath(rel) {
83
+ if (rel.startsWith('wiki/')) return 'wiki';
84
+ if (rel.includes('/current/')) return 'handoff';
85
+ if (rel.includes('/product/')) return 'product';
86
+ if (rel === '00_system/neurain/lessons.md') return 'lessons';
87
+ if (rel.includes('/sessions/handoffs/')) return 'session';
88
+ if (rel.endsWith('log.md')) return 'log';
89
+ return 'markdown';
90
+ }
91
+
92
+ export function scopeForPath(rel) {
93
+ const area = rel.match(/^10_areas\/_?([^/]+)\//)?.[1];
94
+ if (area) return `area:${area}`;
95
+ const session = rel.match(/^00_system\/sessions\/handoffs\/(.+)\.md$/)?.[1];
96
+ if (session) return `session:${session}`;
97
+ return 'global';
98
+ }
99
+
100
+ export function titleForText(text, rel) {
101
+ return String(text || '').match(/^#\s+(.+)$/m)?.[1]?.trim() || path.basename(rel);
102
+ }
103
+
104
+ // Resolve a --area name to a real area directory: exact match, else the
105
+ // underscore-prefixed convention (--area research -> _research). Empty -> no scope.
106
+ // Unknown -> throw with the available area dirs.
107
+ export function resolveAreaDir(root, name, recallCfg = recallConfig(root)) {
108
+ const value = String(name || '').trim();
109
+ if (!value) return '';
110
+ const areasDir = recallCfg.areas_dir || '10_areas';
111
+ const areasAbs = path.join(root, areasDir);
112
+ const exists = (dir) => {
113
+ try { return fs.statSync(path.join(areasAbs, dir)).isDirectory(); } catch { return false; }
114
+ };
115
+ if (exists(value)) return value;
116
+ if (!value.startsWith('_') && exists(`_${value}`)) return `_${value}`;
117
+ let available = [];
118
+ try {
119
+ available = fs.readdirSync(areasAbs, { withFileTypes: true }).filter((d) => d.isDirectory() && !d.name.startsWith('.')).map((d) => d.name);
120
+ } catch { /* areas dir absent */ }
121
+ throw new Error(`Unknown area "${name}". Available areas: ${available.join(', ') || '(none)'}`);
122
+ }
123
+
124
+ // area dir -> recall scope value, matching scopeForPath which strips one leading
125
+ // underscore (10_areas/_research/... -> area:research).
126
+ export function scopeForArea(areaDir) {
127
+ return areaDir ? `area:${areaDir.replace(/^_/, '')}` : '';
128
+ }
129
+
130
+ // The single corpus file selection. Returns the markdown files (with raw text
131
+ // and resolved sensitivity) that belong in the recall corpus. When `area` (a
132
+ // resolved area directory like "_research") is given, the walk is scoped to that
133
+ // area for efficiency and area-local statistics; the whitelist still matches on
134
+ // full root-relative paths.
135
+ export function listRecallMarkdownFiles(root, recallCfg, { area = '' } = {}) {
136
+ const resolver = createSensitivityResolver(root, recallCfg);
137
+ const matches = buildRecallPathMatcher(recallCfg);
138
+ const areasDir = recallCfg.areas_dir || '10_areas';
139
+ const walkRoot = area ? path.join(root, areasDir, area) : root;
140
+ const out = [];
141
+ for (const file of walkFiles(walkRoot, { includeRaw: false, maxFiles: 50000 })) {
142
+ const rel = relPath(root, file);
143
+ if (!matches(rel)) continue;
144
+ if (!isTextFile(rel)) continue;
145
+ const text = readText(file, '');
146
+ const sensitivity = resolver.sensitivityFor(rel, text);
147
+ if (sensitivity === 'private') continue;
148
+ if (!safeToIndex(text)) continue;
149
+ out.push({ rel, text, sensitivity });
150
+ }
151
+ return out;
152
+ }
@@ -0,0 +1,233 @@
1
+ // Fact-ledger fusion for recall (W-A, minimal). A dependency-free, pure-JS port
2
+ // of the vault's memory-intel adapter, scoped to what the lexical ranker needs:
3
+ // it reads per-area fact ledgers registered in the search index registry,
4
+ // header-driven markdown-table parsing (so heterogeneous ledger schemas are read
5
+ // by column name), and exposes factsFor() so curated source docs and ledgers
6
+ // behind a matching fact earn the memory_fusion boost and bypass the diversity
7
+ // cap. Task lookup, conflict detection, and needs-verification (vault memory
8
+ // command surface) are intentionally NOT ported here; they belong to a later
9
+ // wave. Every file is optional: a missing ledger contributes nothing.
10
+ import fs from 'node:fs';
11
+ import path from 'node:path';
12
+ import { includesTermBoundary, normalizeText } from './recall_intel.mjs';
13
+
14
+ function readJsonSafe(absPath, fallback) {
15
+ try {
16
+ return JSON.parse(fs.readFileSync(absPath, 'utf8'));
17
+ } catch {
18
+ return fallback;
19
+ }
20
+ }
21
+
22
+ function readTextSafe(absPath) {
23
+ try {
24
+ return fs.readFileSync(absPath, 'utf8');
25
+ } catch {
26
+ return '';
27
+ }
28
+ }
29
+
30
+ // A registered fact_ledger may be a single .md file or a directory of .md tables.
31
+ function memoryFiles(root, areaRoot, value) {
32
+ if (!value) return [];
33
+ const abs = path.join(root, `${areaRoot}/${value}`);
34
+ let stat;
35
+ try {
36
+ stat = fs.statSync(abs);
37
+ } catch {
38
+ return [];
39
+ }
40
+ if (stat.isFile()) return [abs];
41
+ if (stat.isDirectory()) {
42
+ return fs.readdirSync(abs).filter((f) => f.endsWith('.md')).map((f) => path.join(abs, f));
43
+ }
44
+ return [];
45
+ }
46
+
47
+ // Split a markdown table row into trimmed cells, honoring escaped pipes (\|).
48
+ export function splitRow(line) {
49
+ const cells = line.split(/(?<!\\)\|/).map((c) => c.replace(/\\\|/g, '|').trim());
50
+ if (cells.length && cells[0] === '') cells.shift();
51
+ if (cells.length && cells[cells.length - 1] === '') cells.pop();
52
+ return cells;
53
+ }
54
+
55
+ function isSeparator(cells) {
56
+ return cells.length > 0 && cells.every((c) => /^:?-{2,}:?$/.test(c.replace(/\s/g, '')));
57
+ }
58
+
59
+ // Parse every markdown table into { headers, rows } keyed by normalized header.
60
+ export function parseTables(text) {
61
+ const lines = String(text || '').split(/\r?\n/);
62
+ const tables = [];
63
+ for (let i = 0; i < lines.length; i += 1) {
64
+ if (!/^\s*\|/.test(lines[i])) continue;
65
+ if (i + 1 >= lines.length) break;
66
+ const header = splitRow(lines[i]);
67
+ const sep = splitRow(lines[i + 1]);
68
+ if (!isSeparator(sep) || sep.length !== header.length) continue;
69
+ const rows = [];
70
+ let j = i + 2;
71
+ for (; j < lines.length && /^\s*\|/.test(lines[j]); j += 1) {
72
+ const cells = splitRow(lines[j]);
73
+ if (isSeparator(cells)) continue;
74
+ const row = {};
75
+ header.forEach((h, k) => {
76
+ row[normalizeText(h)] = cells[k] ?? '';
77
+ });
78
+ rows.push(row);
79
+ }
80
+ tables.push({ headers: header.map(normalizeText), rows });
81
+ i = j - 1;
82
+ }
83
+ return tables;
84
+ }
85
+
86
+ function pick(row, names) {
87
+ for (const n of names) {
88
+ if (row[n] !== undefined && String(row[n]).trim() !== '') return String(row[n]).trim();
89
+ }
90
+ return '';
91
+ }
92
+
93
+ function tableHas(table, names) {
94
+ return table.headers.some((h) => names.includes(h));
95
+ }
96
+
97
+ function emptyCell(value) {
98
+ const t = normalizeText(value).trim();
99
+ return t === '' || t === '-' || t === 'n/a' || t === 'na' || t === 'none';
100
+ }
101
+
102
+ const FACT_ID = ['fact id', 'factid'];
103
+ const SRC = ['source doc', 'source', 'ref', 'reference', 'sources'];
104
+
105
+ // Optional: honor a registered foreign id_column so non-English fact tables are
106
+ // recognized; the built-in FACT_ID headers already cover the reference vault.
107
+ function factIdColumns(root, recallCfg, area) {
108
+ const ids = new Set(FACT_ID);
109
+ const registryRel = recallCfg?.intel?.memory_write_registry || '00_system/neurain/memory-write-registry.json';
110
+ const reg = readJsonSafe(path.join(root, registryRel), { areas: {} });
111
+ const areaDef = (reg.areas || {})[area];
112
+ for (const def of Object.values(areaDef?.tables || {})) {
113
+ const idc = String(def.id_column || '').toLowerCase().trim();
114
+ if (idc && !idc.includes('task')) ids.add(idc);
115
+ }
116
+ return [...ids];
117
+ }
118
+
119
+ // Resolve a Source Doc cell to real root-relative paths, applying the area's
120
+ // path_map. A cell may hold multiple sources (backticked, comma/semicolon, or
121
+ // markdown links); URLs and unresolvable tokens are skipped.
122
+ function resolveSources(root, value, areaRoot, pathMap) {
123
+ const raw = String(value ?? '');
124
+ const tokens = [];
125
+ let m;
126
+ const backtick = /`([^`]+)`/g;
127
+ while ((m = backtick.exec(raw))) tokens.push(m[1]);
128
+ const link = /\]\(([^)\s]+)\)/g;
129
+ while ((m = link.exec(raw))) tokens.push(m[1]);
130
+ if (!tokens.length) for (const piece of raw.split(/[,;]/)) tokens.push(piece);
131
+ const out = [];
132
+ for (let token of tokens) {
133
+ token = token.replace(/`/g, '').replace(/\\/g, '/').replace(/^\.\//, '').trim();
134
+ if (!token || /^https?:\/\//i.test(token)) continue;
135
+ const candidates = [token];
136
+ for (const rule of pathMap || []) {
137
+ if (rule && rule.prefix && token.startsWith(rule.prefix)) candidates.push(rule.replace_with + token.slice(rule.prefix.length));
138
+ }
139
+ for (const candidate of candidates) {
140
+ const rel = `${areaRoot}/${candidate}`;
141
+ if (fs.existsSync(path.join(root, rel))) {
142
+ out.push(rel);
143
+ break;
144
+ }
145
+ }
146
+ }
147
+ return [...new Set(out)];
148
+ }
149
+
150
+ // Load curated facts from every registered area's fact ledger. No module cache;
151
+ // the caller (lexical context builder) owns the snapshot.
152
+ export function loadFactIntel(root, recallCfg) {
153
+ const registryRel = recallCfg?.intel?.registry || '00_system/neurain/search-index-registry.json';
154
+ const registry = readJsonSafe(path.join(root, registryRel), { areas: {} });
155
+ const facts = [];
156
+ const rel = (file) => path.relative(root, file).split(path.sep).join('/');
157
+ for (const [area, areaDef] of Object.entries(registry.areas || {})) {
158
+ if (!areaDef || !areaDef.area_root) continue;
159
+ const factIds = factIdColumns(root, recallCfg, area);
160
+ for (const file of memoryFiles(root, areaDef.area_root, areaDef.fact_ledger)) {
161
+ for (const table of parseTables(readTextSafe(file))) {
162
+ if (!tableHas(table, factIds)) continue;
163
+ for (const row of table.rows) {
164
+ const id = pick(row, factIds);
165
+ if (!id) continue;
166
+ facts.push({
167
+ area,
168
+ file: rel(file),
169
+ id,
170
+ entity: pick(row, ['entity', 'subject']),
171
+ domain: pick(row, ['domain']),
172
+ attribute: pick(row, ['attribute']),
173
+ value: pick(row, ['value']),
174
+ status: pick(row, ['status', 'state']),
175
+ recorded_at: pick(row, ['recorded at', 'logged']),
176
+ superseded_by: pick(row, ['superseded by']),
177
+ source: pick(row, SRC),
178
+ sources_resolved: resolveSources(root, pick(row, SRC), areaDef.area_root, areaDef.path_map),
179
+ fields: { ...row },
180
+ });
181
+ }
182
+ }
183
+ }
184
+ }
185
+ return { facts };
186
+ }
187
+
188
+ const OBSOLETE = ['obsolete', 'superseded', 'deprecated'];
189
+
190
+ export function isOutdated(fact) {
191
+ const status = normalizeText(fact.status);
192
+ return OBSOLETE.some((o) => status.includes(o)) || !emptyCell(fact.superseded_by);
193
+ }
194
+
195
+ function tokens(query) {
196
+ return normalizeText(query).split(/\s+/).filter(Boolean);
197
+ }
198
+
199
+ function scoreFields(fieldsText, queryTokens) {
200
+ const lower = normalizeText(fieldsText);
201
+ let n = 0;
202
+ for (const t of queryTokens) if (includesTermBoundary(lower, t)) n += 1;
203
+ return n;
204
+ }
205
+
206
+ const EMPTY_FACTS = { facts: [] };
207
+
208
+ // Facts whose fields match the query, current facts ranked above outdated ones,
209
+ // then by score and recency. Mirrors the vault ordering exactly.
210
+ export function factsFor(query, intel = EMPTY_FACTS, { area, top = 10 } = {}) {
211
+ const qt = tokens(query);
212
+ if (!qt.length) return [];
213
+ return intel.facts
214
+ .filter((f) => !area || f.area === area)
215
+ .map((f) => ({
216
+ f,
217
+ score: scoreFields(
218
+ f.fields && Object.keys(f.fields).length
219
+ ? Object.values(f.fields).join(' ')
220
+ : [f.entity, f.domain, f.attribute, f.value, f.id].join(' '),
221
+ qt
222
+ ),
223
+ }))
224
+ .filter((x) => x.score > 0)
225
+ .sort(
226
+ (a, b) =>
227
+ (isOutdated(a.f) ? 1 : 0) - (isOutdated(b.f) ? 1 : 0) ||
228
+ b.score - a.score ||
229
+ String(b.f.recorded_at).localeCompare(String(a.f.recorded_at))
230
+ )
231
+ .slice(0, top)
232
+ .map((x) => x.f);
233
+ }