neurain 0.1.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/LICENSE +57 -0
- package/README.md +205 -0
- package/SECURITY.md +22 -0
- package/bin/neurain.mjs +7 -0
- package/docs/comparison-mem0.en.md +22 -0
- package/docs/connect-claude.en.md +48 -0
- package/docs/connect-claude.kr.md +51 -0
- package/docs/connect-codex.en.md +38 -0
- package/docs/connect-codex.kr.md +40 -0
- package/docs/connect-gemini.en.md +71 -0
- package/docs/connect-gemini.kr.md +71 -0
- package/docs/connect-runtime.en.md +61 -0
- package/docs/connect-runtime.kr.md +61 -0
- package/docs/development-status.en.md +157 -0
- package/docs/development-status.kr.md +157 -0
- package/docs/knowledge-os.en.md +105 -0
- package/docs/knowledge-os.kr.md +106 -0
- package/docs/pricing.en.md +14 -0
- package/docs/privacy-and-data-flow.en.md +25 -0
- package/docs/public-saas-readiness.en.md +39 -0
- package/docs/quickstart.en.md +64 -0
- package/docs/quickstart.kr.md +64 -0
- package/docs/release-checklist.en.md +38 -0
- package/docs/safety.en.md +36 -0
- package/docs/self-improvement-90-roadmap.en.md +429 -0
- package/docs/self-improvement-90-roadmap.kr.md +429 -0
- package/docs/self-improving-workflows.en.md +163 -0
- package/docs/self-improving-workflows.kr.md +163 -0
- package/docs/support.en.md +17 -0
- package/docs/troubleshooting.en.md +35 -0
- package/package.json +36 -0
- package/src/cli.mjs +261 -0
- package/src/core/adopt.mjs +304 -0
- package/src/core/answer_eval.mjs +450 -0
- package/src/core/capabilities.mjs +217 -0
- package/src/core/capture_durable.mjs +181 -0
- package/src/core/classify.mjs +237 -0
- package/src/core/compile_desk.mjs +324 -0
- package/src/core/complete.mjs +108 -0
- package/src/core/config.mjs +142 -0
- package/src/core/connect.mjs +355 -0
- package/src/core/curator.mjs +351 -0
- package/src/core/daemon.mjs +536 -0
- package/src/core/digest.mjs +155 -0
- package/src/core/doctor.mjs +115 -0
- package/src/core/durable.mjs +96 -0
- package/src/core/envelope.mjs +97 -0
- package/src/core/flush.mjs +190 -0
- package/src/core/fs.mjs +121 -0
- package/src/core/init.mjs +194 -0
- package/src/core/journal.mjs +269 -0
- package/src/core/labels.mjs +117 -0
- package/src/core/lessons.mjs +793 -0
- package/src/core/lifecycle.mjs +1138 -0
- package/src/core/link_check.mjs +180 -0
- package/src/core/live_cases.mjs +221 -0
- package/src/core/onboard.mjs +175 -0
- package/src/core/plan_receipt.mjs +177 -0
- package/src/core/plan_writeback.mjs +176 -0
- package/src/core/queue.mjs +62 -0
- package/src/core/queue_archive.mjs +87 -0
- package/src/core/queue_model.mjs +161 -0
- package/src/core/queue_write.mjs +28 -0
- package/src/core/recall.mjs +1802 -0
- package/src/core/recall_bench.mjs +275 -0
- package/src/core/recall_corpus.mjs +152 -0
- package/src/core/recall_facts.mjs +233 -0
- package/src/core/recall_intel.mjs +233 -0
- package/src/core/recall_lexical.mjs +269 -0
- package/src/core/recap.mjs +78 -0
- package/src/core/review_queue.mjs +131 -0
- package/src/core/review_worker.mjs +284 -0
- package/src/core/route.mjs +73 -0
- package/src/core/safety.mjs +57 -0
- package/src/core/scheduler.mjs +697 -0
- package/src/core/search.mjs +54 -0
- package/src/core/secret_scan.mjs +143 -0
- package/src/core/semantic.mjs +187 -0
- package/src/core/source_digest.mjs +56 -0
- package/src/core/source_digest_gen.mjs +311 -0
- package/src/core/stage.mjs +105 -0
- package/src/core/status.mjs +175 -0
- package/src/core/vault_state.mjs +115 -0
- package/src/core/watch.mjs +282 -0
- package/src/core/wiki_log.mjs +29 -0
- package/src/core/wrap.mjs +62 -0
- package/src/mcp/server.mjs +865 -0
- package/templates/starter-vault/README.md +9 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { absPath, isTextFile, relPath, walkFiles } from './fs.mjs';
|
|
4
|
+
|
|
5
|
+
export async function searchCommand(args) {
|
|
6
|
+
const root = absPath(args._[0]);
|
|
7
|
+
const query = args.query || args._.slice(1).join(' ');
|
|
8
|
+
if (!query.trim()) throw new Error('Missing query.');
|
|
9
|
+
const payload = searchRoot(root, query, { top: Number(args.top || 10), area: args.area || '' });
|
|
10
|
+
if (args.json) return { json: true, payload };
|
|
11
|
+
return {
|
|
12
|
+
text: [
|
|
13
|
+
'# Neurain search',
|
|
14
|
+
'',
|
|
15
|
+
`- Query: ${query}`,
|
|
16
|
+
...payload.results.map((item) => `- ${item.path}: score ${item.score}, ${item.snippet}`),
|
|
17
|
+
payload.results.length ? '' : 'No matches found.',
|
|
18
|
+
].filter(Boolean).join('\n'),
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function searchRoot(root, query, { top = 10, area = '' } = {}) {
|
|
23
|
+
const terms = String(query).toLowerCase().split(/\s+/).filter(Boolean);
|
|
24
|
+
const areaPrefix = area ? `10_areas/${String(area).replace(/^_+/, '_')}` : '';
|
|
25
|
+
const results = [];
|
|
26
|
+
for (const file of walkFiles(root, { includeRaw: false })) {
|
|
27
|
+
const rel = relPath(root, file);
|
|
28
|
+
if (areaPrefix && !rel.startsWith(areaPrefix)) continue;
|
|
29
|
+
if (!isTextFile(file)) continue;
|
|
30
|
+
const text = safeRead(file);
|
|
31
|
+
if (!text) continue;
|
|
32
|
+
const lower = text.toLowerCase();
|
|
33
|
+
let score = 0;
|
|
34
|
+
for (const term of terms) {
|
|
35
|
+
if (lower.includes(term)) score += 10;
|
|
36
|
+
if (path.basename(rel).toLowerCase().includes(term)) score += 15;
|
|
37
|
+
}
|
|
38
|
+
if (score && rel.startsWith('wiki/')) score += 12;
|
|
39
|
+
if (score && rel.startsWith('00_system/')) score -= 3;
|
|
40
|
+
if (!score) continue;
|
|
41
|
+
const line = text.split(/\r?\n/).find((candidate) => terms.some((term) => candidate.toLowerCase().includes(term))) || '';
|
|
42
|
+
results.push({ path: rel, score, snippet: line.trim().slice(0, 180) });
|
|
43
|
+
}
|
|
44
|
+
results.sort((a, b) => b.score - a.score || a.path.localeCompare(b.path));
|
|
45
|
+
return { query: String(query), top, results: results.slice(0, top) };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function safeRead(file) {
|
|
49
|
+
try {
|
|
50
|
+
return fs.readFileSync(file, 'utf8');
|
|
51
|
+
} catch {
|
|
52
|
+
return '';
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
// Deterministic, fail-closed secret/credential preflight (W-B). Keyword
|
|
2
|
+
// classifiers only return a label; nothing inspects body text for actual secret
|
|
3
|
+
// VALUES. Any durable canonical write must run this real content scan first and
|
|
4
|
+
// BLOCK on a high-confidence hit, rather than trusting an instruction to "mask
|
|
5
|
+
// secrets". Scope: high-confidence token SHAPES (T1) + labelled BIP39 mnemonics
|
|
6
|
+
// (T2) + context-aware 64-hex keys (T3) + a Shannon-entropy check on long tokens
|
|
7
|
+
// (T4); a result blocks iff it has any high-confidence hit (T5). Faithful,
|
|
8
|
+
// dependency-free port of the vault lib/secret-scan.mjs.
|
|
9
|
+
import fs from 'node:fs';
|
|
10
|
+
|
|
11
|
+
// T1 — high-confidence token shapes. Each `re` is global.
|
|
12
|
+
const PATTERNS = [
|
|
13
|
+
{ type: 'openai-key', re: /\bsk-[A-Za-z0-9]{20,}\b/g, confidence: 'high' },
|
|
14
|
+
{ type: 'github-token', re: /\b(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9]{30,}\b/g, confidence: 'high' },
|
|
15
|
+
{ type: 'aws-access-key', re: /\bAKIA[0-9A-Z]{16}\b/g, confidence: 'high' },
|
|
16
|
+
{ type: 'google-api-key', re: /\bAIza[0-9A-Za-z\-_]{30,}\b/g, confidence: 'high' },
|
|
17
|
+
{ type: 'slack-token', re: /\bxox[baprs]-[A-Za-z0-9-]{10,}\b/g, confidence: 'high' },
|
|
18
|
+
{ type: 'jwt', re: /\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{6,}\b/g, confidence: 'high' },
|
|
19
|
+
{ type: 'pem-private-key', re: /-----BEGIN (?:RSA |EC |OPENSSH |DSA |PGP )?PRIVATE KEY-----/g, confidence: 'high' },
|
|
20
|
+
{ type: 'private-key-assignment', re: /\b(?:private[_-]?key|secret[_-]?key|api[_-]?secret)\s*[:=]\s*["']?[A-Za-z0-9+/_\-]{16,}/gi, confidence: 'high' },
|
|
21
|
+
// 64-hex blobs (ETH key vs public tx/block hash) are handled by findHexKeys
|
|
22
|
+
// below, which is context-aware so it can block a likely private key without
|
|
23
|
+
// flooding crypto notes full of tx hashes.
|
|
24
|
+
];
|
|
25
|
+
|
|
26
|
+
// A key-context keyword on the same line as a 64-hex blob promotes it from a
|
|
27
|
+
// possible public tx/block hash (low) to a likely private key (high). Kept narrow
|
|
28
|
+
// so ordinary on-chain notes full of 0x… hashes do not all become blocking.
|
|
29
|
+
const KEY_CONTEXT_RE = /(?:\bprivate[\s_-]?key\b|\bsecret[\s_-]?key\b|\bpriv[\s_-]?key\b|\bprivkey\b|\bprivatekey\b|\bmnemonic\b|\bseed[\s_-]?phrase\b|\bkeystore\b|\bwallet[\s_-]?key\b|\bdeployer[\s_-]?key\b|\bsigning[\s_-]?key\b|\bkey[\s_-]?material\b)/i;
|
|
30
|
+
|
|
31
|
+
function entropy(s) {
|
|
32
|
+
if (!s) return 0;
|
|
33
|
+
const freq = new Map();
|
|
34
|
+
for (const ch of s) freq.set(ch, (freq.get(ch) || 0) + 1);
|
|
35
|
+
let h = 0;
|
|
36
|
+
for (const n of freq.values()) {
|
|
37
|
+
const p = n / s.length;
|
|
38
|
+
h -= p * Math.log2(p);
|
|
39
|
+
}
|
|
40
|
+
return h;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function mask(token) {
|
|
44
|
+
const t = String(token);
|
|
45
|
+
if (t.length <= 8) return `${t[0]}***`;
|
|
46
|
+
return `${t.slice(0, 4)}…${t.slice(-2)} (len ${t.length})`;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// T2 — heuristic BIP39 mnemonic: 12 or 24 space-separated lowercase a-z words,
|
|
50
|
+
// each 3-8 chars, on one line (a leading label is stripped first).
|
|
51
|
+
function findMnemonics(text) {
|
|
52
|
+
const hits = [];
|
|
53
|
+
for (const rawLine of text.split(/\r?\n/)) {
|
|
54
|
+
const line = rawLine.replace(/^\s*(?:seed[\s_-]?phrase|mnemonic|recovery[\s_-]?phrase|secret[\s_-]?words?)\s*[:=\-]\s*/i, '');
|
|
55
|
+
const words = line.trim().split(/\s+/);
|
|
56
|
+
if ((words.length === 12 || words.length === 24) && words.every((w) => /^[a-z]{3,8}$/.test(w))) {
|
|
57
|
+
hits.push({ type: 'possible-bip39-mnemonic', confidence: 'high', sample: `${words.length} words: ${words[0]} … ${words[words.length - 1]}` });
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
return hits;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// T3 — context-aware 64-hex scan. A 0x-prefixed 64-hex is always a candidate; a
|
|
64
|
+
// bare 64-hex is only flagged when a key keyword is on the line. Key-context
|
|
65
|
+
// upgrades the hit to high (blocking).
|
|
66
|
+
function findHexKeys(text) {
|
|
67
|
+
const hits = [];
|
|
68
|
+
const seen = new Set();
|
|
69
|
+
for (const line of text.split(/\r?\n/)) {
|
|
70
|
+
const keyCtx = KEY_CONTEXT_RE.test(line);
|
|
71
|
+
for (const m of line.matchAll(/\b0x[0-9a-fA-F]{64}\b/g)) {
|
|
72
|
+
const tok = m[0];
|
|
73
|
+
if (seen.has(tok)) continue;
|
|
74
|
+
seen.add(tok);
|
|
75
|
+
hits.push(keyCtx
|
|
76
|
+
? { type: 'eth-private-key', confidence: 'high', sample: mask(tok) }
|
|
77
|
+
: { type: 'hex64-0x', confidence: 'low', sample: mask(tok) });
|
|
78
|
+
}
|
|
79
|
+
if (keyCtx) {
|
|
80
|
+
for (const m of line.matchAll(/\b[0-9a-fA-F]{64}\b/g)) {
|
|
81
|
+
const tok = m[0];
|
|
82
|
+
if (seen.has(tok) || seen.has(`0x${tok}`)) continue;
|
|
83
|
+
seen.add(tok);
|
|
84
|
+
hits.push({ type: 'hex64-key-context', confidence: 'high', sample: mask(tok) });
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return hits;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// T4 — high-entropy long token scan (catches secrets without a known prefix).
|
|
92
|
+
function findHighEntropy(text) {
|
|
93
|
+
const hits = [];
|
|
94
|
+
const seen = new Set();
|
|
95
|
+
for (const m of text.matchAll(/[A-Za-z0-9+/=_\-]{32,}/g)) {
|
|
96
|
+
const tok = m[0];
|
|
97
|
+
if (seen.has(tok)) continue;
|
|
98
|
+
seen.add(tok);
|
|
99
|
+
const h = entropy(tok);
|
|
100
|
+
const classes = (/[a-z]/.test(tok) ? 1 : 0) + (/[A-Z]/.test(tok) ? 1 : 0) + (/[0-9]/.test(tok) ? 1 : 0) + (/[+/=_\-]/.test(tok) ? 1 : 0);
|
|
101
|
+
if (h >= 4.0 && classes >= 3 && tok.length >= 32) {
|
|
102
|
+
hits.push({ type: 'high-entropy-token', confidence: 'medium', sample: mask(tok), entropy: Math.round(h * 100) / 100 });
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
return hits;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
export function scanSecrets(text) {
|
|
109
|
+
const src = String(text || '');
|
|
110
|
+
if (!src) return [];
|
|
111
|
+
const hits = [];
|
|
112
|
+
for (const { type, re, confidence } of PATTERNS) {
|
|
113
|
+
for (const m of src.matchAll(re)) hits.push({ type, confidence, sample: mask(m[0]) });
|
|
114
|
+
}
|
|
115
|
+
hits.push(...findMnemonics(src));
|
|
116
|
+
hits.push(...findHexKeys(src));
|
|
117
|
+
hits.push(...findHighEntropy(src));
|
|
118
|
+
return hits;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export function scanFileForSecrets(absPath) {
|
|
122
|
+
let text = '';
|
|
123
|
+
try {
|
|
124
|
+
text = fs.readFileSync(absPath, 'utf8');
|
|
125
|
+
} catch {
|
|
126
|
+
return { ok: true, readable: false, hits: [] };
|
|
127
|
+
}
|
|
128
|
+
const hits = scanSecrets(text);
|
|
129
|
+
return { ok: hits.filter((h) => h.confidence === 'high').length === 0, readable: true, hits };
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// T5 — a scan result is "blocking" iff it has any high-confidence hit. Low/medium
|
|
133
|
+
// hits are reported for human review but never auto-block (too noisy on crypto
|
|
134
|
+
// notes full of 0x hashes and base64 blobs).
|
|
135
|
+
export function isBlocking(hits) {
|
|
136
|
+
return (hits || []).some((h) => h.confidence === 'high');
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
export function summarizeHits(hits) {
|
|
140
|
+
const byType = new Map();
|
|
141
|
+
for (const h of hits || []) byType.set(h.type, (byType.get(h.type) || 0) + 1);
|
|
142
|
+
return [...byType.entries()].map(([type, count]) => `${type}×${count}`).join(', ');
|
|
143
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
// Local, dependency-free, deterministic lexical-semantic layer for recall (E22).
|
|
2
|
+
//
|
|
3
|
+
// This is NOT a neural embedding model. It improves recall beyond exact-token by:
|
|
4
|
+
// 1. stemming morphological variants: fix / fixed / fixing / fixes -> fix
|
|
5
|
+
// 2. a synonym map resolved <-> fix, defect <-> bug, login <-> authentication
|
|
6
|
+
// 3. fuzzy overlap character-trigram Jaccard for typos and minor variants
|
|
7
|
+
//
|
|
8
|
+
// An embedding-provider interface lets a real vector model be plugged in later
|
|
9
|
+
// WITHOUT changing the canonical markdown or locking Neurain to one LLM. The
|
|
10
|
+
// default provider is `local-lexical`: fully deterministic, no model call, no
|
|
11
|
+
// external dependency, and it needs no separate index because the canonical
|
|
12
|
+
// markdown is scored directly.
|
|
13
|
+
|
|
14
|
+
const STOPWORDS = new Set([
|
|
15
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'of', 'to', 'in', 'on', 'for', 'with', 'at', 'by', 'from',
|
|
16
|
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'this', 'that', 'these', 'those', 'it', 'its',
|
|
17
|
+
'as', 'into', 'after', 'before', 'should', 'must', 'will', 'can', 'not', 'no', 'than', 'then',
|
|
18
|
+
]);
|
|
19
|
+
|
|
20
|
+
const STEM_SUFFIXES = [
|
|
21
|
+
'ization', 'isation', 'ational', 'fulness', 'iveness', 'ousness', 'ations', 'ation',
|
|
22
|
+
'ingly', 'edly', 'ings', 'ies', 'ied', 'ment', 'ness', 'able', 'ible', 'ful', 'ous',
|
|
23
|
+
'ive', 'ize', 'ise', 'ing', 'ers', 'er', 'est', 'ed', 'es', 'ly', 's',
|
|
24
|
+
];
|
|
25
|
+
|
|
26
|
+
// Each group lists surface forms that mean roughly the same thing. Every form's
|
|
27
|
+
// stem is mapped to the group's canonical id, so a query word and a stored word
|
|
28
|
+
// in the same group match even when they share no characters.
|
|
29
|
+
const SYNONYM_GROUPS = [
|
|
30
|
+
['fix', 'fixed', 'fixes', 'fixing', 'resolve', 'resolved', 'resolving', 'repair', 'repaired', 'correct', 'corrected', 'patch', 'patched'],
|
|
31
|
+
['bug', 'bugs', 'defect', 'defects', 'error', 'errors', 'issue', 'issues', 'fault', 'faults', 'failure', 'failures', 'problem', 'problems'],
|
|
32
|
+
['login', 'logins', 'signin', 'authentication', 'authenticate', 'authenticated', 'credential', 'credentials'],
|
|
33
|
+
['delete', 'deleted', 'deletes', 'remove', 'removed', 'removal', 'drop', 'dropped', 'erase', 'erased'],
|
|
34
|
+
['add', 'added', 'create', 'created', 'creation', 'insert', 'inserted', 'new'],
|
|
35
|
+
['update', 'updated', 'modify', 'modified', 'change', 'changed', 'edit', 'edited', 'revise', 'revised', 'revision'],
|
|
36
|
+
['fast', 'faster', 'quick', 'quickly', 'rapid', 'rapidly', 'speed', 'speedup', 'performance', 'latency'],
|
|
37
|
+
['doc', 'docs', 'document', 'documents', 'documentation', 'readme', 'guide', 'guides'],
|
|
38
|
+
['config', 'configs', 'configuration', 'configure', 'configured', 'setting', 'settings', 'setup'],
|
|
39
|
+
['test', 'tests', 'testing', 'verify', 'verified', 'verification', 'validate', 'validated', 'validation'],
|
|
40
|
+
['rollback', 'revert', 'reverted', 'undo', 'restore', 'restored', 'restoration'],
|
|
41
|
+
['review', 'reviewed', 'reviews', 'audit', 'audited', 'inspect', 'inspected', 'inspection'],
|
|
42
|
+
['session', 'sessions', 'handoff', 'handoffs', 'continuity', 'resume', 'resumed'],
|
|
43
|
+
['memory', 'memories', 'recall', 'remember', 'remembered', 'retrieval', 'retrieve', 'retrieved'],
|
|
44
|
+
['lesson', 'lessons', 'learning', 'learned', 'insight', 'insights', 'takeaway'],
|
|
45
|
+
['car', 'cars', 'automobile', 'automobiles', 'vehicle', 'vehicles'],
|
|
46
|
+
['big', 'large', 'huge', 'massive', 'major'],
|
|
47
|
+
['small', 'tiny', 'minor', 'little'],
|
|
48
|
+
['start', 'started', 'begin', 'began', 'launch', 'launched', 'initiate', 'initiated'],
|
|
49
|
+
['stop', 'stopped', 'halt', 'halted', 'end', 'ended', 'terminate', 'terminated'],
|
|
50
|
+
['safe', 'safety', 'secure', 'security', 'protect', 'protected', 'protection'],
|
|
51
|
+
['private', 'confidential', 'sensitive', 'secret'],
|
|
52
|
+
['user', 'users', 'customer', 'customers', 'client', 'clients'],
|
|
53
|
+
['plan', 'planned', 'planning', 'roadmap', 'schedule', 'scheduled'],
|
|
54
|
+
['build', 'built', 'compile', 'compiled', 'assemble', 'assembled'],
|
|
55
|
+
['note', 'notes', 'memo', 'record', 'records', 'log', 'logged'],
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
const SYNONYM_CANON = (() => {
|
|
59
|
+
const map = new Map();
|
|
60
|
+
for (const group of SYNONYM_GROUPS) {
|
|
61
|
+
const canon = stem(group[0]);
|
|
62
|
+
for (const word of group) {
|
|
63
|
+
const s = stem(word);
|
|
64
|
+
if (!map.has(s)) map.set(s, canon);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
return map;
|
|
68
|
+
})();
|
|
69
|
+
|
|
70
|
+
export function stem(token) {
|
|
71
|
+
let t = String(token || '').toLowerCase().replace(/['’]/g, '');
|
|
72
|
+
if (t.length <= 3) return t;
|
|
73
|
+
for (const suf of STEM_SUFFIXES) {
|
|
74
|
+
if (t.endsWith(suf) && t.length - suf.length >= 3) {
|
|
75
|
+
t = t.slice(0, -suf.length);
|
|
76
|
+
break;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// collapse a doubled final consonant (running -> runn -> run) for stability
|
|
80
|
+
t = t.replace(/([bdfgklmnprt])\1$/, '$1');
|
|
81
|
+
return t;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export function tokenize(text) {
|
|
85
|
+
return (String(text || '').toLowerCase().match(/[\p{L}\p{N}_-]+/gu) || []).filter((token) => token.length >= 2 && !STOPWORDS.has(token));
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export function expandToken(token) {
|
|
89
|
+
const s = stem(token);
|
|
90
|
+
return { token: String(token || '').toLowerCase(), stem: s, canon: SYNONYM_CANON.get(s) || null };
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const CANON_MEMBERS = (() => {
|
|
94
|
+
const map = new Map();
|
|
95
|
+
for (const group of SYNONYM_GROUPS) map.set(stem(group[0]), group);
|
|
96
|
+
return map;
|
|
97
|
+
})();
|
|
98
|
+
|
|
99
|
+
// Return a different surface form in the same synonym group (a deterministic lexical swap),
|
|
100
|
+
// or null if the token has no synonym. Used to derive paraphrase queries from real content.
|
|
101
|
+
export function alternativeForm(token) {
|
|
102
|
+
const s = stem(token);
|
|
103
|
+
const canon = SYNONYM_CANON.get(s);
|
|
104
|
+
if (!canon) return null;
|
|
105
|
+
const members = CANON_MEMBERS.get(canon) || [];
|
|
106
|
+
const alt = members.find((member) => stem(member) !== s && member.toLowerCase() !== String(token || '').toLowerCase());
|
|
107
|
+
return alt || null;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function charTrigrams(token) {
|
|
111
|
+
const padded = `#${String(token || '')}#`;
|
|
112
|
+
const grams = new Set();
|
|
113
|
+
for (let i = 0; i + 3 <= padded.length; i += 1) grams.add(padded.slice(i, i + 3));
|
|
114
|
+
return grams;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export function fuzzyOverlap(a, b) {
|
|
118
|
+
if (!a || !b) return 0;
|
|
119
|
+
if (a === b) return 1;
|
|
120
|
+
const ga = charTrigrams(a);
|
|
121
|
+
const gb = charTrigrams(b);
|
|
122
|
+
if (!ga.size || !gb.size) return 0;
|
|
123
|
+
let inter = 0;
|
|
124
|
+
for (const g of ga) if (gb.has(g)) inter += 1;
|
|
125
|
+
return inter / (ga.size + gb.size - inter);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Deterministic lexical-semantic score of a query against a document body.
|
|
129
|
+
// Returns { score: 0..1 normalized by query length, matched_terms: [...] }.
|
|
130
|
+
export function lexicalSemanticScore(query, docText) {
|
|
131
|
+
const queryExpanded = tokenize(query).map(expandToken);
|
|
132
|
+
if (!queryExpanded.length) return { score: 0, matched_terms: [] };
|
|
133
|
+
const docTokens = tokenize(docText).map(expandToken);
|
|
134
|
+
if (!docTokens.length) return { score: 0, matched_terms: [] };
|
|
135
|
+
const docStems = new Set(docTokens.map((d) => d.stem));
|
|
136
|
+
const docCanons = new Set(docTokens.map((d) => d.canon).filter(Boolean));
|
|
137
|
+
const matched = [];
|
|
138
|
+
let total = 0;
|
|
139
|
+
for (const q of queryExpanded) {
|
|
140
|
+
let best = 0;
|
|
141
|
+
let how = '';
|
|
142
|
+
if (docStems.has(q.stem)) { best = 1; how = 'exact'; }
|
|
143
|
+
else if (q.canon && docCanons.has(q.canon)) { best = 0.75; how = 'synonym'; }
|
|
144
|
+
else {
|
|
145
|
+
// fuzzy: best trigram overlap against any doc stem (typos / variants)
|
|
146
|
+
for (const d of docStems) {
|
|
147
|
+
const ov = fuzzyOverlap(q.stem, d);
|
|
148
|
+
if (ov > best) { best = ov; how = 'fuzzy'; }
|
|
149
|
+
}
|
|
150
|
+
if (best < 0.6) best = 0;
|
|
151
|
+
}
|
|
152
|
+
if (best > 0) matched.push({ term: q.token, via: how, weight: Number(best.toFixed(3)) });
|
|
153
|
+
total += best;
|
|
154
|
+
}
|
|
155
|
+
return { score: Number((total / queryExpanded.length).toFixed(4)), matched_terms: matched };
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const PROVIDERS = new Map();
|
|
159
|
+
|
|
160
|
+
export function registerProvider(name, impl) {
|
|
161
|
+
PROVIDERS.set(name, impl);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
export function getProvider(name = 'local-lexical') {
|
|
165
|
+
return PROVIDERS.get(name) || PROVIDERS.get('local-lexical');
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
export function listProviders() {
|
|
169
|
+
return [...PROVIDERS.keys()].sort();
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Default provider: deterministic local lexical-semantic. No model call, no
|
|
173
|
+
// external dependency, and no separate generated index (markdown stays canonical).
|
|
174
|
+
registerProvider('local-lexical', {
|
|
175
|
+
name: 'local-lexical',
|
|
176
|
+
kind: 'deterministic_lexical',
|
|
177
|
+
model_call: false,
|
|
178
|
+
external_call: false,
|
|
179
|
+
requires_index: false,
|
|
180
|
+
llm_locked: false,
|
|
181
|
+
expandQuery(query) {
|
|
182
|
+
return tokenize(query).map(expandToken);
|
|
183
|
+
},
|
|
184
|
+
score(query, docText) {
|
|
185
|
+
return lexicalSemanticScore(query, docText);
|
|
186
|
+
},
|
|
187
|
+
});
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
// Read-only reader for the source-digest manifest (the capture/compile pipeline's
|
|
2
|
+
// high-value candidate index, owned by W-B). The view tools only READ it to
|
|
3
|
+
// surface compile and deep-compile candidates. Missing manifest -> empty result.
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { readJsonSafe } from './vault_state.mjs';
|
|
6
|
+
|
|
7
|
+
const COMPILED_STATUSES = ['compiled', 'flushed', 'archived', 'superseded'];
|
|
8
|
+
|
|
9
|
+
function isCompiled(entry) {
|
|
10
|
+
const status = String(entry?.status || '').toLowerCase();
|
|
11
|
+
const compiledTo = Array.isArray(entry?.compiled_to) ? entry.compiled_to : [];
|
|
12
|
+
return COMPILED_STATUSES.includes(status) || compiledTo.length > 0;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export function loadManifestEntries(root, vaultCfg) {
|
|
16
|
+
const manifest = readJsonSafe(path.join(root, vaultCfg.source_digest_manifest), null);
|
|
17
|
+
if (!manifest || typeof manifest.files !== 'object') return [];
|
|
18
|
+
return Object.values(manifest.files);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Compile candidates for the boot/status summary: uncompiled, non-private,
|
|
22
|
+
// high-value, ranked by score then recency.
|
|
23
|
+
export function compileCandidateSummary(root, vaultCfg) {
|
|
24
|
+
const entries = loadManifestEntries(root, vaultCfg)
|
|
25
|
+
.filter((entry) => !isCompiled(entry))
|
|
26
|
+
.filter((entry) => ['public', 'internal'].includes(String(entry.sensitivity || 'internal')))
|
|
27
|
+
.filter((entry) => Number(entry.high_value_score || 0) >= 2)
|
|
28
|
+
.sort((a, b) => Number(b.high_value_score || 0) - Number(a.high_value_score || 0) || Number(b.mtime_ms || 0) - Number(a.mtime_ms || 0));
|
|
29
|
+
return {
|
|
30
|
+
count: entries.length,
|
|
31
|
+
examples: entries.slice(0, 3).map((entry) => ({
|
|
32
|
+
path: entry.path,
|
|
33
|
+
source_id: entry.source_id || '',
|
|
34
|
+
sensitivity: entry.sensitivity || 'internal',
|
|
35
|
+
score: Number(entry.high_value_score || 0),
|
|
36
|
+
})),
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Deep-compile candidates for the review desk: uncompiled with score >= 2.
|
|
41
|
+
export function deepCompileCandidates(root, vaultCfg, { top = 12 } = {}) {
|
|
42
|
+
return loadManifestEntries(root, vaultCfg)
|
|
43
|
+
.filter((entry) => !isCompiled(entry) && Number(entry.high_value_score || 0) >= 2)
|
|
44
|
+
.sort((a, b) => Number(b.high_value_score || 0) - Number(a.high_value_score || 0) || Number(b.size_bytes || 0) - Number(a.size_bytes || 0))
|
|
45
|
+
.slice(0, top)
|
|
46
|
+
.map((entry) => ({
|
|
47
|
+
path: entry.path,
|
|
48
|
+
source_id: entry.source_id,
|
|
49
|
+
source_type: entry.source_type,
|
|
50
|
+
sensitivity: entry.sensitivity,
|
|
51
|
+
status: entry.status,
|
|
52
|
+
high_value_score: entry.high_value_score,
|
|
53
|
+
reasons: entry.high_value_reasons || [],
|
|
54
|
+
suggested_review: 'Select this for Deep Compile if it will affect decisions, current status, or repeated future work.',
|
|
55
|
+
}));
|
|
56
|
+
}
|