@levalicious/server-memory 0.0.13 → 0.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -28
- package/dist/scripts/delete-document.js +91 -0
- package/dist/scripts/textrank-experiment.js +618 -0
- package/dist/server.js +127 -59
- package/dist/src/graphfile.js +118 -4
- package/dist/src/kb_load.js +396 -0
- package/dist/src/memoryfile.js +17 -0
- package/dist/src/merw.js +160 -0
- package/dist/src/stringtable.js +24 -6
- package/dist/tests/memory-server.test.js +129 -0
- package/dist/tests/test-utils.js +6 -0
- package/package.json +6 -2
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* kb_load.ts — Load a plaintext document into the knowledge graph.
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* 1. Normalize text
|
|
6
|
+
* 2. Split into observations (≤140 chars, word-boundary aligned)
|
|
7
|
+
* 3. Group observations into chunks (≤2 per entity)
|
|
8
|
+
* 4. Build chain: Document → starts_with/ends_with → chunks ↔ follows/preceded_by
|
|
9
|
+
* 5. Sentence TextRank: rank sentences by TF-IDF cosine PageRank
|
|
10
|
+
* 6. Build index entity: Document → has_index → Index → highlights → top chunks
|
|
11
|
+
*
|
|
12
|
+
* Returns arrays of entities and relations ready for createEntities/createRelations.
|
|
13
|
+
*/
|
|
14
|
+
import * as crypto from 'crypto';
|
|
15
|
+
import * as path from 'path';
|
|
16
|
+
// ─── Constants ──────────────────────────────────────────────────────
|
|
17
|
+
const MAX_OBS_LENGTH = 140;
|
|
18
|
+
const MAX_OBS_PER_ENTITY = 2;
|
|
19
|
+
const TEXTRANK_DAMPING = 0.85;
|
|
20
|
+
const TEXTRANK_MAX_ITER = 30000;
|
|
21
|
+
const TEXTRANK_CONVERGENCE = 1e-6;
|
|
22
|
+
const ALLOWED_EXTENSIONS = new Set([
|
|
23
|
+
'.txt', '.tex', '.md', '.markdown', '.rst', '.org', '.adoc',
|
|
24
|
+
'.asciidoc', '.html', '.htm', '.xml', '.json', '.yaml', '.yml',
|
|
25
|
+
'.toml', '.csv', '.tsv', '.log', '.cfg', '.ini', '.conf',
|
|
26
|
+
'.py', '.js', '.ts', '.c', '.h', '.cpp', '.hpp', '.java',
|
|
27
|
+
'.go', '.rs', '.rb', '.pl', '.sh', '.bash', '.zsh', '.fish',
|
|
28
|
+
'.el', '.lisp', '.clj', '.hs', '.ml', '.scala', '.kt',
|
|
29
|
+
'.r', '.m', '.swift', '.lua', '.vim', '.sql',
|
|
30
|
+
'.bib', '.sty', '.cls',
|
|
31
|
+
]);
|
|
32
|
+
// ─── Text Processing ────────────────────────────────────────────────
|
|
33
|
+
function normalize(text) {
|
|
34
|
+
text = text.replace(/\r\n/g, '\n');
|
|
35
|
+
text = text.replace(/[ \t]+/g, ' ');
|
|
36
|
+
text = text.replace(/\n{3,}/g, '\n\n');
|
|
37
|
+
text = text.trim();
|
|
38
|
+
return text.split(/\s+/).join(' ');
|
|
39
|
+
}
|
|
40
|
+
function labelWords(text, offset) {
|
|
41
|
+
const words = [];
|
|
42
|
+
let i = 0;
|
|
43
|
+
const n = text.length;
|
|
44
|
+
while (i < n) {
|
|
45
|
+
while (i < n && text[i] === ' ')
|
|
46
|
+
i++;
|
|
47
|
+
if (i >= n)
|
|
48
|
+
break;
|
|
49
|
+
const start = i;
|
|
50
|
+
while (i < n && text[i] !== ' ')
|
|
51
|
+
i++;
|
|
52
|
+
const raw = text.slice(start, i);
|
|
53
|
+
words.push({
|
|
54
|
+
text: raw,
|
|
55
|
+
normalized: raw.toLowerCase(),
|
|
56
|
+
start: offset + start,
|
|
57
|
+
end: offset + i,
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
return words;
|
|
61
|
+
}
|
|
62
|
+
function splitIntoObservations(text) {
|
|
63
|
+
const observations = [];
|
|
64
|
+
let pos = 0;
|
|
65
|
+
while (pos < text.length) {
|
|
66
|
+
const remaining = text.slice(pos);
|
|
67
|
+
if (remaining.length <= MAX_OBS_LENGTH) {
|
|
68
|
+
observations.push({
|
|
69
|
+
text: remaining,
|
|
70
|
+
start: pos,
|
|
71
|
+
end: pos + remaining.length,
|
|
72
|
+
words: labelWords(remaining, pos),
|
|
73
|
+
});
|
|
74
|
+
break;
|
|
75
|
+
}
|
|
76
|
+
let splitAt = 0;
|
|
77
|
+
for (let i = 0; i < remaining.length; i++) {
|
|
78
|
+
if (remaining[i] === ' ') {
|
|
79
|
+
if (remaining.slice(0, i).length <= MAX_OBS_LENGTH) {
|
|
80
|
+
splitAt = i;
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
break;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (splitAt === 0) {
|
|
88
|
+
// No space fits — hard split
|
|
89
|
+
let jsLen = 0;
|
|
90
|
+
for (let i = 0; i < remaining.length; i++) {
|
|
91
|
+
const charLen = remaining.codePointAt(i) > 0xFFFF ? 2 : 1;
|
|
92
|
+
if (jsLen + charLen > MAX_OBS_LENGTH) {
|
|
93
|
+
splitAt = i;
|
|
94
|
+
break;
|
|
95
|
+
}
|
|
96
|
+
jsLen += charLen;
|
|
97
|
+
if (charLen === 2)
|
|
98
|
+
i++;
|
|
99
|
+
}
|
|
100
|
+
if (splitAt === 0)
|
|
101
|
+
splitAt = remaining.length;
|
|
102
|
+
}
|
|
103
|
+
const obsText = remaining.slice(0, splitAt).trimEnd();
|
|
104
|
+
observations.push({
|
|
105
|
+
text: obsText,
|
|
106
|
+
start: pos,
|
|
107
|
+
end: pos + obsText.length,
|
|
108
|
+
words: labelWords(obsText, pos),
|
|
109
|
+
});
|
|
110
|
+
pos += splitAt;
|
|
111
|
+
while (pos < text.length && text[pos] === ' ')
|
|
112
|
+
pos++;
|
|
113
|
+
}
|
|
114
|
+
return observations;
|
|
115
|
+
}
|
|
116
|
+
function chunkObservations(observations) {
|
|
117
|
+
const chunks = [];
|
|
118
|
+
for (let i = 0; i < observations.length; i += MAX_OBS_PER_ENTITY) {
|
|
119
|
+
chunks.push({
|
|
120
|
+
index: chunks.length,
|
|
121
|
+
id: crypto.randomBytes(12).toString('hex'),
|
|
122
|
+
observations: observations.slice(i, i + MAX_OBS_PER_ENTITY),
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
return chunks;
|
|
126
|
+
}
|
|
127
|
+
// ─── Sentence Splitting ─────────────────────────────────────────────
|
|
128
|
+
function splitSentences(normalizedText) {
|
|
129
|
+
const sentences = [];
|
|
130
|
+
const re = /(?<=[.?!])\s+/g;
|
|
131
|
+
let pos = 0;
|
|
132
|
+
let match;
|
|
133
|
+
while ((match = re.exec(normalizedText)) !== null) {
|
|
134
|
+
const text = normalizedText.slice(pos, match.index + 1).trim();
|
|
135
|
+
if (text.length > 0) {
|
|
136
|
+
const words = text.toLowerCase().split(/\s+/).filter(w => w.length > 0);
|
|
137
|
+
if (words.length >= 3) {
|
|
138
|
+
sentences.push({ index: sentences.length, text, start: pos, words });
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
pos = match.index + match[0].length;
|
|
142
|
+
}
|
|
143
|
+
if (pos < normalizedText.length) {
|
|
144
|
+
const text = normalizedText.slice(pos).trim();
|
|
145
|
+
if (text.length > 0) {
|
|
146
|
+
const words = text.toLowerCase().split(/\s+/).filter(w => w.length > 0);
|
|
147
|
+
if (words.length >= 3) {
|
|
148
|
+
sentences.push({ index: sentences.length, text, start: pos, words });
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
return sentences;
|
|
153
|
+
}
|
|
154
|
+
// ─── TF-IDF ─────────────────────────────────────────────────────────
|
|
155
|
+
function buildWeightVector(allWords, idf) {
|
|
156
|
+
const rawCounts = new Map();
|
|
157
|
+
for (const w of allWords) {
|
|
158
|
+
rawCounts.set(w.normalized, (rawCounts.get(w.normalized) ?? 0) + 1);
|
|
159
|
+
}
|
|
160
|
+
const weights = new Map();
|
|
161
|
+
for (const [word, raw] of rawCounts) {
|
|
162
|
+
weights.set(word, raw * (idf.get(word) ?? 0));
|
|
163
|
+
}
|
|
164
|
+
return weights;
|
|
165
|
+
}
|
|
166
|
+
function deriveCorpusDocFreqs(st) {
|
|
167
|
+
const df = new Map();
|
|
168
|
+
let corpusSize = 0;
|
|
169
|
+
for (const entry of st.entries()) {
|
|
170
|
+
corpusSize += entry.refcount;
|
|
171
|
+
const uniqueWords = new Set(entry.text.toLowerCase().split(/\s+/).filter(w => w.length > 0));
|
|
172
|
+
for (const word of uniqueWords) {
|
|
173
|
+
df.set(word, (df.get(word) ?? 0) + entry.refcount);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return { df, corpusSize };
|
|
177
|
+
}
|
|
178
|
+
function buildIdfVector(docVocab, df, corpusSize) {
|
|
179
|
+
const idf = new Map();
|
|
180
|
+
for (const word of docVocab) {
|
|
181
|
+
const docFreq = df.get(word) ?? 0;
|
|
182
|
+
idf.set(word, Math.log(corpusSize / (1 + docFreq)) + 1);
|
|
183
|
+
}
|
|
184
|
+
return idf;
|
|
185
|
+
}
|
|
186
|
+
// ─── Cosine Similarity ──────────────────────────────────────────────
|
|
187
|
+
function cosineSimilarity(weights, keysA, keysB) {
|
|
188
|
+
let dot = 0;
|
|
189
|
+
for (const word of keysA) {
|
|
190
|
+
if (keysB.has(word)) {
|
|
191
|
+
const w = weights.get(word) ?? 0;
|
|
192
|
+
dot += w * w;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
let normA = 0;
|
|
196
|
+
for (const word of keysA) {
|
|
197
|
+
const w = weights.get(word) ?? 0;
|
|
198
|
+
normA += w * w;
|
|
199
|
+
}
|
|
200
|
+
let normB = 0;
|
|
201
|
+
for (const word of keysB) {
|
|
202
|
+
const w = weights.get(word) ?? 0;
|
|
203
|
+
normB += w * w;
|
|
204
|
+
}
|
|
205
|
+
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
206
|
+
return denom === 0 ? 0 : dot / denom;
|
|
207
|
+
}
|
|
208
|
+
// ─── PageRank ───────────────────────────────────────────────────────
|
|
209
|
+
function pageRank(matrix) {
|
|
210
|
+
const n = matrix.length;
|
|
211
|
+
if (n === 0)
|
|
212
|
+
return [];
|
|
213
|
+
const rowSums = matrix.map(row => row.reduce((a, b) => a + b, 0));
|
|
214
|
+
let scores = new Array(n).fill(1 / n);
|
|
215
|
+
for (let iter = 0; iter < TEXTRANK_MAX_ITER; iter++) {
|
|
216
|
+
const next = new Array(n).fill(0);
|
|
217
|
+
for (let i = 0; i < n; i++) {
|
|
218
|
+
let sum = 0;
|
|
219
|
+
for (let j = 0; j < n; j++) {
|
|
220
|
+
if (j !== i && rowSums[j] > 0) {
|
|
221
|
+
sum += (matrix[j][i] / rowSums[j]) * scores[j];
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
next[i] = (1 - TEXTRANK_DAMPING) / n + TEXTRANK_DAMPING * sum;
|
|
225
|
+
}
|
|
226
|
+
let delta = 0;
|
|
227
|
+
for (let i = 0; i < n; i++)
|
|
228
|
+
delta += Math.abs(next[i] - scores[i]);
|
|
229
|
+
scores = next;
|
|
230
|
+
if (delta < TEXTRANK_CONVERGENCE)
|
|
231
|
+
break;
|
|
232
|
+
}
|
|
233
|
+
return scores;
|
|
234
|
+
}
|
|
235
|
+
// ─── Sentence TextRank ──────────────────────────────────────────────
|
|
236
|
+
function sentenceTextRank(sentences, weights) {
|
|
237
|
+
const n = sentences.length;
|
|
238
|
+
const keySets = sentences.map(s => new Set(s.words));
|
|
239
|
+
const matrix = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
240
|
+
for (let i = 0; i < n; i++) {
|
|
241
|
+
for (let j = i + 1; j < n; j++) {
|
|
242
|
+
const sim = cosineSimilarity(weights, keySets[i], keySets[j]);
|
|
243
|
+
matrix[i][j] = sim;
|
|
244
|
+
matrix[j][i] = sim;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
const scores = pageRank(matrix);
|
|
248
|
+
return sentences
|
|
249
|
+
.map((sentence, i) => ({ sentence, score: scores[i] }))
|
|
250
|
+
.sort((a, b) => b.score - a.score);
|
|
251
|
+
}
|
|
252
|
+
// ─── Sentence → Chunk mapping ───────────────────────────────────────
|
|
253
|
+
function sentenceToChunk(sentence, chunks) {
|
|
254
|
+
const target = sentence.start;
|
|
255
|
+
for (const chunk of chunks) {
|
|
256
|
+
const first = chunk.observations[0];
|
|
257
|
+
const last = chunk.observations[chunk.observations.length - 1];
|
|
258
|
+
if (target >= first.start && target < last.end)
|
|
259
|
+
return chunk;
|
|
260
|
+
}
|
|
261
|
+
return null;
|
|
262
|
+
}
|
|
263
|
+
// ─── Public API ─────────────────────────────────────────────────────
|
|
264
|
+
/**
|
|
265
|
+
* Validate that a file path has a plaintext extension.
|
|
266
|
+
* Returns the extension if valid, throws if not.
|
|
267
|
+
*/
|
|
268
|
+
export function validateExtension(filePath) {
|
|
269
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
270
|
+
if (!ext) {
|
|
271
|
+
throw new Error(`File has no extension: ${filePath}. Only plaintext files are accepted.`);
|
|
272
|
+
}
|
|
273
|
+
if (!ALLOWED_EXTENSIONS.has(ext)) {
|
|
274
|
+
throw new Error(`Unsupported file extension "${ext}". Only plaintext formats are accepted ` +
|
|
275
|
+
`(${[...ALLOWED_EXTENSIONS].slice(0, 10).join(', ')}, ...). ` +
|
|
276
|
+
`For PDFs, use pdftotext first. For other binary formats, convert to text.`);
|
|
277
|
+
}
|
|
278
|
+
return ext;
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Load a plaintext document into the knowledge graph.
|
|
282
|
+
*
|
|
283
|
+
* @param text Raw document text
|
|
284
|
+
* @param title Document entity name (e.g. filename without extension)
|
|
285
|
+
* @param st StringTable for IDF corpus frequencies
|
|
286
|
+
* @param topK Number of sentences to highlight in the index (default: 15)
|
|
287
|
+
* @returns Entities and relations ready for createEntities/createRelations
|
|
288
|
+
*/
|
|
289
|
+
export function loadDocument(text, title, st, topK = 15) {
|
|
290
|
+
// 1. Normalize and chunk
|
|
291
|
+
const normalizedText = normalize(text);
|
|
292
|
+
const observations = splitIntoObservations(normalizedText);
|
|
293
|
+
const chunks = chunkObservations(observations);
|
|
294
|
+
// Collect all words
|
|
295
|
+
const allWords = [];
|
|
296
|
+
for (const chunk of chunks) {
|
|
297
|
+
for (const obs of chunk.observations)
|
|
298
|
+
allWords.push(...obs.words);
|
|
299
|
+
}
|
|
300
|
+
const vocab = new Set(allWords.map(w => w.normalized));
|
|
301
|
+
// 2. IDF from corpus
|
|
302
|
+
const { df, corpusSize } = deriveCorpusDocFreqs(st);
|
|
303
|
+
const idf = buildIdfVector(vocab, df, corpusSize);
|
|
304
|
+
// 3. TF-IDF weight vector
|
|
305
|
+
const weights = buildWeightVector(allWords, idf);
|
|
306
|
+
// 4. Sentence TextRank
|
|
307
|
+
const sentences = splitSentences(normalizedText);
|
|
308
|
+
const rankedSentences = sentenceTextRank(sentences, weights);
|
|
309
|
+
// 5. Map top sentences to chunks (deduplicate)
|
|
310
|
+
const topSents = rankedSentences.slice(0, topK);
|
|
311
|
+
const highlights = [];
|
|
312
|
+
const seenChunks = new Set();
|
|
313
|
+
for (const { sentence, score } of topSents) {
|
|
314
|
+
const chunk = sentenceToChunk(sentence, chunks);
|
|
315
|
+
if (!chunk || seenChunks.has(chunk.id))
|
|
316
|
+
continue;
|
|
317
|
+
seenChunks.add(chunk.id);
|
|
318
|
+
highlights.push({ chunk, sentence, score });
|
|
319
|
+
}
|
|
320
|
+
// 6. Build index observations (compressed sentence previews)
|
|
321
|
+
const indexId = `${title}__index`;
|
|
322
|
+
const indexObs = [];
|
|
323
|
+
let current = '';
|
|
324
|
+
for (const { sentence } of highlights) {
|
|
325
|
+
const preview = sentence.text.length > 60
|
|
326
|
+
? sentence.text.slice(0, 57) + '...'
|
|
327
|
+
: sentence.text;
|
|
328
|
+
const candidate = current ? current + ' | ' + preview : preview;
|
|
329
|
+
if (candidate.length <= MAX_OBS_LENGTH) {
|
|
330
|
+
current = candidate;
|
|
331
|
+
}
|
|
332
|
+
else {
|
|
333
|
+
if (current)
|
|
334
|
+
indexObs.push(current);
|
|
335
|
+
if (indexObs.length >= MAX_OBS_PER_ENTITY)
|
|
336
|
+
break;
|
|
337
|
+
current = preview.length <= MAX_OBS_LENGTH ? preview : preview.slice(0, MAX_OBS_LENGTH);
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
if (current && indexObs.length < MAX_OBS_PER_ENTITY)
|
|
341
|
+
indexObs.push(current);
|
|
342
|
+
// ─── Assemble entities ──────────────────────────────────────────
|
|
343
|
+
const entities = [];
|
|
344
|
+
const relations = [];
|
|
345
|
+
// Document entity (no observations — it's a pointer node)
|
|
346
|
+
entities.push({ name: title, entityType: 'Document', observations: [] });
|
|
347
|
+
// Chunk entities
|
|
348
|
+
for (const chunk of chunks) {
|
|
349
|
+
entities.push({
|
|
350
|
+
name: chunk.id,
|
|
351
|
+
entityType: 'TextChunk',
|
|
352
|
+
observations: chunk.observations.map(o => o.text),
|
|
353
|
+
});
|
|
354
|
+
}
|
|
355
|
+
// Index entity
|
|
356
|
+
entities.push({
|
|
357
|
+
name: indexId,
|
|
358
|
+
entityType: 'DocumentIndex',
|
|
359
|
+
observations: indexObs,
|
|
360
|
+
});
|
|
361
|
+
// ─── Assemble relations ─────────────────────────────────────────
|
|
362
|
+
// Document → chain endpoints
|
|
363
|
+
if (chunks.length > 0) {
|
|
364
|
+
relations.push({ from: title, to: chunks[0].id, relationType: 'starts_with' });
|
|
365
|
+
relations.push({ from: chunks[0].id, to: title, relationType: 'belongs_to' });
|
|
366
|
+
if (chunks.length > 1) {
|
|
367
|
+
relations.push({ from: title, to: chunks[chunks.length - 1].id, relationType: 'ends_with' });
|
|
368
|
+
relations.push({ from: chunks[chunks.length - 1].id, to: title, relationType: 'belongs_to' });
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
// Chain: follows/preceded_by
|
|
372
|
+
for (let i = 0; i < chunks.length - 1; i++) {
|
|
373
|
+
relations.push({ from: chunks[i].id, to: chunks[i + 1].id, relationType: 'follows' });
|
|
374
|
+
relations.push({ from: chunks[i + 1].id, to: chunks[i].id, relationType: 'preceded_by' });
|
|
375
|
+
}
|
|
376
|
+
// Document → index
|
|
377
|
+
relations.push({ from: title, to: indexId, relationType: 'has_index' });
|
|
378
|
+
relations.push({ from: indexId, to: title, relationType: 'indexes' });
|
|
379
|
+
// Index → highlighted chunks
|
|
380
|
+
for (const { chunk } of highlights) {
|
|
381
|
+
relations.push({ from: indexId, to: chunk.id, relationType: 'highlights' });
|
|
382
|
+
relations.push({ from: chunk.id, to: indexId, relationType: 'highlighted_by' });
|
|
383
|
+
}
|
|
384
|
+
return {
|
|
385
|
+
entities,
|
|
386
|
+
relations,
|
|
387
|
+
stats: {
|
|
388
|
+
chars: text.length,
|
|
389
|
+
words: allWords.length,
|
|
390
|
+
uniqueWords: vocab.size,
|
|
391
|
+
chunks: chunks.length,
|
|
392
|
+
sentences: sentences.length,
|
|
393
|
+
indexHighlights: highlights.length,
|
|
394
|
+
},
|
|
395
|
+
};
|
|
396
|
+
}
|
package/dist/src/memoryfile.js
CHANGED
|
@@ -115,6 +115,23 @@ export class MemoryFile {
|
|
|
115
115
|
this.assertOpen();
|
|
116
116
|
return native.stats(this.handle);
|
|
117
117
|
}
|
|
118
|
+
/**
|
|
119
|
+
* Read the memfile version field (u32 at offset 4).
|
|
120
|
+
*/
|
|
121
|
+
getVersion() {
|
|
122
|
+
this.assertOpen();
|
|
123
|
+
const buf = native.read(this.handle, 4n, 4n);
|
|
124
|
+
return buf.readUInt32LE(0);
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Write the memfile version field (u32 at offset 4).
|
|
128
|
+
*/
|
|
129
|
+
setVersion(version) {
|
|
130
|
+
this.assertOpen();
|
|
131
|
+
const buf = Buffer.alloc(4);
|
|
132
|
+
buf.writeUInt32LE(version, 0);
|
|
133
|
+
native.write(this.handle, 4n, buf);
|
|
134
|
+
}
|
|
118
135
|
/**
|
|
119
136
|
* Close the memory file. Syncs and unmaps.
|
|
120
137
|
* The instance is unusable after this.
|
package/dist/src/merw.js
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Maximum Entropy Random Walk (MERW) — dominant eigenvector computation
|
|
3
|
+
* via power iteration on the graph's adjacency matrix.
|
|
4
|
+
*
|
|
5
|
+
* MERW transition probabilities: S_ij = (A_ij / λ) * (ψ_j / ψ_i)
|
|
6
|
+
* Stationary distribution: ρ_i = ψ_i² / ‖ψ‖₂²
|
|
7
|
+
*
|
|
8
|
+
* We compute ψ (the dominant right eigenvector of A) using sparse power
|
|
9
|
+
* iteration directly on the GraphFile adjacency lists. No dense matrix
|
|
10
|
+
* is ever constructed.
|
|
11
|
+
*
|
|
12
|
+
* For directed graphs that may not be strongly connected, we add
|
|
13
|
+
* teleportation damping (like PageRank): at each step, follow an edge
|
|
14
|
+
* with probability `alpha`, or jump to a uniform random node with
|
|
15
|
+
* probability `(1 - alpha)`. This guarantees convergence to a unique
|
|
16
|
+
* positive eigenvector.
|
|
17
|
+
*/
|
|
18
|
+
import { DIR_FORWARD } from './graphfile.js';
|
|
19
|
+
const DEFAULT_ALPHA = 0.85;
|
|
20
|
+
const DEFAULT_MAX_ITER = 200;
|
|
21
|
+
const DEFAULT_TOL = 1e-8;
|
|
22
|
+
/**
|
|
23
|
+
* Compute the dominant eigenvector of the (damped) adjacency matrix
|
|
24
|
+
* via power iteration and write ψ_i into each entity record.
|
|
25
|
+
*
|
|
26
|
+
* Warm-starts from the ψ values already stored in the entity records.
|
|
27
|
+
* New nodes (psi === 0) are seeded with the mean of existing values.
|
|
28
|
+
* On a fresh graph (all zeros), falls back to uniform initialization.
|
|
29
|
+
*
|
|
30
|
+
* @param gf GraphFile to operate on
|
|
31
|
+
* @param alpha Damping factor (probability of following an edge). Default 0.85.
|
|
32
|
+
* @param maxIter Maximum iterations. Default 200.
|
|
33
|
+
* @param tol Convergence tolerance (L2 norm of change). Default 1e-8.
|
|
34
|
+
* @returns Number of iterations performed.
|
|
35
|
+
*/
|
|
36
|
+
export function computeMerwPsi(gf, alpha = DEFAULT_ALPHA, maxIter = DEFAULT_MAX_ITER, tol = DEFAULT_TOL) {
|
|
37
|
+
const offsets = gf.getAllEntityOffsets();
|
|
38
|
+
const n = offsets.length;
|
|
39
|
+
if (n === 0)
|
|
40
|
+
return 0;
|
|
41
|
+
// Build offset → index map for O(1) lookup
|
|
42
|
+
const indexMap = new Map();
|
|
43
|
+
for (let i = 0; i < n; i++) {
|
|
44
|
+
indexMap.set(offsets[i], i);
|
|
45
|
+
}
|
|
46
|
+
// Build sparse adjacency: for each node, list of forward neighbor indices
|
|
47
|
+
const adj = new Array(n);
|
|
48
|
+
for (let i = 0; i < n; i++) {
|
|
49
|
+
const edges = gf.getEdges(offsets[i]);
|
|
50
|
+
const neighbors = [];
|
|
51
|
+
for (const e of edges) {
|
|
52
|
+
if (e.direction !== DIR_FORWARD)
|
|
53
|
+
continue;
|
|
54
|
+
const j = indexMap.get(e.targetOffset);
|
|
55
|
+
if (j !== undefined)
|
|
56
|
+
neighbors.push(j);
|
|
57
|
+
}
|
|
58
|
+
adj[i] = neighbors;
|
|
59
|
+
}
|
|
60
|
+
// Warm-start: read existing ψ from entity records
|
|
61
|
+
let psi = new Float64Array(n);
|
|
62
|
+
let hasWarm = false;
|
|
63
|
+
let warmSum = 0;
|
|
64
|
+
let warmCount = 0;
|
|
65
|
+
for (let i = 0; i < n; i++) {
|
|
66
|
+
const val = gf.getPsi(offsets[i]);
|
|
67
|
+
psi[i] = val;
|
|
68
|
+
if (val > 0) {
|
|
69
|
+
hasWarm = true;
|
|
70
|
+
warmSum += val;
|
|
71
|
+
warmCount++;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
if (hasWarm) {
|
|
75
|
+
// Seed new/zero nodes with the mean of existing nonzero values
|
|
76
|
+
const mean = warmSum / warmCount;
|
|
77
|
+
for (let i = 0; i < n; i++) {
|
|
78
|
+
if (psi[i] <= 0)
|
|
79
|
+
psi[i] = mean;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
// Cold start: uniform
|
|
84
|
+
const uniform = 1.0 / Math.sqrt(n);
|
|
85
|
+
psi.fill(uniform);
|
|
86
|
+
}
|
|
87
|
+
// Normalize initial vector to unit L2
|
|
88
|
+
let initNorm = 0;
|
|
89
|
+
for (let i = 0; i < n; i++)
|
|
90
|
+
initNorm += psi[i] * psi[i];
|
|
91
|
+
initNorm = Math.sqrt(initNorm);
|
|
92
|
+
if (initNorm > 0) {
|
|
93
|
+
for (let i = 0; i < n; i++)
|
|
94
|
+
psi[i] /= initNorm;
|
|
95
|
+
}
|
|
96
|
+
let psiNext = new Float64Array(n);
|
|
97
|
+
const teleport = (1.0 - alpha) / n;
|
|
98
|
+
let iter = 0;
|
|
99
|
+
for (iter = 0; iter < maxIter; iter++) {
|
|
100
|
+
// Matrix-vector multiply: psiNext = alpha * A * psi + (1-alpha)/n * sum(psi)
|
|
101
|
+
// Since ψ is normalized, sum(psi) components contribute uniformly.
|
|
102
|
+
// For the adjacency multiply, A_ij = 1 if edge i→j exists.
|
|
103
|
+
// Power iteration: psiNext_j = alpha * Σ_{i: i→j} psi_i + teleport * Σ_k psi_k
|
|
104
|
+
//
|
|
105
|
+
// We iterate over source nodes and scatter to targets.
|
|
106
|
+
psiNext.fill(0);
|
|
107
|
+
// Compute sum of psi for teleportation
|
|
108
|
+
let psiSum = 0;
|
|
109
|
+
for (let i = 0; i < n; i++)
|
|
110
|
+
psiSum += psi[i];
|
|
111
|
+
const teleportContrib = teleport * psiSum;
|
|
112
|
+
// Sparse multiply: scatter from sources to targets
|
|
113
|
+
for (let i = 0; i < n; i++) {
|
|
114
|
+
const neighbors = adj[i];
|
|
115
|
+
const val = alpha * psi[i];
|
|
116
|
+
for (const j of neighbors) {
|
|
117
|
+
psiNext[j] += val;
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
// Add teleportation
|
|
121
|
+
for (let i = 0; i < n; i++) {
|
|
122
|
+
psiNext[i] += teleportContrib;
|
|
123
|
+
}
|
|
124
|
+
// Normalize to unit L2
|
|
125
|
+
let norm = 0;
|
|
126
|
+
for (let i = 0; i < n; i++)
|
|
127
|
+
norm += psiNext[i] * psiNext[i];
|
|
128
|
+
norm = Math.sqrt(norm);
|
|
129
|
+
if (norm > 0) {
|
|
130
|
+
for (let i = 0; i < n; i++)
|
|
131
|
+
psiNext[i] /= norm;
|
|
132
|
+
}
|
|
133
|
+
// Check convergence: L2 norm of difference
|
|
134
|
+
let diff = 0;
|
|
135
|
+
for (let i = 0; i < n; i++) {
|
|
136
|
+
const d = psiNext[i] - psi[i];
|
|
137
|
+
diff += d * d;
|
|
138
|
+
}
|
|
139
|
+
diff = Math.sqrt(diff);
|
|
140
|
+
// Swap buffers
|
|
141
|
+
const tmp = psi;
|
|
142
|
+
psi = psiNext;
|
|
143
|
+
psiNext = tmp;
|
|
144
|
+
if (diff < tol) {
|
|
145
|
+
iter++;
|
|
146
|
+
break;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
// Ensure all components are positive (Perron-Frobenius: dominant eigenvector is non-negative,
|
|
150
|
+
// but numerical noise can produce tiny negatives). Clamp to 0.
|
|
151
|
+
for (let i = 0; i < n; i++) {
|
|
152
|
+
if (psi[i] < 0)
|
|
153
|
+
psi[i] = 0;
|
|
154
|
+
}
|
|
155
|
+
// Write ψ_i into each entity record
|
|
156
|
+
for (let i = 0; i < n; i++) {
|
|
157
|
+
gf.setPsi(offsets[i], psi[i]);
|
|
158
|
+
}
|
|
159
|
+
return iter;
|
|
160
|
+
}
|
package/dist/src/stringtable.js
CHANGED
|
@@ -43,7 +43,7 @@ const ENT_LEN = 8; // u16
|
|
|
43
43
|
const ENT_DATA = 10; // u8[len]
|
|
44
44
|
const ENT_HEADER_SIZE = 10;
|
|
45
45
|
// Hash index field offsets (relative to index block start)
|
|
46
|
-
const
|
|
46
|
+
const _IDX_BUCKET_COUNT = 0; // u32
|
|
47
47
|
const IDX_BUCKETS = 8; // u64[bucket_count]
|
|
48
48
|
const INITIAL_BUCKETS = 4096;
|
|
49
49
|
const LOAD_FACTOR_THRESHOLD = 0.7;
|
|
@@ -159,7 +159,7 @@ export class StringTable {
|
|
|
159
159
|
const data = Buffer.from(str, 'utf-8');
|
|
160
160
|
const hash = fnv1a(data);
|
|
161
161
|
const bucketCount = this.getBucketCount();
|
|
162
|
-
|
|
162
|
+
const bucket = hash % bucketCount;
|
|
163
163
|
// Linear probe to find existing or empty slot
|
|
164
164
|
for (let i = 0; i < bucketCount; i++) {
|
|
165
165
|
const slotIdx = (bucket + i) % bucketCount;
|
|
@@ -215,7 +215,7 @@ export class StringTable {
|
|
|
215
215
|
const data = Buffer.from(str, 'utf-8');
|
|
216
216
|
const hash = fnv1a(data);
|
|
217
217
|
const bucketCount = this.getBucketCount();
|
|
218
|
-
|
|
218
|
+
const bucket = hash % bucketCount;
|
|
219
219
|
for (let i = 0; i < bucketCount; i++) {
|
|
220
220
|
const slotIdx = (bucket + i) % bucketCount;
|
|
221
221
|
const entryOffset = this.getBucket(slotIdx);
|
|
@@ -264,10 +264,28 @@ export class StringTable {
|
|
|
264
264
|
get count() {
|
|
265
265
|
return this.getEntryCount();
|
|
266
266
|
}
|
|
267
|
+
/**
|
|
268
|
+
* Iterate over all live strings in the table.
|
|
269
|
+
* Yields { id, text, refcount } for each entry.
|
|
270
|
+
*/
|
|
271
|
+
*entries() {
|
|
272
|
+
const bucketCount = this.getBucketCount();
|
|
273
|
+
for (let i = 0; i < bucketCount; i++) {
|
|
274
|
+
const entryOffset = this.getBucket(i);
|
|
275
|
+
if (entryOffset === 0n)
|
|
276
|
+
continue;
|
|
277
|
+
const entry = this.readEntry(entryOffset);
|
|
278
|
+
yield {
|
|
279
|
+
id: entryOffset,
|
|
280
|
+
text: entry.data.toString('utf-8'),
|
|
281
|
+
refcount: entry.refcount,
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
}
|
|
267
285
|
// --- Hash index management ---
|
|
268
286
|
removeFromIndex(offset, hash) {
|
|
269
287
|
const bucketCount = this.getBucketCount();
|
|
270
|
-
|
|
288
|
+
const bucket = hash % bucketCount;
|
|
271
289
|
// Find the entry in the index
|
|
272
290
|
for (let i = 0; i < bucketCount; i++) {
|
|
273
291
|
const slotIdx = (bucket + i) % bucketCount;
|
|
@@ -305,7 +323,7 @@ export class StringTable {
|
|
|
305
323
|
slot = (slot + 1) % bucketCount;
|
|
306
324
|
}
|
|
307
325
|
}
|
|
308
|
-
needsRelocation(natural, empty, current,
|
|
326
|
+
needsRelocation(natural, empty, current, _size) {
|
|
309
327
|
// Is 'empty' between 'natural' and 'current' in the circular probe sequence?
|
|
310
328
|
if (natural <= current) {
|
|
311
329
|
return natural <= empty && empty < current;
|
|
@@ -343,7 +361,7 @@ export class StringTable {
|
|
|
343
361
|
continue;
|
|
344
362
|
// Read hash and insert into new index
|
|
345
363
|
const entry = this.readEntry(entryOffset);
|
|
346
|
-
|
|
364
|
+
const bucket = entry.hash % newBucketCount;
|
|
347
365
|
for (let j = 0; j < newBucketCount; j++) {
|
|
348
366
|
const slotIdx = (bucket + j) % newBucketCount;
|
|
349
367
|
const slotPos = newIndexOffset + BigInt(IDX_BUCKETS + slotIdx * 8);
|