@levalicious/server-memory 0.0.12 → 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +16 -0
- package/dist/scripts/textrank-experiment.js +618 -0
- package/dist/server.js +77 -3
- package/dist/src/kb_load.js +396 -0
- package/dist/src/memoryfile.js +15 -2
- package/dist/src/stringtable.js +24 -6
- package/dist/tests/memory-server.test.js +129 -0
- package/dist/tests/test-utils.js +6 -0
- package/native/binding.c +340 -0
- package/native/memoryfile.c +343 -0
- package/native/memoryfile.h +82 -0
- package/package.json +10 -3
package/dist/server.js
CHANGED
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
3
3
|
import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
|
|
4
4
|
import { randomBytes } from 'crypto';
|
|
5
|
+
import fs from 'fs';
|
|
5
6
|
import path from 'path';
|
|
6
7
|
import { fileURLToPath } from 'url';
|
|
7
8
|
import { GraphFile, DIR_FORWARD, DIR_BACKWARD } from './src/graphfile.js';
|
|
8
9
|
import { StringTable } from './src/stringtable.js';
|
|
9
10
|
import { structuralSample } from './src/pagerank.js';
|
|
11
|
+
import { validateExtension, loadDocument } from './src/kb_load.js';
|
|
10
12
|
// Define memory file path using environment variable with fallback
|
|
11
13
|
const defaultMemoryPath = path.join(path.dirname(fileURLToPath(import.meta.url)), 'memory.json');
|
|
12
14
|
// If MEMORY_FILE_PATH is just a filename, put it in the same directory as the script
|
|
@@ -104,7 +106,7 @@ function paginateItems(items, cursor = 0, maxChars = MAX_CHARS) {
|
|
|
104
106
|
let i = cursor;
|
|
105
107
|
// Calculate overhead for wrapper: {"items":[],"nextCursor":null,"totalCount":123}
|
|
106
108
|
const wrapperTemplate = { items: [], nextCursor: null, totalCount: items.length };
|
|
107
|
-
|
|
109
|
+
const overhead = JSON.stringify(wrapperTemplate).length;
|
|
108
110
|
let charCount = overhead;
|
|
109
111
|
while (i < items.length) {
|
|
110
112
|
const itemJson = JSON.stringify(items[i]);
|
|
@@ -199,6 +201,13 @@ export class KnowledgeGraphManager {
|
|
|
199
201
|
this.gf.sync();
|
|
200
202
|
}
|
|
201
203
|
}
|
|
204
|
+
/**
|
|
205
|
+
* Run the loadDocument pipeline under a shared (read) lock,
|
|
206
|
+
* so the StringTable is consistent during IDF derivation.
|
|
207
|
+
*/
|
|
208
|
+
prepareDocumentLoad(text, title, topK) {
|
|
209
|
+
return this.withReadLock(() => loadDocument(text, title, this.st, topK));
|
|
210
|
+
}
|
|
202
211
|
// --- Locking helpers ---
|
|
203
212
|
/**
|
|
204
213
|
* Acquire a shared (read) lock, refresh mappings (in case another process
|
|
@@ -528,7 +537,7 @@ export class KnowledgeGraphManager {
|
|
|
528
537
|
try {
|
|
529
538
|
regex = new RegExp(query, 'i');
|
|
530
539
|
}
|
|
531
|
-
catch
|
|
540
|
+
catch {
|
|
532
541
|
throw new Error(`Invalid regex pattern: ${query}`);
|
|
533
542
|
}
|
|
534
543
|
return this.withReadLock(() => {
|
|
@@ -950,7 +959,7 @@ export function createServer(memoryFilePath) {
|
|
|
950
959
|
sizes: ["any"]
|
|
951
960
|
}
|
|
952
961
|
],
|
|
953
|
-
version: "0.0.
|
|
962
|
+
version: "0.0.14",
|
|
954
963
|
}, {
|
|
955
964
|
capabilities: {
|
|
956
965
|
tools: {},
|
|
@@ -1271,6 +1280,30 @@ Use this to build chains of reasoning that persist in the graph. Each thought ca
|
|
|
1271
1280
|
required: ["observations"],
|
|
1272
1281
|
},
|
|
1273
1282
|
},
|
|
1283
|
+
{
|
|
1284
|
+
name: "kb_load",
|
|
1285
|
+
description: `Load a plaintext document into the knowledge graph. Chunks the text into entities connected by a doubly-linked chain, runs sentence TextRank to identify the most important sentences, and creates an index entity that links directly to the chunks containing those sentences.
|
|
1286
|
+
|
|
1287
|
+
The file MUST be plaintext (.txt, .tex, .md, source code, etc.). For PDFs, use pdftotext first. For other binary formats, convert to text before calling this tool.`,
|
|
1288
|
+
inputSchema: {
|
|
1289
|
+
type: "object",
|
|
1290
|
+
properties: {
|
|
1291
|
+
filePath: {
|
|
1292
|
+
type: "string",
|
|
1293
|
+
description: "Absolute path to the plaintext file to load. Must have a plaintext extension (.txt, .tex, .md, .py, .ts, etc.).",
|
|
1294
|
+
},
|
|
1295
|
+
title: {
|
|
1296
|
+
type: "string",
|
|
1297
|
+
description: "Optional title for the document entity. Defaults to the filename without extension.",
|
|
1298
|
+
},
|
|
1299
|
+
topK: {
|
|
1300
|
+
type: "number",
|
|
1301
|
+
description: "Number of top-ranked sentences to highlight in the index (default: 15).",
|
|
1302
|
+
},
|
|
1303
|
+
},
|
|
1304
|
+
required: ["filePath"],
|
|
1305
|
+
},
|
|
1306
|
+
},
|
|
1274
1307
|
],
|
|
1275
1308
|
};
|
|
1276
1309
|
});
|
|
@@ -1351,6 +1384,47 @@ Use this to build chains of reasoning that persist in the graph. Each thought ca
|
|
|
1351
1384
|
const result = await knowledgeGraphManager.addThought(args.observations, args.previousCtxId);
|
|
1352
1385
|
return { content: [{ type: "text", text: JSON.stringify(result) }] };
|
|
1353
1386
|
}
|
|
1387
|
+
case "kb_load": {
|
|
1388
|
+
const filePath = args.filePath;
|
|
1389
|
+
// Validate extension
|
|
1390
|
+
validateExtension(filePath);
|
|
1391
|
+
// Read file
|
|
1392
|
+
let text;
|
|
1393
|
+
try {
|
|
1394
|
+
text = fs.readFileSync(filePath, 'utf-8');
|
|
1395
|
+
}
|
|
1396
|
+
catch (err) {
|
|
1397
|
+
throw new Error(`Failed to read file: ${err instanceof Error ? err.message : String(err)}`);
|
|
1398
|
+
}
|
|
1399
|
+
// Derive title
|
|
1400
|
+
const title = args.title ?? path.basename(filePath, path.extname(filePath));
|
|
1401
|
+
const topK = args.topK ?? 15;
|
|
1402
|
+
// Run the pipeline (reads string table under read lock)
|
|
1403
|
+
const loadResult = knowledgeGraphManager.prepareDocumentLoad(text, title, topK);
|
|
1404
|
+
// Insert into KB
|
|
1405
|
+
const entities = await knowledgeGraphManager.createEntities(loadResult.entities.map(e => ({
|
|
1406
|
+
name: e.name,
|
|
1407
|
+
entityType: e.entityType,
|
|
1408
|
+
observations: e.observations,
|
|
1409
|
+
})));
|
|
1410
|
+
const relations = await knowledgeGraphManager.createRelations(loadResult.relations.map(r => ({
|
|
1411
|
+
from: r.from,
|
|
1412
|
+
to: r.to,
|
|
1413
|
+
relationType: r.relationType,
|
|
1414
|
+
})));
|
|
1415
|
+
knowledgeGraphManager.resample();
|
|
1416
|
+
return {
|
|
1417
|
+
content: [{
|
|
1418
|
+
type: "text",
|
|
1419
|
+
text: JSON.stringify({
|
|
1420
|
+
document: title,
|
|
1421
|
+
stats: loadResult.stats,
|
|
1422
|
+
entitiesCreated: entities.length,
|
|
1423
|
+
relationsCreated: relations.length,
|
|
1424
|
+
}, null, 2),
|
|
1425
|
+
}],
|
|
1426
|
+
};
|
|
1427
|
+
}
|
|
1354
1428
|
default:
|
|
1355
1429
|
throw new Error(`Unknown tool: ${name}`);
|
|
1356
1430
|
}
|
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* kb_load.ts — Load a plaintext document into the knowledge graph.
|
|
3
|
+
*
|
|
4
|
+
* Pipeline:
|
|
5
|
+
* 1. Normalize text
|
|
6
|
+
* 2. Split into observations (≤140 chars, word-boundary aligned)
|
|
7
|
+
* 3. Group observations into chunks (≤2 per entity)
|
|
8
|
+
* 4. Build chain: Document → starts_with/ends_with → chunks ↔ follows/preceded_by
|
|
9
|
+
* 5. Sentence TextRank: rank sentences by TF-IDF cosine PageRank
|
|
10
|
+
* 6. Build index entity: Document → has_index → Index → highlights → top chunks
|
|
11
|
+
*
|
|
12
|
+
* Returns arrays of entities and relations ready for createEntities/createRelations.
|
|
13
|
+
*/
|
|
14
|
+
import * as crypto from 'crypto';
|
|
15
|
+
import * as path from 'path';
|
|
16
|
+
// ─── Constants ──────────────────────────────────────────────────────
|
|
17
|
+
const MAX_OBS_LENGTH = 140;
|
|
18
|
+
const MAX_OBS_PER_ENTITY = 2;
|
|
19
|
+
const TEXTRANK_DAMPING = 0.85;
|
|
20
|
+
const TEXTRANK_MAX_ITER = 30000;
|
|
21
|
+
const TEXTRANK_CONVERGENCE = 1e-6;
|
|
22
|
+
const ALLOWED_EXTENSIONS = new Set([
|
|
23
|
+
'.txt', '.tex', '.md', '.markdown', '.rst', '.org', '.adoc',
|
|
24
|
+
'.asciidoc', '.html', '.htm', '.xml', '.json', '.yaml', '.yml',
|
|
25
|
+
'.toml', '.csv', '.tsv', '.log', '.cfg', '.ini', '.conf',
|
|
26
|
+
'.py', '.js', '.ts', '.c', '.h', '.cpp', '.hpp', '.java',
|
|
27
|
+
'.go', '.rs', '.rb', '.pl', '.sh', '.bash', '.zsh', '.fish',
|
|
28
|
+
'.el', '.lisp', '.clj', '.hs', '.ml', '.scala', '.kt',
|
|
29
|
+
'.r', '.m', '.swift', '.lua', '.vim', '.sql',
|
|
30
|
+
'.bib', '.sty', '.cls',
|
|
31
|
+
]);
|
|
32
|
+
// ─── Text Processing ────────────────────────────────────────────────
|
|
33
|
+
function normalize(text) {
|
|
34
|
+
text = text.replace(/\r\n/g, '\n');
|
|
35
|
+
text = text.replace(/[ \t]+/g, ' ');
|
|
36
|
+
text = text.replace(/\n{3,}/g, '\n\n');
|
|
37
|
+
text = text.trim();
|
|
38
|
+
return text.split(/\s+/).join(' ');
|
|
39
|
+
}
|
|
40
|
+
function labelWords(text, offset) {
|
|
41
|
+
const words = [];
|
|
42
|
+
let i = 0;
|
|
43
|
+
const n = text.length;
|
|
44
|
+
while (i < n) {
|
|
45
|
+
while (i < n && text[i] === ' ')
|
|
46
|
+
i++;
|
|
47
|
+
if (i >= n)
|
|
48
|
+
break;
|
|
49
|
+
const start = i;
|
|
50
|
+
while (i < n && text[i] !== ' ')
|
|
51
|
+
i++;
|
|
52
|
+
const raw = text.slice(start, i);
|
|
53
|
+
words.push({
|
|
54
|
+
text: raw,
|
|
55
|
+
normalized: raw.toLowerCase(),
|
|
56
|
+
start: offset + start,
|
|
57
|
+
end: offset + i,
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
return words;
|
|
61
|
+
}
|
|
62
|
+
function splitIntoObservations(text) {
|
|
63
|
+
const observations = [];
|
|
64
|
+
let pos = 0;
|
|
65
|
+
while (pos < text.length) {
|
|
66
|
+
const remaining = text.slice(pos);
|
|
67
|
+
if (remaining.length <= MAX_OBS_LENGTH) {
|
|
68
|
+
observations.push({
|
|
69
|
+
text: remaining,
|
|
70
|
+
start: pos,
|
|
71
|
+
end: pos + remaining.length,
|
|
72
|
+
words: labelWords(remaining, pos),
|
|
73
|
+
});
|
|
74
|
+
break;
|
|
75
|
+
}
|
|
76
|
+
let splitAt = 0;
|
|
77
|
+
for (let i = 0; i < remaining.length; i++) {
|
|
78
|
+
if (remaining[i] === ' ') {
|
|
79
|
+
if (remaining.slice(0, i).length <= MAX_OBS_LENGTH) {
|
|
80
|
+
splitAt = i;
|
|
81
|
+
}
|
|
82
|
+
else {
|
|
83
|
+
break;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (splitAt === 0) {
|
|
88
|
+
// No space fits — hard split
|
|
89
|
+
let jsLen = 0;
|
|
90
|
+
for (let i = 0; i < remaining.length; i++) {
|
|
91
|
+
const charLen = remaining.codePointAt(i) > 0xFFFF ? 2 : 1;
|
|
92
|
+
if (jsLen + charLen > MAX_OBS_LENGTH) {
|
|
93
|
+
splitAt = i;
|
|
94
|
+
break;
|
|
95
|
+
}
|
|
96
|
+
jsLen += charLen;
|
|
97
|
+
if (charLen === 2)
|
|
98
|
+
i++;
|
|
99
|
+
}
|
|
100
|
+
if (splitAt === 0)
|
|
101
|
+
splitAt = remaining.length;
|
|
102
|
+
}
|
|
103
|
+
const obsText = remaining.slice(0, splitAt).trimEnd();
|
|
104
|
+
observations.push({
|
|
105
|
+
text: obsText,
|
|
106
|
+
start: pos,
|
|
107
|
+
end: pos + obsText.length,
|
|
108
|
+
words: labelWords(obsText, pos),
|
|
109
|
+
});
|
|
110
|
+
pos += splitAt;
|
|
111
|
+
while (pos < text.length && text[pos] === ' ')
|
|
112
|
+
pos++;
|
|
113
|
+
}
|
|
114
|
+
return observations;
|
|
115
|
+
}
|
|
116
|
+
function chunkObservations(observations) {
|
|
117
|
+
const chunks = [];
|
|
118
|
+
for (let i = 0; i < observations.length; i += MAX_OBS_PER_ENTITY) {
|
|
119
|
+
chunks.push({
|
|
120
|
+
index: chunks.length,
|
|
121
|
+
id: crypto.randomBytes(12).toString('hex'),
|
|
122
|
+
observations: observations.slice(i, i + MAX_OBS_PER_ENTITY),
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
return chunks;
|
|
126
|
+
}
|
|
127
|
+
// ─── Sentence Splitting ─────────────────────────────────────────────
|
|
128
|
+
function splitSentences(normalizedText) {
|
|
129
|
+
const sentences = [];
|
|
130
|
+
const re = /(?<=[.?!])\s+/g;
|
|
131
|
+
let pos = 0;
|
|
132
|
+
let match;
|
|
133
|
+
while ((match = re.exec(normalizedText)) !== null) {
|
|
134
|
+
const text = normalizedText.slice(pos, match.index + 1).trim();
|
|
135
|
+
if (text.length > 0) {
|
|
136
|
+
const words = text.toLowerCase().split(/\s+/).filter(w => w.length > 0);
|
|
137
|
+
if (words.length >= 3) {
|
|
138
|
+
sentences.push({ index: sentences.length, text, start: pos, words });
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
pos = match.index + match[0].length;
|
|
142
|
+
}
|
|
143
|
+
if (pos < normalizedText.length) {
|
|
144
|
+
const text = normalizedText.slice(pos).trim();
|
|
145
|
+
if (text.length > 0) {
|
|
146
|
+
const words = text.toLowerCase().split(/\s+/).filter(w => w.length > 0);
|
|
147
|
+
if (words.length >= 3) {
|
|
148
|
+
sentences.push({ index: sentences.length, text, start: pos, words });
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
return sentences;
|
|
153
|
+
}
|
|
154
|
+
// ─── TF-IDF ─────────────────────────────────────────────────────────
|
|
155
|
+
function buildWeightVector(allWords, idf) {
|
|
156
|
+
const rawCounts = new Map();
|
|
157
|
+
for (const w of allWords) {
|
|
158
|
+
rawCounts.set(w.normalized, (rawCounts.get(w.normalized) ?? 0) + 1);
|
|
159
|
+
}
|
|
160
|
+
const weights = new Map();
|
|
161
|
+
for (const [word, raw] of rawCounts) {
|
|
162
|
+
weights.set(word, raw * (idf.get(word) ?? 0));
|
|
163
|
+
}
|
|
164
|
+
return weights;
|
|
165
|
+
}
|
|
166
|
+
function deriveCorpusDocFreqs(st) {
|
|
167
|
+
const df = new Map();
|
|
168
|
+
let corpusSize = 0;
|
|
169
|
+
for (const entry of st.entries()) {
|
|
170
|
+
corpusSize += entry.refcount;
|
|
171
|
+
const uniqueWords = new Set(entry.text.toLowerCase().split(/\s+/).filter(w => w.length > 0));
|
|
172
|
+
for (const word of uniqueWords) {
|
|
173
|
+
df.set(word, (df.get(word) ?? 0) + entry.refcount);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return { df, corpusSize };
|
|
177
|
+
}
|
|
178
|
+
function buildIdfVector(docVocab, df, corpusSize) {
|
|
179
|
+
const idf = new Map();
|
|
180
|
+
for (const word of docVocab) {
|
|
181
|
+
const docFreq = df.get(word) ?? 0;
|
|
182
|
+
idf.set(word, Math.log(corpusSize / (1 + docFreq)) + 1);
|
|
183
|
+
}
|
|
184
|
+
return idf;
|
|
185
|
+
}
|
|
186
|
+
// ─── Cosine Similarity ──────────────────────────────────────────────
|
|
187
|
+
function cosineSimilarity(weights, keysA, keysB) {
|
|
188
|
+
let dot = 0;
|
|
189
|
+
for (const word of keysA) {
|
|
190
|
+
if (keysB.has(word)) {
|
|
191
|
+
const w = weights.get(word) ?? 0;
|
|
192
|
+
dot += w * w;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
let normA = 0;
|
|
196
|
+
for (const word of keysA) {
|
|
197
|
+
const w = weights.get(word) ?? 0;
|
|
198
|
+
normA += w * w;
|
|
199
|
+
}
|
|
200
|
+
let normB = 0;
|
|
201
|
+
for (const word of keysB) {
|
|
202
|
+
const w = weights.get(word) ?? 0;
|
|
203
|
+
normB += w * w;
|
|
204
|
+
}
|
|
205
|
+
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
206
|
+
return denom === 0 ? 0 : dot / denom;
|
|
207
|
+
}
|
|
208
|
+
// ─── PageRank ───────────────────────────────────────────────────────
|
|
209
|
+
function pageRank(matrix) {
|
|
210
|
+
const n = matrix.length;
|
|
211
|
+
if (n === 0)
|
|
212
|
+
return [];
|
|
213
|
+
const rowSums = matrix.map(row => row.reduce((a, b) => a + b, 0));
|
|
214
|
+
let scores = new Array(n).fill(1 / n);
|
|
215
|
+
for (let iter = 0; iter < TEXTRANK_MAX_ITER; iter++) {
|
|
216
|
+
const next = new Array(n).fill(0);
|
|
217
|
+
for (let i = 0; i < n; i++) {
|
|
218
|
+
let sum = 0;
|
|
219
|
+
for (let j = 0; j < n; j++) {
|
|
220
|
+
if (j !== i && rowSums[j] > 0) {
|
|
221
|
+
sum += (matrix[j][i] / rowSums[j]) * scores[j];
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
next[i] = (1 - TEXTRANK_DAMPING) / n + TEXTRANK_DAMPING * sum;
|
|
225
|
+
}
|
|
226
|
+
let delta = 0;
|
|
227
|
+
for (let i = 0; i < n; i++)
|
|
228
|
+
delta += Math.abs(next[i] - scores[i]);
|
|
229
|
+
scores = next;
|
|
230
|
+
if (delta < TEXTRANK_CONVERGENCE)
|
|
231
|
+
break;
|
|
232
|
+
}
|
|
233
|
+
return scores;
|
|
234
|
+
}
|
|
235
|
+
// ─── Sentence TextRank ──────────────────────────────────────────────
|
|
236
|
+
function sentenceTextRank(sentences, weights) {
|
|
237
|
+
const n = sentences.length;
|
|
238
|
+
const keySets = sentences.map(s => new Set(s.words));
|
|
239
|
+
const matrix = Array.from({ length: n }, () => new Array(n).fill(0));
|
|
240
|
+
for (let i = 0; i < n; i++) {
|
|
241
|
+
for (let j = i + 1; j < n; j++) {
|
|
242
|
+
const sim = cosineSimilarity(weights, keySets[i], keySets[j]);
|
|
243
|
+
matrix[i][j] = sim;
|
|
244
|
+
matrix[j][i] = sim;
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
const scores = pageRank(matrix);
|
|
248
|
+
return sentences
|
|
249
|
+
.map((sentence, i) => ({ sentence, score: scores[i] }))
|
|
250
|
+
.sort((a, b) => b.score - a.score);
|
|
251
|
+
}
|
|
252
|
+
// ─── Sentence → Chunk mapping ───────────────────────────────────────
|
|
253
|
+
function sentenceToChunk(sentence, chunks) {
|
|
254
|
+
const target = sentence.start;
|
|
255
|
+
for (const chunk of chunks) {
|
|
256
|
+
const first = chunk.observations[0];
|
|
257
|
+
const last = chunk.observations[chunk.observations.length - 1];
|
|
258
|
+
if (target >= first.start && target < last.end)
|
|
259
|
+
return chunk;
|
|
260
|
+
}
|
|
261
|
+
return null;
|
|
262
|
+
}
|
|
263
|
+
// ─── Public API ─────────────────────────────────────────────────────
|
|
264
|
+
/**
|
|
265
|
+
* Validate that a file path has a plaintext extension.
|
|
266
|
+
* Returns the extension if valid, throws if not.
|
|
267
|
+
*/
|
|
268
|
+
export function validateExtension(filePath) {
|
|
269
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
270
|
+
if (!ext) {
|
|
271
|
+
throw new Error(`File has no extension: ${filePath}. Only plaintext files are accepted.`);
|
|
272
|
+
}
|
|
273
|
+
if (!ALLOWED_EXTENSIONS.has(ext)) {
|
|
274
|
+
throw new Error(`Unsupported file extension "${ext}". Only plaintext formats are accepted ` +
|
|
275
|
+
`(${[...ALLOWED_EXTENSIONS].slice(0, 10).join(', ')}, ...). ` +
|
|
276
|
+
`For PDFs, use pdftotext first. For other binary formats, convert to text.`);
|
|
277
|
+
}
|
|
278
|
+
return ext;
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Load a plaintext document into the knowledge graph.
|
|
282
|
+
*
|
|
283
|
+
* @param text Raw document text
|
|
284
|
+
* @param title Document entity name (e.g. filename without extension)
|
|
285
|
+
* @param st StringTable for IDF corpus frequencies
|
|
286
|
+
* @param topK Number of sentences to highlight in the index (default: 15)
|
|
287
|
+
* @returns Entities and relations ready for createEntities/createRelations
|
|
288
|
+
*/
|
|
289
|
+
export function loadDocument(text, title, st, topK = 15) {
|
|
290
|
+
// 1. Normalize and chunk
|
|
291
|
+
const normalizedText = normalize(text);
|
|
292
|
+
const observations = splitIntoObservations(normalizedText);
|
|
293
|
+
const chunks = chunkObservations(observations);
|
|
294
|
+
// Collect all words
|
|
295
|
+
const allWords = [];
|
|
296
|
+
for (const chunk of chunks) {
|
|
297
|
+
for (const obs of chunk.observations)
|
|
298
|
+
allWords.push(...obs.words);
|
|
299
|
+
}
|
|
300
|
+
const vocab = new Set(allWords.map(w => w.normalized));
|
|
301
|
+
// 2. IDF from corpus
|
|
302
|
+
const { df, corpusSize } = deriveCorpusDocFreqs(st);
|
|
303
|
+
const idf = buildIdfVector(vocab, df, corpusSize);
|
|
304
|
+
// 3. TF-IDF weight vector
|
|
305
|
+
const weights = buildWeightVector(allWords, idf);
|
|
306
|
+
// 4. Sentence TextRank
|
|
307
|
+
const sentences = splitSentences(normalizedText);
|
|
308
|
+
const rankedSentences = sentenceTextRank(sentences, weights);
|
|
309
|
+
// 5. Map top sentences to chunks (deduplicate)
|
|
310
|
+
const topSents = rankedSentences.slice(0, topK);
|
|
311
|
+
const highlights = [];
|
|
312
|
+
const seenChunks = new Set();
|
|
313
|
+
for (const { sentence, score } of topSents) {
|
|
314
|
+
const chunk = sentenceToChunk(sentence, chunks);
|
|
315
|
+
if (!chunk || seenChunks.has(chunk.id))
|
|
316
|
+
continue;
|
|
317
|
+
seenChunks.add(chunk.id);
|
|
318
|
+
highlights.push({ chunk, sentence, score });
|
|
319
|
+
}
|
|
320
|
+
// 6. Build index observations (compressed sentence previews)
|
|
321
|
+
const indexId = `${title}__index`;
|
|
322
|
+
const indexObs = [];
|
|
323
|
+
let current = '';
|
|
324
|
+
for (const { sentence } of highlights) {
|
|
325
|
+
const preview = sentence.text.length > 60
|
|
326
|
+
? sentence.text.slice(0, 57) + '...'
|
|
327
|
+
: sentence.text;
|
|
328
|
+
const candidate = current ? current + ' | ' + preview : preview;
|
|
329
|
+
if (candidate.length <= MAX_OBS_LENGTH) {
|
|
330
|
+
current = candidate;
|
|
331
|
+
}
|
|
332
|
+
else {
|
|
333
|
+
if (current)
|
|
334
|
+
indexObs.push(current);
|
|
335
|
+
if (indexObs.length >= MAX_OBS_PER_ENTITY)
|
|
336
|
+
break;
|
|
337
|
+
current = preview.length <= MAX_OBS_LENGTH ? preview : preview.slice(0, MAX_OBS_LENGTH);
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
if (current && indexObs.length < MAX_OBS_PER_ENTITY)
|
|
341
|
+
indexObs.push(current);
|
|
342
|
+
// ─── Assemble entities ──────────────────────────────────────────
|
|
343
|
+
const entities = [];
|
|
344
|
+
const relations = [];
|
|
345
|
+
// Document entity (no observations — it's a pointer node)
|
|
346
|
+
entities.push({ name: title, entityType: 'Document', observations: [] });
|
|
347
|
+
// Chunk entities
|
|
348
|
+
for (const chunk of chunks) {
|
|
349
|
+
entities.push({
|
|
350
|
+
name: chunk.id,
|
|
351
|
+
entityType: 'TextChunk',
|
|
352
|
+
observations: chunk.observations.map(o => o.text),
|
|
353
|
+
});
|
|
354
|
+
}
|
|
355
|
+
// Index entity
|
|
356
|
+
entities.push({
|
|
357
|
+
name: indexId,
|
|
358
|
+
entityType: 'DocumentIndex',
|
|
359
|
+
observations: indexObs,
|
|
360
|
+
});
|
|
361
|
+
// ─── Assemble relations ─────────────────────────────────────────
|
|
362
|
+
// Document → chain endpoints
|
|
363
|
+
if (chunks.length > 0) {
|
|
364
|
+
relations.push({ from: title, to: chunks[0].id, relationType: 'starts_with' });
|
|
365
|
+
relations.push({ from: chunks[0].id, to: title, relationType: 'belongs_to' });
|
|
366
|
+
if (chunks.length > 1) {
|
|
367
|
+
relations.push({ from: title, to: chunks[chunks.length - 1].id, relationType: 'ends_with' });
|
|
368
|
+
relations.push({ from: chunks[chunks.length - 1].id, to: title, relationType: 'belongs_to' });
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
// Chain: follows/preceded_by
|
|
372
|
+
for (let i = 0; i < chunks.length - 1; i++) {
|
|
373
|
+
relations.push({ from: chunks[i].id, to: chunks[i + 1].id, relationType: 'follows' });
|
|
374
|
+
relations.push({ from: chunks[i + 1].id, to: chunks[i].id, relationType: 'preceded_by' });
|
|
375
|
+
}
|
|
376
|
+
// Document → index
|
|
377
|
+
relations.push({ from: title, to: indexId, relationType: 'has_index' });
|
|
378
|
+
relations.push({ from: indexId, to: title, relationType: 'indexes' });
|
|
379
|
+
// Index → highlighted chunks
|
|
380
|
+
for (const { chunk } of highlights) {
|
|
381
|
+
relations.push({ from: indexId, to: chunk.id, relationType: 'highlights' });
|
|
382
|
+
relations.push({ from: chunk.id, to: indexId, relationType: 'highlighted_by' });
|
|
383
|
+
}
|
|
384
|
+
return {
|
|
385
|
+
entities,
|
|
386
|
+
relations,
|
|
387
|
+
stats: {
|
|
388
|
+
chars: text.length,
|
|
389
|
+
words: allWords.length,
|
|
390
|
+
uniqueWords: vocab.size,
|
|
391
|
+
chunks: chunks.length,
|
|
392
|
+
sentences: sentences.length,
|
|
393
|
+
indexHighlights: highlights.length,
|
|
394
|
+
},
|
|
395
|
+
};
|
|
396
|
+
}
|
package/dist/src/memoryfile.js
CHANGED
|
@@ -5,12 +5,25 @@
|
|
|
5
5
|
* Buffers passed to/from the native layer are Node Buffers.
|
|
6
6
|
*/
|
|
7
7
|
import { createRequire } from 'module';
|
|
8
|
+
import { existsSync } from 'fs';
|
|
8
9
|
import { dirname, join } from 'path';
|
|
9
10
|
import { fileURLToPath } from 'url';
|
|
10
11
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
11
12
|
const require = createRequire(import.meta.url);
|
|
12
|
-
//
|
|
13
|
-
|
|
13
|
+
// Walk up from __dirname to find the package root containing build/Release/memoryfile.node.
|
|
14
|
+
// Works from source (src/), compiled (dist/src/), and npx cache contexts.
|
|
15
|
+
function findNative() {
|
|
16
|
+
let dir = __dirname;
|
|
17
|
+
const { root } = require('path').parse(dir);
|
|
18
|
+
while (dir !== root) {
|
|
19
|
+
const candidate = join(dir, 'build', 'Release', 'memoryfile.node');
|
|
20
|
+
if (existsSync(candidate))
|
|
21
|
+
return candidate;
|
|
22
|
+
dir = dirname(dir);
|
|
23
|
+
}
|
|
24
|
+
throw new Error('Could not find native memoryfile.node — was the C addon built? Run: node-gyp rebuild');
|
|
25
|
+
}
|
|
26
|
+
const native = require(findNative());
|
|
14
27
|
export class MemoryFile {
|
|
15
28
|
handle;
|
|
16
29
|
closed = false;
|
package/dist/src/stringtable.js
CHANGED
|
@@ -43,7 +43,7 @@ const ENT_LEN = 8; // u16
|
|
|
43
43
|
const ENT_DATA = 10; // u8[len]
|
|
44
44
|
const ENT_HEADER_SIZE = 10;
|
|
45
45
|
// Hash index field offsets (relative to index block start)
|
|
46
|
-
const
|
|
46
|
+
const _IDX_BUCKET_COUNT = 0; // u32
|
|
47
47
|
const IDX_BUCKETS = 8; // u64[bucket_count]
|
|
48
48
|
const INITIAL_BUCKETS = 4096;
|
|
49
49
|
const LOAD_FACTOR_THRESHOLD = 0.7;
|
|
@@ -159,7 +159,7 @@ export class StringTable {
|
|
|
159
159
|
const data = Buffer.from(str, 'utf-8');
|
|
160
160
|
const hash = fnv1a(data);
|
|
161
161
|
const bucketCount = this.getBucketCount();
|
|
162
|
-
|
|
162
|
+
const bucket = hash % bucketCount;
|
|
163
163
|
// Linear probe to find existing or empty slot
|
|
164
164
|
for (let i = 0; i < bucketCount; i++) {
|
|
165
165
|
const slotIdx = (bucket + i) % bucketCount;
|
|
@@ -215,7 +215,7 @@ export class StringTable {
|
|
|
215
215
|
const data = Buffer.from(str, 'utf-8');
|
|
216
216
|
const hash = fnv1a(data);
|
|
217
217
|
const bucketCount = this.getBucketCount();
|
|
218
|
-
|
|
218
|
+
const bucket = hash % bucketCount;
|
|
219
219
|
for (let i = 0; i < bucketCount; i++) {
|
|
220
220
|
const slotIdx = (bucket + i) % bucketCount;
|
|
221
221
|
const entryOffset = this.getBucket(slotIdx);
|
|
@@ -264,10 +264,28 @@ export class StringTable {
|
|
|
264
264
|
get count() {
|
|
265
265
|
return this.getEntryCount();
|
|
266
266
|
}
|
|
267
|
+
/**
|
|
268
|
+
* Iterate over all live strings in the table.
|
|
269
|
+
* Yields { id, text, refcount } for each entry.
|
|
270
|
+
*/
|
|
271
|
+
*entries() {
|
|
272
|
+
const bucketCount = this.getBucketCount();
|
|
273
|
+
for (let i = 0; i < bucketCount; i++) {
|
|
274
|
+
const entryOffset = this.getBucket(i);
|
|
275
|
+
if (entryOffset === 0n)
|
|
276
|
+
continue;
|
|
277
|
+
const entry = this.readEntry(entryOffset);
|
|
278
|
+
yield {
|
|
279
|
+
id: entryOffset,
|
|
280
|
+
text: entry.data.toString('utf-8'),
|
|
281
|
+
refcount: entry.refcount,
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
}
|
|
267
285
|
// --- Hash index management ---
|
|
268
286
|
removeFromIndex(offset, hash) {
|
|
269
287
|
const bucketCount = this.getBucketCount();
|
|
270
|
-
|
|
288
|
+
const bucket = hash % bucketCount;
|
|
271
289
|
// Find the entry in the index
|
|
272
290
|
for (let i = 0; i < bucketCount; i++) {
|
|
273
291
|
const slotIdx = (bucket + i) % bucketCount;
|
|
@@ -305,7 +323,7 @@ export class StringTable {
|
|
|
305
323
|
slot = (slot + 1) % bucketCount;
|
|
306
324
|
}
|
|
307
325
|
}
|
|
308
|
-
needsRelocation(natural, empty, current,
|
|
326
|
+
needsRelocation(natural, empty, current, _size) {
|
|
309
327
|
// Is 'empty' between 'natural' and 'current' in the circular probe sequence?
|
|
310
328
|
if (natural <= current) {
|
|
311
329
|
return natural <= empty && empty < current;
|
|
@@ -343,7 +361,7 @@ export class StringTable {
|
|
|
343
361
|
continue;
|
|
344
362
|
// Read hash and insert into new index
|
|
345
363
|
const entry = this.readEntry(entryOffset);
|
|
346
|
-
|
|
364
|
+
const bucket = entry.hash % newBucketCount;
|
|
347
365
|
for (let j = 0; j < newBucketCount; j++) {
|
|
348
366
|
const slotIdx = (bucket + j) % newBucketCount;
|
|
349
367
|
const slotPos = newIndexOffset + BigInt(IDX_BUCKETS + slotIdx * 8);
|