@levalicious/server-memory 0.0.13 → 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,618 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * textrank-experiment.ts — Chunking + Document Vectors
4
+ *
5
+ * Takes a document, produces:
6
+ * 1. TextChunk chain (same scheme as kb_load.py) — the structural backbone
7
+ * 2. TF-IDF weight vector for the WHOLE DOCUMENT
8
+ * 3. Cosine similarity function over subvectors (for chunk-chunk comparison)
9
+ *
10
+ * The weight vector is document-level. Chunks project into it via their word sets.
11
+ *
12
+ * Toggle USE_LOG_TF to switch between raw TF and log(1+TF).
13
+ *
14
+ * Usage:
15
+ * MEMORY_FILE_PATH=~/.local/share/memory/vscode.json \
16
+ * npx tsx scripts/textrank-experiment.ts <file>
17
+ */
18
+ import * as fs from 'fs';
19
+ import * as path from 'path';
20
+ import * as crypto from 'crypto';
21
+ import { StringTable } from '../src/stringtable.js';
22
+ // ─── Configuration ──────────────────────────────────────────────────
23
+ /** Flip this to switch between log(1 + rawCount) and rawCount for TF. */
24
+ const USE_LOG_TF = false;
25
+ // KB constraints (matching kb_load.py)
26
+ const MAX_OBS_LENGTH = 140;
27
+ const MAX_OBS_PER_ENTITY = 2;
28
+ // ─── Helpers ────────────────────────────────────────────────────────
29
+ /** String length as JS counts it (UTF-16 code units). */
30
+ function jsLength(s) {
31
+ // In JS, string.length already returns UTF-16 code units
32
+ return s.length;
33
+ }
34
+ /** Find every space-delimited word in text, with offsets relative to `offset`. */
35
+ function labelWords(text, offset) {
36
+ const words = [];
37
+ let i = 0;
38
+ const n = text.length;
39
+ while (i < n) {
40
+ // Skip spaces
41
+ while (i < n && text[i] === ' ')
42
+ i++;
43
+ if (i >= n)
44
+ break;
45
+ // Start of word
46
+ const start = i;
47
+ while (i < n && text[i] !== ' ')
48
+ i++;
49
+ const raw = text.slice(start, i);
50
+ words.push({
51
+ text: raw,
52
+ normalized: raw.toLowerCase(),
53
+ start: offset + start,
54
+ end: offset + i,
55
+ });
56
+ }
57
+ return words;
58
+ }
59
+ // ─── Splitting ──────────────────────────────────────────────────────
60
+ /** Split normalized text into observations of max 140 JS chars, word-boundary aligned. */
61
+ function splitIntoObservations(text) {
62
+ const observations = [];
63
+ let pos = 0;
64
+ while (pos < text.length) {
65
+ const remaining = text.slice(pos);
66
+ if (jsLength(remaining) <= MAX_OBS_LENGTH) {
67
+ observations.push({
68
+ text: remaining,
69
+ start: pos,
70
+ end: pos + remaining.length,
71
+ words: labelWords(remaining, pos),
72
+ });
73
+ break;
74
+ }
75
+ // Find last space that keeps JS length <= 140
76
+ let splitAt = 0;
77
+ for (let i = 0; i < remaining.length; i++) {
78
+ if (remaining[i] === ' ') {
79
+ if (jsLength(remaining.slice(0, i)) <= MAX_OBS_LENGTH) {
80
+ splitAt = i;
81
+ }
82
+ else {
83
+ break;
84
+ }
85
+ }
86
+ }
87
+ if (splitAt === 0) {
88
+ // No space fits — hard split at length boundary
89
+ let jsLen = 0;
90
+ for (let i = 0; i < remaining.length; i++) {
91
+ const charLen = remaining.codePointAt(i) > 0xFFFF ? 2 : 1;
92
+ if (jsLen + charLen > MAX_OBS_LENGTH) {
93
+ splitAt = i;
94
+ break;
95
+ }
96
+ jsLen += charLen;
97
+ // Skip surrogate pair
98
+ if (charLen === 2)
99
+ i++;
100
+ }
101
+ if (splitAt === 0)
102
+ splitAt = remaining.length;
103
+ }
104
+ const obsText = remaining.slice(0, splitAt).trimEnd();
105
+ observations.push({
106
+ text: obsText,
107
+ start: pos,
108
+ end: pos + obsText.length,
109
+ words: labelWords(obsText, pos),
110
+ });
111
+ pos += splitAt;
112
+ // Skip whitespace after split
113
+ while (pos < text.length && text[pos] === ' ')
114
+ pos++;
115
+ }
116
+ return observations;
117
+ }
118
+ /** Group observations into chunks of up to 2 (one entity each). */
119
+ function chunkObservations(observations) {
120
+ const chunks = [];
121
+ for (let i = 0; i < observations.length; i += MAX_OBS_PER_ENTITY) {
122
+ chunks.push({
123
+ index: chunks.length,
124
+ observations: observations.slice(i, i + MAX_OBS_PER_ENTITY),
125
+ });
126
+ }
127
+ return chunks;
128
+ }
129
+ // ─── Accessors ──────────────────────────────────────────────────────
130
+ /** Get all words across a chunk's observations. */
131
+ function _chunkWords(chunk) {
132
+ const words = [];
133
+ for (const obs of chunk.observations) {
134
+ words.push(...obs.words);
135
+ }
136
+ return words;
137
+ }
138
+ /** Get the full text of a chunk (all observations joined). */
139
+ function _chunkText(chunk) {
140
+ return chunk.observations.map(o => o.text).join(' ');
141
+ }
142
+ // ─── Pipeline ───────────────────────────────────────────────────────
143
+ /** Normalize text the same way kb_load.py does. */
144
+ function normalize(text) {
145
+ text = text.replace(/\r\n/g, '\n');
146
+ text = text.replace(/[ \t]+/g, ' ');
147
+ text = text.replace(/\n{3,}/g, '\n\n');
148
+ text = text.trim();
149
+ return text.split(/\s+/).join(' ');
150
+ }
151
+ function generateId() {
152
+ return crypto.randomBytes(12).toString('hex');
153
+ }
154
+ /** Full pipeline: normalize -> observations -> chunks with labeled words + IDs. */
155
+ export function processDocument(text) {
156
+ const normalized = normalize(text);
157
+ const observations = splitIntoObservations(normalized);
158
+ const chunks = chunkObservations(observations);
159
+ // Assign hex IDs
160
+ for (const chunk of chunks) {
161
+ chunk.id = generateId();
162
+ }
163
+ // Collect ALL words across the entire document
164
+ const allWords = [];
165
+ for (const chunk of chunks) {
166
+ for (const obs of chunk.observations) {
167
+ allWords.push(...obs.words);
168
+ }
169
+ }
170
+ return { chunks, allWords };
171
+ }
172
+ // ─── TF-IDF Vector ──────────────────────────────────────────────────
173
+ /**
174
+ * Build the document's TF-IDF weight vector.
175
+ * word → tf(word) * idf(word)
176
+ *
177
+ * TF is either raw count or log(1 + count) depending on USE_LOG_TF.
178
+ * IDF is log(totalCorpusWords / (1 + wordFreq)) + 1.
179
+ */
180
+ function buildWeightVector(allWords, idf) {
181
+ // Raw counts
182
+ const rawCounts = new Map();
183
+ for (const w of allWords) {
184
+ rawCounts.set(w.normalized, (rawCounts.get(w.normalized) ?? 0) + 1);
185
+ }
186
+ const weights = new Map();
187
+ for (const [word, raw] of rawCounts) {
188
+ const tf = USE_LOG_TF ? Math.log(1 + raw) : raw;
189
+ const idfW = idf.get(word) ?? 0;
190
+ weights.set(word, tf * idfW);
191
+ }
192
+ return weights;
193
+ }
194
+ // ─── Cosine Similarity ──────────────────────────────────────────────
195
+ /**
196
+ * Cosine similarity between two subsets of the document weight vector.
197
+ *
198
+ * Given the full weight vector W and two sets of word keys A and B,
199
+ * computes cosine(W|_A, W|_B) where W|_X is the subvector restricted
200
+ * to the dimensions in X.
201
+ *
202
+ * The shared dimensions (A ∩ B) contribute to the dot product.
203
+ * Each side's norm is computed over its own dimensions only.
204
+ *
205
+ * Returns 0 if either subvector has zero norm.
206
+ */
207
+ function cosineSimilarity(weights, keysA, keysB) {
208
+ // Dot product: only shared dimensions
209
+ let dot = 0;
210
+ for (const word of keysA) {
211
+ if (keysB.has(word)) {
212
+ const w = weights.get(word) ?? 0;
213
+ dot += w * w; // same vector, so weight(word) appears on both sides
214
+ }
215
+ }
216
+ // Norms: each over its own dimensions
217
+ let normA = 0;
218
+ for (const word of keysA) {
219
+ const w = weights.get(word) ?? 0;
220
+ normA += w * w;
221
+ }
222
+ let normB = 0;
223
+ for (const word of keysB) {
224
+ const w = weights.get(word) ?? 0;
225
+ normB += w * w;
226
+ }
227
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
228
+ return denom === 0 ? 0 : dot / denom;
229
+ }
230
+ // ─── IDF Vector (from string table) ─────────────────────────────────
231
+ /**
232
+ * Derive classic document-frequency IDF from the string table.
233
+ *
234
+ * For each interned string: find the unique words it contains.
235
+ * Each unique word gets +refcount to its document frequency
236
+ * (refcount = number of entities sharing that string = number of
237
+ * "documents" it appears in).
238
+ *
239
+ * N = sum of all refcounts (total entity-string references).
240
+ * df(word) = number of entity-string references containing that word.
241
+ * IDF(word) = log(N / (1 + df(word))) + 1
242
+ *
243
+ * Returns { df, corpusSize }.
244
+ */
245
+ function deriveCorpusDocFreqs(st) {
246
+ const df = new Map();
247
+ let corpusSize = 0;
248
+ for (const entry of st.entries()) {
249
+ corpusSize += entry.refcount;
250
+ // Unique words in this string (presence, not count)
251
+ const uniqueWords = new Set(entry.text.toLowerCase().split(/\s+/).filter(w => w.length > 0));
252
+ for (const word of uniqueWords) {
253
+ df.set(word, (df.get(word) ?? 0) + entry.refcount);
254
+ }
255
+ }
256
+ return { df, corpusSize };
257
+ }
258
+ /**
259
+ * Build IDF vector using classic document-frequency IDF.
260
+ * IDF(word) = log(N / (1 + df(word))) + 1
261
+ */
262
+ function buildIdfVector(docVocab, df, corpusSize) {
263
+ const idf = new Map();
264
+ for (const word of docVocab) {
265
+ const docFreq = df.get(word) ?? 0;
266
+ idf.set(word, Math.log(corpusSize / (1 + docFreq)) + 1);
267
+ }
268
+ return idf;
269
+ }
270
+ // ─── TextRank Core ──────────────────────────────────────────────────
271
+ const TEXTRANK_DAMPING = 0.85;
272
+ const TEXTRANK_ITERATIONS = 30000;
273
+ const TEXTRANK_CONVERGENCE = 1e-6;
274
+ /**
275
+ * Generic PageRank over any square weight matrix.
276
+ * Returns scores[i] for each node.
277
+ */
278
+ function pageRank(matrix) {
279
+ const n = matrix.length;
280
+ if (n === 0)
281
+ return [];
282
+ const rowSums = matrix.map(row => row.reduce((a, b) => a + b, 0));
283
+ let scores = new Array(n).fill(1 / n);
284
+ for (let iter = 0; iter < TEXTRANK_ITERATIONS; iter++) {
285
+ const newScores = new Array(n).fill(0);
286
+ for (let i = 0; i < n; i++) {
287
+ let sum = 0;
288
+ for (let j = 0; j < n; j++) {
289
+ if (j !== i && rowSums[j] > 0) {
290
+ sum += (matrix[j][i] / rowSums[j]) * scores[j];
291
+ }
292
+ }
293
+ newScores[i] = (1 - TEXTRANK_DAMPING) / n + TEXTRANK_DAMPING * sum;
294
+ }
295
+ let delta = 0;
296
+ for (let i = 0; i < n; i++) {
297
+ delta += Math.abs(newScores[i] - scores[i]);
298
+ }
299
+ scores = newScores;
300
+ if (delta < TEXTRANK_CONVERGENCE)
301
+ break;
302
+ }
303
+ return scores;
304
+ }
305
+ // ─── TextRank for Keywords (word co-occurrence graph) ───────────────
306
+ const COOCCURRENCE_WINDOW = 5;
307
+ /**
308
+ * Build word co-occurrence graph.
309
+ * Nodes = unique normalized words in the document.
310
+ * Edge weight between w_i and w_j = number of times they co-occur
311
+ * within a sliding window of COOCCURRENCE_WINDOW words.
312
+ */
313
+ function wordTextRank(allWords) {
314
+ // Unique vocabulary with stable indices
315
+ const vocabList = [];
316
+ const vocabIndex = new Map();
317
+ for (const w of allWords) {
318
+ if (!vocabIndex.has(w.normalized)) {
319
+ vocabIndex.set(w.normalized, vocabList.length);
320
+ vocabList.push(w.normalized);
321
+ }
322
+ }
323
+ const n = vocabList.length;
324
+ const matrix = Array.from({ length: n }, () => new Array(n).fill(0));
325
+ // Slide window over word sequence
326
+ for (let i = 0; i < allWords.length; i++) {
327
+ const idxI = vocabIndex.get(allWords[i].normalized);
328
+ for (let j = i + 1; j < Math.min(i + COOCCURRENCE_WINDOW, allWords.length); j++) {
329
+ const idxJ = vocabIndex.get(allWords[j].normalized);
330
+ if (idxI !== idxJ) {
331
+ matrix[idxI][idxJ] += 1;
332
+ matrix[idxJ][idxI] += 1;
333
+ }
334
+ }
335
+ }
336
+ const scores = pageRank(matrix);
337
+ return vocabList
338
+ .map((word, i) => ({ word, score: scores[i] }))
339
+ .sort((a, b) => b.score - a.score);
340
+ }
341
+ /**
342
+ * Split normalized document text into sentences on . ? !
343
+ * (Crude but functional for this experiment.)
344
+ */
345
+ function splitSentences(normalizedText) {
346
+ const sentences = [];
347
+ // Match sentence-ending punctuation followed by whitespace (or end of string)
348
+ const re = /(?<=[.?!])\s+/g;
349
+ let pos = 0;
350
+ let match;
351
+ while ((match = re.exec(normalizedText)) !== null) {
352
+ const text = normalizedText.slice(pos, match.index + 1).trim(); // include the punctuation
353
+ if (text.length > 0) {
354
+ const words = text.toLowerCase().split(/\s+/).filter(w => w.length > 0);
355
+ if (words.length >= 3) {
356
+ sentences.push({ index: sentences.length, text, start: pos, words });
357
+ }
358
+ }
359
+ pos = match.index + match[0].length;
360
+ }
361
+ // Remainder after last split
362
+ if (pos < normalizedText.length) {
363
+ const text = normalizedText.slice(pos).trim();
364
+ if (text.length > 0) {
365
+ const words = text.toLowerCase().split(/\s+/).filter(w => w.length > 0);
366
+ if (words.length >= 3) {
367
+ sentences.push({ index: sentences.length, text, start: pos, words });
368
+ }
369
+ }
370
+ }
371
+ return sentences;
372
+ }
373
+ /**
374
+ * TextRank over sentences.
375
+ * Nodes = sentences. Edge weight = cosine similarity of TF-IDF subvectors.
376
+ */
377
+ function sentenceTextRank(sentences, weights) {
378
+ const n = sentences.length;
379
+ const keySets = sentences.map(s => new Set(s.words));
380
+ const matrix = Array.from({ length: n }, () => new Array(n).fill(0));
381
+ for (let i = 0; i < n; i++) {
382
+ for (let j = i + 1; j < n; j++) {
383
+ const sim = cosineSimilarity(weights, keySets[i], keySets[j]);
384
+ matrix[i][j] = sim;
385
+ matrix[j][i] = sim;
386
+ }
387
+ }
388
+ const scores = pageRank(matrix);
389
+ return sentences
390
+ .map((sentence, i) => ({ sentence, score: scores[i] }))
391
+ .sort((a, b) => b.score - a.score);
392
+ }
393
+ function buildChain(title, chunks) {
394
+ const cc = chunks;
395
+ const relations = [];
396
+ if (cc.length === 0)
397
+ return { title, chunks: cc, relations };
398
+ // Title <-> first chunk
399
+ relations.push({ from: title, to: cc[0].id, relationType: 'starts_with' });
400
+ relations.push({ from: cc[0].id, to: title, relationType: 'belongs_to' });
401
+ // Title <-> last chunk (if different)
402
+ if (cc.length > 1) {
403
+ relations.push({ from: title, to: cc[cc.length - 1].id, relationType: 'ends_with' });
404
+ relations.push({ from: cc[cc.length - 1].id, to: title, relationType: 'belongs_to' });
405
+ }
406
+ // Chain: chunk_i <-> chunk_{i+1}
407
+ for (let i = 0; i < cc.length - 1; i++) {
408
+ relations.push({ from: cc[i].id, to: cc[i + 1].id, relationType: 'follows' });
409
+ relations.push({ from: cc[i + 1].id, to: cc[i].id, relationType: 'preceded_by' });
410
+ }
411
+ return { title, chunks: cc, relations };
412
+ }
413
+ /**
414
+ * Find which chunk contains the start of a sentence, using offsets.
415
+ * Chunks tile the normalized text via their observation start/end spans.
416
+ * We find the chunk whose span contains sentence.start.
417
+ */
418
+ function sentenceToChunk(sentence, chunks) {
419
+ const target = sentence.start;
420
+ for (const chunk of chunks) {
421
+ const first = chunk.observations[0];
422
+ const last = chunk.observations[chunk.observations.length - 1];
423
+ if (target >= first.start && target < last.end) {
424
+ return chunk;
425
+ }
426
+ }
427
+ return null;
428
+ }
429
+ /**
430
+ * Build the index entity for a document.
431
+ *
432
+ * - Takes the top-K ranked sentences
433
+ * - Maps each to its containing chunk
434
+ * - Deduplicates chunk references
435
+ * - Packs top sentence previews into the index entity's 2 observations
436
+ * - Creates relations: title → has_index → index, index → highlights → chunk
437
+ */
438
+ function buildIndex(title, chunks, rankedSentences, topK) {
439
+ const indexId = `${title}__index`;
440
+ const topSents = rankedSentences.slice(0, topK);
441
+ // Map sentences to chunks, deduplicate
442
+ const references = [];
443
+ const seenChunks = new Set();
444
+ for (const { sentence, score } of topSents) {
445
+ const chunk = sentenceToChunk(sentence, chunks);
446
+ if (!chunk)
447
+ continue;
448
+ if (seenChunks.has(chunk.id))
449
+ continue;
450
+ seenChunks.add(chunk.id);
451
+ references.push({
452
+ chunkId: chunk.id,
453
+ chunkIndex: chunk.index,
454
+ sentence: sentence.text,
455
+ score,
456
+ });
457
+ }
458
+ // Pack top sentence previews into observations (max 2 × 140 chars)
459
+ // Truncate sentences to fit, separate with " | "
460
+ const observations = [];
461
+ let current = '';
462
+ for (const ref of references) {
463
+ const preview = ref.sentence.length > 60
464
+ ? ref.sentence.slice(0, 57) + '...'
465
+ : ref.sentence;
466
+ const candidate = current ? current + ' | ' + preview : preview;
467
+ if (candidate.length <= MAX_OBS_LENGTH) {
468
+ current = candidate;
469
+ }
470
+ else {
471
+ if (current)
472
+ observations.push(current);
473
+ if (observations.length >= MAX_OBS_PER_ENTITY)
474
+ break;
475
+ current = preview.length <= MAX_OBS_LENGTH ? preview : preview.slice(0, MAX_OBS_LENGTH);
476
+ }
477
+ }
478
+ if (current && observations.length < MAX_OBS_PER_ENTITY) {
479
+ observations.push(current);
480
+ }
481
+ // Relations
482
+ const relations = [];
483
+ // title → index
484
+ relations.push({ from: title, to: indexId, relationType: 'has_index' });
485
+ relations.push({ from: indexId, to: title, relationType: 'indexes' });
486
+ // index → highlighted chunks
487
+ for (const ref of references) {
488
+ relations.push({ from: indexId, to: ref.chunkId, relationType: 'highlights' });
489
+ relations.push({ from: ref.chunkId, to: indexId, relationType: 'highlighted_by' });
490
+ }
491
+ return { indexId, observations, relations, references };
492
+ }
493
+ // ─── Main ───────────────────────────────────────────────────────────
494
+ function main() {
495
+ const args = process.argv.slice(2);
496
+ let _verbose = false;
497
+ const filtered = [];
498
+ for (const arg of args) {
499
+ if (arg === '-v' || arg === '--verbose')
500
+ _verbose = true;
501
+ else
502
+ filtered.push(arg);
503
+ }
504
+ // Read document
505
+ let text;
506
+ let title = 'untitled';
507
+ if (filtered[0] === '--text' && filtered[1]) {
508
+ text = filtered[1];
509
+ title = filtered[2] ?? 'untitled';
510
+ }
511
+ else if (filtered[0] && filtered[0] !== '-') {
512
+ text = fs.readFileSync(filtered[0], 'utf-8');
513
+ title = path.basename(filtered[0], path.extname(filtered[0]));
514
+ }
515
+ else if (!process.stdin.isTTY) {
516
+ text = fs.readFileSync(0, 'utf-8');
517
+ }
518
+ else {
519
+ console.error('Usage: npx tsx scripts/textrank-experiment.ts <file>');
520
+ process.exit(1);
521
+ }
522
+ // Process document
523
+ const { chunks, allWords } = processDocument(text);
524
+ // Build chain
525
+ const chain = buildChain(title, chunks);
526
+ // Open KB string table for IDF (required)
527
+ const memPath = process.env.MEMORY_FILE_PATH;
528
+ if (!memPath) {
529
+ console.error('MEMORY_FILE_PATH must be set.');
530
+ process.exit(1);
531
+ }
532
+ const dir = path.dirname(memPath);
533
+ const base = path.basename(memPath, path.extname(memPath));
534
+ const strPath = path.join(dir, `${base}.strings`);
535
+ if (!fs.existsSync(strPath)) {
536
+ console.error(`String table not found at ${strPath}`);
537
+ process.exit(1);
538
+ }
539
+ const st = new StringTable(strPath);
540
+ console.error(`String table loaded: ${strPath} (${st.count} entries)`);
541
+ const { df, corpusSize } = deriveCorpusDocFreqs(st);
542
+ // Vocab = all unique words in this document
543
+ const vocab = new Set(allWords.map(w => w.normalized));
544
+ const idf = buildIdfVector(vocab, df, corpusSize);
545
+ // Build the single document weight vector
546
+ const weights = buildWeightVector(allWords, idf);
547
+ // ─── Output ─────────────────────────────────────────────────────
548
+ const tfMode = USE_LOG_TF ? 'log(1+count)' : 'raw count';
549
+ console.log(`Document: "${title}" [TF mode: ${tfMode}]`);
550
+ console.log(` ${text.length} chars, ${allWords.length} words, ${vocab.size} unique`);
551
+ console.log(` ${chain.chunks.length} chunks, ${chain.relations.length} relations`);
552
+ console.log(` Corpus: N=${corpusSize} entity-string refs, ${df.size} unique words`);
553
+ console.log();
554
+ // ─── Word TextRank (co-occurrence graph) ────────────────────────
555
+ console.error('Running word TextRank...');
556
+ const rankedWords = wordTextRank(allWords);
557
+ console.log('=== Word TextRank (top 40 keywords) ===');
558
+ for (const { word, score } of rankedWords.slice(0, 40)) {
559
+ const idfW = idf.get(word) ?? 0;
560
+ const tfidf = weights.get(word) ?? 0;
561
+ console.log(` ${score.toFixed(6)} ${word.padEnd(25)} TF-IDF: ${tfidf.toFixed(2).padStart(7)} IDF: ${idfW.toFixed(2).padStart(7)}`);
562
+ }
563
+ console.log();
564
+ // ─── Sentence TextRank ──────────────────────────────────────────
565
+ const normalized = normalize(text);
566
+ const sentences = splitSentences(normalized);
567
+ console.error(`Running sentence TextRank (${sentences.length} sentences)...`);
568
+ const rankedSentences = sentenceTextRank(sentences, weights);
569
+ // ─── Build Index Entity ─────────────────────────────────────────
570
+ const index = buildIndex(title, chain.chunks, rankedSentences, 15);
571
+ // ─── Output: Full Graph Description ─────────────────────────────
572
+ // Collect all entities
573
+ const entities = [];
574
+ // Document entity
575
+ entities.push({ name: title, type: 'Document', observations: [] });
576
+ // Chain chunks
577
+ for (const chunk of chain.chunks) {
578
+ entities.push({
579
+ name: chunk.id,
580
+ type: 'TextChunk',
581
+ observations: chunk.observations.map(o => o.text),
582
+ });
583
+ }
584
+ // Index entity
585
+ entities.push({
586
+ name: index.indexId,
587
+ type: 'DocumentIndex',
588
+ observations: index.observations,
589
+ });
590
+ // Collect all relations
591
+ const allRelations = [...chain.relations, ...index.relations];
592
+ console.log(`=== Graph Structure ===`);
593
+ console.log(` Entities: ${entities.length} (1 Document + ${chain.chunks.length} TextChunks + 1 DocumentIndex)`);
594
+ console.log(` Relations: ${allRelations.length} (${chain.relations.length} chain + ${index.relations.length} index)`);
595
+ console.log();
596
+ // Index details
597
+ console.log(`=== Index: "${index.indexId}" ===`);
598
+ console.log(` Observations:`);
599
+ for (const obs of index.observations) {
600
+ console.log(` "${obs}"`);
601
+ }
602
+ console.log(` Highlights ${index.references.length} chunks:`);
603
+ for (const ref of index.references) {
604
+ const preview = ref.sentence.length > 80 ? ref.sentence.slice(0, 77) + '...' : ref.sentence;
605
+ console.log(` chunk[${String(ref.chunkIndex).padStart(3)}] (score: ${ref.score.toFixed(4)}) ${preview}`);
606
+ }
607
+ console.log();
608
+ // Top sentences for reference
609
+ console.log(`=== Sentence TextRank (top 15) ===`);
610
+ for (const { sentence, score } of rankedSentences.slice(0, 15)) {
611
+ const preview = sentence.text.length > 100 ? sentence.text.slice(0, 97) + '...' : sentence.text;
612
+ console.log(` (${score.toFixed(6)}) ${preview}`);
613
+ }
614
+ console.log();
615
+ // Cleanup
616
+ st.close();
617
+ }
618
+ main();