@mastra/rag 1.2.3-alpha.0 → 1.2.3-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/package.json +19 -6
- package/.turbo/turbo-build.log +0 -4
- package/docker-compose.yaml +0 -22
- package/eslint.config.js +0 -6
- package/src/document/document.test.ts +0 -2975
- package/src/document/document.ts +0 -335
- package/src/document/extractors/base.ts +0 -30
- package/src/document/extractors/index.ts +0 -5
- package/src/document/extractors/keywords.test.ts +0 -125
- package/src/document/extractors/keywords.ts +0 -126
- package/src/document/extractors/questions.test.ts +0 -120
- package/src/document/extractors/questions.ts +0 -111
- package/src/document/extractors/summary.test.ts +0 -107
- package/src/document/extractors/summary.ts +0 -122
- package/src/document/extractors/title.test.ts +0 -121
- package/src/document/extractors/title.ts +0 -185
- package/src/document/extractors/types.ts +0 -40
- package/src/document/index.ts +0 -2
- package/src/document/prompts/base.ts +0 -77
- package/src/document/prompts/format.ts +0 -9
- package/src/document/prompts/index.ts +0 -15
- package/src/document/prompts/prompt.ts +0 -60
- package/src/document/prompts/types.ts +0 -29
- package/src/document/schema/index.ts +0 -3
- package/src/document/schema/node.ts +0 -187
- package/src/document/schema/types.ts +0 -40
- package/src/document/transformers/character.ts +0 -267
- package/src/document/transformers/html.ts +0 -346
- package/src/document/transformers/json.ts +0 -536
- package/src/document/transformers/latex.ts +0 -11
- package/src/document/transformers/markdown.ts +0 -239
- package/src/document/transformers/semantic-markdown.ts +0 -227
- package/src/document/transformers/sentence.ts +0 -314
- package/src/document/transformers/text.ts +0 -158
- package/src/document/transformers/token.ts +0 -137
- package/src/document/transformers/transformer.ts +0 -5
- package/src/document/types.ts +0 -145
- package/src/document/validation.ts +0 -158
- package/src/graph-rag/index.test.ts +0 -235
- package/src/graph-rag/index.ts +0 -306
- package/src/index.ts +0 -8
- package/src/rerank/index.test.ts +0 -150
- package/src/rerank/index.ts +0 -198
- package/src/rerank/relevance/cohere/index.ts +0 -56
- package/src/rerank/relevance/index.ts +0 -3
- package/src/rerank/relevance/mastra-agent/index.ts +0 -32
- package/src/rerank/relevance/zeroentropy/index.ts +0 -26
- package/src/tools/README.md +0 -153
- package/src/tools/document-chunker.ts +0 -34
- package/src/tools/graph-rag.test.ts +0 -115
- package/src/tools/graph-rag.ts +0 -157
- package/src/tools/index.ts +0 -3
- package/src/tools/types.ts +0 -126
- package/src/tools/vector-query-database-config.test.ts +0 -190
- package/src/tools/vector-query.test.ts +0 -477
- package/src/tools/vector-query.ts +0 -171
- package/src/utils/convert-sources.ts +0 -43
- package/src/utils/default-settings.ts +0 -38
- package/src/utils/index.ts +0 -3
- package/src/utils/tool-schemas.ts +0 -38
- package/src/utils/vector-prompts.ts +0 -832
- package/src/utils/vector-search.ts +0 -130
- package/tsconfig.build.json +0 -9
- package/tsconfig.json +0 -5
- package/tsup.config.ts +0 -17
- package/vitest.config.ts +0 -8
|
@@ -1,2975 +0,0 @@
|
|
|
1
|
-
import { createOpenAI } from '@ai-sdk/openai';
|
|
2
|
-
import { embedMany } from 'ai';
|
|
3
|
-
import { describe, it, expect, vi } from 'vitest';
|
|
4
|
-
|
|
5
|
-
import { MDocument } from './document';
|
|
6
|
-
import { Language } from './types';
|
|
7
|
-
|
|
8
|
-
const sampleMarkdown = `
|
|
9
|
-
# Complete Guide to Modern Web Development
|
|
10
|
-
## Introduction
|
|
11
|
-
Welcome to our comprehensive guide on modern web development. This resource covers essential concepts, best practices, and tools that every developer should know in 2024.
|
|
12
|
-
|
|
13
|
-
### Who This Guide Is For
|
|
14
|
-
- Beginning developers looking to establish a solid foundation
|
|
15
|
-
- Intermediate developers wanting to modernize their skillset
|
|
16
|
-
- Senior developers seeking a refresher on current best practices
|
|
17
|
-
`;
|
|
18
|
-
|
|
19
|
-
const openai = createOpenAI({
|
|
20
|
-
apiKey: process.env.OPENAI_API_KEY,
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
vi.setConfig({ testTimeout: 100_000, hookTimeout: 100_000 });
|
|
24
|
-
|
|
25
|
-
describe('MDocument', () => {
|
|
26
|
-
describe('basics', () => {
|
|
27
|
-
let chunks: MDocument['chunks'];
|
|
28
|
-
let doc: MDocument;
|
|
29
|
-
it('initialization', () => {
|
|
30
|
-
const doc = new MDocument({ docs: [{ text: 'test' }], type: 'text' });
|
|
31
|
-
expect(doc.getDocs()).toHaveLength(1);
|
|
32
|
-
expect(doc.getText()?.[0]).toBe('test');
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
it('initialization with array', () => {
|
|
36
|
-
doc = new MDocument({ docs: [{ text: 'test' }, { text: 'test2' }], type: 'text' });
|
|
37
|
-
expect(doc.getDocs()).toHaveLength(2);
|
|
38
|
-
expect(doc.getDocs()[0]?.text).toBe('test');
|
|
39
|
-
expect(doc.getDocs()[1]?.text).toBe('test2');
|
|
40
|
-
});
|
|
41
|
-
|
|
42
|
-
it('chunk - metadata title', async () => {
|
|
43
|
-
const doc = MDocument.fromMarkdown(sampleMarkdown);
|
|
44
|
-
|
|
45
|
-
chunks = await doc.chunk({
|
|
46
|
-
maxSize: 1500,
|
|
47
|
-
overlap: 0,
|
|
48
|
-
extract: {
|
|
49
|
-
keywords: true,
|
|
50
|
-
},
|
|
51
|
-
});
|
|
52
|
-
|
|
53
|
-
expect(doc.getMetadata()?.[0]).toBeTruthy();
|
|
54
|
-
expect(chunks).toBeInstanceOf(Array);
|
|
55
|
-
}, 15000);
|
|
56
|
-
|
|
57
|
-
it('embed - create embedding from chunk', async () => {
|
|
58
|
-
const embeddings = await embedMany({
|
|
59
|
-
values: chunks.map(chunk => chunk.text),
|
|
60
|
-
model: openai.embedding('text-embedding-3-small'),
|
|
61
|
-
});
|
|
62
|
-
|
|
63
|
-
expect(embeddings).toBeDefined();
|
|
64
|
-
}, 15000);
|
|
65
|
-
});
|
|
66
|
-
|
|
67
|
-
describe('chunkCharacter', () => {
|
|
68
|
-
it('should split text on simple separator', async () => {
|
|
69
|
-
const text = 'Hello world\n\nHow are you\n\nI am fine';
|
|
70
|
-
|
|
71
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
72
|
-
|
|
73
|
-
await doc.chunk({
|
|
74
|
-
strategy: 'character',
|
|
75
|
-
separator: '\n\n',
|
|
76
|
-
isSeparatorRegex: false,
|
|
77
|
-
maxSize: 50,
|
|
78
|
-
overlap: 5,
|
|
79
|
-
});
|
|
80
|
-
|
|
81
|
-
const chunks = doc.getDocs();
|
|
82
|
-
|
|
83
|
-
expect(chunks).toHaveLength(3);
|
|
84
|
-
expect(chunks?.[0]?.text).toBe('Hello world');
|
|
85
|
-
expect(chunks?.[1]?.text).toBe('How are you');
|
|
86
|
-
expect(chunks?.[2]?.text).toBe('I am fine');
|
|
87
|
-
});
|
|
88
|
-
|
|
89
|
-
it('should handle regex separator', async () => {
|
|
90
|
-
const text = 'Hello world\n\nHow are you';
|
|
91
|
-
|
|
92
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
93
|
-
|
|
94
|
-
await doc.chunk({
|
|
95
|
-
strategy: 'character',
|
|
96
|
-
separator: '\\s+',
|
|
97
|
-
isSeparatorRegex: true,
|
|
98
|
-
maxSize: 50,
|
|
99
|
-
overlap: 5,
|
|
100
|
-
});
|
|
101
|
-
|
|
102
|
-
expect(doc.getText().join(' ')).toBe('Hello world How are you');
|
|
103
|
-
});
|
|
104
|
-
|
|
105
|
-
it('should keep separator when specified', async () => {
|
|
106
|
-
const text = 'Hello\n\nWorld';
|
|
107
|
-
|
|
108
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
109
|
-
|
|
110
|
-
await doc.chunk({
|
|
111
|
-
strategy: 'character',
|
|
112
|
-
separator: '\n\n',
|
|
113
|
-
isSeparatorRegex: false,
|
|
114
|
-
maxSize: 50,
|
|
115
|
-
overlap: 5,
|
|
116
|
-
keepSeparator: 'end',
|
|
117
|
-
});
|
|
118
|
-
const chunks = doc.getText();
|
|
119
|
-
|
|
120
|
-
expect(chunks[0]).toBe('Hello\n\n');
|
|
121
|
-
expect(chunks[1]).toBe('World');
|
|
122
|
-
});
|
|
123
|
-
|
|
124
|
-
describe('separator handling', () => {
|
|
125
|
-
it('should keep separator at end when specified', async () => {
|
|
126
|
-
const text = 'Hello\n\nWorld';
|
|
127
|
-
|
|
128
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
129
|
-
|
|
130
|
-
await doc.chunk({
|
|
131
|
-
strategy: 'character',
|
|
132
|
-
separator: '\n\n',
|
|
133
|
-
isSeparatorRegex: false,
|
|
134
|
-
maxSize: 50,
|
|
135
|
-
overlap: 5,
|
|
136
|
-
keepSeparator: 'end',
|
|
137
|
-
});
|
|
138
|
-
|
|
139
|
-
const chunks = doc.getText();
|
|
140
|
-
|
|
141
|
-
expect(chunks).toHaveLength(2);
|
|
142
|
-
expect(chunks[0]).toBe('Hello\n\n');
|
|
143
|
-
expect(chunks[1]).toBe('World');
|
|
144
|
-
});
|
|
145
|
-
|
|
146
|
-
it('should keep separator at start when specified', async () => {
|
|
147
|
-
const text = 'Hello\n\nWorld\n\nTest';
|
|
148
|
-
|
|
149
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
150
|
-
|
|
151
|
-
await doc.chunk({
|
|
152
|
-
strategy: 'character',
|
|
153
|
-
separator: '\n\n',
|
|
154
|
-
isSeparatorRegex: false,
|
|
155
|
-
maxSize: 50,
|
|
156
|
-
overlap: 5,
|
|
157
|
-
keepSeparator: 'start',
|
|
158
|
-
});
|
|
159
|
-
|
|
160
|
-
const chunks = doc.getText();
|
|
161
|
-
|
|
162
|
-
expect(chunks).toHaveLength(3);
|
|
163
|
-
expect(chunks[0]).toBe('Hello');
|
|
164
|
-
expect(chunks[1]).toBe('\n\nWorld');
|
|
165
|
-
expect(chunks[2]).toBe('\n\nTest');
|
|
166
|
-
});
|
|
167
|
-
|
|
168
|
-
it('should handle multiple consecutive separators', async () => {
|
|
169
|
-
const text = 'Hello\n\n\n\nWorld';
|
|
170
|
-
|
|
171
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
172
|
-
|
|
173
|
-
await doc.chunk({
|
|
174
|
-
strategy: 'character',
|
|
175
|
-
separator: '\n\n',
|
|
176
|
-
isSeparatorRegex: false,
|
|
177
|
-
maxSize: 50,
|
|
178
|
-
overlap: 5,
|
|
179
|
-
keepSeparator: 'end',
|
|
180
|
-
});
|
|
181
|
-
|
|
182
|
-
const chunks = doc.getText();
|
|
183
|
-
|
|
184
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
185
|
-
expect(chunks.join('')).toBe(text);
|
|
186
|
-
});
|
|
187
|
-
|
|
188
|
-
it('should handle text ending with separator', async () => {
|
|
189
|
-
const text = 'Hello\n\nWorld\n\n';
|
|
190
|
-
|
|
191
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
192
|
-
|
|
193
|
-
await doc.chunk({
|
|
194
|
-
strategy: 'character',
|
|
195
|
-
separator: '\n\n',
|
|
196
|
-
isSeparatorRegex: false,
|
|
197
|
-
maxSize: 50,
|
|
198
|
-
overlap: 5,
|
|
199
|
-
keepSeparator: 'end',
|
|
200
|
-
});
|
|
201
|
-
|
|
202
|
-
const chunks = doc.getText();
|
|
203
|
-
|
|
204
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
205
|
-
expect(chunks.join('')).toBe(text);
|
|
206
|
-
});
|
|
207
|
-
|
|
208
|
-
it('should handle text starting with separator', async () => {
|
|
209
|
-
const text = '\n\nHello\n\nWorld';
|
|
210
|
-
|
|
211
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
212
|
-
|
|
213
|
-
await doc.chunk({
|
|
214
|
-
strategy: 'character',
|
|
215
|
-
separator: '\n\n',
|
|
216
|
-
isSeparatorRegex: false,
|
|
217
|
-
maxSize: 50,
|
|
218
|
-
overlap: 5,
|
|
219
|
-
keepSeparator: 'start',
|
|
220
|
-
});
|
|
221
|
-
|
|
222
|
-
const chunks = doc.getText();
|
|
223
|
-
|
|
224
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
225
|
-
expect(chunks.join('')).toBe(text);
|
|
226
|
-
});
|
|
227
|
-
});
|
|
228
|
-
it('should properly implement overlap in character chunking', async () => {
|
|
229
|
-
// Test basic overlap functionality
|
|
230
|
-
const text = 'a'.repeat(500) + 'b'.repeat(500) + 'c'.repeat(500);
|
|
231
|
-
const chunkSize = 600;
|
|
232
|
-
const overlap = 100;
|
|
233
|
-
const doc = MDocument.fromText(text);
|
|
234
|
-
|
|
235
|
-
const result = await doc.chunk({
|
|
236
|
-
strategy: 'character',
|
|
237
|
-
maxSize: chunkSize,
|
|
238
|
-
overlap,
|
|
239
|
-
});
|
|
240
|
-
|
|
241
|
-
// Verify overlap between chunks
|
|
242
|
-
for (let i = 1; i < result.length; i++) {
|
|
243
|
-
const prevChunk = result[i - 1]?.text;
|
|
244
|
-
const currentChunk = result[i]?.text;
|
|
245
|
-
|
|
246
|
-
if (prevChunk && currentChunk) {
|
|
247
|
-
// Get the end of the previous chunk and start of current chunk
|
|
248
|
-
const prevEnd = prevChunk.slice(-overlap);
|
|
249
|
-
const currentStart = currentChunk.slice(0, overlap);
|
|
250
|
-
|
|
251
|
-
// There should be a common substring of length >= min(overlap, chunk length)
|
|
252
|
-
const commonSubstring = findCommonSubstring(prevEnd, currentStart);
|
|
253
|
-
expect(commonSubstring.length).toBeGreaterThan(0);
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
});
|
|
257
|
-
|
|
258
|
-
it('should ensure character chunks never exceed size limit', async () => {
|
|
259
|
-
// Create text with varying content to test size limits
|
|
260
|
-
const text = 'a'.repeat(50) + 'b'.repeat(100) + 'c'.repeat(30);
|
|
261
|
-
const chunkSize = 50;
|
|
262
|
-
const overlap = 10;
|
|
263
|
-
|
|
264
|
-
const doc = MDocument.fromText(text);
|
|
265
|
-
const chunks = await doc.chunk({
|
|
266
|
-
strategy: 'character',
|
|
267
|
-
maxSize: chunkSize,
|
|
268
|
-
overlap,
|
|
269
|
-
});
|
|
270
|
-
|
|
271
|
-
chunks.forEach((chunk, i) => {
|
|
272
|
-
if (i > 0) {
|
|
273
|
-
const prevChunk = chunks[i - 1]?.text;
|
|
274
|
-
const actualOverlap = chunk.text.slice(0, overlap);
|
|
275
|
-
const expectedOverlap = prevChunk?.slice(-overlap);
|
|
276
|
-
expect(actualOverlap).toBe(expectedOverlap);
|
|
277
|
-
}
|
|
278
|
-
});
|
|
279
|
-
|
|
280
|
-
// Verify each chunk's size
|
|
281
|
-
let allChunksValid = true;
|
|
282
|
-
for (const chunk of chunks) {
|
|
283
|
-
if (chunk.text.length > chunkSize) {
|
|
284
|
-
allChunksValid = false;
|
|
285
|
-
}
|
|
286
|
-
}
|
|
287
|
-
expect(allChunksValid).toBe(true);
|
|
288
|
-
|
|
289
|
-
// Verify overlaps between consecutive chunks
|
|
290
|
-
for (let i = 1; i < chunks.length; i++) {
|
|
291
|
-
const prevChunk = chunks[i - 1]!;
|
|
292
|
-
const currentChunk = chunks[i]!;
|
|
293
|
-
|
|
294
|
-
// The end of the previous chunk should match the start of the current chunk
|
|
295
|
-
const prevEnd = prevChunk.text.slice(-overlap);
|
|
296
|
-
const currentStart = currentChunk.text.slice(0, overlap);
|
|
297
|
-
|
|
298
|
-
expect(currentStart).toBe(prevEnd);
|
|
299
|
-
expect(currentStart.length).toBeLessThanOrEqual(overlap);
|
|
300
|
-
}
|
|
301
|
-
});
|
|
302
|
-
|
|
303
|
-
it('should handle end chunks properly in character chunking', async () => {
|
|
304
|
-
const text = 'This is a test document that needs to be split into chunks with proper handling of the end.';
|
|
305
|
-
const chunkSize = 20;
|
|
306
|
-
const overlap = 5;
|
|
307
|
-
|
|
308
|
-
const testDoc = MDocument.fromText(text);
|
|
309
|
-
const chunks = await testDoc.chunk({
|
|
310
|
-
strategy: 'character',
|
|
311
|
-
maxSize: chunkSize,
|
|
312
|
-
overlap,
|
|
313
|
-
});
|
|
314
|
-
|
|
315
|
-
// Verify no tiny fragments at the end
|
|
316
|
-
const lastChunk = chunks[chunks.length - 1]?.text;
|
|
317
|
-
expect(lastChunk?.length).toBeGreaterThan(5);
|
|
318
|
-
|
|
319
|
-
// Verify each chunk respects size limit
|
|
320
|
-
let allChunksValid = true;
|
|
321
|
-
for (const chunk of chunks) {
|
|
322
|
-
if (chunk.text.length > chunkSize) {
|
|
323
|
-
allChunksValid = false;
|
|
324
|
-
}
|
|
325
|
-
}
|
|
326
|
-
expect(allChunksValid).toBe(true);
|
|
327
|
-
|
|
328
|
-
// Verify the size of each chunk explicitly
|
|
329
|
-
for (const chunk of chunks) {
|
|
330
|
-
expect(chunk.text.length).toBeLessThanOrEqual(chunkSize);
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
// Verify overlaps between consecutive chunks
|
|
334
|
-
for (let i = 1; i < chunks.length; i++) {
|
|
335
|
-
const prevChunk = chunks[i - 1]!;
|
|
336
|
-
const currentChunk = chunks[i]!;
|
|
337
|
-
|
|
338
|
-
// The end of the previous chunk should match the start of the current chunk
|
|
339
|
-
const prevEnd = prevChunk.text.slice(-overlap);
|
|
340
|
-
const currentStart = currentChunk.text.slice(0, overlap);
|
|
341
|
-
|
|
342
|
-
expect(currentStart).toBe(prevEnd);
|
|
343
|
-
expect(currentStart.length).toBeLessThanOrEqual(overlap);
|
|
344
|
-
}
|
|
345
|
-
});
|
|
346
|
-
it('should not create tiny chunks at the end', async () => {
|
|
347
|
-
const text = 'ABCDEFGHIJ'; // 10 characters
|
|
348
|
-
const chunkSize = 4;
|
|
349
|
-
const overlap = 2;
|
|
350
|
-
|
|
351
|
-
const doc = MDocument.fromText(text);
|
|
352
|
-
const chunks = await doc.chunk({
|
|
353
|
-
strategy: 'character',
|
|
354
|
-
maxSize: chunkSize,
|
|
355
|
-
overlap,
|
|
356
|
-
});
|
|
357
|
-
|
|
358
|
-
// Verify we don't have tiny chunks
|
|
359
|
-
chunks.forEach(chunk => {
|
|
360
|
-
// Each chunk should be either:
|
|
361
|
-
// 1. Full size (chunkSize)
|
|
362
|
-
// 2. Or at least half the chunk maxSize if it's the last chunk
|
|
363
|
-
const minSize = chunk === chunks[chunks.length - 1] ? Math.floor(chunkSize / 2) : chunkSize;
|
|
364
|
-
expect(chunk.text.length).toBeGreaterThanOrEqual(minSize);
|
|
365
|
-
});
|
|
366
|
-
|
|
367
|
-
// Verify overlaps are maintained
|
|
368
|
-
for (let i = 1; i < chunks.length; i++) {
|
|
369
|
-
const prevChunk = chunks[i - 1]!;
|
|
370
|
-
const currentChunk = chunks[i]!;
|
|
371
|
-
const actualOverlap = currentChunk.text.slice(0, overlap);
|
|
372
|
-
const expectedOverlap = prevChunk.text.slice(-overlap);
|
|
373
|
-
expect(actualOverlap).toBe(expectedOverlap);
|
|
374
|
-
}
|
|
375
|
-
});
|
|
376
|
-
});
|
|
377
|
-
|
|
378
|
-
describe('text transformer overlap', () => {
|
|
379
|
-
it('should properly implement overlap in text splitting', async () => {
|
|
380
|
-
// Create a text with distinct sections that will be split
|
|
381
|
-
const text = 'Section1'.repeat(100) + '\n\n' + 'Section2'.repeat(100) + '\n\n' + 'Section3'.repeat(100);
|
|
382
|
-
const size = 300;
|
|
383
|
-
const overlapSize = 50;
|
|
384
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
385
|
-
|
|
386
|
-
await doc.chunk({
|
|
387
|
-
strategy: 'recursive',
|
|
388
|
-
maxSize: size,
|
|
389
|
-
overlap: overlapSize,
|
|
390
|
-
separators: ['\n\n'], // Split on double newlines
|
|
391
|
-
});
|
|
392
|
-
|
|
393
|
-
const docs = doc.getDocs();
|
|
394
|
-
expect(docs.length).toBeGreaterThan(1); // Should create multiple chunks
|
|
395
|
-
|
|
396
|
-
for (let i = 1; i < docs.length; i++) {
|
|
397
|
-
const prevChunk = docs[i - 1]?.text;
|
|
398
|
-
const currentChunk = docs[i]?.text;
|
|
399
|
-
|
|
400
|
-
if (prevChunk && currentChunk) {
|
|
401
|
-
// Check if there's some overlap between chunks
|
|
402
|
-
// We should find some common text between the end of the previous chunk
|
|
403
|
-
// and the beginning of the current chunk
|
|
404
|
-
const commonText = findCommonSubstring(prevChunk, currentChunk);
|
|
405
|
-
expect(commonText.length).toBeGreaterThan(0);
|
|
406
|
-
}
|
|
407
|
-
}
|
|
408
|
-
});
|
|
409
|
-
});
|
|
410
|
-
|
|
411
|
-
describe('chunkRecursive', () => {
|
|
412
|
-
it('chunkRecursive', async () => {
|
|
413
|
-
const text =
|
|
414
|
-
'Hello world.\n\nThis is a test of the recursive splitting system.\nIt should handle multiple lines and different separators appropriately.';
|
|
415
|
-
|
|
416
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
417
|
-
|
|
418
|
-
await doc.chunk({
|
|
419
|
-
strategy: 'recursive',
|
|
420
|
-
separators: ['\n\n', '\n', ' ', ''],
|
|
421
|
-
isSeparatorRegex: false,
|
|
422
|
-
maxSize: 50,
|
|
423
|
-
overlap: 5,
|
|
424
|
-
});
|
|
425
|
-
|
|
426
|
-
expect(doc.getDocs()?.length).toBeGreaterThan(1);
|
|
427
|
-
|
|
428
|
-
doc.getText()?.forEach(t => {
|
|
429
|
-
expect(t.length).toBeLessThanOrEqual(50);
|
|
430
|
-
});
|
|
431
|
-
});
|
|
432
|
-
|
|
433
|
-
it('chunkRecursive - language options', async () => {
|
|
434
|
-
const tsCode = `
|
|
435
|
-
interface User {
|
|
436
|
-
name: string;
|
|
437
|
-
age: number;
|
|
438
|
-
}
|
|
439
|
-
|
|
440
|
-
function greet(user: User) {
|
|
441
|
-
console.log(\`Hello \${user.name}\`);
|
|
442
|
-
}
|
|
443
|
-
`;
|
|
444
|
-
|
|
445
|
-
const doc = MDocument.fromText(tsCode, { meta: 'data' });
|
|
446
|
-
|
|
447
|
-
await doc.chunk({
|
|
448
|
-
maxSize: 50,
|
|
449
|
-
overlap: 5,
|
|
450
|
-
language: Language.TS,
|
|
451
|
-
});
|
|
452
|
-
|
|
453
|
-
expect(doc.getDocs().length).toBeGreaterThan(1);
|
|
454
|
-
expect(doc.getText().some(chunk => chunk.includes('interface'))).toBe(true);
|
|
455
|
-
expect(doc.getText().some(chunk => chunk.includes('function'))).toBe(true);
|
|
456
|
-
});
|
|
457
|
-
|
|
458
|
-
it('should throw error for unsupported language', async () => {
|
|
459
|
-
const doc = MDocument.fromText('tsCode', { meta: 'data' });
|
|
460
|
-
|
|
461
|
-
await expect(
|
|
462
|
-
doc.chunk({
|
|
463
|
-
maxSize: 50,
|
|
464
|
-
overlap: 5,
|
|
465
|
-
language: 'invalid-language' as any,
|
|
466
|
-
}),
|
|
467
|
-
).rejects.toThrow();
|
|
468
|
-
});
|
|
469
|
-
|
|
470
|
-
it('should maintain context with overlap', async () => {
|
|
471
|
-
// Create a longer text that will definitely be split into multiple chunks
|
|
472
|
-
const text =
|
|
473
|
-
'This is a test paragraph. '.repeat(50) +
|
|
474
|
-
'\n\n' +
|
|
475
|
-
'This is a second paragraph with different content. '.repeat(50) +
|
|
476
|
-
'\n\n' +
|
|
477
|
-
'This is a third paragraph with more unique content. '.repeat(50);
|
|
478
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
479
|
-
const overlapSize = 20; // Explicit overlap size
|
|
480
|
-
|
|
481
|
-
await doc.chunk({
|
|
482
|
-
strategy: 'recursive',
|
|
483
|
-
maxSize: 500, // Smaller chunk maxSize to ensure multiple chunks
|
|
484
|
-
overlap: overlapSize,
|
|
485
|
-
});
|
|
486
|
-
|
|
487
|
-
const docs = doc.getDocs();
|
|
488
|
-
|
|
489
|
-
// Ensure we have multiple chunks to test overlap
|
|
490
|
-
expect(docs.length).toBeGreaterThan(1);
|
|
491
|
-
|
|
492
|
-
for (let i = 1; i < docs.length; i++) {
|
|
493
|
-
const prevChunk = docs[i - 1]?.text;
|
|
494
|
-
const currentChunk = docs[i]?.text;
|
|
495
|
-
|
|
496
|
-
if (prevChunk && currentChunk) {
|
|
497
|
-
// Test using two methods:
|
|
498
|
-
|
|
499
|
-
// 1. Check for shared words (original test)
|
|
500
|
-
const hasWordOverlap = prevChunk.split(' ').some(word => word.length > 1 && currentChunk.includes(word));
|
|
501
|
-
|
|
502
|
-
// 2. Check for shared character sequences
|
|
503
|
-
const commonText = findCommonSubstring(prevChunk, currentChunk);
|
|
504
|
-
|
|
505
|
-
// At least one of these overlap detection methods should succeed
|
|
506
|
-
expect(hasWordOverlap || commonText.length > 5).toBe(true);
|
|
507
|
-
}
|
|
508
|
-
}
|
|
509
|
-
});
|
|
510
|
-
|
|
511
|
-
it('should respect the specified overlap size', async () => {
|
|
512
|
-
const text = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.repeat(10); // Long repeating text
|
|
513
|
-
const chunkSize = 50;
|
|
514
|
-
const overlapSize = 20;
|
|
515
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
516
|
-
|
|
517
|
-
await doc.chunk({
|
|
518
|
-
strategy: 'recursive',
|
|
519
|
-
maxSize: chunkSize,
|
|
520
|
-
overlap: overlapSize,
|
|
521
|
-
});
|
|
522
|
-
|
|
523
|
-
const docs = doc.getDocs();
|
|
524
|
-
// Skip first chunk as it doesn't have a previous chunk to overlap with
|
|
525
|
-
for (let i = 1; i < docs.length; i++) {
|
|
526
|
-
const prevChunk = docs[i - 1]?.text;
|
|
527
|
-
const currentChunk = docs[i]?.text;
|
|
528
|
-
|
|
529
|
-
if (prevChunk && currentChunk) {
|
|
530
|
-
// Get the end of the previous chunk
|
|
531
|
-
const prevEnd = prevChunk.slice(-overlapSize);
|
|
532
|
-
// Get the start of the current chunk
|
|
533
|
-
const currentStart = currentChunk.slice(0, overlapSize);
|
|
534
|
-
|
|
535
|
-
// There should be some overlap between the end of the previous chunk
|
|
536
|
-
// and the start of the current chunk
|
|
537
|
-
expect(prevEnd).toContain(currentStart.slice(0, 5));
|
|
538
|
-
// The overlap shouldn't be the entire chunk
|
|
539
|
-
expect(prevChunk).not.toBe(currentChunk);
|
|
540
|
-
}
|
|
541
|
-
}
|
|
542
|
-
});
|
|
543
|
-
});
|
|
544
|
-
|
|
545
|
-
describe('chunkHTML', () => {
|
|
546
|
-
it('should split HTML with headers correctly', async () => {
|
|
547
|
-
const html = `
|
|
548
|
-
<html>
|
|
549
|
-
<body>
|
|
550
|
-
<h1>Main Title</h1>
|
|
551
|
-
<p>Main content.</p>
|
|
552
|
-
<h2>Section 1</h2>
|
|
553
|
-
<p>Section 1 content.</p>
|
|
554
|
-
<h3>Subsection 1.1</h3>
|
|
555
|
-
<p>Subsection content.</p>
|
|
556
|
-
</body>
|
|
557
|
-
</html>
|
|
558
|
-
`;
|
|
559
|
-
|
|
560
|
-
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
561
|
-
|
|
562
|
-
await doc.chunk({
|
|
563
|
-
strategy: 'html',
|
|
564
|
-
headers: [
|
|
565
|
-
['h1', 'Header 1'],
|
|
566
|
-
['h2', 'Header 2'],
|
|
567
|
-
['h3', 'Header 3'],
|
|
568
|
-
],
|
|
569
|
-
});
|
|
570
|
-
|
|
571
|
-
const docs = doc.getDocs();
|
|
572
|
-
expect(docs.length).toBeGreaterThan(1);
|
|
573
|
-
expect(docs?.[0]?.metadata?.['Header 1']).toBe('Main Title');
|
|
574
|
-
expect(docs?.[1]?.metadata?.['Header 2']).toBe('Section 1');
|
|
575
|
-
});
|
|
576
|
-
|
|
577
|
-
it('should handle nested content', async () => {
|
|
578
|
-
const html = `
|
|
579
|
-
<html>
|
|
580
|
-
<body>
|
|
581
|
-
<h1>Title</h1>
|
|
582
|
-
<div>
|
|
583
|
-
<p>Nested content.</p>
|
|
584
|
-
<div>
|
|
585
|
-
<p>Deeply nested content.</p>
|
|
586
|
-
</div>
|
|
587
|
-
</div>
|
|
588
|
-
</body>
|
|
589
|
-
</html>
|
|
590
|
-
`;
|
|
591
|
-
|
|
592
|
-
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
593
|
-
|
|
594
|
-
await doc.chunk({
|
|
595
|
-
strategy: 'html',
|
|
596
|
-
headers: [
|
|
597
|
-
['h1', 'Header 1'],
|
|
598
|
-
['h2', 'Header 2'],
|
|
599
|
-
['h3', 'Header 3'],
|
|
600
|
-
],
|
|
601
|
-
});
|
|
602
|
-
|
|
603
|
-
const docs = doc.getDocs();
|
|
604
|
-
const mainSection = docs.find(doc => doc.metadata?.['Header 1'] === 'Title');
|
|
605
|
-
expect(mainSection?.text).toContain('Nested content');
|
|
606
|
-
expect(mainSection?.text).toContain('Deeply nested content');
|
|
607
|
-
});
|
|
608
|
-
|
|
609
|
-
it('should respect returnEachElement option', async () => {
|
|
610
|
-
const html = `
|
|
611
|
-
<html>
|
|
612
|
-
<body>
|
|
613
|
-
<h1>Title</h1>
|
|
614
|
-
<p>Paragraph 1</p>
|
|
615
|
-
<h1>Title</h1>
|
|
616
|
-
<p>Paragraph 2</p>
|
|
617
|
-
<h1>Title</h1>
|
|
618
|
-
<p>Paragraph 3</p>
|
|
619
|
-
</body>
|
|
620
|
-
</html>
|
|
621
|
-
`;
|
|
622
|
-
|
|
623
|
-
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
624
|
-
|
|
625
|
-
await doc.chunk({
|
|
626
|
-
strategy: 'html',
|
|
627
|
-
|
|
628
|
-
returnEachLine: true,
|
|
629
|
-
headers: [
|
|
630
|
-
['h1', 'Header 1'],
|
|
631
|
-
['h2', 'Header 2'],
|
|
632
|
-
['h3', 'Header 3'],
|
|
633
|
-
],
|
|
634
|
-
});
|
|
635
|
-
|
|
636
|
-
const docs = doc.getDocs();
|
|
637
|
-
|
|
638
|
-
expect(docs.length).toBeGreaterThan(2);
|
|
639
|
-
docs.forEach(doc => {
|
|
640
|
-
expect(doc.metadata?.['Header 1']).toBe('Title');
|
|
641
|
-
});
|
|
642
|
-
});
|
|
643
|
-
|
|
644
|
-
it('should split HTML into sections', async () => {
|
|
645
|
-
const html = `
|
|
646
|
-
<html>
|
|
647
|
-
<body>
|
|
648
|
-
<h1>Document Title</h1>
|
|
649
|
-
<p>Introduction text.</p>
|
|
650
|
-
<h2>First Section</h2>
|
|
651
|
-
<p>First section content.</p>
|
|
652
|
-
<h2>Second Section</h2>
|
|
653
|
-
<p>Second section content.</p>
|
|
654
|
-
</body>
|
|
655
|
-
</html>
|
|
656
|
-
`;
|
|
657
|
-
|
|
658
|
-
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
659
|
-
|
|
660
|
-
await doc.chunk({
|
|
661
|
-
strategy: 'html',
|
|
662
|
-
sections: [
|
|
663
|
-
['h1', 'Header 1'],
|
|
664
|
-
['h2', 'Header 2'],
|
|
665
|
-
],
|
|
666
|
-
});
|
|
667
|
-
const docs = doc.getDocs();
|
|
668
|
-
|
|
669
|
-
expect(docs.length).toBe(3);
|
|
670
|
-
expect(docs?.[0]?.metadata?.['Header 1']).toBe('Document Title');
|
|
671
|
-
expect(docs?.[1]?.metadata?.['Header 2']).toBe('First Section');
|
|
672
|
-
});
|
|
673
|
-
|
|
674
|
-
it('should properly merge metadata', async () => {
|
|
675
|
-
const doc = new MDocument({
|
|
676
|
-
docs: [
|
|
677
|
-
{
|
|
678
|
-
text: `
|
|
679
|
-
<h1>Title 1</h1>
|
|
680
|
-
<p>Content 1</p>
|
|
681
|
-
`,
|
|
682
|
-
metadata: { source: 'doc1' },
|
|
683
|
-
},
|
|
684
|
-
{
|
|
685
|
-
text: `
|
|
686
|
-
<h1>Title 2</h1>
|
|
687
|
-
<p>Content 2</p>
|
|
688
|
-
`,
|
|
689
|
-
metadata: { source: 'doc2' },
|
|
690
|
-
},
|
|
691
|
-
],
|
|
692
|
-
type: 'html',
|
|
693
|
-
});
|
|
694
|
-
|
|
695
|
-
await doc.chunk({
|
|
696
|
-
strategy: 'html',
|
|
697
|
-
sections: [
|
|
698
|
-
['h1', 'Header 1'],
|
|
699
|
-
['h2', 'Header 2'],
|
|
700
|
-
],
|
|
701
|
-
});
|
|
702
|
-
|
|
703
|
-
doc.getDocs().forEach(doc => {
|
|
704
|
-
expect(doc?.metadata).toHaveProperty('source');
|
|
705
|
-
expect(doc?.metadata).toHaveProperty('Header 1');
|
|
706
|
-
});
|
|
707
|
-
});
|
|
708
|
-
|
|
709
|
-
it('should handle empty or invalid HTML', async () => {
|
|
710
|
-
const emptyHtml = '';
|
|
711
|
-
const invalidHtml = '<unclosed>test';
|
|
712
|
-
const noHeadersHtml = '<div>test</div>';
|
|
713
|
-
|
|
714
|
-
const doc1 = MDocument.fromHTML(emptyHtml, { meta: 'data' });
|
|
715
|
-
const doc2 = MDocument.fromHTML(invalidHtml, { meta: 'data' });
|
|
716
|
-
const doc3 = MDocument.fromHTML(noHeadersHtml, { meta: 'data' });
|
|
717
|
-
|
|
718
|
-
await doc1.chunk({
|
|
719
|
-
strategy: 'html',
|
|
720
|
-
headers: [
|
|
721
|
-
['h1', 'Header 1'],
|
|
722
|
-
['h2', 'Header 2'],
|
|
723
|
-
],
|
|
724
|
-
});
|
|
725
|
-
|
|
726
|
-
await doc2.chunk({
|
|
727
|
-
strategy: 'html',
|
|
728
|
-
headers: [
|
|
729
|
-
['h1', 'Header 1'],
|
|
730
|
-
['h2', 'Header 2'],
|
|
731
|
-
],
|
|
732
|
-
});
|
|
733
|
-
|
|
734
|
-
await doc3.chunk({
|
|
735
|
-
strategy: 'html',
|
|
736
|
-
headers: [
|
|
737
|
-
['h1', 'Header 1'],
|
|
738
|
-
['h2', 'Header 2'],
|
|
739
|
-
],
|
|
740
|
-
});
|
|
741
|
-
|
|
742
|
-
expect(doc1.getDocs()).toHaveLength(0);
|
|
743
|
-
expect(doc2.getDocs()).toHaveLength(0);
|
|
744
|
-
expect(doc3.getDocs()).toHaveLength(0);
|
|
745
|
-
});
|
|
746
|
-
|
|
747
|
-
it('should handle complex nested header hierarchies', async () => {
|
|
748
|
-
const html = `
|
|
749
|
-
<html>
|
|
750
|
-
<body>
|
|
751
|
-
<h1>Main Title</h1>
|
|
752
|
-
<p>Main content</p>
|
|
753
|
-
<h2>Section 1</h2>
|
|
754
|
-
<p>Section 1 content</p>
|
|
755
|
-
<h3>Subsection 1.1</h3>
|
|
756
|
-
<p>Subsection 1.1 content</p>
|
|
757
|
-
<h2>Section 2</h2>
|
|
758
|
-
<h3>Subsection 2.1</h3>
|
|
759
|
-
<p>Subsection 2.1 content</p>
|
|
760
|
-
</body>
|
|
761
|
-
</html>
|
|
762
|
-
`;
|
|
763
|
-
|
|
764
|
-
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
765
|
-
await doc.chunk({
|
|
766
|
-
strategy: 'html',
|
|
767
|
-
headers: [
|
|
768
|
-
['h1', 'Header 1'],
|
|
769
|
-
['h2', 'Header 2'],
|
|
770
|
-
['h3', 'Header 3'],
|
|
771
|
-
],
|
|
772
|
-
});
|
|
773
|
-
|
|
774
|
-
const docs = doc.getDocs();
|
|
775
|
-
expect(docs.length).toBeGreaterThan(3);
|
|
776
|
-
expect(docs.some(d => d.metadata?.['Header 1'] === 'Main Title')).toBe(true);
|
|
777
|
-
expect(docs.some(d => d.metadata?.['Header 2'] === 'Section 1')).toBe(true);
|
|
778
|
-
expect(docs.some(d => d.metadata?.['Header 3'] === 'Subsection 1.1')).toBe(true);
|
|
779
|
-
});
|
|
780
|
-
|
|
781
|
-
it('should handle headers with mixed content and special characters', async () => {
|
|
782
|
-
const html = `
|
|
783
|
-
<html>
|
|
784
|
-
<body>
|
|
785
|
-
<h1>Title with <strong>bold</strong> & <em>emphasis</em></h1>
|
|
786
|
-
<p>Content 1</p>
|
|
787
|
-
<h2>Section with <tags> & symbols</h2>
|
|
788
|
-
<p>Content 2</p>
|
|
789
|
-
</body>
|
|
790
|
-
</html>
|
|
791
|
-
`;
|
|
792
|
-
|
|
793
|
-
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
794
|
-
await doc.chunk({
|
|
795
|
-
strategy: 'html',
|
|
796
|
-
headers: [
|
|
797
|
-
['h1', 'Header 1'],
|
|
798
|
-
['h2', 'Header 2'],
|
|
799
|
-
],
|
|
800
|
-
});
|
|
801
|
-
|
|
802
|
-
const docs = doc.getDocs();
|
|
803
|
-
expect(docs.length).toBeGreaterThan(1);
|
|
804
|
-
expect(docs[0]?.metadata?.['Header 1']).toContain('bold');
|
|
805
|
-
expect(docs[0]?.metadata?.['Header 1']).toContain('&');
|
|
806
|
-
expect(docs[0]?.metadata?.['Header 1']).toContain('emphasis');
|
|
807
|
-
expect(docs[1]?.metadata?.['Header 2']).toContain('<tags>');
|
|
808
|
-
});
|
|
809
|
-
|
|
810
|
-
it('should handle headers with no content or whitespace content', async () => {
|
|
811
|
-
const html = `
|
|
812
|
-
<html>
|
|
813
|
-
<body>
|
|
814
|
-
<h1>Empty Section</h1>
|
|
815
|
-
<h2>Whitespace Section</h2>
|
|
816
|
-
|
|
817
|
-
<h2>Valid Section</h2>
|
|
818
|
-
<p>Content</p>
|
|
819
|
-
</body>
|
|
820
|
-
</html>
|
|
821
|
-
`;
|
|
822
|
-
|
|
823
|
-
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
824
|
-
await doc.chunk({
|
|
825
|
-
strategy: 'html',
|
|
826
|
-
headers: [
|
|
827
|
-
['h1', 'Header 1'],
|
|
828
|
-
['h2', 'Header 2'],
|
|
829
|
-
],
|
|
830
|
-
});
|
|
831
|
-
|
|
832
|
-
const docs = doc.getDocs();
|
|
833
|
-
expect(docs.some(d => d.metadata?.['Header 1'] === 'Empty Section')).toBe(true);
|
|
834
|
-
expect(docs.some(d => d.metadata?.['Header 2'] === 'Valid Section')).toBe(true);
|
|
835
|
-
expect(docs.find(d => d.metadata?.['Header 2'] === 'Valid Section')?.text).toContain('Content');
|
|
836
|
-
});
|
|
837
|
-
|
|
838
|
-
it('should generate correct XPaths for deeply nested elements', async () => {
|
|
839
|
-
const html = `
|
|
840
|
-
<html>
|
|
841
|
-
<body>
|
|
842
|
-
<div class="container">
|
|
843
|
-
<section id="main">
|
|
844
|
-
<div>
|
|
845
|
-
<h1>Deeply Nested Title</h1>
|
|
846
|
-
<p>Content</p>
|
|
847
|
-
</div>
|
|
848
|
-
<div>
|
|
849
|
-
<h1>Second Title</h1>
|
|
850
|
-
<p>More Content</p>
|
|
851
|
-
</div>
|
|
852
|
-
</section>
|
|
853
|
-
</div>
|
|
854
|
-
</body>
|
|
855
|
-
</html>
|
|
856
|
-
`;
|
|
857
|
-
|
|
858
|
-
const doc = MDocument.fromHTML(html, { meta: 'data' });
|
|
859
|
-
await doc.chunk({
|
|
860
|
-
strategy: 'html',
|
|
861
|
-
headers: [['h1', 'Header 1']],
|
|
862
|
-
});
|
|
863
|
-
|
|
864
|
-
const docs = doc.getDocs();
|
|
865
|
-
expect(docs).toHaveLength(2);
|
|
866
|
-
|
|
867
|
-
// First h1
|
|
868
|
-
expect(docs[0]?.metadata?.['Header 1']).toBe('Deeply Nested Title');
|
|
869
|
-
const xpath1 = docs[0]?.metadata?.xpath as string;
|
|
870
|
-
expect(xpath1).toBeDefined();
|
|
871
|
-
expect(xpath1).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[1\]\/h1\[1\]$/);
|
|
872
|
-
|
|
873
|
-
// Second h1
|
|
874
|
-
expect(docs[1]?.metadata?.['Header 1']).toBe('Second Title');
|
|
875
|
-
const xpath2 = docs[1]?.metadata?.xpath as string;
|
|
876
|
-
expect(xpath2).toBeDefined();
|
|
877
|
-
expect(xpath2).toMatch(/^\/html\[1\]\/body\[1\]\/div\[1\]\/section\[1\]\/div\[2\]\/h1\[1\]$/);
|
|
878
|
-
});
|
|
879
|
-
});
|
|
880
|
-
|
|
881
|
-
describe('chunkJson', () => {
|
|
882
|
-
describe('Unicode handling', () => {
|
|
883
|
-
it('should handle Unicode characters correctly', async () => {
|
|
884
|
-
const input = {
|
|
885
|
-
key1: '你好',
|
|
886
|
-
key2: '世界',
|
|
887
|
-
};
|
|
888
|
-
|
|
889
|
-
const doc = MDocument.fromJSON(JSON.stringify(input), { meta: 'data' });
|
|
890
|
-
|
|
891
|
-
await doc.chunk({
|
|
892
|
-
strategy: 'json',
|
|
893
|
-
maxSize: 50,
|
|
894
|
-
minSize: 50,
|
|
895
|
-
ensureAscii: true,
|
|
896
|
-
});
|
|
897
|
-
|
|
898
|
-
expect(doc.getText().some(chunk => chunk.includes('\\u'))).toBe(true);
|
|
899
|
-
|
|
900
|
-
const combined = doc
|
|
901
|
-
.getText()
|
|
902
|
-
.map(chunk => {
|
|
903
|
-
const c = JSON.parse(chunk);
|
|
904
|
-
const retVal: Record<string, string> = {};
|
|
905
|
-
Object.entries(c).forEach(([key, value]) => {
|
|
906
|
-
retVal[key] = JSON.parse(`"${value as string}"`);
|
|
907
|
-
});
|
|
908
|
-
|
|
909
|
-
return retVal;
|
|
910
|
-
})
|
|
911
|
-
.reduce((acc, curr) => ({ ...acc, ...curr }), {});
|
|
912
|
-
|
|
913
|
-
expect(combined?.key1?.charCodeAt(0)).toBe('你'.charCodeAt(0));
|
|
914
|
-
expect(combined?.key1?.charCodeAt(1)).toBe('好'.charCodeAt(0));
|
|
915
|
-
expect(combined?.key2?.charCodeAt(0)).toBe('世'.charCodeAt(0));
|
|
916
|
-
expect(combined?.key2?.charCodeAt(1)).toBe('界'.charCodeAt(0));
|
|
917
|
-
|
|
918
|
-
expect(combined?.key1).toBe('你好');
|
|
919
|
-
expect(combined?.key2).toBe('世界');
|
|
920
|
-
});
|
|
921
|
-
|
|
922
|
-
it('should handle non-ASCII without escaping when ensureAscii is false', async () => {
|
|
923
|
-
const input = {
|
|
924
|
-
key1: '你好',
|
|
925
|
-
key2: '世界',
|
|
926
|
-
};
|
|
927
|
-
|
|
928
|
-
const doc = MDocument.fromJSON(JSON.stringify(input), { meta: 'data' });
|
|
929
|
-
|
|
930
|
-
await doc.chunk({
|
|
931
|
-
strategy: 'json',
|
|
932
|
-
maxSize: 50,
|
|
933
|
-
ensureAscii: false,
|
|
934
|
-
});
|
|
935
|
-
|
|
936
|
-
expect(doc.getText().some(chunk => chunk.includes('你好'))).toBe(true);
|
|
937
|
-
|
|
938
|
-
const combined = doc
|
|
939
|
-
.getText()
|
|
940
|
-
.map(chunk => JSON.parse(chunk))
|
|
941
|
-
.reduce((acc, curr) => ({ ...acc, ...curr }), {});
|
|
942
|
-
|
|
943
|
-
expect(combined.key1).toBe('你好');
|
|
944
|
-
expect(combined.key2).toBe('世界');
|
|
945
|
-
});
|
|
946
|
-
});
|
|
947
|
-
|
|
948
|
-
describe('JSON structure handling', () => {
|
|
949
|
-
it('should handle flat objects', async () => {
|
|
950
|
-
const flatJson = {
|
|
951
|
-
name: 'John',
|
|
952
|
-
age: 30,
|
|
953
|
-
email: 'john@example.com',
|
|
954
|
-
};
|
|
955
|
-
|
|
956
|
-
const doc = MDocument.fromJSON(JSON.stringify(flatJson), { meta: 'data' });
|
|
957
|
-
await doc.chunk({
|
|
958
|
-
strategy: 'json',
|
|
959
|
-
maxSize: 50,
|
|
960
|
-
minSize: 10,
|
|
961
|
-
});
|
|
962
|
-
|
|
963
|
-
const chunks = doc.getText();
|
|
964
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
965
|
-
|
|
966
|
-
// Verify all data is preserved
|
|
967
|
-
const reconstructed = chunks.map(chunk => JSON.parse(chunk)).reduce((acc, curr) => ({ ...acc, ...curr }), {});
|
|
968
|
-
expect(reconstructed).toEqual(flatJson);
|
|
969
|
-
});
|
|
970
|
-
|
|
971
|
-
it('should handle nested objects', async () => {
|
|
972
|
-
const nestedJson = {
|
|
973
|
-
user: {
|
|
974
|
-
name: 'John',
|
|
975
|
-
contact: {
|
|
976
|
-
email: 'john@example.com',
|
|
977
|
-
phone: '123-456-7890',
|
|
978
|
-
},
|
|
979
|
-
},
|
|
980
|
-
};
|
|
981
|
-
|
|
982
|
-
const doc = MDocument.fromJSON(JSON.stringify(nestedJson), { meta: 'data' });
|
|
983
|
-
await doc.chunk({
|
|
984
|
-
strategy: 'json',
|
|
985
|
-
maxSize: 50,
|
|
986
|
-
minSize: 10,
|
|
987
|
-
});
|
|
988
|
-
|
|
989
|
-
const chunks = doc.getText();
|
|
990
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
991
|
-
|
|
992
|
-
// Verify nested structure is maintained
|
|
993
|
-
chunks.forEach(chunk => {
|
|
994
|
-
const parsed = JSON.parse(chunk);
|
|
995
|
-
expect(parsed).toHaveProperty('user');
|
|
996
|
-
});
|
|
997
|
-
});
|
|
998
|
-
|
|
999
|
-
it('should handle arrays of objects', async () => {
|
|
1000
|
-
const arrayJson = [
|
|
1001
|
-
{ id: 1, value: 'first' },
|
|
1002
|
-
{ id: 2, value: 'second' },
|
|
1003
|
-
];
|
|
1004
|
-
|
|
1005
|
-
const doc = MDocument.fromJSON(JSON.stringify(arrayJson), { meta: 'data' });
|
|
1006
|
-
await doc.chunk({
|
|
1007
|
-
strategy: 'json',
|
|
1008
|
-
maxSize: 50,
|
|
1009
|
-
minSize: 10,
|
|
1010
|
-
});
|
|
1011
|
-
|
|
1012
|
-
const chunks = doc.getText();
|
|
1013
|
-
expect(chunks.length).toBe(2);
|
|
1014
|
-
chunks.forEach((chunk, index) => {
|
|
1015
|
-
const parsed = JSON.parse(chunk);
|
|
1016
|
-
expect(parsed[index]).toEqual(arrayJson[index]);
|
|
1017
|
-
});
|
|
1018
|
-
});
|
|
1019
|
-
|
|
1020
|
-
it('should handle mixed types', async () => {
|
|
1021
|
-
const mixedJson = {
|
|
1022
|
-
string: 'hello',
|
|
1023
|
-
number: 123,
|
|
1024
|
-
boolean: true,
|
|
1025
|
-
array: [1, 2, 3],
|
|
1026
|
-
object: {
|
|
1027
|
-
nested: 'value',
|
|
1028
|
-
},
|
|
1029
|
-
};
|
|
1030
|
-
|
|
1031
|
-
const doc = MDocument.fromJSON(JSON.stringify(mixedJson), { meta: 'data' });
|
|
1032
|
-
await doc.chunk({
|
|
1033
|
-
strategy: 'json',
|
|
1034
|
-
maxSize: 50,
|
|
1035
|
-
minSize: 10,
|
|
1036
|
-
});
|
|
1037
|
-
|
|
1038
|
-
const chunks = doc.getText();
|
|
1039
|
-
const reconstructed = chunks.map(chunk => JSON.parse(chunk)).reduce((acc, curr) => ({ ...acc, ...curr }), {});
|
|
1040
|
-
|
|
1041
|
-
expect(reconstructed).toEqual(mixedJson);
|
|
1042
|
-
});
|
|
1043
|
-
|
|
1044
|
-
it('should properly split long string values', async () => {
|
|
1045
|
-
const longStringJson = {
|
|
1046
|
-
title: 'Short title',
|
|
1047
|
-
description:
|
|
1048
|
-
'This is a very long description that should definitely exceed our maxSize limit of 128 characters. It contains multiple sentences and should be split into multiple chunks while maintaining proper structure.',
|
|
1049
|
-
};
|
|
1050
|
-
|
|
1051
|
-
const doc = MDocument.fromJSON(JSON.stringify(longStringJson), { meta: 'data' });
|
|
1052
|
-
await doc.chunk({
|
|
1053
|
-
strategy: 'json',
|
|
1054
|
-
maxSize: 50,
|
|
1055
|
-
minSize: 10,
|
|
1056
|
-
});
|
|
1057
|
-
|
|
1058
|
-
const chunks = doc.getText();
|
|
1059
|
-
|
|
1060
|
-
// Verify the short field is kept intact
|
|
1061
|
-
expect(
|
|
1062
|
-
chunks.some(chunk => {
|
|
1063
|
-
const parsed = JSON.parse(chunk);
|
|
1064
|
-
return parsed.title === 'Short title';
|
|
1065
|
-
}),
|
|
1066
|
-
).toBe(true);
|
|
1067
|
-
|
|
1068
|
-
// Verify the long field is split
|
|
1069
|
-
const descriptionChunks = chunks
|
|
1070
|
-
.map(chunk => JSON.parse(chunk))
|
|
1071
|
-
.filter(parsed => parsed.description)
|
|
1072
|
-
.map(parsed => parsed.description);
|
|
1073
|
-
|
|
1074
|
-
expect(descriptionChunks.length).toBeGreaterThan(1);
|
|
1075
|
-
expect(descriptionChunks.join('')).toBe(longStringJson.description);
|
|
1076
|
-
});
|
|
1077
|
-
|
|
1078
|
-
it('should respect maxSize in all chunks', async () => {
|
|
1079
|
-
const doc = MDocument.fromJSON(
|
|
1080
|
-
JSON.stringify({
|
|
1081
|
-
key: 'x'.repeat(200), // Deliberately exceed maxSize
|
|
1082
|
-
}),
|
|
1083
|
-
{ meta: 'data' },
|
|
1084
|
-
);
|
|
1085
|
-
|
|
1086
|
-
await doc.chunk({
|
|
1087
|
-
strategy: 'json',
|
|
1088
|
-
maxSize: 50,
|
|
1089
|
-
minSize: 10,
|
|
1090
|
-
});
|
|
1091
|
-
|
|
1092
|
-
const chunks = doc.getText();
|
|
1093
|
-
chunks.forEach(chunk => {
|
|
1094
|
-
expect(chunk.length).toBeLessThanOrEqual(50);
|
|
1095
|
-
});
|
|
1096
|
-
});
|
|
1097
|
-
|
|
1098
|
-
it('should properly group array items when possible', async () => {
|
|
1099
|
-
const arrayData = [
|
|
1100
|
-
{ id: 1, name: 'Item 1', description: 'Short desc' },
|
|
1101
|
-
{ id: 2, name: 'Item 2', description: 'Short desc' },
|
|
1102
|
-
{
|
|
1103
|
-
id: 3,
|
|
1104
|
-
name: 'Item 3',
|
|
1105
|
-
description: 'This is a much longer description that should cause this item to be in its own chunk',
|
|
1106
|
-
},
|
|
1107
|
-
{ id: 4, name: 'Item 4', description: 'Short desc' },
|
|
1108
|
-
];
|
|
1109
|
-
|
|
1110
|
-
const doc = MDocument.fromJSON(JSON.stringify({ items: arrayData }));
|
|
1111
|
-
await doc.chunk({
|
|
1112
|
-
strategy: 'json',
|
|
1113
|
-
maxSize: 100,
|
|
1114
|
-
minSize: 10,
|
|
1115
|
-
});
|
|
1116
|
-
|
|
1117
|
-
const chunks = doc.getText().map(chunk => JSON.parse(chunk));
|
|
1118
|
-
|
|
1119
|
-
// Change expectation: No items should be grouped when maxSize is too small
|
|
1120
|
-
expect(chunks.every(chunk => !chunk.items || !Array.isArray(chunk.items) || chunk.items.length === 1)).toBe(
|
|
1121
|
-
true,
|
|
1122
|
-
);
|
|
1123
|
-
});
|
|
1124
|
-
|
|
1125
|
-
it('should group items with larger maxSize', async () => {
|
|
1126
|
-
const arrayData = [
|
|
1127
|
-
{ id: 1, name: 'Item 1', description: 'Short desc' },
|
|
1128
|
-
{ id: 2, name: 'Item 2', description: 'Short desc' },
|
|
1129
|
-
{
|
|
1130
|
-
id: 3,
|
|
1131
|
-
name: 'Item 3',
|
|
1132
|
-
description: 'This is a much longer description that should cause this item to be in its own chunk',
|
|
1133
|
-
},
|
|
1134
|
-
{ id: 4, name: 'Item 4', description: 'Short desc' },
|
|
1135
|
-
];
|
|
1136
|
-
|
|
1137
|
-
const doc = MDocument.fromJSON(JSON.stringify({ items: arrayData }));
|
|
1138
|
-
await doc.chunk({
|
|
1139
|
-
strategy: 'json',
|
|
1140
|
-
maxSize: 150, // Larger maxSize to allow grouping
|
|
1141
|
-
minSize: 10,
|
|
1142
|
-
});
|
|
1143
|
-
|
|
1144
|
-
const chunks = doc.getText().map(chunk => JSON.parse(chunk));
|
|
1145
|
-
|
|
1146
|
-
// Should group first two items
|
|
1147
|
-
expect(
|
|
1148
|
-
chunks.some(
|
|
1149
|
-
chunk =>
|
|
1150
|
-
chunk.items &&
|
|
1151
|
-
Array.isArray(chunk.items) &&
|
|
1152
|
-
chunk.items.length === 2 &&
|
|
1153
|
-
chunk.items[0].id === 1 &&
|
|
1154
|
-
chunk.items[1].id === 2,
|
|
1155
|
-
),
|
|
1156
|
-
).toBe(true);
|
|
1157
|
-
|
|
1158
|
-
// Long item should still be separate
|
|
1159
|
-
expect(
|
|
1160
|
-
chunks.some(
|
|
1161
|
-
chunk => chunk.items && Array.isArray(chunk.items) && chunk.items.length === 1 && chunk.items[0].id === 3,
|
|
1162
|
-
),
|
|
1163
|
-
).toBe(true);
|
|
1164
|
-
});
|
|
1165
|
-
|
|
1166
|
-
it('should group smaller items within maxSize limit', async () => {
|
|
1167
|
-
const arrayData = [
|
|
1168
|
-
{ id: 1, name: 'A', desc: 'x' }, // Minimal items
|
|
1169
|
-
{ id: 2, name: 'B', desc: 'y' },
|
|
1170
|
-
{ id: 3, name: 'C', desc: 'This is the long one' },
|
|
1171
|
-
{ id: 4, name: 'D', desc: 'z' },
|
|
1172
|
-
{ id: 5, name: 'E', desc: 'w' }, // Added fifth item
|
|
1173
|
-
];
|
|
1174
|
-
|
|
1175
|
-
const doc = MDocument.fromJSON(JSON.stringify({ items: arrayData }));
|
|
1176
|
-
await doc.chunk({
|
|
1177
|
-
strategy: 'json',
|
|
1178
|
-
maxSize: 100,
|
|
1179
|
-
minSize: 10,
|
|
1180
|
-
});
|
|
1181
|
-
|
|
1182
|
-
const chunks = doc.getText().map(chunk => JSON.parse(chunk));
|
|
1183
|
-
|
|
1184
|
-
// Change expectation: Should group 2 items (not 3)
|
|
1185
|
-
expect(
|
|
1186
|
-
chunks.some(
|
|
1187
|
-
chunk => chunk.items && Array.isArray(chunk.items) && chunk.items.length === 2, // Changed from >= 3
|
|
1188
|
-
),
|
|
1189
|
-
).toBe(true);
|
|
1190
|
-
});
|
|
1191
|
-
|
|
1192
|
-
it('should handle convertLists option', async () => {
|
|
1193
|
-
const data = {
|
|
1194
|
-
items: [1, 2, 3],
|
|
1195
|
-
nested: {
|
|
1196
|
-
list: ['a', 'b', 'c'],
|
|
1197
|
-
},
|
|
1198
|
-
};
|
|
1199
|
-
|
|
1200
|
-
const doc = MDocument.fromJSON(JSON.stringify(data));
|
|
1201
|
-
await doc.chunk({
|
|
1202
|
-
strategy: 'json',
|
|
1203
|
-
maxSize: 50,
|
|
1204
|
-
minSize: 10,
|
|
1205
|
-
convertLists: true,
|
|
1206
|
-
});
|
|
1207
|
-
|
|
1208
|
-
const chunks = doc.getText().map(chunk => JSON.parse(chunk));
|
|
1209
|
-
|
|
1210
|
-
// Check that arrays were converted to objects with numeric keys
|
|
1211
|
-
expect(
|
|
1212
|
-
chunks.some(chunk => chunk.items && typeof chunk.items === 'object' && !Array.isArray(chunk.items)),
|
|
1213
|
-
).toBe(true);
|
|
1214
|
-
});
|
|
1215
|
-
|
|
1216
|
-
it('should handle ensureAscii option', async () => {
|
|
1217
|
-
const data = {
|
|
1218
|
-
text: 'Hello café world 🌍',
|
|
1219
|
-
};
|
|
1220
|
-
|
|
1221
|
-
const doc = MDocument.fromJSON(JSON.stringify(data));
|
|
1222
|
-
|
|
1223
|
-
// With ensureAscii true
|
|
1224
|
-
await doc.chunk({
|
|
1225
|
-
strategy: 'json',
|
|
1226
|
-
maxSize: 50,
|
|
1227
|
-
minSize: 10,
|
|
1228
|
-
ensureAscii: true,
|
|
1229
|
-
});
|
|
1230
|
-
|
|
1231
|
-
const asciiChunks = doc.getText();
|
|
1232
|
-
expect(asciiChunks[0]).not.toMatch(/[^\x00-\x7F]/);
|
|
1233
|
-
|
|
1234
|
-
// With ensureAscii false
|
|
1235
|
-
await doc.chunk({
|
|
1236
|
-
strategy: 'json',
|
|
1237
|
-
maxSize: 50,
|
|
1238
|
-
minSize: 10,
|
|
1239
|
-
ensureAscii: false,
|
|
1240
|
-
});
|
|
1241
|
-
|
|
1242
|
-
const unicodeChunks = doc.getText();
|
|
1243
|
-
expect(JSON.parse(unicodeChunks[0]).text).toMatch(/[^\x00-\x7F]/);
|
|
1244
|
-
});
|
|
1245
|
-
|
|
1246
|
-
it('should handle deeply nested structures', async () => {
|
|
1247
|
-
const deepData = {
|
|
1248
|
-
level1: {
|
|
1249
|
-
level2: {
|
|
1250
|
-
level3: {
|
|
1251
|
-
level4: {
|
|
1252
|
-
value: 'deep',
|
|
1253
|
-
},
|
|
1254
|
-
},
|
|
1255
|
-
},
|
|
1256
|
-
},
|
|
1257
|
-
};
|
|
1258
|
-
|
|
1259
|
-
const doc = MDocument.fromJSON(JSON.stringify(deepData));
|
|
1260
|
-
await doc.chunk({
|
|
1261
|
-
strategy: 'json',
|
|
1262
|
-
maxSize: 50,
|
|
1263
|
-
minSize: 10,
|
|
1264
|
-
});
|
|
1265
|
-
|
|
1266
|
-
const chunks = doc.getText().map(chunk => JSON.parse(chunk));
|
|
1267
|
-
// Verify we can still access deeply nested value
|
|
1268
|
-
chunks.forEach(chunk => {
|
|
1269
|
-
expect(chunk).toHaveProperty('level1');
|
|
1270
|
-
});
|
|
1271
|
-
const hasDeepValue = chunks.some(chunk => {
|
|
1272
|
-
try {
|
|
1273
|
-
return chunk.level1?.level2?.level3?.level4?.value === 'deep';
|
|
1274
|
-
} catch {
|
|
1275
|
-
return false;
|
|
1276
|
-
}
|
|
1277
|
-
});
|
|
1278
|
-
expect(hasDeepValue).toBe(true);
|
|
1279
|
-
});
|
|
1280
|
-
|
|
1281
|
-
it('should handle complex deeply nested structures with mixed types', async () => {
|
|
1282
|
-
const complexData = {
|
|
1283
|
-
organization: {
|
|
1284
|
-
name: 'TechCorp',
|
|
1285
|
-
departments: {
|
|
1286
|
-
engineering: {
|
|
1287
|
-
teams: [
|
|
1288
|
-
{
|
|
1289
|
-
name: 'Frontend',
|
|
1290
|
-
projects: {
|
|
1291
|
-
main: {
|
|
1292
|
-
title: 'Website Redesign',
|
|
1293
|
-
status: 'active',
|
|
1294
|
-
tasks: [
|
|
1295
|
-
{ id: 1, description: 'Update homepage', status: 'done' },
|
|
1296
|
-
{ id: 2, description: 'Refactor CSS', status: 'in-progress' },
|
|
1297
|
-
],
|
|
1298
|
-
metrics: {
|
|
1299
|
-
performance: {
|
|
1300
|
-
loadTime: '1.2s',
|
|
1301
|
-
score: 95,
|
|
1302
|
-
details: {
|
|
1303
|
-
mobile: { score: 90, issues: ['image optimization'] },
|
|
1304
|
-
desktop: { score: 98, issues: [] },
|
|
1305
|
-
},
|
|
1306
|
-
},
|
|
1307
|
-
},
|
|
1308
|
-
},
|
|
1309
|
-
},
|
|
1310
|
-
members: [
|
|
1311
|
-
{ id: 1, name: 'Alice', role: 'Lead' },
|
|
1312
|
-
{ id: 2, name: 'Bob', role: 'Senior Dev' },
|
|
1313
|
-
],
|
|
1314
|
-
},
|
|
1315
|
-
],
|
|
1316
|
-
},
|
|
1317
|
-
},
|
|
1318
|
-
},
|
|
1319
|
-
};
|
|
1320
|
-
|
|
1321
|
-
const doc = MDocument.fromJSON(JSON.stringify(complexData));
|
|
1322
|
-
await doc.chunk({
|
|
1323
|
-
strategy: 'json',
|
|
1324
|
-
maxSize: 500, // Increased to more realistic size for JSON structures
|
|
1325
|
-
minSize: 50, // Increased to account for JSON path overhead
|
|
1326
|
-
});
|
|
1327
|
-
|
|
1328
|
-
const chunks = doc.getText().map(chunk => JSON.parse(chunk));
|
|
1329
|
-
|
|
1330
|
-
// Test complete objects are kept together when possible
|
|
1331
|
-
expect(
|
|
1332
|
-
chunks.some(chunk => {
|
|
1333
|
-
const members = chunk.organization?.departments?.engineering?.teams?.[0]?.members;
|
|
1334
|
-
return Array.isArray(members) && members.length === 2; // Both members should be in same chunk
|
|
1335
|
-
}),
|
|
1336
|
-
).toBe(true);
|
|
1337
|
-
|
|
1338
|
-
// Test large nested objects are split appropriately
|
|
1339
|
-
expect(
|
|
1340
|
-
chunks.some(
|
|
1341
|
-
chunk =>
|
|
1342
|
-
chunk.organization?.departments?.engineering?.teams?.[0]?.projects?.main?.metrics?.performance
|
|
1343
|
-
?.loadTime === '1.2s',
|
|
1344
|
-
),
|
|
1345
|
-
).toBe(true);
|
|
1346
|
-
|
|
1347
|
-
// Test array items are handled properly
|
|
1348
|
-
const taskChunks = chunks.filter(chunk => {
|
|
1349
|
-
const tasks = chunk.organization?.departments?.engineering?.teams?.[0]?.projects?.main?.tasks;
|
|
1350
|
-
return Array.isArray(tasks) || (tasks && typeof tasks === 'object');
|
|
1351
|
-
});
|
|
1352
|
-
expect(taskChunks.length).toBeGreaterThan(0);
|
|
1353
|
-
|
|
1354
|
-
// Test that related data stays together when under maxSize
|
|
1355
|
-
expect(
|
|
1356
|
-
chunks.some(chunk => {
|
|
1357
|
-
const mobile =
|
|
1358
|
-
chunk.organization?.departments?.engineering?.teams?.[0]?.projects?.main?.metrics?.performance?.details
|
|
1359
|
-
?.mobile;
|
|
1360
|
-
return mobile && mobile.score === 90 && Array.isArray(mobile.issues);
|
|
1361
|
-
}),
|
|
1362
|
-
).toBe(true);
|
|
1363
|
-
});
|
|
1364
|
-
});
|
|
1365
|
-
});
|
|
1366
|
-
|
|
1367
|
-
describe('chunkToken', () => {
|
|
1368
|
-
it('should handle different encodings', async () => {
|
|
1369
|
-
const text = 'This is a test text for different encodings.';
|
|
1370
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1371
|
-
|
|
1372
|
-
await doc.chunk({
|
|
1373
|
-
strategy: 'token',
|
|
1374
|
-
encodingName: 'cl100k_base',
|
|
1375
|
-
maxSize: 10,
|
|
1376
|
-
overlap: 2,
|
|
1377
|
-
});
|
|
1378
|
-
|
|
1379
|
-
const chunks = doc.getText();
|
|
1380
|
-
|
|
1381
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
1382
|
-
expect(chunks.join(' ').trim()).toBe(text);
|
|
1383
|
-
});
|
|
1384
|
-
|
|
1385
|
-
it('should handle special tokens correctly', async () => {
|
|
1386
|
-
const text = 'Test text <|endoftext|> more text';
|
|
1387
|
-
|
|
1388
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1389
|
-
|
|
1390
|
-
await doc.chunk({
|
|
1391
|
-
strategy: 'token',
|
|
1392
|
-
encodingName: 'gpt2',
|
|
1393
|
-
maxSize: 10,
|
|
1394
|
-
disallowedSpecial: new Set(),
|
|
1395
|
-
allowedSpecial: new Set(['<|endoftext|>']),
|
|
1396
|
-
overlap: 2,
|
|
1397
|
-
});
|
|
1398
|
-
|
|
1399
|
-
const chunks = doc.getText();
|
|
1400
|
-
|
|
1401
|
-
expect(chunks.join(' ').includes('<|endoftext|>')).toBe(true);
|
|
1402
|
-
});
|
|
1403
|
-
|
|
1404
|
-
it('should strip whitespace when configured', async () => {
|
|
1405
|
-
const text = ' This has whitespace ';
|
|
1406
|
-
|
|
1407
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1408
|
-
|
|
1409
|
-
await doc.chunk({
|
|
1410
|
-
strategy: 'token',
|
|
1411
|
-
encodingName: 'gpt2',
|
|
1412
|
-
maxSize: 10,
|
|
1413
|
-
disallowedSpecial: new Set(),
|
|
1414
|
-
allowedSpecial: new Set(['<|endoftext|>']),
|
|
1415
|
-
overlap: 2,
|
|
1416
|
-
});
|
|
1417
|
-
|
|
1418
|
-
const chunks = doc.getText();
|
|
1419
|
-
|
|
1420
|
-
chunks.forEach(chunk => {
|
|
1421
|
-
expect(chunk).not.toMatch(/^\s+|\s+$/);
|
|
1422
|
-
});
|
|
1423
|
-
});
|
|
1424
|
-
|
|
1425
|
-
describe('Error cases', () => {
|
|
1426
|
-
it('should throw error for invalid chunk maxSize and overlap', async () => {
|
|
1427
|
-
const text = ' This has whitespace ';
|
|
1428
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1429
|
-
|
|
1430
|
-
await expect(
|
|
1431
|
-
doc.chunk({
|
|
1432
|
-
strategy: 'token',
|
|
1433
|
-
maxSize: 100,
|
|
1434
|
-
overlap: 150, // overlap larger than chunk maxSize
|
|
1435
|
-
}),
|
|
1436
|
-
).rejects.toThrow();
|
|
1437
|
-
});
|
|
1438
|
-
|
|
1439
|
-
it('should handle invalid encoding name', async () => {
|
|
1440
|
-
const text = ' This has whitespace ';
|
|
1441
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1442
|
-
|
|
1443
|
-
await expect(
|
|
1444
|
-
doc.chunk({
|
|
1445
|
-
strategy: 'token',
|
|
1446
|
-
encodingName: 'invalid-encoding' as any,
|
|
1447
|
-
maxSize: 100,
|
|
1448
|
-
overlap: 150, // overlap larger than chunk maxSize
|
|
1449
|
-
}),
|
|
1450
|
-
).rejects.toThrow();
|
|
1451
|
-
});
|
|
1452
|
-
});
|
|
1453
|
-
});
|
|
1454
|
-
|
|
1455
|
-
describe('chunkMarkdown', () => {
|
|
1456
|
-
it('should split markdown text correctly', async () => {
|
|
1457
|
-
const text = `# Header 1
|
|
1458
|
-
|
|
1459
|
-
This is some text under header 1.
|
|
1460
|
-
|
|
1461
|
-
## Header 2
|
|
1462
|
-
|
|
1463
|
-
This is some text under header 2.
|
|
1464
|
-
|
|
1465
|
-
### Header 3
|
|
1466
|
-
|
|
1467
|
-
- List item 1
|
|
1468
|
-
- List item 2`;
|
|
1469
|
-
|
|
1470
|
-
const doc = MDocument.fromMarkdown(text, { meta: 'data' });
|
|
1471
|
-
|
|
1472
|
-
await doc.chunk({
|
|
1473
|
-
strategy: 'markdown',
|
|
1474
|
-
maxSize: 100,
|
|
1475
|
-
overlap: 10,
|
|
1476
|
-
});
|
|
1477
|
-
|
|
1478
|
-
const chunks = doc.getText();
|
|
1479
|
-
expect(chunks.length).toBeGreaterThan(1);
|
|
1480
|
-
expect(chunks[0]).toContain('# Header 1');
|
|
1481
|
-
});
|
|
1482
|
-
|
|
1483
|
-
it('should handle code blocks', async () => {
|
|
1484
|
-
const text = `# Code Example
|
|
1485
|
-
|
|
1486
|
-
\`\`\`javascript
|
|
1487
|
-
function hello() {
|
|
1488
|
-
console.log('Hello, World!');
|
|
1489
|
-
}
|
|
1490
|
-
\`\`\`
|
|
1491
|
-
|
|
1492
|
-
Regular text after code block.`;
|
|
1493
|
-
|
|
1494
|
-
const doc = MDocument.fromMarkdown(text, { meta: 'data' });
|
|
1495
|
-
|
|
1496
|
-
await doc.chunk({
|
|
1497
|
-
strategy: 'markdown',
|
|
1498
|
-
maxSize: 100,
|
|
1499
|
-
overlap: 10,
|
|
1500
|
-
});
|
|
1501
|
-
|
|
1502
|
-
const chunks = doc.getText();
|
|
1503
|
-
expect(chunks.some(chunk => chunk.includes('```javascript'))).toBe(true);
|
|
1504
|
-
});
|
|
1505
|
-
});
|
|
1506
|
-
|
|
1507
|
-
describe('chunkLaTeX', () => {
|
|
1508
|
-
it('should split LaTeX text correctly based on sections', async () => {
|
|
1509
|
-
const text = `\\section{Introduction}
|
|
1510
|
-
|
|
1511
|
-
This is the introduction section.
|
|
1512
|
-
|
|
1513
|
-
\\subsection{Background}
|
|
1514
|
-
|
|
1515
|
-
Some background information.
|
|
1516
|
-
|
|
1517
|
-
\\subsubsection{Details}
|
|
1518
|
-
|
|
1519
|
-
Even more detailed explanation.
|
|
1520
|
-
|
|
1521
|
-
\\section{Conclusion}
|
|
1522
|
-
|
|
1523
|
-
Final thoughts here.`;
|
|
1524
|
-
|
|
1525
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1526
|
-
|
|
1527
|
-
await doc.chunk({
|
|
1528
|
-
strategy: 'latex',
|
|
1529
|
-
maxSize: 100,
|
|
1530
|
-
overlap: 10,
|
|
1531
|
-
keepSeparator: 'start',
|
|
1532
|
-
});
|
|
1533
|
-
|
|
1534
|
-
const chunks = doc.getText();
|
|
1535
|
-
expect(chunks.length).toBeGreaterThan(1);
|
|
1536
|
-
expect(chunks[0]).toContain('\\section{Introduction}');
|
|
1537
|
-
});
|
|
1538
|
-
|
|
1539
|
-
it('should handle environments like equations or itemize', async () => {
|
|
1540
|
-
const text = `\\section{Math Section}
|
|
1541
|
-
|
|
1542
|
-
Here is an equation:
|
|
1543
|
-
|
|
1544
|
-
\\[
|
|
1545
|
-
E = mc^2
|
|
1546
|
-
\\]
|
|
1547
|
-
|
|
1548
|
-
\\begin{itemize}
|
|
1549
|
-
\\item First item
|
|
1550
|
-
\\item Second item
|
|
1551
|
-
\\end{itemize}
|
|
1552
|
-
|
|
1553
|
-
End of the section.`;
|
|
1554
|
-
|
|
1555
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1556
|
-
|
|
1557
|
-
await doc.chunk({
|
|
1558
|
-
strategy: 'latex',
|
|
1559
|
-
maxSize: 100,
|
|
1560
|
-
overlap: 10,
|
|
1561
|
-
keepSeparator: 'start',
|
|
1562
|
-
});
|
|
1563
|
-
|
|
1564
|
-
const chunks = doc.getText();
|
|
1565
|
-
expect(chunks.some(chunk => chunk.includes('\\begin{itemize}'))).toBe(true);
|
|
1566
|
-
expect(chunks.some(chunk => chunk.includes('E = mc^2'))).toBe(true);
|
|
1567
|
-
});
|
|
1568
|
-
|
|
1569
|
-
it('should split with keepSeparator at end', async () => {
|
|
1570
|
-
const text = `Intro text here.
|
|
1571
|
-
\\section{First}
|
|
1572
|
-
Content A.
|
|
1573
|
-
|
|
1574
|
-
\\section{Second}
|
|
1575
|
-
Content B.`;
|
|
1576
|
-
|
|
1577
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1578
|
-
|
|
1579
|
-
await doc.chunk({
|
|
1580
|
-
strategy: 'latex',
|
|
1581
|
-
maxSize: 50,
|
|
1582
|
-
overlap: 0,
|
|
1583
|
-
keepSeparator: 'end',
|
|
1584
|
-
});
|
|
1585
|
-
|
|
1586
|
-
const chunks = doc.getText();
|
|
1587
|
-
expect(chunks.length).toBe(3);
|
|
1588
|
-
expect(chunks[0].trimEnd().includes('\\section{')).toBe(true);
|
|
1589
|
-
expect(chunks[1].trimEnd().includes('\\section{')).toBe(true);
|
|
1590
|
-
});
|
|
1591
|
-
|
|
1592
|
-
it('should strip whitespace correctly', async () => {
|
|
1593
|
-
const text = `\\section{Whitespace}
|
|
1594
|
-
|
|
1595
|
-
Content with leading and trailing whitespace.
|
|
1596
|
-
`;
|
|
1597
|
-
|
|
1598
|
-
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1599
|
-
|
|
1600
|
-
await doc.chunk({
|
|
1601
|
-
strategy: 'latex',
|
|
1602
|
-
maxSize: 100,
|
|
1603
|
-
overlap: 0,
|
|
1604
|
-
stripWhitespace: true,
|
|
1605
|
-
});
|
|
1606
|
-
|
|
1607
|
-
const chunks = doc.getText();
|
|
1608
|
-
expect(chunks.every(chunk => chunk === chunk.trim())).toBe(true);
|
|
1609
|
-
});
|
|
1610
|
-
});
|
|
1611
|
-
|
|
1612
|
-
describe('MarkdownHeader', () => {
|
|
1613
|
-
it('should split on headers and preserve metadata', async () => {
|
|
1614
|
-
const text = `# Main Title
|
|
1615
|
-
|
|
1616
|
-
Some content here.
|
|
1617
|
-
|
|
1618
|
-
## Section 1
|
|
1619
|
-
|
|
1620
|
-
Section 1 content.
|
|
1621
|
-
|
|
1622
|
-
### Subsection 1.1
|
|
1623
|
-
|
|
1624
|
-
Subsection content.
|
|
1625
|
-
|
|
1626
|
-
## Section 2
|
|
1627
|
-
|
|
1628
|
-
Final content.`;
|
|
1629
|
-
|
|
1630
|
-
const doc = MDocument.fromMarkdown(text);
|
|
1631
|
-
|
|
1632
|
-
await doc.chunk({
|
|
1633
|
-
strategy: 'markdown',
|
|
1634
|
-
headers: [
|
|
1635
|
-
['#', 'Header 1'],
|
|
1636
|
-
['##', 'Header 2'],
|
|
1637
|
-
['###', 'Header 3'],
|
|
1638
|
-
],
|
|
1639
|
-
});
|
|
1640
|
-
|
|
1641
|
-
const docs = doc.getDocs();
|
|
1642
|
-
|
|
1643
|
-
expect(docs.length).toBeGreaterThan(1);
|
|
1644
|
-
expect(docs?.[0]?.metadata?.['Header 1']).toBe('Main Title');
|
|
1645
|
-
|
|
1646
|
-
const section1 = docs.find(doc => doc?.metadata?.['Header 2'] === 'Section 1');
|
|
1647
|
-
expect(section1).toBeDefined();
|
|
1648
|
-
expect(section1?.text).toContain('Section 1 content');
|
|
1649
|
-
});
|
|
1650
|
-
|
|
1651
|
-
it('should handle nested headers correctly', async () => {
|
|
1652
|
-
const text = `# Top Level
|
|
1653
|
-
|
|
1654
|
-
## Section A
|
|
1655
|
-
Content A
|
|
1656
|
-
|
|
1657
|
-
### Subsection A1
|
|
1658
|
-
Content A1
|
|
1659
|
-
|
|
1660
|
-
## Section B
|
|
1661
|
-
Content B`;
|
|
1662
|
-
|
|
1663
|
-
const doc = MDocument.fromMarkdown(text, { meta: 'data' });
|
|
1664
|
-
|
|
1665
|
-
await doc.chunk({
|
|
1666
|
-
strategy: 'markdown',
|
|
1667
|
-
headers: [
|
|
1668
|
-
['#', 'Header 1'],
|
|
1669
|
-
['##', 'Header 2'],
|
|
1670
|
-
['###', 'Header 3'],
|
|
1671
|
-
],
|
|
1672
|
-
});
|
|
1673
|
-
|
|
1674
|
-
const subsectionDoc = doc.getDocs().find(doc => doc?.metadata?.['Header 3'] === 'Subsection A1');
|
|
1675
|
-
expect(subsectionDoc).toBeDefined();
|
|
1676
|
-
expect(subsectionDoc?.metadata?.['Header 1']).toBe('Top Level');
|
|
1677
|
-
expect(subsectionDoc?.metadata?.['Header 2']).toBe('Section A');
|
|
1678
|
-
});
|
|
1679
|
-
|
|
1680
|
-
it('should handle code blocks without splitting them', async () => {
|
|
1681
|
-
const text = `# Code Section
|
|
1682
|
-
|
|
1683
|
-
\`\`\`python
|
|
1684
|
-
def hello():
|
|
1685
|
-
print("Hello World")
|
|
1686
|
-
\`\`\`
|
|
1687
|
-
|
|
1688
|
-
## Next Section`;
|
|
1689
|
-
|
|
1690
|
-
const doc = MDocument.fromMarkdown(text, { meta: 'data' });
|
|
1691
|
-
|
|
1692
|
-
await doc.chunk({
|
|
1693
|
-
strategy: 'markdown',
|
|
1694
|
-
headers: [
|
|
1695
|
-
['#', 'Header 1'],
|
|
1696
|
-
['##', 'Header 2'],
|
|
1697
|
-
['###', 'Header 3'],
|
|
1698
|
-
],
|
|
1699
|
-
});
|
|
1700
|
-
|
|
1701
|
-
const codeDoc = doc.getDocs().find(doc => doc?.text?.includes('```python'));
|
|
1702
|
-
expect(codeDoc?.text).toContain('print("Hello World")');
|
|
1703
|
-
});
|
|
1704
|
-
|
|
1705
|
-
it('should respect returnEachLine option', async () => {
|
|
1706
|
-
const text = `# Title
|
|
1707
|
-
|
|
1708
|
-
Line 1
|
|
1709
|
-
Line 2
|
|
1710
|
-
Line 3`;
|
|
1711
|
-
|
|
1712
|
-
const doc = MDocument.fromMarkdown(text, { meta: 'data' });
|
|
1713
|
-
|
|
1714
|
-
await doc.chunk({
|
|
1715
|
-
strategy: 'markdown',
|
|
1716
|
-
headers: [['#', 'Header 1']],
|
|
1717
|
-
returnEachLine: true,
|
|
1718
|
-
stripHeaders: false,
|
|
1719
|
-
});
|
|
1720
|
-
|
|
1721
|
-
expect(doc.getDocs().length).toBe(4); // Title + 3 lines
|
|
1722
|
-
doc
|
|
1723
|
-
.getDocs()
|
|
1724
|
-
.slice(1)
|
|
1725
|
-
.forEach(doc => {
|
|
1726
|
-
expect(doc.metadata?.['Header 1']).toBe('Title');
|
|
1727
|
-
});
|
|
1728
|
-
});
|
|
1729
|
-
|
|
1730
|
-
it('should handle stripHeaders option', async () => {
|
|
1731
|
-
const text = `# Title
|
|
1732
|
-
|
|
1733
|
-
Content`;
|
|
1734
|
-
|
|
1735
|
-
const doc = MDocument.fromMarkdown(text, { meta: 'data' });
|
|
1736
|
-
|
|
1737
|
-
await doc.chunk({
|
|
1738
|
-
strategy: 'markdown',
|
|
1739
|
-
headers: [['#', 'Header 1']],
|
|
1740
|
-
returnEachLine: false,
|
|
1741
|
-
stripHeaders: false,
|
|
1742
|
-
});
|
|
1743
|
-
|
|
1744
|
-
const docs = doc.getDocs();
|
|
1745
|
-
expect(docs?.[0]?.text).toContain('# Title');
|
|
1746
|
-
});
|
|
1747
|
-
|
|
1748
|
-
it('should remove headers when stripHeaders: true is set in markdown chunker', async () => {
|
|
1749
|
-
const markdown = [
|
|
1750
|
-
'# H1 Title',
|
|
1751
|
-
'Some intro text.',
|
|
1752
|
-
'## H2 Subtitle',
|
|
1753
|
-
'More details.',
|
|
1754
|
-
'### H3 Section',
|
|
1755
|
-
'Final content.',
|
|
1756
|
-
].join('\n');
|
|
1757
|
-
|
|
1758
|
-
const doc = MDocument.fromMarkdown(markdown);
|
|
1759
|
-
const chunks = await doc.chunk({
|
|
1760
|
-
strategy: 'markdown',
|
|
1761
|
-
maxSize: 500,
|
|
1762
|
-
overlap: 0,
|
|
1763
|
-
headers: [
|
|
1764
|
-
['#', 'h1'],
|
|
1765
|
-
['##', 'h2'],
|
|
1766
|
-
['###', 'h3'],
|
|
1767
|
-
],
|
|
1768
|
-
stripHeaders: true,
|
|
1769
|
-
});
|
|
1770
|
-
// None of the chunk texts should start with the header patterns
|
|
1771
|
-
const headerPatterns = [/^#\s/, /^##\s/, /^###\s/];
|
|
1772
|
-
for (const chunk of chunks) {
|
|
1773
|
-
for (const pattern of headerPatterns) {
|
|
1774
|
-
expect(pattern.test(chunk.text)).toBe(false);
|
|
1775
|
-
}
|
|
1776
|
-
}
|
|
1777
|
-
});
|
|
1778
|
-
|
|
1779
|
-
it('should support custom header prefixes', async () => {
|
|
1780
|
-
const text = `!!! Important\nThis is important.\n--- Section\nSection content.`;
|
|
1781
|
-
const doc = MDocument.fromMarkdown(text);
|
|
1782
|
-
await doc.chunk({
|
|
1783
|
-
strategy: 'markdown',
|
|
1784
|
-
headers: [
|
|
1785
|
-
['!!!', 'important'],
|
|
1786
|
-
['---', 'section'],
|
|
1787
|
-
],
|
|
1788
|
-
stripHeaders: true,
|
|
1789
|
-
});
|
|
1790
|
-
const texts = doc.getText();
|
|
1791
|
-
expect(texts.some(t => t.startsWith('!!!'))).toBe(false);
|
|
1792
|
-
expect(texts.some(t => t.startsWith('---'))).toBe(false);
|
|
1793
|
-
});
|
|
1794
|
-
|
|
1795
|
-
it('should attach correct metadata for nested headers', async () => {
|
|
1796
|
-
const text = `# H1\n## H2\n### H3\nContent`;
|
|
1797
|
-
const doc = MDocument.fromMarkdown(text);
|
|
1798
|
-
await doc.chunk({
|
|
1799
|
-
strategy: 'markdown',
|
|
1800
|
-
headers: [
|
|
1801
|
-
['#', 'h1'],
|
|
1802
|
-
['##', 'h2'],
|
|
1803
|
-
['###', 'h3'],
|
|
1804
|
-
],
|
|
1805
|
-
stripHeaders: true,
|
|
1806
|
-
});
|
|
1807
|
-
const chunk = doc.getDocs().find(c => c.text.includes('Content'));
|
|
1808
|
-
expect(chunk?.metadata?.h1).toBe('H1');
|
|
1809
|
-
expect(chunk?.metadata?.h2).toBe('H2');
|
|
1810
|
-
expect(chunk?.metadata?.h3).toBe('H3');
|
|
1811
|
-
});
|
|
1812
|
-
|
|
1813
|
-
it('should include header lines as chunks if stripHeaders is false', async () => {
|
|
1814
|
-
const text = `# H1\nContent`;
|
|
1815
|
-
const doc = MDocument.fromMarkdown(text);
|
|
1816
|
-
await doc.chunk({
|
|
1817
|
-
strategy: 'markdown',
|
|
1818
|
-
headers: [['#', 'h1']],
|
|
1819
|
-
stripHeaders: false,
|
|
1820
|
-
});
|
|
1821
|
-
const texts = doc.getText();
|
|
1822
|
-
expect(texts.some(t => t.startsWith('# H1'))).toBe(true);
|
|
1823
|
-
});
|
|
1824
|
-
|
|
1825
|
-
it('should handle multiple adjacent headers correctly', async () => {
|
|
1826
|
-
const text = `# H1\n## H2\n### H3\nContent`;
|
|
1827
|
-
const doc = MDocument.fromMarkdown(text);
|
|
1828
|
-
await doc.chunk({
|
|
1829
|
-
strategy: 'markdown',
|
|
1830
|
-
headers: [
|
|
1831
|
-
['#', 'h1'],
|
|
1832
|
-
['##', 'h2'],
|
|
1833
|
-
['###', 'h3'],
|
|
1834
|
-
],
|
|
1835
|
-
stripHeaders: true,
|
|
1836
|
-
});
|
|
1837
|
-
const texts = doc.getText();
|
|
1838
|
-
expect(texts.some(t => t === 'Content')).toBe(true);
|
|
1839
|
-
expect(texts.some(t => t === '')).toBe(false);
|
|
1840
|
-
});
|
|
1841
|
-
|
|
1842
|
-
it('should handle content before any header', async () => {
|
|
1843
|
-
const text = `Intro before header\n# H1\nContent`;
|
|
1844
|
-
const doc = MDocument.fromMarkdown(text);
|
|
1845
|
-
await doc.chunk({
|
|
1846
|
-
strategy: 'markdown',
|
|
1847
|
-
headers: [['#', 'h1']],
|
|
1848
|
-
stripHeaders: true,
|
|
1849
|
-
});
|
|
1850
|
-
const preHeaderChunk = doc.getDocs().find(c => c.text.includes('Intro before header'));
|
|
1851
|
-
expect(preHeaderChunk?.metadata?.h1).toBeUndefined();
|
|
1852
|
-
});
|
|
1853
|
-
|
|
1854
|
-
it('should not treat headers inside code blocks as headers', async () => {
|
|
1855
|
-
const text = ['# Real Header', '```', '# Not a header', '```', 'Content'].join('\n');
|
|
1856
|
-
const doc = MDocument.fromMarkdown(text);
|
|
1857
|
-
await doc.chunk({
|
|
1858
|
-
strategy: 'markdown',
|
|
1859
|
-
headers: [['#', 'h1']],
|
|
1860
|
-
stripHeaders: true,
|
|
1861
|
-
});
|
|
1862
|
-
const texts = doc.getText();
|
|
1863
|
-
expect(texts.some(t => t.includes('# Not a header'))).toBe(true);
|
|
1864
|
-
expect(texts.some(t => t.startsWith('# Real Header'))).toBe(false);
|
|
1865
|
-
});
|
|
1866
|
-
});
|
|
1867
|
-
|
|
1868
|
-
describe('metadata extraction', () => {
|
|
1869
|
-
it('should extract metadata with default settings', async () => {
|
|
1870
|
-
const doc = MDocument.fromMarkdown(
|
|
1871
|
-
'# AI and Machine Learning\n\nThis is a test document about artificial intelligence and machine learning.',
|
|
1872
|
-
);
|
|
1873
|
-
|
|
1874
|
-
const chunks = await doc.chunk({
|
|
1875
|
-
strategy: 'markdown',
|
|
1876
|
-
extract: {
|
|
1877
|
-
title: true,
|
|
1878
|
-
summary: true,
|
|
1879
|
-
keywords: true,
|
|
1880
|
-
},
|
|
1881
|
-
});
|
|
1882
|
-
|
|
1883
|
-
const metadata = chunks[0].metadata;
|
|
1884
|
-
expect(metadata).toBeDefined();
|
|
1885
|
-
expect(metadata.documentTitle).toBeDefined();
|
|
1886
|
-
expect(metadata.sectionSummary).toBeDefined();
|
|
1887
|
-
expect(metadata.excerptKeywords).toMatch(/^KEYWORDS: .*/);
|
|
1888
|
-
}, 15000);
|
|
1889
|
-
|
|
1890
|
-
it('should extract metadata with custom settings', async () => {
|
|
1891
|
-
const doc = MDocument.fromMarkdown(
|
|
1892
|
-
'# AI and Machine Learning\n\nThis is a test document about artificial intelligence and machine learning.',
|
|
1893
|
-
);
|
|
1894
|
-
|
|
1895
|
-
const chunks = await doc.chunk({
|
|
1896
|
-
strategy: 'markdown',
|
|
1897
|
-
extract: {
|
|
1898
|
-
title: {
|
|
1899
|
-
nodes: 2,
|
|
1900
|
-
nodeTemplate: 'Generate a title for this: {context}',
|
|
1901
|
-
combineTemplate: 'Combine these titles: {context}',
|
|
1902
|
-
},
|
|
1903
|
-
summary: {
|
|
1904
|
-
summaries: ['self'],
|
|
1905
|
-
promptTemplate: 'Summarize this: {context}',
|
|
1906
|
-
},
|
|
1907
|
-
questions: {
|
|
1908
|
-
questions: 2,
|
|
1909
|
-
promptTemplate: 'Generate {numQuestions} questions about: {context}',
|
|
1910
|
-
},
|
|
1911
|
-
keywords: {
|
|
1912
|
-
keywords: 3,
|
|
1913
|
-
promptTemplate: 'Extract {maxKeywords} key terms from: {context}',
|
|
1914
|
-
},
|
|
1915
|
-
},
|
|
1916
|
-
});
|
|
1917
|
-
|
|
1918
|
-
const metadata = chunks[0].metadata;
|
|
1919
|
-
expect(metadata).toBeDefined();
|
|
1920
|
-
expect(metadata.documentTitle).toBeDefined();
|
|
1921
|
-
expect(metadata.sectionSummary).toBeDefined();
|
|
1922
|
-
const qStr = metadata.questionsThisExcerptCanAnswer;
|
|
1923
|
-
expect(qStr).toMatch(/1\..*\?/s);
|
|
1924
|
-
expect(qStr).toMatch(/2\..*\?/s);
|
|
1925
|
-
expect((qStr.match(/\?/g) || []).length).toBeGreaterThanOrEqual(2);
|
|
1926
|
-
expect(metadata.excerptKeywords).toMatch(/^1\. .*\n2\. .*\n3\. .*$/);
|
|
1927
|
-
}, 15000);
|
|
1928
|
-
|
|
1929
|
-
it('should handle invalid summary types', async () => {
|
|
1930
|
-
const doc = MDocument.fromText('Test document');
|
|
1931
|
-
|
|
1932
|
-
await expect(
|
|
1933
|
-
doc.chunk({
|
|
1934
|
-
extract: {
|
|
1935
|
-
summary: {
|
|
1936
|
-
summaries: ['invalid'],
|
|
1937
|
-
},
|
|
1938
|
-
},
|
|
1939
|
-
}),
|
|
1940
|
-
).rejects.toThrow("Summaries must be one of 'self', 'prev', 'next'");
|
|
1941
|
-
}, 15000);
|
|
1942
|
-
});
|
|
1943
|
-
|
|
1944
|
-
describe('metadata preservation', () => {
|
|
1945
|
-
const baseText = 'This is a test document for metadata extraction.';
|
|
1946
|
-
const baseMetadata = { source: 'unit-test', customField: 123 };
|
|
1947
|
-
|
|
1948
|
-
it('preserves metadata with KeywordExtractor', async () => {
|
|
1949
|
-
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1950
|
-
const chunks = await doc.chunk({ extract: { keywords: true } });
|
|
1951
|
-
const metadata = chunks[0].metadata;
|
|
1952
|
-
expect(metadata.source).toBe('unit-test');
|
|
1953
|
-
expect(metadata.customField).toBe(123);
|
|
1954
|
-
expect(metadata.excerptKeywords).toBeDefined();
|
|
1955
|
-
});
|
|
1956
|
-
|
|
1957
|
-
it('preserves metadata with SummaryExtractor', async () => {
|
|
1958
|
-
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1959
|
-
const chunks = await doc.chunk({ extract: { summary: true } });
|
|
1960
|
-
const metadata = chunks[0].metadata;
|
|
1961
|
-
expect(metadata.source).toBe('unit-test');
|
|
1962
|
-
expect(metadata.customField).toBe(123);
|
|
1963
|
-
expect(metadata.sectionSummary).toBeDefined();
|
|
1964
|
-
});
|
|
1965
|
-
|
|
1966
|
-
it('preserves metadata with QuestionsAnsweredExtractor', async () => {
|
|
1967
|
-
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1968
|
-
const chunks = await doc.chunk({ extract: { questions: true } });
|
|
1969
|
-
const metadata = chunks[0].metadata;
|
|
1970
|
-
expect(metadata.source).toBe('unit-test');
|
|
1971
|
-
expect(metadata.customField).toBe(123);
|
|
1972
|
-
expect(metadata.questionsThisExcerptCanAnswer).toBeDefined();
|
|
1973
|
-
});
|
|
1974
|
-
|
|
1975
|
-
it('preserves metadata with TitleExtractor', async () => {
|
|
1976
|
-
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1977
|
-
const chunks = await doc.chunk({ extract: { title: true } });
|
|
1978
|
-
const metadata = chunks[0].metadata;
|
|
1979
|
-
expect(metadata.source).toBe('unit-test');
|
|
1980
|
-
expect(metadata.customField).toBe(123);
|
|
1981
|
-
expect(metadata.documentTitle).toBeDefined();
|
|
1982
|
-
});
|
|
1983
|
-
|
|
1984
|
-
it('preserves metadata with multiple extractors', async () => {
|
|
1985
|
-
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1986
|
-
const chunks = await doc.chunk({
|
|
1987
|
-
extract: {
|
|
1988
|
-
keywords: true,
|
|
1989
|
-
summary: true,
|
|
1990
|
-
questions: true,
|
|
1991
|
-
title: true,
|
|
1992
|
-
},
|
|
1993
|
-
});
|
|
1994
|
-
const metadata = chunks[0].metadata;
|
|
1995
|
-
expect(metadata.source).toBe('unit-test');
|
|
1996
|
-
expect(metadata.customField).toBe(123);
|
|
1997
|
-
expect(metadata.excerptKeywords).toBeDefined();
|
|
1998
|
-
expect(metadata.sectionSummary).toBeDefined();
|
|
1999
|
-
expect(metadata.questionsThisExcerptCanAnswer).toBeDefined();
|
|
2000
|
-
expect(metadata.documentTitle).toBeDefined();
|
|
2001
|
-
});
|
|
2002
|
-
it('preserves metadata on all chunks when multiple are created', async () => {
|
|
2003
|
-
const text = 'Chunk one.\n\nChunk two.\n\nChunk three.';
|
|
2004
|
-
const doc = MDocument.fromText(text, { source: 'multi-chunk', customField: 42 });
|
|
2005
|
-
const chunks = await doc.chunk({
|
|
2006
|
-
strategy: 'character',
|
|
2007
|
-
separator: '\n\n',
|
|
2008
|
-
maxSize: 20,
|
|
2009
|
-
overlap: 0,
|
|
2010
|
-
extract: { keywords: true },
|
|
2011
|
-
});
|
|
2012
|
-
expect(chunks.length).toBeGreaterThan(1);
|
|
2013
|
-
for (const chunk of chunks) {
|
|
2014
|
-
const metadata = chunk.metadata;
|
|
2015
|
-
expect(metadata.source).toBe('multi-chunk');
|
|
2016
|
-
expect(metadata.customField).toBe(42);
|
|
2017
|
-
expect(metadata.excerptKeywords).toBeDefined();
|
|
2018
|
-
}
|
|
2019
|
-
});
|
|
2020
|
-
|
|
2021
|
-
it('overwrites only the matching metadata field with extractor output', async () => {
|
|
2022
|
-
const doc = MDocument.fromText('Test for overwrite', {
|
|
2023
|
-
excerptKeywords: 'original,keywords',
|
|
2024
|
-
unrelatedField: 'should stay',
|
|
2025
|
-
source: 'unit-test',
|
|
2026
|
-
});
|
|
2027
|
-
const chunks = await doc.chunk({ extract: { keywords: true } });
|
|
2028
|
-
const metadata = chunks[0].metadata;
|
|
2029
|
-
expect(metadata.source).toBe('unit-test');
|
|
2030
|
-
expect(metadata.unrelatedField).toBe('should stay');
|
|
2031
|
-
expect(metadata.excerptKeywords).not.toBe('original,keywords'); // Should be new keywords
|
|
2032
|
-
});
|
|
2033
|
-
});
|
|
2034
|
-
describe('MDocument TitleExtractor document grouping integration', () => {
|
|
2035
|
-
it('groups chunks by docId for title extraction (integration)', async () => {
|
|
2036
|
-
const doc = new MDocument({
|
|
2037
|
-
docs: [
|
|
2038
|
-
{ text: 'Alpha chunk 1', metadata: { docId: 'docA' } },
|
|
2039
|
-
{ text: 'Alpha chunk 2', metadata: { docId: 'docA' } },
|
|
2040
|
-
{ text: 'Beta chunk 1', metadata: { docId: 'docB' } },
|
|
2041
|
-
],
|
|
2042
|
-
type: 'text',
|
|
2043
|
-
});
|
|
2044
|
-
|
|
2045
|
-
await doc.extractMetadata({ title: true });
|
|
2046
|
-
const chunks = doc.getDocs();
|
|
2047
|
-
|
|
2048
|
-
const titleA1 = chunks[0].metadata.documentTitle;
|
|
2049
|
-
const titleA2 = chunks[1].metadata.documentTitle;
|
|
2050
|
-
const titleB = chunks[2].metadata.documentTitle;
|
|
2051
|
-
|
|
2052
|
-
expect(titleA1).toBeDefined();
|
|
2053
|
-
expect(titleA2).toBeDefined();
|
|
2054
|
-
expect(titleB).toBeDefined();
|
|
2055
|
-
expect(titleA1).toBe(titleA2);
|
|
2056
|
-
expect(titleA1).not.toBe(titleB);
|
|
2057
|
-
});
|
|
2058
|
-
});
|
|
2059
|
-
|
|
2060
|
-
describe('chunkSentence', () => {
|
|
2061
|
-
it('should preserve sentence structure and avoid mid-sentence breaks', async () => {
|
|
2062
|
-
const text =
|
|
2063
|
-
'A dynamic concert scene captures an energetic, vibrant atmosphere, with a densely packed crowd silhouetted against bright stage lights. The image features beams of white light radiating from multiple projectors, creating dramatic patterns across a darkened room. The audience, comprised of numerous people with raised hands, exudes excitement and engagement, enhancing the lively mood. The setting suggests a large indoor venue, possibly a music or worship event, with text visible on a screen in the background, adding to an immersive experience. The overall composition emphasizes a sense of community and shared enthusiasm, ideal for promoting entertainment events, live concerts, or communal gatherings. The high-contrast lighting and slight haze effect imbue the scene with a modern, electrifying quality.';
|
|
2064
|
-
|
|
2065
|
-
const doc = MDocument.fromText(text);
|
|
2066
|
-
|
|
2067
|
-
const chunks = await doc.chunk({
|
|
2068
|
-
strategy: 'sentence',
|
|
2069
|
-
minSize: 50,
|
|
2070
|
-
maxSize: 450,
|
|
2071
|
-
overlap: 0,
|
|
2072
|
-
sentenceEnders: ['.'],
|
|
2073
|
-
keepSeparator: true,
|
|
2074
|
-
});
|
|
2075
|
-
|
|
2076
|
-
expect(chunks.length).toBeGreaterThan(1);
|
|
2077
|
-
|
|
2078
|
-
chunks.forEach(chunk => {
|
|
2079
|
-
expect(chunk.text.length).toBeGreaterThanOrEqual(50);
|
|
2080
|
-
expect(chunk.text.length).toBeLessThanOrEqual(450);
|
|
2081
|
-
|
|
2082
|
-
expect(chunk.text.startsWith('.')).toBe(false);
|
|
2083
|
-
expect(chunk.text.startsWith(' .')).toBe(false);
|
|
2084
|
-
|
|
2085
|
-
expect(chunk.text.endsWith('.')).toBe(true);
|
|
2086
|
-
});
|
|
2087
|
-
});
|
|
2088
|
-
|
|
2089
|
-
it('should require maxSize parameter', async () => {
|
|
2090
|
-
const doc = MDocument.fromText('Short text.');
|
|
2091
|
-
|
|
2092
|
-
await expect(
|
|
2093
|
-
doc.chunk({
|
|
2094
|
-
strategy: 'sentence',
|
|
2095
|
-
minSize: 50,
|
|
2096
|
-
} as any),
|
|
2097
|
-
).rejects.toThrow('Invalid parameters for sentence strategy: maxSize: Required');
|
|
2098
|
-
});
|
|
2099
|
-
|
|
2100
|
-
it('should handle custom sentence enders', async () => {
|
|
2101
|
-
const text =
|
|
2102
|
-
'First sentence with more content to make it longer. Second sentence with additional content! Third sentence with even more text? Fourth sentence with final content.';
|
|
2103
|
-
|
|
2104
|
-
const doc = MDocument.fromText(text);
|
|
2105
|
-
|
|
2106
|
-
const chunks = await doc.chunk({
|
|
2107
|
-
strategy: 'sentence',
|
|
2108
|
-
maxSize: 100,
|
|
2109
|
-
sentenceEnders: ['.', '!', '?'],
|
|
2110
|
-
keepSeparator: true,
|
|
2111
|
-
});
|
|
2112
|
-
|
|
2113
|
-
expect(chunks.length).toBeGreaterThan(1);
|
|
2114
|
-
|
|
2115
|
-
chunks.forEach(chunk => {
|
|
2116
|
-
const endsWithValidSeparator = chunk.text.endsWith('.') || chunk.text.endsWith('!') || chunk.text.endsWith('?');
|
|
2117
|
-
expect(endsWithValidSeparator).toBe(true);
|
|
2118
|
-
});
|
|
2119
|
-
});
|
|
2120
|
-
|
|
2121
|
-
it('should handle overlap with complete sentences', async () => {
|
|
2122
|
-
const text =
|
|
2123
|
-
'First sentence with some content that makes it quite long. Second sentence with different content that also makes it lengthy. Third sentence with more content to ensure multiple chunks. Fourth sentence with final content to complete the test.';
|
|
2124
|
-
|
|
2125
|
-
const doc = MDocument.fromText(text);
|
|
2126
|
-
|
|
2127
|
-
const chunks = await doc.chunk({
|
|
2128
|
-
strategy: 'sentence',
|
|
2129
|
-
maxSize: 120,
|
|
2130
|
-
overlap: 50,
|
|
2131
|
-
sentenceEnders: ['.'],
|
|
2132
|
-
keepSeparator: true,
|
|
2133
|
-
});
|
|
2134
|
-
|
|
2135
|
-
expect(chunks.length).toBeGreaterThan(1);
|
|
2136
|
-
|
|
2137
|
-
// Check that overlapping chunks share some content
|
|
2138
|
-
if (chunks.length > 1) {
|
|
2139
|
-
for (let i = 1; i < chunks.length; i++) {
|
|
2140
|
-
const currentChunk = chunks[i].text;
|
|
2141
|
-
|
|
2142
|
-
// With overlap, current chunk should start with some content from previous chunk
|
|
2143
|
-
// Just verify that overlap is being applied (chunk 2 starts with overlap from chunk 1)
|
|
2144
|
-
expect(currentChunk.length).toBeGreaterThan(50); // Should include overlap content
|
|
2145
|
-
}
|
|
2146
|
-
}
|
|
2147
|
-
});
|
|
2148
|
-
|
|
2149
|
-
it('should fallback to word splitting for oversized sentences', async () => {
|
|
2150
|
-
const longSentence =
|
|
2151
|
-
'This is an extremely long sentence that ' +
|
|
2152
|
-
'word '.repeat(50) +
|
|
2153
|
-
'and should be split into smaller chunks when it exceeds the maximum size limit.';
|
|
2154
|
-
|
|
2155
|
-
const doc = MDocument.fromText(longSentence);
|
|
2156
|
-
|
|
2157
|
-
const chunks = await doc.chunk({
|
|
2158
|
-
strategy: 'sentence',
|
|
2159
|
-
maxSize: 100,
|
|
2160
|
-
fallbackToWords: true,
|
|
2161
|
-
});
|
|
2162
|
-
|
|
2163
|
-
expect(chunks.length).toBeGreaterThan(1);
|
|
2164
|
-
|
|
2165
|
-
chunks.forEach(chunk => {
|
|
2166
|
-
expect(chunk.text.length).toBeLessThanOrEqual(100);
|
|
2167
|
-
});
|
|
2168
|
-
});
|
|
2169
|
-
|
|
2170
|
-
it('should handle short text appropriately', async () => {
|
|
2171
|
-
const text = 'Short sentence.';
|
|
2172
|
-
|
|
2173
|
-
const doc = MDocument.fromText(text);
|
|
2174
|
-
|
|
2175
|
-
const chunks = await doc.chunk({
|
|
2176
|
-
strategy: 'sentence',
|
|
2177
|
-
minSize: 5,
|
|
2178
|
-
maxSize: 100,
|
|
2179
|
-
sentenceEnders: ['.'],
|
|
2180
|
-
keepSeparator: true,
|
|
2181
|
-
});
|
|
2182
|
-
|
|
2183
|
-
expect(chunks.length).toBe(1);
|
|
2184
|
-
expect(chunks[0].text).toBe(text);
|
|
2185
|
-
});
|
|
2186
|
-
|
|
2187
|
-
it('should group multiple sentences when they fit within target size', async () => {
|
|
2188
|
-
const text = 'Short one. Another short. Third short. Fourth sentence. Fifth one.';
|
|
2189
|
-
|
|
2190
|
-
const doc = MDocument.fromText(text);
|
|
2191
|
-
|
|
2192
|
-
const chunks = await doc.chunk({
|
|
2193
|
-
strategy: 'sentence',
|
|
2194
|
-
minSize: 10,
|
|
2195
|
-
maxSize: 100,
|
|
2196
|
-
targetSize: 40,
|
|
2197
|
-
sentenceEnders: ['.'],
|
|
2198
|
-
keepSeparator: true,
|
|
2199
|
-
});
|
|
2200
|
-
|
|
2201
|
-
// Should group multiple short sentences together
|
|
2202
|
-
expect(chunks.length).toBeLessThan(5); // Less than the number of sentences
|
|
2203
|
-
|
|
2204
|
-
chunks.forEach(chunk => {
|
|
2205
|
-
// Each chunk should contain multiple sentences when possible
|
|
2206
|
-
expect(chunk.text.length).toBeLessThanOrEqual(100);
|
|
2207
|
-
});
|
|
2208
|
-
});
|
|
2209
|
-
|
|
2210
|
-
it('should preserve metadata across chunks', async () => {
|
|
2211
|
-
const text =
|
|
2212
|
-
'First sentence with enough content to make it longer than fifty characters. Second sentence with additional content to ensure multiple chunks. Third sentence with final content.';
|
|
2213
|
-
const metadata = { source: 'test', author: 'jest' };
|
|
2214
|
-
|
|
2215
|
-
const doc = MDocument.fromText(text, metadata);
|
|
2216
|
-
|
|
2217
|
-
const chunks = await doc.chunk({
|
|
2218
|
-
strategy: 'sentence',
|
|
2219
|
-
maxSize: 100,
|
|
2220
|
-
sentenceEnders: ['.'],
|
|
2221
|
-
keepSeparator: true,
|
|
2222
|
-
});
|
|
2223
|
-
|
|
2224
|
-
expect(chunks.length).toBeGreaterThan(1);
|
|
2225
|
-
|
|
2226
|
-
chunks.forEach(chunk => {
|
|
2227
|
-
expect(chunk.metadata.source).toBe('test');
|
|
2228
|
-
expect(chunk.metadata.author).toBe('jest');
|
|
2229
|
-
});
|
|
2230
|
-
});
|
|
2231
|
-
|
|
2232
|
-
it('should handle abbreviations without false sentence breaks', async () => {
|
|
2233
|
-
const text =
|
|
2234
|
-
'Dr. Smith went to the U.S.A. at 3:30 a.m. on Monday. He met with Prof. Johnson at the U.N. headquarters.';
|
|
2235
|
-
|
|
2236
|
-
const doc = MDocument.fromText(text);
|
|
2237
|
-
const chunks = await doc.chunk({
|
|
2238
|
-
strategy: 'sentence',
|
|
2239
|
-
maxSize: 200,
|
|
2240
|
-
sentenceEnders: ['.'],
|
|
2241
|
-
keepSeparator: true,
|
|
2242
|
-
});
|
|
2243
|
-
|
|
2244
|
-
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2245
|
-
expect(chunks.length).toBeLessThanOrEqual(2);
|
|
2246
|
-
|
|
2247
|
-
const allText = chunks.map(c => c.text).join(' ');
|
|
2248
|
-
expect(allText).toContain('Dr. Smith'); // Should keep Dr. together
|
|
2249
|
-
expect(allText).toContain('U.S.A.'); // Should keep U.S.A. together
|
|
2250
|
-
expect(allText).toContain('a.m.'); // Should keep a.m. together
|
|
2251
|
-
expect(allText).toContain('Prof. Johnson'); // Should keep Prof. together
|
|
2252
|
-
expect(allText).toContain('U.N.'); // Should keep U.N. together
|
|
2253
|
-
|
|
2254
|
-
expect(allText).not.toContain('Dr '); // No broken Dr.
|
|
2255
|
-
expect(allText).not.toContain('Prof '); // No broken Prof.
|
|
2256
|
-
});
|
|
2257
|
-
|
|
2258
|
-
it('should respect fallbackToCharacters setting', async () => {
|
|
2259
|
-
const oversizedWord = 'supercalifragilisticexpialidocious'.repeat(5);
|
|
2260
|
-
const text = `Short sentence. ${oversizedWord}.`;
|
|
2261
|
-
|
|
2262
|
-
const doc1 = MDocument.fromText(text);
|
|
2263
|
-
const chunksWithFallback = await doc1.chunk({
|
|
2264
|
-
strategy: 'sentence',
|
|
2265
|
-
maxSize: 50,
|
|
2266
|
-
fallbackToWords: true,
|
|
2267
|
-
fallbackToCharacters: true,
|
|
2268
|
-
});
|
|
2269
|
-
|
|
2270
|
-
// Should split the oversized word
|
|
2271
|
-
expect(chunksWithFallback.length).toBeGreaterThan(2);
|
|
2272
|
-
|
|
2273
|
-
const doc2 = MDocument.fromText(text);
|
|
2274
|
-
const chunksWithoutFallback = await doc2.chunk({
|
|
2275
|
-
strategy: 'sentence',
|
|
2276
|
-
maxSize: 50,
|
|
2277
|
-
fallbackToWords: true,
|
|
2278
|
-
fallbackToCharacters: false,
|
|
2279
|
-
});
|
|
2280
|
-
|
|
2281
|
-
// Should have fewer chunks (oversized word kept intact)
|
|
2282
|
-
expect(chunksWithoutFallback.length).toBeLessThan(chunksWithFallback.length);
|
|
2283
|
-
|
|
2284
|
-
// Verify fallback disabled keeps oversized content
|
|
2285
|
-
const oversizedChunk = chunksWithoutFallback.find(chunk => chunk.text.length > 50);
|
|
2286
|
-
expect(oversizedChunk).toBeDefined();
|
|
2287
|
-
});
|
|
2288
|
-
|
|
2289
|
-
it('should handle complex punctuation and edge cases', async () => {
|
|
2290
|
-
const text =
|
|
2291
|
-
'Version 2.0 was released. The score was 3.14159. Mr. & Mrs. Smith arrived at 12:30 p.m. What happened next?';
|
|
2292
|
-
|
|
2293
|
-
const doc = MDocument.fromText(text);
|
|
2294
|
-
const chunks = await doc.chunk({
|
|
2295
|
-
strategy: 'sentence',
|
|
2296
|
-
maxSize: 200,
|
|
2297
|
-
sentenceEnders: ['.', '?'],
|
|
2298
|
-
keepSeparator: true,
|
|
2299
|
-
});
|
|
2300
|
-
|
|
2301
|
-
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2302
|
-
expect(chunks.length).toBeLessThanOrEqual(4);
|
|
2303
|
-
|
|
2304
|
-
const allText = chunks.map(c => c.text).join(' ');
|
|
2305
|
-
expect(allText).toContain('2.0'); // Should keep version numbers intact
|
|
2306
|
-
expect(allText).toContain('3.14159'); // Should keep decimals intact
|
|
2307
|
-
expect(allText).toContain('p.m.'); // Should keep time abbreviations intact
|
|
2308
|
-
expect(allText).toContain('What happened next?'); // Should end with question
|
|
2309
|
-
|
|
2310
|
-
// Should not break on decimals or version numbers
|
|
2311
|
-
expect(allText).not.toContain('2 '); // No broken version number
|
|
2312
|
-
expect(allText).not.toContain('3 '); // No broken decimal
|
|
2313
|
-
});
|
|
2314
|
-
});
|
|
2315
|
-
|
|
2316
|
-
describe('chunkSemanticMarkdown', () => {
|
|
2317
|
-
it('should merge small sections based on token threshold', async () => {
|
|
2318
|
-
const text = `# Introduction
|
|
2319
|
-
Brief intro paragraph.
|
|
2320
|
-
|
|
2321
|
-
## Setup Guide
|
|
2322
|
-
Short setup instructions.
|
|
2323
|
-
|
|
2324
|
-
### Prerequisites
|
|
2325
|
-
Very short list.
|
|
2326
|
-
|
|
2327
|
-
### Installation Steps
|
|
2328
|
-
Very detailed installation process with code examples and explanations that would normally be quite long but in this test we'll keep it moderate length for testing purposes.
|
|
2329
|
-
|
|
2330
|
-
## Advanced Configuration
|
|
2331
|
-
Another section with moderate content for testing the merging algorithm.`;
|
|
2332
|
-
|
|
2333
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2334
|
-
|
|
2335
|
-
await doc.chunk({
|
|
2336
|
-
strategy: 'semantic-markdown',
|
|
2337
|
-
joinThreshold: 200,
|
|
2338
|
-
});
|
|
2339
|
-
|
|
2340
|
-
const chunks = doc.getText();
|
|
2341
|
-
const docs = doc.getDocs();
|
|
2342
|
-
|
|
2343
|
-
expect(chunks.length).toBeLessThan(6);
|
|
2344
|
-
|
|
2345
|
-
expect(docs[0]?.metadata?.tokenCount).toBeDefined();
|
|
2346
|
-
expect(typeof docs[0]?.metadata?.tokenCount).toBe('number');
|
|
2347
|
-
expect(docs[0]?.metadata?.tokenCount).toBeGreaterThan(0);
|
|
2348
|
-
});
|
|
2349
|
-
|
|
2350
|
-
it('should respect sibling/parent relationships in merging', async () => {
|
|
2351
|
-
const text = `# Main Document
|
|
2352
|
-
|
|
2353
|
-
## Section A
|
|
2354
|
-
Content for section A that is moderately long to ensure we have enough tokens for testing the semantic merging algorithm properly.
|
|
2355
|
-
|
|
2356
|
-
### Subsection A1
|
|
2357
|
-
This subsection has more content than the previous version to test the hierarchical merging behavior.
|
|
2358
|
-
|
|
2359
|
-
### Subsection A2
|
|
2360
|
-
Another subsection with substantial content to verify proper semantic boundary handling.
|
|
2361
|
-
|
|
2362
|
-
## Section B
|
|
2363
|
-
Content for section B that is also moderately sized with meaningful text to test cross-section merging behavior.
|
|
2364
|
-
|
|
2365
|
-
### Subsection B1
|
|
2366
|
-
This final subsection contains enough content to test the bottom-up merging algorithm effectively.`;
|
|
2367
|
-
|
|
2368
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2369
|
-
|
|
2370
|
-
await doc.chunk({
|
|
2371
|
-
strategy: 'semantic-markdown',
|
|
2372
|
-
joinThreshold: 100, // Threshold that allows some merging but not everything
|
|
2373
|
-
});
|
|
2374
|
-
|
|
2375
|
-
const chunks = doc.getText();
|
|
2376
|
-
const docs = doc.getDocs();
|
|
2377
|
-
|
|
2378
|
-
// Should create fewer chunks than original sections due to merging
|
|
2379
|
-
expect(chunks.length).toBeLessThan(7);
|
|
2380
|
-
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2381
|
-
|
|
2382
|
-
// Verify sections maintain semantic coherence
|
|
2383
|
-
const hasSection = chunks.some(chunk => chunk.includes('Section A') || chunk.includes('Subsection A1'));
|
|
2384
|
-
expect(hasSection).toBe(true);
|
|
2385
|
-
|
|
2386
|
-
expect(docs[0]?.metadata?.tokenCount).toBeDefined();
|
|
2387
|
-
expect(docs[0]?.metadata?.tokenCount).toBeGreaterThan(0);
|
|
2388
|
-
});
|
|
2389
|
-
|
|
2390
|
-
it('should correctly chunk a controlled test document', async () => {
|
|
2391
|
-
const controlledTestMarkdown = `# My Test Document
|
|
2392
|
-
|
|
2393
|
-
This is a short preamble to test how content before the first header is handled. It should be merged with the first section if that section is small enough.
|
|
2394
|
-
|
|
2395
|
-
## Chapter 1: The Small Sections
|
|
2396
|
-
|
|
2397
|
-
This is the introduction to Chapter 1. It contains several small subsections that are perfect candidates for merging.
|
|
2398
|
-
|
|
2399
|
-
### Section 1.1: A Tiny Topic
|
|
2400
|
-
|
|
2401
|
-
Just a few words here.
|
|
2402
|
-
|
|
2403
|
-
### Section 1.2: Another Tiny Topic
|
|
2404
|
-
|
|
2405
|
-
A few more words to make up a small paragraph.
|
|
2406
|
-
|
|
2407
|
-
## Chapter 2: The Big Section
|
|
2408
|
-
|
|
2409
|
-
This chapter has a very large section that should NOT be merged with its sibling because it is over the token limit all by itself.
|
|
2410
|
-
|
|
2411
|
-
\`\`\`python
|
|
2412
|
-
# This is a large block of Python code.
|
|
2413
|
-
# It is designed to have a high token count to test the merging threshold.
|
|
2414
|
-
import os
|
|
2415
|
-
import sys
|
|
2416
|
-
|
|
2417
|
-
class DataProcessor:
|
|
2418
|
-
def __init__(self, data):
|
|
2419
|
-
self.data = data
|
|
2420
|
-
self.length = len(data)
|
|
2421
|
-
|
|
2422
|
-
def process(self):
|
|
2423
|
-
"""
|
|
2424
|
-
This is a long docstring to add even more tokens to the count.
|
|
2425
|
-
We will iterate through the data and perform some kind of mock processing.
|
|
2426
|
-
The goal is to exceed the joinThreshold of 250 tokens easily.
|
|
2427
|
-
Let's add more lines to be sure.
|
|
2428
|
-
Line 1
|
|
2429
|
-
Line 2
|
|
2430
|
-
Line 3
|
|
2431
|
-
Line 4
|
|
2432
|
-
Line 5
|
|
2433
|
-
...and so on.
|
|
2434
|
-
"""
|
|
2435
|
-
results = []
|
|
2436
|
-
for i, item in enumerate(self.data):
|
|
2437
|
-
# A mock calculation
|
|
2438
|
-
processed_item = (item * i) + self.length
|
|
2439
|
-
results.append(processed_item)
|
|
2440
|
-
return results
|
|
2441
|
-
|
|
2442
|
-
# Let's make sure this section is large enough.
|
|
2443
|
-
# More comments and code will help.
|
|
2444
|
-
def another_function_to_add_tokens():
|
|
2445
|
-
"""Another long docstring for good measure."""
|
|
2446
|
-
x = 1
|
|
2447
|
-
y = 2
|
|
2448
|
-
z = x + y
|
|
2449
|
-
print(f"The result is {z}")
|
|
2450
|
-
# End of function
|
|
2451
|
-
\`\`\`
|
|
2452
|
-
|
|
2453
|
-
## Chapter 3: The Mixed Bag
|
|
2454
|
-
|
|
2455
|
-
This chapter contains a mix of small and medium sections.
|
|
2456
|
-
|
|
2457
|
-
### Section 3.1: A Medium Section
|
|
2458
|
-
|
|
2459
|
-
This section is moderately sized. It's not huge, but it has enough content to be a meaningful chunk on its own. We'll aim for about 150 tokens here so it can potentially merge with a small sibling.
|
|
2460
|
-
|
|
2461
|
-
### Section 3.2: A Final Small Section
|
|
2462
|
-
|
|
2463
|
-
This final section is very small and should definitely be merged into its predecessor, Section 3.1, because their combined total will be under the threshold.
|
|
2464
|
-
`;
|
|
2465
|
-
|
|
2466
|
-
const doc = MDocument.fromMarkdown(controlledTestMarkdown);
|
|
2467
|
-
await doc.chunk({
|
|
2468
|
-
strategy: 'semantic-markdown',
|
|
2469
|
-
joinThreshold: 250,
|
|
2470
|
-
modelName: 'gpt-3.5-turbo',
|
|
2471
|
-
});
|
|
2472
|
-
|
|
2473
|
-
const chunks = doc.getText();
|
|
2474
|
-
expect(chunks).toHaveLength(3);
|
|
2475
|
-
expect(chunks[0]).toContain('# My Test Document');
|
|
2476
|
-
expect(chunks[0]).toContain('### Section 1.2: Another Tiny Topic');
|
|
2477
|
-
expect(chunks[1]).toContain('## Chapter 2: The Big Section');
|
|
2478
|
-
expect(chunks[2]).toContain('## Chapter 3: The Mixed Bag');
|
|
2479
|
-
expect(chunks[2]).toContain('### Section 3.2: A Final Small Section');
|
|
2480
|
-
});
|
|
2481
|
-
|
|
2482
|
-
it('should preserve code blocks during merging', async () => {
|
|
2483
|
-
const text = `# Code Example
|
|
2484
|
-
|
|
2485
|
-
## Installation
|
|
2486
|
-
Install the package:
|
|
2487
|
-
|
|
2488
|
-
\`\`\`bash
|
|
2489
|
-
npm install example-package
|
|
2490
|
-
\`\`\`
|
|
2491
|
-
|
|
2492
|
-
## Usage
|
|
2493
|
-
Here's how to use it:
|
|
2494
|
-
|
|
2495
|
-
\`\`\`javascript
|
|
2496
|
-
const example = require('example-package');
|
|
2497
|
-
example.doSomething();
|
|
2498
|
-
\`\`\`
|
|
2499
|
-
|
|
2500
|
-
## Configuration
|
|
2501
|
-
Set up your config file.`;
|
|
2502
|
-
|
|
2503
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2504
|
-
|
|
2505
|
-
await doc.chunk({
|
|
2506
|
-
strategy: 'semantic-markdown',
|
|
2507
|
-
joinThreshold: 300,
|
|
2508
|
-
});
|
|
2509
|
-
|
|
2510
|
-
const chunks = doc.getText();
|
|
2511
|
-
|
|
2512
|
-
// Code blocks should be preserved intact
|
|
2513
|
-
expect(chunks.some(chunk => chunk.includes('```bash'))).toBe(true);
|
|
2514
|
-
expect(chunks.some(chunk => chunk.includes('```javascript'))).toBe(true);
|
|
2515
|
-
|
|
2516
|
-
// Should not split within code blocks
|
|
2517
|
-
const bashChunk = chunks.find(chunk => chunk.includes('npm install'));
|
|
2518
|
-
expect(bashChunk).toBeDefined();
|
|
2519
|
-
expect(bashChunk).toContain('```bash');
|
|
2520
|
-
});
|
|
2521
|
-
|
|
2522
|
-
it('should work with different tiktoken models', async () => {
|
|
2523
|
-
const text = `# Test Document
|
|
2524
|
-
|
|
2525
|
-
## Section 1
|
|
2526
|
-
Some content for testing different tiktoken models and their token counting accuracy.
|
|
2527
|
-
|
|
2528
|
-
## Section 2
|
|
2529
|
-
More content to verify the token counting works correctly across different model encodings.`;
|
|
2530
|
-
|
|
2531
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2532
|
-
|
|
2533
|
-
await doc.chunk({
|
|
2534
|
-
strategy: 'semantic-markdown',
|
|
2535
|
-
joinThreshold: 100,
|
|
2536
|
-
modelName: 'gpt-4',
|
|
2537
|
-
});
|
|
2538
|
-
|
|
2539
|
-
const chunks = doc.getText();
|
|
2540
|
-
const docs = doc.getDocs();
|
|
2541
|
-
|
|
2542
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
2543
|
-
expect(docs[0]?.metadata?.tokenCount).toBeDefined();
|
|
2544
|
-
expect(typeof docs[0]?.metadata?.tokenCount).toBe('number');
|
|
2545
|
-
});
|
|
2546
|
-
|
|
2547
|
-
it('should handle documents with no headers', async () => {
|
|
2548
|
-
const text = `This is a document with no markdown headers.
|
|
2549
|
-
|
|
2550
|
-
Just regular paragraphs of text that should be processed as a single semantic unit since there are no headers to split on.
|
|
2551
|
-
|
|
2552
|
-
More paragraphs here to test the behavior.`;
|
|
2553
|
-
|
|
2554
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2555
|
-
|
|
2556
|
-
await doc.chunk({
|
|
2557
|
-
strategy: 'semantic-markdown',
|
|
2558
|
-
joinThreshold: 200,
|
|
2559
|
-
});
|
|
2560
|
-
|
|
2561
|
-
const chunks = doc.getText();
|
|
2562
|
-
|
|
2563
|
-
// Should return single chunk since no headers to split on
|
|
2564
|
-
expect(chunks.length).toBe(1);
|
|
2565
|
-
expect(chunks[0]).toContain('This is a document with no markdown headers');
|
|
2566
|
-
});
|
|
2567
|
-
|
|
2568
|
-
it('should handle empty sections correctly', async () => {
|
|
2569
|
-
const text = `# Document
|
|
2570
|
-
|
|
2571
|
-
## Empty Section
|
|
2572
|
-
|
|
2573
|
-
## Another Section
|
|
2574
|
-
Some content here.
|
|
2575
|
-
|
|
2576
|
-
## Final Empty Section
|
|
2577
|
-
|
|
2578
|
-
`;
|
|
2579
|
-
|
|
2580
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2581
|
-
|
|
2582
|
-
await doc.chunk({
|
|
2583
|
-
strategy: 'semantic-markdown',
|
|
2584
|
-
joinThreshold: 100,
|
|
2585
|
-
});
|
|
2586
|
-
|
|
2587
|
-
const chunks = doc.getText();
|
|
2588
|
-
|
|
2589
|
-
// Should handle empty sections gracefully
|
|
2590
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
2591
|
-
expect(chunks.some(chunk => chunk.includes('Some content here'))).toBe(true);
|
|
2592
|
-
});
|
|
2593
|
-
|
|
2594
|
-
it('should maintain bottom-up merging order (deepest first)', async () => {
|
|
2595
|
-
const text = `# Root
|
|
2596
|
-
|
|
2597
|
-
## Level 2A
|
|
2598
|
-
Content 2A
|
|
2599
|
-
|
|
2600
|
-
### Level 3A
|
|
2601
|
-
Short content 3A
|
|
2602
|
-
|
|
2603
|
-
#### Level 4A
|
|
2604
|
-
Short content 4A
|
|
2605
|
-
|
|
2606
|
-
### Level 3B
|
|
2607
|
-
Short content 3B
|
|
2608
|
-
|
|
2609
|
-
## Level 2B
|
|
2610
|
-
Content 2B`;
|
|
2611
|
-
|
|
2612
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2613
|
-
|
|
2614
|
-
await doc.chunk({
|
|
2615
|
-
strategy: 'semantic-markdown',
|
|
2616
|
-
joinThreshold: 200,
|
|
2617
|
-
});
|
|
2618
|
-
|
|
2619
|
-
const chunks = doc.getText();
|
|
2620
|
-
|
|
2621
|
-
// The algorithm should merge from deepest level first
|
|
2622
|
-
// Level 4 should merge with Level 3, then Level 3s might merge with Level 2
|
|
2623
|
-
expect(chunks.length).toBeLessThan(7); // Less than original 7 sections
|
|
2624
|
-
|
|
2625
|
-
// Verify deep nesting is preserved in merged content
|
|
2626
|
-
const deepChunk = chunks.find(chunk => chunk.includes('Level 4A') && chunk.includes('Level 3A'));
|
|
2627
|
-
expect(deepChunk).toBeDefined();
|
|
2628
|
-
});
|
|
2629
|
-
|
|
2630
|
-
it('should compare token accuracy vs character-based sizing', async () => {
|
|
2631
|
-
// Use text with unicode and varying token densities
|
|
2632
|
-
const text = `# Test Document
|
|
2633
|
-
|
|
2634
|
-
## Unicode Section
|
|
2635
|
-
This section contains unicode characters: café, naïve, résumé, 中文, العربية
|
|
2636
|
-
|
|
2637
|
-
## Code Section
|
|
2638
|
-
\`\`\`python
|
|
2639
|
-
def function_with_long_name_and_parameters(param1, param2, param3):
|
|
2640
|
-
return param1 + param2 + param3
|
|
2641
|
-
\`\`\`
|
|
2642
|
-
|
|
2643
|
-
## Regular Section
|
|
2644
|
-
Regular English text without special characters.`;
|
|
2645
|
-
|
|
2646
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2647
|
-
|
|
2648
|
-
await doc.chunk({
|
|
2649
|
-
strategy: 'semantic-markdown',
|
|
2650
|
-
joinThreshold: 150, // Token-based threshold
|
|
2651
|
-
});
|
|
2652
|
-
|
|
2653
|
-
const docs = doc.getDocs();
|
|
2654
|
-
|
|
2655
|
-
// Verify token counts are provided in metadata
|
|
2656
|
-
docs.forEach(doc => {
|
|
2657
|
-
expect(doc.metadata.tokenCount).toBeDefined();
|
|
2658
|
-
expect(typeof doc.metadata.tokenCount).toBe('number');
|
|
2659
|
-
expect(doc.metadata.tokenCount).toBeGreaterThan(0);
|
|
2660
|
-
});
|
|
2661
|
-
|
|
2662
|
-
// Token count should be different from character count for unicode text
|
|
2663
|
-
const unicodeDoc = docs.find(doc => doc.text.includes('café'));
|
|
2664
|
-
if (unicodeDoc) {
|
|
2665
|
-
const charCount = unicodeDoc.text.length;
|
|
2666
|
-
const tokenCount = unicodeDoc.metadata.tokenCount;
|
|
2667
|
-
|
|
2668
|
-
// For text with unicode, token count is often different from char count
|
|
2669
|
-
expect(tokenCount).toBeDefined();
|
|
2670
|
-
expect(tokenCount).not.toBe(charCount);
|
|
2671
|
-
}
|
|
2672
|
-
});
|
|
2673
|
-
|
|
2674
|
-
it('should handle documents with only deep headers (no top-level sections)', async () => {
|
|
2675
|
-
const text = `### Deep Section 1
|
|
2676
|
-
Short content for deep section 1.
|
|
2677
|
-
|
|
2678
|
-
#### Very Deep Section 1.1
|
|
2679
|
-
Even shorter content.
|
|
2680
|
-
|
|
2681
|
-
#### Very Deep Section 1.2
|
|
2682
|
-
Another short subsection.
|
|
2683
|
-
|
|
2684
|
-
### Deep Section 2
|
|
2685
|
-
Short content for deep section 2.
|
|
2686
|
-
|
|
2687
|
-
#### Very Deep Section 2.1
|
|
2688
|
-
Final short content.`;
|
|
2689
|
-
|
|
2690
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2691
|
-
|
|
2692
|
-
await doc.chunk({
|
|
2693
|
-
strategy: 'semantic-markdown',
|
|
2694
|
-
joinThreshold: 200,
|
|
2695
|
-
});
|
|
2696
|
-
|
|
2697
|
-
const chunks = doc.getText();
|
|
2698
|
-
const docs = doc.getDocs();
|
|
2699
|
-
|
|
2700
|
-
// Should merge the small deep sections together
|
|
2701
|
-
expect(chunks.length).toBeLessThan(5);
|
|
2702
|
-
expect(chunks.length).toBeGreaterThan(0);
|
|
2703
|
-
|
|
2704
|
-
// Verify deep headers are preserved in merged content
|
|
2705
|
-
const deepChunk = chunks.find(
|
|
2706
|
-
chunk => chunk.includes('### Deep Section 1') && chunk.includes('#### Very Deep Section'),
|
|
2707
|
-
);
|
|
2708
|
-
expect(deepChunk).toBeDefined();
|
|
2709
|
-
|
|
2710
|
-
expect(docs[0]?.metadata?.tokenCount).toBeDefined();
|
|
2711
|
-
});
|
|
2712
|
-
|
|
2713
|
-
it('should leave very large individual sections intact (exceeding joinThreshold)', async () => {
|
|
2714
|
-
const largeContent = 'This is a very long section. '.repeat(50); // ~1500 tokens
|
|
2715
|
-
const text = `# Document Title
|
|
2716
|
-
|
|
2717
|
-
## Small Section
|
|
2718
|
-
Small content here.
|
|
2719
|
-
|
|
2720
|
-
## Oversized Section
|
|
2721
|
-
${largeContent}
|
|
2722
|
-
|
|
2723
|
-
\`\`\`javascript
|
|
2724
|
-
// Adding code to make it even larger
|
|
2725
|
-
function processData(data) {
|
|
2726
|
-
const results = [];
|
|
2727
|
-
for (let i = 0; i < data.length; i++) {
|
|
2728
|
-
const processed = data[i] * 2 + Math.random();
|
|
2729
|
-
results.push(processed);
|
|
2730
|
-
console.log(\`Processed item \${i}: \${processed}\`);
|
|
2731
|
-
}
|
|
2732
|
-
return results;
|
|
2733
|
-
}
|
|
2734
|
-
|
|
2735
|
-
// More code to ensure we exceed the threshold
|
|
2736
|
-
class DataManager {
|
|
2737
|
-
constructor(initialData) {
|
|
2738
|
-
this.data = initialData;
|
|
2739
|
-
this.processedCount = 0;
|
|
2740
|
-
}
|
|
2741
|
-
|
|
2742
|
-
process() {
|
|
2743
|
-
this.data.forEach((item, index) => {
|
|
2744
|
-
// Process each item
|
|
2745
|
-
this.processedCount++;
|
|
2746
|
-
});
|
|
2747
|
-
}
|
|
2748
|
-
}
|
|
2749
|
-
\`\`\`
|
|
2750
|
-
|
|
2751
|
-
## Another Small Section
|
|
2752
|
-
More small content.`;
|
|
2753
|
-
|
|
2754
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2755
|
-
|
|
2756
|
-
await doc.chunk({
|
|
2757
|
-
strategy: 'semantic-markdown',
|
|
2758
|
-
joinThreshold: 300, // Much smaller than the oversized section
|
|
2759
|
-
});
|
|
2760
|
-
|
|
2761
|
-
const chunks = doc.getText();
|
|
2762
|
-
const docs = doc.getDocs();
|
|
2763
|
-
|
|
2764
|
-
expect(chunks.length).toBeGreaterThan(1);
|
|
2765
|
-
|
|
2766
|
-
// The oversized section should be left as its own chunk
|
|
2767
|
-
const oversizedChunk = chunks.find(chunk => chunk.includes('Oversized Section'));
|
|
2768
|
-
expect(oversizedChunk).toBeDefined();
|
|
2769
|
-
expect(oversizedChunk).toContain('This is a very long section.');
|
|
2770
|
-
|
|
2771
|
-
// Verify the oversized chunk exceeds the threshold
|
|
2772
|
-
const oversizedDoc = docs.find(doc => doc.text.includes('Oversized Section'));
|
|
2773
|
-
expect(oversizedDoc?.metadata?.tokenCount).toBeGreaterThan(300);
|
|
2774
|
-
|
|
2775
|
-
// Small sections should still be merged where possible
|
|
2776
|
-
const smallChunk = chunks.find(chunk => chunk.includes('Small Section') && !chunk.includes('Oversized'));
|
|
2777
|
-
expect(smallChunk).toBeDefined();
|
|
2778
|
-
});
|
|
2779
|
-
|
|
2780
|
-
it('should handle mixed header levels with gaps (skipping levels)', async () => {
|
|
2781
|
-
const text = `# Top Level
|
|
2782
|
-
|
|
2783
|
-
#### Deep Level A (skipped H2 and H3)
|
|
2784
|
-
Content for deep level A that is moderately sized with enough text to make it substantial. This section needs to have sufficient content to test the merging behavior properly when header levels are skipped. Let's add more content to ensure we have enough tokens to work with.
|
|
2785
|
-
|
|
2786
|
-
## Middle Level
|
|
2787
|
-
Content for middle level section that also needs to be substantial enough to test the algorithm. This section should have enough content to be meaningful when testing the semantic markdown chunking with mixed header levels.
|
|
2788
|
-
|
|
2789
|
-
##### Very Deep Level (skipped H3 and H4)
|
|
2790
|
-
Short content for very deep level that should still be substantial enough for testing. Even though this is marked as short, we need enough content to make the test meaningful.
|
|
2791
|
-
|
|
2792
|
-
# Another Top Level
|
|
2793
|
-
|
|
2794
|
-
This second top-level section should definitely create a boundary that prevents everything from merging into a single chunk. We need substantial content here to ensure proper separation.
|
|
2795
|
-
|
|
2796
|
-
### Medium Deep Level (skipped H2)
|
|
2797
|
-
Final content for testing header level gaps. This section also needs substantial content to ensure we're testing the algorithm properly with realistic content sizes.`;
|
|
2798
|
-
|
|
2799
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2800
|
-
|
|
2801
|
-
await doc.chunk({
|
|
2802
|
-
strategy: 'semantic-markdown',
|
|
2803
|
-
joinThreshold: 150, // Smaller threshold to encourage more chunks
|
|
2804
|
-
});
|
|
2805
|
-
|
|
2806
|
-
const chunks = doc.getText();
|
|
2807
|
-
|
|
2808
|
-
// Should handle the gaps gracefully - expect at least 2 chunks due to the second top-level section
|
|
2809
|
-
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2810
|
-
|
|
2811
|
-
// Verify headers with gaps are preserved
|
|
2812
|
-
expect(chunks.some(chunk => chunk.includes('#### Deep Level A'))).toBe(true);
|
|
2813
|
-
expect(chunks.some(chunk => chunk.includes('##### Very Deep Level'))).toBe(true);
|
|
2814
|
-
expect(chunks.some(chunk => chunk.includes('### Medium Deep Level'))).toBe(true);
|
|
2815
|
-
|
|
2816
|
-
// Verify both top-level sections are present
|
|
2817
|
-
expect(chunks.some(chunk => chunk.includes('# Top Level'))).toBe(true);
|
|
2818
|
-
expect(chunks.some(chunk => chunk.includes('# Another Top Level'))).toBe(true);
|
|
2819
|
-
});
|
|
2820
|
-
|
|
2821
|
-
it('should handle large documents efficiently (performance test)', async () => {
|
|
2822
|
-
const sections: string[] = [];
|
|
2823
|
-
for (let i = 1; i <= 100; i++) {
|
|
2824
|
-
sections.push(`## Section ${i}`);
|
|
2825
|
-
sections.push(`This is content for section ${i}. `.repeat(10)); // ~100 tokens each
|
|
2826
|
-
|
|
2827
|
-
for (let j = 1; j <= 3; j++) {
|
|
2828
|
-
sections.push(`### Subsection ${i}.${j}`);
|
|
2829
|
-
sections.push(`This is subsection content ${i}.${j}. `.repeat(5)); // ~50 tokens each
|
|
2830
|
-
}
|
|
2831
|
-
}
|
|
2832
|
-
|
|
2833
|
-
const largeText = `# Large Test Document\n\n${sections.join('\n\n')}`;
|
|
2834
|
-
|
|
2835
|
-
const doc = MDocument.fromMarkdown(largeText);
|
|
2836
|
-
|
|
2837
|
-
const startTime = Date.now();
|
|
2838
|
-
|
|
2839
|
-
await doc.chunk({
|
|
2840
|
-
strategy: 'semantic-markdown',
|
|
2841
|
-
joinThreshold: 300,
|
|
2842
|
-
});
|
|
2843
|
-
|
|
2844
|
-
const duration = Date.now() - startTime;
|
|
2845
|
-
const chunks = doc.getText();
|
|
2846
|
-
const docs = doc.getDocs();
|
|
2847
|
-
|
|
2848
|
-
expect(duration).toBeLessThan(5000);
|
|
2849
|
-
|
|
2850
|
-
expect(chunks.length).toBeGreaterThan(10);
|
|
2851
|
-
expect(chunks.length).toBeLessThan(400);
|
|
2852
|
-
|
|
2853
|
-
docs.forEach(doc => {
|
|
2854
|
-
expect(doc.metadata.tokenCount).toBeDefined();
|
|
2855
|
-
expect(doc.metadata.tokenCount).toBeGreaterThan(0);
|
|
2856
|
-
});
|
|
2857
|
-
}, 10000);
|
|
2858
|
-
|
|
2859
|
-
it('should maintain semantic coherence with very small joinThreshold', async () => {
|
|
2860
|
-
const text = `# Document
|
|
2861
|
-
|
|
2862
|
-
This is a substantial preamble section that should have enough content to be meaningful in token counting. We need sufficient content here to test the algorithm properly.
|
|
2863
|
-
|
|
2864
|
-
## Section A
|
|
2865
|
-
Brief content for section A that needs to be expanded to ensure we have meaningful token counts for testing the semantic markdown chunking algorithm with a very small threshold.
|
|
2866
|
-
|
|
2867
|
-
### Sub A1
|
|
2868
|
-
More substantial content here for subsection A1. This content needs to be long enough to have a reasonable token count that will affect the merging decisions in our semantic chunking algorithm.
|
|
2869
|
-
|
|
2870
|
-
### Sub A2
|
|
2871
|
-
Even more substantial content for subsection A2. Again, we need enough tokens here to make the test meaningful and to properly exercise the algorithm's decision-making process.
|
|
2872
|
-
|
|
2873
|
-
## Section B
|
|
2874
|
-
Another section with substantial content for section B. This section should also have enough content to be meaningful in our token-based chunking strategy testing.
|
|
2875
|
-
|
|
2876
|
-
### Sub B1
|
|
2877
|
-
Final substantial content for subsection B1. This content should complete our test document with enough tokens to properly test the small threshold behavior.`;
|
|
2878
|
-
|
|
2879
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2880
|
-
|
|
2881
|
-
await doc.chunk({
|
|
2882
|
-
strategy: 'semantic-markdown',
|
|
2883
|
-
joinThreshold: 30, // Even smaller threshold to force separation
|
|
2884
|
-
});
|
|
2885
|
-
|
|
2886
|
-
const chunks = doc.getText();
|
|
2887
|
-
|
|
2888
|
-
// With a very small threshold, we should get at least some separation
|
|
2889
|
-
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2890
|
-
|
|
2891
|
-
// Verify all chunks have meaningful content
|
|
2892
|
-
chunks.forEach(chunk => {
|
|
2893
|
-
expect(chunk.trim().length).toBeGreaterThan(0);
|
|
2894
|
-
expect(chunk.trim().length).toBeGreaterThan(10);
|
|
2895
|
-
});
|
|
2896
|
-
|
|
2897
|
-
// Verify we have the main document structure preserved
|
|
2898
|
-
const allText = chunks.join(' ');
|
|
2899
|
-
expect(allText).toContain('# Document');
|
|
2900
|
-
expect(allText).toContain('## Section A');
|
|
2901
|
-
expect(allText).toContain('## Section B');
|
|
2902
|
-
});
|
|
2903
|
-
|
|
2904
|
-
it('should not treat headers inside code blocks as headers for splitting', async () => {
|
|
2905
|
-
const text = `# Real Header
|
|
2906
|
-
|
|
2907
|
-
Some introductory text explaining code examples.
|
|
2908
|
-
|
|
2909
|
-
\`\`\`markdown
|
|
2910
|
-
# This is not a real header
|
|
2911
|
-
It is inside a code block and should be ignored for chunking.
|
|
2912
|
-
|
|
2913
|
-
## This is also not a real header
|
|
2914
|
-
It should be treated as plain text content, not a section boundary.
|
|
2915
|
-
|
|
2916
|
-
### Even deeper fake headers
|
|
2917
|
-
Should also be ignored completely.
|
|
2918
|
-
\`\`\`
|
|
2919
|
-
|
|
2920
|
-
## A Real Second Header
|
|
2921
|
-
This content comes after the code block.
|
|
2922
|
-
|
|
2923
|
-
### A Real Subsection
|
|
2924
|
-
With some additional content to test the hierarchy.`;
|
|
2925
|
-
|
|
2926
|
-
const doc = MDocument.fromMarkdown(text);
|
|
2927
|
-
|
|
2928
|
-
await doc.chunk({
|
|
2929
|
-
strategy: 'semantic-markdown',
|
|
2930
|
-
joinThreshold: 25, // Low threshold to force separation into 2 or more chunks
|
|
2931
|
-
});
|
|
2932
|
-
|
|
2933
|
-
const chunks = doc.getText();
|
|
2934
|
-
|
|
2935
|
-
// With a low threshold, we should get exactly 2 chunks:
|
|
2936
|
-
// 1. "# Real Header" section (with the code block as content)
|
|
2937
|
-
// 2. "## A Real Second Header" section (with its subsection)
|
|
2938
|
-
// If fake headers were processed, we'd get more than 2 chunks
|
|
2939
|
-
expect(chunks.length).toBe(2);
|
|
2940
|
-
|
|
2941
|
-
const firstChunk = chunks[0];
|
|
2942
|
-
const secondChunk = chunks[1];
|
|
2943
|
-
|
|
2944
|
-
expect(firstChunk).toContain('# Real Header');
|
|
2945
|
-
expect(firstChunk).toContain('Some introductory text explaining code examples');
|
|
2946
|
-
expect(firstChunk).toContain('```markdown');
|
|
2947
|
-
expect(firstChunk).toContain('# This is not a real header');
|
|
2948
|
-
expect(firstChunk).toContain('## This is also not a real header');
|
|
2949
|
-
expect(firstChunk).toContain('### Even deeper fake headers');
|
|
2950
|
-
expect(firstChunk).not.toContain('## A Real Second Header');
|
|
2951
|
-
|
|
2952
|
-
expect(secondChunk).toContain('## A Real Second Header');
|
|
2953
|
-
expect(secondChunk).toContain('### A Real Subsection');
|
|
2954
|
-
expect(secondChunk).not.toContain('# Real Header');
|
|
2955
|
-
expect(secondChunk).not.toContain('# This is not a real header');
|
|
2956
|
-
});
|
|
2957
|
-
});
|
|
2958
|
-
});
|
|
2959
|
-
|
|
2960
|
-
// Helper function to find the longest common substring between two strings
|
|
2961
|
-
function findCommonSubstring(str1: string, str2: string): string {
|
|
2962
|
-
let longest = '';
|
|
2963
|
-
|
|
2964
|
-
// Check for substrings of str1 in str2
|
|
2965
|
-
for (let i = 0; i < str1.length; i++) {
|
|
2966
|
-
for (let j = i + 1; j <= str1.length; j++) {
|
|
2967
|
-
const substring = str1.substring(i, j);
|
|
2968
|
-
if (substring.length > longest.length && str2.includes(substring)) {
|
|
2969
|
-
longest = substring;
|
|
2970
|
-
}
|
|
2971
|
-
}
|
|
2972
|
-
}
|
|
2973
|
-
|
|
2974
|
-
return longest;
|
|
2975
|
-
}
|