@mastra/rag 1.0.6 → 1.0.7-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +25 -0
- package/dist/document/document.d.ts +10 -9
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/extractors/base.d.ts +1 -1
- package/dist/document/extractors/index.d.ts +5 -5
- package/dist/document/extractors/keywords.d.ts +4 -4
- package/dist/document/extractors/questions.d.ts +4 -4
- package/dist/document/extractors/summary.d.ts +4 -4
- package/dist/document/extractors/title.d.ts +4 -4
- package/dist/document/extractors/types.d.ts +1 -1
- package/dist/document/index.d.ts +2 -2
- package/dist/document/prompts/base.d.ts +1 -1
- package/dist/document/prompts/index.d.ts +3 -3
- package/dist/document/prompts/prompt.d.ts +1 -1
- package/dist/document/schema/index.d.ts +3 -3
- package/dist/document/schema/node.d.ts +2 -2
- package/dist/document/transformers/character.d.ts +6 -28
- package/dist/document/transformers/character.d.ts.map +1 -1
- package/dist/document/transformers/html.d.ts +9 -4
- package/dist/document/transformers/html.d.ts.map +1 -1
- package/dist/document/transformers/json.d.ts +5 -5
- package/dist/document/transformers/json.d.ts.map +1 -1
- package/dist/document/transformers/latex.d.ts +3 -9
- package/dist/document/transformers/latex.d.ts.map +1 -1
- package/dist/document/transformers/markdown.d.ts +4 -10
- package/dist/document/transformers/markdown.d.ts.map +1 -1
- package/dist/document/transformers/sentence.d.ts +31 -0
- package/dist/document/transformers/sentence.d.ts.map +1 -0
- package/dist/document/transformers/text.d.ts +5 -5
- package/dist/document/transformers/text.d.ts.map +1 -1
- package/dist/document/transformers/token.d.ts +5 -16
- package/dist/document/transformers/token.d.ts.map +1 -1
- package/dist/document/transformers/transformer.d.ts +1 -1
- package/dist/document/types.d.ts +86 -15
- package/dist/document/types.d.ts.map +1 -1
- package/dist/document/validation.d.ts +3 -0
- package/dist/document/validation.d.ts.map +1 -0
- package/dist/index.cjs +414 -80
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.ts +8 -8
- package/dist/index.js +414 -80
- package/dist/index.js.map +1 -1
- package/dist/rerank/relevance/index.d.ts +3 -3
- package/dist/tools/document-chunker.d.ts +1 -1
- package/dist/tools/document-chunker.d.ts.map +1 -1
- package/dist/tools/graph-rag.d.ts +2 -2
- package/dist/tools/index.d.ts +3 -3
- package/dist/tools/types.d.ts +1 -1
- package/dist/tools/vector-query.d.ts +2 -2
- package/dist/utils/convert-sources.d.ts +2 -2
- package/dist/utils/index.d.ts +3 -3
- package/dist/utils/vector-search.d.ts +1 -1
- package/package.json +8 -7
- package/src/document/document.test.ts +294 -39
- package/src/document/document.ts +69 -41
- package/src/document/transformers/character.ts +15 -43
- package/src/document/transformers/html.ts +9 -9
- package/src/document/transformers/json.ts +8 -3
- package/src/document/transformers/latex.ts +3 -11
- package/src/document/transformers/markdown.ts +3 -11
- package/src/document/transformers/sentence.ts +314 -0
- package/src/document/transformers/text.ts +10 -10
- package/src/document/transformers/token.ts +6 -17
- package/src/document/types.ts +66 -15
- package/src/document/validation.ts +147 -0
- package/src/tools/document-chunker.ts +12 -8
- package/tsup.config.ts +2 -7
|
@@ -43,9 +43,8 @@ describe('MDocument', () => {
|
|
|
43
43
|
const doc = MDocument.fromMarkdown(sampleMarkdown);
|
|
44
44
|
|
|
45
45
|
chunks = await doc.chunk({
|
|
46
|
-
|
|
46
|
+
maxSize: 1500,
|
|
47
47
|
overlap: 0,
|
|
48
|
-
separator: `\n`,
|
|
49
48
|
extract: {
|
|
50
49
|
keywords: true,
|
|
51
50
|
},
|
|
@@ -75,7 +74,7 @@ describe('MDocument', () => {
|
|
|
75
74
|
strategy: 'character',
|
|
76
75
|
separator: '\n\n',
|
|
77
76
|
isSeparatorRegex: false,
|
|
78
|
-
|
|
77
|
+
maxSize: 50,
|
|
79
78
|
overlap: 5,
|
|
80
79
|
});
|
|
81
80
|
|
|
@@ -96,7 +95,7 @@ describe('MDocument', () => {
|
|
|
96
95
|
strategy: 'character',
|
|
97
96
|
separator: '\\s+',
|
|
98
97
|
isSeparatorRegex: true,
|
|
99
|
-
|
|
98
|
+
maxSize: 50,
|
|
100
99
|
overlap: 5,
|
|
101
100
|
});
|
|
102
101
|
|
|
@@ -112,7 +111,7 @@ describe('MDocument', () => {
|
|
|
112
111
|
strategy: 'character',
|
|
113
112
|
separator: '\n\n',
|
|
114
113
|
isSeparatorRegex: false,
|
|
115
|
-
|
|
114
|
+
maxSize: 50,
|
|
116
115
|
overlap: 5,
|
|
117
116
|
keepSeparator: 'end',
|
|
118
117
|
});
|
|
@@ -132,7 +131,7 @@ describe('MDocument', () => {
|
|
|
132
131
|
strategy: 'character',
|
|
133
132
|
separator: '\n\n',
|
|
134
133
|
isSeparatorRegex: false,
|
|
135
|
-
|
|
134
|
+
maxSize: 50,
|
|
136
135
|
overlap: 5,
|
|
137
136
|
keepSeparator: 'end',
|
|
138
137
|
});
|
|
@@ -153,7 +152,7 @@ describe('MDocument', () => {
|
|
|
153
152
|
strategy: 'character',
|
|
154
153
|
separator: '\n\n',
|
|
155
154
|
isSeparatorRegex: false,
|
|
156
|
-
|
|
155
|
+
maxSize: 50,
|
|
157
156
|
overlap: 5,
|
|
158
157
|
keepSeparator: 'start',
|
|
159
158
|
});
|
|
@@ -175,7 +174,7 @@ describe('MDocument', () => {
|
|
|
175
174
|
strategy: 'character',
|
|
176
175
|
separator: '\n\n',
|
|
177
176
|
isSeparatorRegex: false,
|
|
178
|
-
|
|
177
|
+
maxSize: 50,
|
|
179
178
|
overlap: 5,
|
|
180
179
|
keepSeparator: 'end',
|
|
181
180
|
});
|
|
@@ -195,7 +194,7 @@ describe('MDocument', () => {
|
|
|
195
194
|
strategy: 'character',
|
|
196
195
|
separator: '\n\n',
|
|
197
196
|
isSeparatorRegex: false,
|
|
198
|
-
|
|
197
|
+
maxSize: 50,
|
|
199
198
|
overlap: 5,
|
|
200
199
|
keepSeparator: 'end',
|
|
201
200
|
});
|
|
@@ -215,7 +214,7 @@ describe('MDocument', () => {
|
|
|
215
214
|
strategy: 'character',
|
|
216
215
|
separator: '\n\n',
|
|
217
216
|
isSeparatorRegex: false,
|
|
218
|
-
|
|
217
|
+
maxSize: 50,
|
|
219
218
|
overlap: 5,
|
|
220
219
|
keepSeparator: 'start',
|
|
221
220
|
});
|
|
@@ -235,7 +234,7 @@ describe('MDocument', () => {
|
|
|
235
234
|
|
|
236
235
|
const result = await doc.chunk({
|
|
237
236
|
strategy: 'character',
|
|
238
|
-
|
|
237
|
+
maxSize: chunkSize,
|
|
239
238
|
overlap,
|
|
240
239
|
});
|
|
241
240
|
|
|
@@ -265,7 +264,7 @@ describe('MDocument', () => {
|
|
|
265
264
|
const doc = MDocument.fromText(text);
|
|
266
265
|
const chunks = await doc.chunk({
|
|
267
266
|
strategy: 'character',
|
|
268
|
-
|
|
267
|
+
maxSize: chunkSize,
|
|
269
268
|
overlap,
|
|
270
269
|
});
|
|
271
270
|
|
|
@@ -309,7 +308,7 @@ describe('MDocument', () => {
|
|
|
309
308
|
const testDoc = MDocument.fromText(text);
|
|
310
309
|
const chunks = await testDoc.chunk({
|
|
311
310
|
strategy: 'character',
|
|
312
|
-
|
|
311
|
+
maxSize: chunkSize,
|
|
313
312
|
overlap,
|
|
314
313
|
});
|
|
315
314
|
|
|
@@ -326,7 +325,7 @@ describe('MDocument', () => {
|
|
|
326
325
|
}
|
|
327
326
|
expect(allChunksValid).toBe(true);
|
|
328
327
|
|
|
329
|
-
// Verify each chunk
|
|
328
|
+
// Verify the size of each chunk explicitly
|
|
330
329
|
for (const chunk of chunks) {
|
|
331
330
|
expect(chunk.text.length).toBeLessThanOrEqual(chunkSize);
|
|
332
331
|
}
|
|
@@ -352,7 +351,7 @@ describe('MDocument', () => {
|
|
|
352
351
|
const doc = MDocument.fromText(text);
|
|
353
352
|
const chunks = await doc.chunk({
|
|
354
353
|
strategy: 'character',
|
|
355
|
-
|
|
354
|
+
maxSize: chunkSize,
|
|
356
355
|
overlap,
|
|
357
356
|
});
|
|
358
357
|
|
|
@@ -360,7 +359,7 @@ describe('MDocument', () => {
|
|
|
360
359
|
chunks.forEach(chunk => {
|
|
361
360
|
// Each chunk should be either:
|
|
362
361
|
// 1. Full size (chunkSize)
|
|
363
|
-
// 2. Or at least half the chunk
|
|
362
|
+
// 2. Or at least half the chunk maxSize if it's the last chunk
|
|
364
363
|
const minSize = chunk === chunks[chunks.length - 1] ? Math.floor(chunkSize / 2) : chunkSize;
|
|
365
364
|
expect(chunk.text.length).toBeGreaterThanOrEqual(minSize);
|
|
366
365
|
});
|
|
@@ -386,9 +385,9 @@ describe('MDocument', () => {
|
|
|
386
385
|
|
|
387
386
|
await doc.chunk({
|
|
388
387
|
strategy: 'recursive',
|
|
389
|
-
size,
|
|
388
|
+
maxSize: size,
|
|
390
389
|
overlap: overlapSize,
|
|
391
|
-
|
|
390
|
+
separators: ['\n\n'], // Split on double newlines
|
|
392
391
|
});
|
|
393
392
|
|
|
394
393
|
const docs = doc.getDocs();
|
|
@@ -420,7 +419,7 @@ describe('MDocument', () => {
|
|
|
420
419
|
strategy: 'recursive',
|
|
421
420
|
separators: ['\n\n', '\n', ' ', ''],
|
|
422
421
|
isSeparatorRegex: false,
|
|
423
|
-
|
|
422
|
+
maxSize: 50,
|
|
424
423
|
overlap: 5,
|
|
425
424
|
});
|
|
426
425
|
|
|
@@ -446,7 +445,7 @@ describe('MDocument', () => {
|
|
|
446
445
|
const doc = MDocument.fromText(tsCode, { meta: 'data' });
|
|
447
446
|
|
|
448
447
|
await doc.chunk({
|
|
449
|
-
|
|
448
|
+
maxSize: 50,
|
|
450
449
|
overlap: 5,
|
|
451
450
|
language: Language.TS,
|
|
452
451
|
});
|
|
@@ -461,7 +460,7 @@ describe('MDocument', () => {
|
|
|
461
460
|
|
|
462
461
|
await expect(
|
|
463
462
|
doc.chunk({
|
|
464
|
-
|
|
463
|
+
maxSize: 50,
|
|
465
464
|
overlap: 5,
|
|
466
465
|
language: 'invalid-language' as any,
|
|
467
466
|
}),
|
|
@@ -481,7 +480,7 @@ describe('MDocument', () => {
|
|
|
481
480
|
|
|
482
481
|
await doc.chunk({
|
|
483
482
|
strategy: 'recursive',
|
|
484
|
-
|
|
483
|
+
maxSize: 500, // Smaller chunk maxSize to ensure multiple chunks
|
|
485
484
|
overlap: overlapSize,
|
|
486
485
|
});
|
|
487
486
|
|
|
@@ -517,7 +516,7 @@ describe('MDocument', () => {
|
|
|
517
516
|
|
|
518
517
|
await doc.chunk({
|
|
519
518
|
strategy: 'recursive',
|
|
520
|
-
|
|
519
|
+
maxSize: chunkSize,
|
|
521
520
|
overlap: overlapSize,
|
|
522
521
|
});
|
|
523
522
|
|
|
@@ -1373,7 +1372,7 @@ describe('MDocument', () => {
|
|
|
1373
1372
|
await doc.chunk({
|
|
1374
1373
|
strategy: 'token',
|
|
1375
1374
|
encodingName: 'cl100k_base',
|
|
1376
|
-
|
|
1375
|
+
maxSize: 10,
|
|
1377
1376
|
overlap: 2,
|
|
1378
1377
|
});
|
|
1379
1378
|
|
|
@@ -1391,7 +1390,7 @@ describe('MDocument', () => {
|
|
|
1391
1390
|
await doc.chunk({
|
|
1392
1391
|
strategy: 'token',
|
|
1393
1392
|
encodingName: 'gpt2',
|
|
1394
|
-
|
|
1393
|
+
maxSize: 10,
|
|
1395
1394
|
disallowedSpecial: new Set(),
|
|
1396
1395
|
allowedSpecial: new Set(['<|endoftext|>']),
|
|
1397
1396
|
overlap: 2,
|
|
@@ -1410,7 +1409,7 @@ describe('MDocument', () => {
|
|
|
1410
1409
|
await doc.chunk({
|
|
1411
1410
|
strategy: 'token',
|
|
1412
1411
|
encodingName: 'gpt2',
|
|
1413
|
-
|
|
1412
|
+
maxSize: 10,
|
|
1414
1413
|
disallowedSpecial: new Set(),
|
|
1415
1414
|
allowedSpecial: new Set(['<|endoftext|>']),
|
|
1416
1415
|
overlap: 2,
|
|
@@ -1424,15 +1423,15 @@ describe('MDocument', () => {
|
|
|
1424
1423
|
});
|
|
1425
1424
|
|
|
1426
1425
|
describe('Error cases', () => {
|
|
1427
|
-
it('should throw error for invalid chunk
|
|
1426
|
+
it('should throw error for invalid chunk maxSize and overlap', async () => {
|
|
1428
1427
|
const text = ' This has whitespace ';
|
|
1429
1428
|
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1430
1429
|
|
|
1431
1430
|
await expect(
|
|
1432
1431
|
doc.chunk({
|
|
1433
1432
|
strategy: 'token',
|
|
1434
|
-
|
|
1435
|
-
overlap: 150, // overlap larger than chunk
|
|
1433
|
+
maxSize: 100,
|
|
1434
|
+
overlap: 150, // overlap larger than chunk maxSize
|
|
1436
1435
|
}),
|
|
1437
1436
|
).rejects.toThrow();
|
|
1438
1437
|
});
|
|
@@ -1445,8 +1444,8 @@ describe('MDocument', () => {
|
|
|
1445
1444
|
doc.chunk({
|
|
1446
1445
|
strategy: 'token',
|
|
1447
1446
|
encodingName: 'invalid-encoding' as any,
|
|
1448
|
-
|
|
1449
|
-
overlap: 150, // overlap larger than chunk
|
|
1447
|
+
maxSize: 100,
|
|
1448
|
+
overlap: 150, // overlap larger than chunk maxSize
|
|
1450
1449
|
}),
|
|
1451
1450
|
).rejects.toThrow();
|
|
1452
1451
|
});
|
|
@@ -1472,7 +1471,7 @@ describe('MDocument', () => {
|
|
|
1472
1471
|
|
|
1473
1472
|
await doc.chunk({
|
|
1474
1473
|
strategy: 'markdown',
|
|
1475
|
-
|
|
1474
|
+
maxSize: 100,
|
|
1476
1475
|
overlap: 10,
|
|
1477
1476
|
});
|
|
1478
1477
|
|
|
@@ -1496,7 +1495,7 @@ describe('MDocument', () => {
|
|
|
1496
1495
|
|
|
1497
1496
|
await doc.chunk({
|
|
1498
1497
|
strategy: 'markdown',
|
|
1499
|
-
|
|
1498
|
+
maxSize: 100,
|
|
1500
1499
|
overlap: 10,
|
|
1501
1500
|
});
|
|
1502
1501
|
|
|
@@ -1527,7 +1526,7 @@ describe('MDocument', () => {
|
|
|
1527
1526
|
|
|
1528
1527
|
await doc.chunk({
|
|
1529
1528
|
strategy: 'latex',
|
|
1530
|
-
|
|
1529
|
+
maxSize: 100,
|
|
1531
1530
|
overlap: 10,
|
|
1532
1531
|
keepSeparator: 'start',
|
|
1533
1532
|
});
|
|
@@ -1557,7 +1556,7 @@ describe('MDocument', () => {
|
|
|
1557
1556
|
|
|
1558
1557
|
await doc.chunk({
|
|
1559
1558
|
strategy: 'latex',
|
|
1560
|
-
|
|
1559
|
+
maxSize: 100,
|
|
1561
1560
|
overlap: 10,
|
|
1562
1561
|
keepSeparator: 'start',
|
|
1563
1562
|
});
|
|
@@ -1579,7 +1578,7 @@ describe('MDocument', () => {
|
|
|
1579
1578
|
|
|
1580
1579
|
await doc.chunk({
|
|
1581
1580
|
strategy: 'latex',
|
|
1582
|
-
|
|
1581
|
+
maxSize: 50,
|
|
1583
1582
|
overlap: 0,
|
|
1584
1583
|
keepSeparator: 'end',
|
|
1585
1584
|
});
|
|
@@ -1600,7 +1599,7 @@ describe('MDocument', () => {
|
|
|
1600
1599
|
|
|
1601
1600
|
await doc.chunk({
|
|
1602
1601
|
strategy: 'latex',
|
|
1603
|
-
|
|
1602
|
+
maxSize: 100,
|
|
1604
1603
|
overlap: 0,
|
|
1605
1604
|
stripWhitespace: true,
|
|
1606
1605
|
});
|
|
@@ -1759,7 +1758,7 @@ describe('MDocument', () => {
|
|
|
1759
1758
|
const doc = MDocument.fromMarkdown(markdown);
|
|
1760
1759
|
const chunks = await doc.chunk({
|
|
1761
1760
|
strategy: 'markdown',
|
|
1762
|
-
|
|
1761
|
+
maxSize: 500,
|
|
1763
1762
|
overlap: 0,
|
|
1764
1763
|
headers: [
|
|
1765
1764
|
['#', 'h1'],
|
|
@@ -2006,7 +2005,7 @@ describe('MDocument', () => {
|
|
|
2006
2005
|
const chunks = await doc.chunk({
|
|
2007
2006
|
strategy: 'character',
|
|
2008
2007
|
separator: '\n\n',
|
|
2009
|
-
|
|
2008
|
+
maxSize: 20,
|
|
2010
2009
|
overlap: 0,
|
|
2011
2010
|
extract: { keywords: true },
|
|
2012
2011
|
});
|
|
@@ -2057,6 +2056,262 @@ describe('MDocument', () => {
|
|
|
2057
2056
|
expect(titleA1).not.toBe(titleB);
|
|
2058
2057
|
});
|
|
2059
2058
|
});
|
|
2059
|
+
|
|
2060
|
+
describe('chunkSentence', () => {
|
|
2061
|
+
it('should preserve sentence structure and avoid mid-sentence breaks', async () => {
|
|
2062
|
+
const text =
|
|
2063
|
+
'A dynamic concert scene captures an energetic, vibrant atmosphere, with a densely packed crowd silhouetted against bright stage lights. The image features beams of white light radiating from multiple projectors, creating dramatic patterns across a darkened room. The audience, comprised of numerous people with raised hands, exudes excitement and engagement, enhancing the lively mood. The setting suggests a large indoor venue, possibly a music or worship event, with text visible on a screen in the background, adding to an immersive experience. The overall composition emphasizes a sense of community and shared enthusiasm, ideal for promoting entertainment events, live concerts, or communal gatherings. The high-contrast lighting and slight haze effect imbue the scene with a modern, electrifying quality.';
|
|
2064
|
+
|
|
2065
|
+
const doc = MDocument.fromText(text);
|
|
2066
|
+
|
|
2067
|
+
const chunks = await doc.chunk({
|
|
2068
|
+
strategy: 'sentence',
|
|
2069
|
+
minSize: 50,
|
|
2070
|
+
maxSize: 450,
|
|
2071
|
+
overlap: 0,
|
|
2072
|
+
sentenceEnders: ['.'],
|
|
2073
|
+
keepSeparator: true,
|
|
2074
|
+
});
|
|
2075
|
+
|
|
2076
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
2077
|
+
|
|
2078
|
+
chunks.forEach(chunk => {
|
|
2079
|
+
expect(chunk.text.length).toBeGreaterThanOrEqual(50);
|
|
2080
|
+
expect(chunk.text.length).toBeLessThanOrEqual(450);
|
|
2081
|
+
|
|
2082
|
+
expect(chunk.text.startsWith('.')).toBe(false);
|
|
2083
|
+
expect(chunk.text.startsWith(' .')).toBe(false);
|
|
2084
|
+
|
|
2085
|
+
expect(chunk.text.endsWith('.')).toBe(true);
|
|
2086
|
+
});
|
|
2087
|
+
});
|
|
2088
|
+
|
|
2089
|
+
it('should require maxSize parameter', async () => {
|
|
2090
|
+
const doc = MDocument.fromText('Short text.');
|
|
2091
|
+
|
|
2092
|
+
await expect(
|
|
2093
|
+
doc.chunk({
|
|
2094
|
+
strategy: 'sentence',
|
|
2095
|
+
minSize: 50,
|
|
2096
|
+
} as any),
|
|
2097
|
+
).rejects.toThrow('Invalid parameters for sentence strategy: maxSize: Required');
|
|
2098
|
+
});
|
|
2099
|
+
|
|
2100
|
+
it('should handle custom sentence enders', async () => {
|
|
2101
|
+
const text =
|
|
2102
|
+
'First sentence with more content to make it longer. Second sentence with additional content! Third sentence with even more text? Fourth sentence with final content.';
|
|
2103
|
+
|
|
2104
|
+
const doc = MDocument.fromText(text);
|
|
2105
|
+
|
|
2106
|
+
const chunks = await doc.chunk({
|
|
2107
|
+
strategy: 'sentence',
|
|
2108
|
+
maxSize: 100,
|
|
2109
|
+
sentenceEnders: ['.', '!', '?'],
|
|
2110
|
+
keepSeparator: true,
|
|
2111
|
+
});
|
|
2112
|
+
|
|
2113
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
2114
|
+
|
|
2115
|
+
chunks.forEach(chunk => {
|
|
2116
|
+
const endsWithValidSeparator = chunk.text.endsWith('.') || chunk.text.endsWith('!') || chunk.text.endsWith('?');
|
|
2117
|
+
expect(endsWithValidSeparator).toBe(true);
|
|
2118
|
+
});
|
|
2119
|
+
});
|
|
2120
|
+
|
|
2121
|
+
it('should handle overlap with complete sentences', async () => {
|
|
2122
|
+
const text =
|
|
2123
|
+
'First sentence with some content that makes it quite long. Second sentence with different content that also makes it lengthy. Third sentence with more content to ensure multiple chunks. Fourth sentence with final content to complete the test.';
|
|
2124
|
+
|
|
2125
|
+
const doc = MDocument.fromText(text);
|
|
2126
|
+
|
|
2127
|
+
const chunks = await doc.chunk({
|
|
2128
|
+
strategy: 'sentence',
|
|
2129
|
+
maxSize: 120,
|
|
2130
|
+
overlap: 50,
|
|
2131
|
+
sentenceEnders: ['.'],
|
|
2132
|
+
keepSeparator: true,
|
|
2133
|
+
});
|
|
2134
|
+
|
|
2135
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
2136
|
+
|
|
2137
|
+
// Check that overlapping chunks share some content
|
|
2138
|
+
if (chunks.length > 1) {
|
|
2139
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
2140
|
+
const currentChunk = chunks[i].text;
|
|
2141
|
+
|
|
2142
|
+
// With overlap, current chunk should start with some content from previous chunk
|
|
2143
|
+
// Just verify that overlap is being applied (chunk 2 starts with overlap from chunk 1)
|
|
2144
|
+
expect(currentChunk.length).toBeGreaterThan(50); // Should include overlap content
|
|
2145
|
+
}
|
|
2146
|
+
}
|
|
2147
|
+
});
|
|
2148
|
+
|
|
2149
|
+
it('should fallback to word splitting for oversized sentences', async () => {
|
|
2150
|
+
const longSentence =
|
|
2151
|
+
'This is an extremely long sentence that ' +
|
|
2152
|
+
'word '.repeat(50) +
|
|
2153
|
+
'and should be split into smaller chunks when it exceeds the maximum size limit.';
|
|
2154
|
+
|
|
2155
|
+
const doc = MDocument.fromText(longSentence);
|
|
2156
|
+
|
|
2157
|
+
const chunks = await doc.chunk({
|
|
2158
|
+
strategy: 'sentence',
|
|
2159
|
+
maxSize: 100,
|
|
2160
|
+
fallbackToWords: true,
|
|
2161
|
+
});
|
|
2162
|
+
|
|
2163
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
2164
|
+
|
|
2165
|
+
chunks.forEach(chunk => {
|
|
2166
|
+
expect(chunk.text.length).toBeLessThanOrEqual(100);
|
|
2167
|
+
});
|
|
2168
|
+
});
|
|
2169
|
+
|
|
2170
|
+
it('should handle short text appropriately', async () => {
|
|
2171
|
+
const text = 'Short sentence.';
|
|
2172
|
+
|
|
2173
|
+
const doc = MDocument.fromText(text);
|
|
2174
|
+
|
|
2175
|
+
const chunks = await doc.chunk({
|
|
2176
|
+
strategy: 'sentence',
|
|
2177
|
+
minSize: 5,
|
|
2178
|
+
maxSize: 100,
|
|
2179
|
+
sentenceEnders: ['.'],
|
|
2180
|
+
keepSeparator: true,
|
|
2181
|
+
});
|
|
2182
|
+
|
|
2183
|
+
expect(chunks.length).toBe(1);
|
|
2184
|
+
expect(chunks[0].text).toBe(text);
|
|
2185
|
+
});
|
|
2186
|
+
|
|
2187
|
+
it('should group multiple sentences when they fit within target size', async () => {
|
|
2188
|
+
const text = 'Short one. Another short. Third short. Fourth sentence. Fifth one.';
|
|
2189
|
+
|
|
2190
|
+
const doc = MDocument.fromText(text);
|
|
2191
|
+
|
|
2192
|
+
const chunks = await doc.chunk({
|
|
2193
|
+
strategy: 'sentence',
|
|
2194
|
+
minSize: 10,
|
|
2195
|
+
maxSize: 100,
|
|
2196
|
+
targetSize: 40,
|
|
2197
|
+
sentenceEnders: ['.'],
|
|
2198
|
+
keepSeparator: true,
|
|
2199
|
+
});
|
|
2200
|
+
|
|
2201
|
+
// Should group multiple short sentences together
|
|
2202
|
+
expect(chunks.length).toBeLessThan(5); // Less than the number of sentences
|
|
2203
|
+
|
|
2204
|
+
chunks.forEach(chunk => {
|
|
2205
|
+
// Each chunk should contain multiple sentences when possible
|
|
2206
|
+
expect(chunk.text.length).toBeLessThanOrEqual(100);
|
|
2207
|
+
});
|
|
2208
|
+
});
|
|
2209
|
+
|
|
2210
|
+
it('should preserve metadata across chunks', async () => {
|
|
2211
|
+
const text =
|
|
2212
|
+
'First sentence with enough content to make it longer than fifty characters. Second sentence with additional content to ensure multiple chunks. Third sentence with final content.';
|
|
2213
|
+
const metadata = { source: 'test', author: 'jest' };
|
|
2214
|
+
|
|
2215
|
+
const doc = MDocument.fromText(text, metadata);
|
|
2216
|
+
|
|
2217
|
+
const chunks = await doc.chunk({
|
|
2218
|
+
strategy: 'sentence',
|
|
2219
|
+
maxSize: 100,
|
|
2220
|
+
sentenceEnders: ['.'],
|
|
2221
|
+
keepSeparator: true,
|
|
2222
|
+
});
|
|
2223
|
+
|
|
2224
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
2225
|
+
|
|
2226
|
+
chunks.forEach(chunk => {
|
|
2227
|
+
expect(chunk.metadata.source).toBe('test');
|
|
2228
|
+
expect(chunk.metadata.author).toBe('jest');
|
|
2229
|
+
});
|
|
2230
|
+
});
|
|
2231
|
+
|
|
2232
|
+
it('should handle abbreviations without false sentence breaks', async () => {
|
|
2233
|
+
const text =
|
|
2234
|
+
'Dr. Smith went to the U.S.A. at 3:30 a.m. on Monday. He met with Prof. Johnson at the U.N. headquarters.';
|
|
2235
|
+
|
|
2236
|
+
const doc = MDocument.fromText(text);
|
|
2237
|
+
const chunks = await doc.chunk({
|
|
2238
|
+
strategy: 'sentence',
|
|
2239
|
+
maxSize: 200,
|
|
2240
|
+
sentenceEnders: ['.'],
|
|
2241
|
+
keepSeparator: true,
|
|
2242
|
+
});
|
|
2243
|
+
|
|
2244
|
+
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2245
|
+
expect(chunks.length).toBeLessThanOrEqual(2);
|
|
2246
|
+
|
|
2247
|
+
const allText = chunks.map(c => c.text).join(' ');
|
|
2248
|
+
expect(allText).toContain('Dr. Smith'); // Should keep Dr. together
|
|
2249
|
+
expect(allText).toContain('U.S.A.'); // Should keep U.S.A. together
|
|
2250
|
+
expect(allText).toContain('a.m.'); // Should keep a.m. together
|
|
2251
|
+
expect(allText).toContain('Prof. Johnson'); // Should keep Prof. together
|
|
2252
|
+
expect(allText).toContain('U.N.'); // Should keep U.N. together
|
|
2253
|
+
|
|
2254
|
+
expect(allText).not.toContain('Dr '); // No broken Dr.
|
|
2255
|
+
expect(allText).not.toContain('Prof '); // No broken Prof.
|
|
2256
|
+
});
|
|
2257
|
+
|
|
2258
|
+
it('should respect fallbackToCharacters setting', async () => {
|
|
2259
|
+
const oversizedWord = 'supercalifragilisticexpialidocious'.repeat(5);
|
|
2260
|
+
const text = `Short sentence. ${oversizedWord}.`;
|
|
2261
|
+
|
|
2262
|
+
const doc1 = MDocument.fromText(text);
|
|
2263
|
+
const chunksWithFallback = await doc1.chunk({
|
|
2264
|
+
strategy: 'sentence',
|
|
2265
|
+
maxSize: 50,
|
|
2266
|
+
fallbackToWords: true,
|
|
2267
|
+
fallbackToCharacters: true,
|
|
2268
|
+
});
|
|
2269
|
+
|
|
2270
|
+
// Should split the oversized word
|
|
2271
|
+
expect(chunksWithFallback.length).toBeGreaterThan(2);
|
|
2272
|
+
|
|
2273
|
+
const doc2 = MDocument.fromText(text);
|
|
2274
|
+
const chunksWithoutFallback = await doc2.chunk({
|
|
2275
|
+
strategy: 'sentence',
|
|
2276
|
+
maxSize: 50,
|
|
2277
|
+
fallbackToWords: true,
|
|
2278
|
+
fallbackToCharacters: false,
|
|
2279
|
+
});
|
|
2280
|
+
|
|
2281
|
+
// Should have fewer chunks (oversized word kept intact)
|
|
2282
|
+
expect(chunksWithoutFallback.length).toBeLessThan(chunksWithFallback.length);
|
|
2283
|
+
|
|
2284
|
+
// Verify fallback disabled keeps oversized content
|
|
2285
|
+
const oversizedChunk = chunksWithoutFallback.find(chunk => chunk.text.length > 50);
|
|
2286
|
+
expect(oversizedChunk).toBeDefined();
|
|
2287
|
+
});
|
|
2288
|
+
|
|
2289
|
+
it('should handle complex punctuation and edge cases', async () => {
|
|
2290
|
+
const text =
|
|
2291
|
+
'Version 2.0 was released. The score was 3.14159. Mr. & Mrs. Smith arrived at 12:30 p.m. What happened next?';
|
|
2292
|
+
|
|
2293
|
+
const doc = MDocument.fromText(text);
|
|
2294
|
+
const chunks = await doc.chunk({
|
|
2295
|
+
strategy: 'sentence',
|
|
2296
|
+
maxSize: 200,
|
|
2297
|
+
sentenceEnders: ['.', '?'],
|
|
2298
|
+
keepSeparator: true,
|
|
2299
|
+
});
|
|
2300
|
+
|
|
2301
|
+
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2302
|
+
expect(chunks.length).toBeLessThanOrEqual(4);
|
|
2303
|
+
|
|
2304
|
+
const allText = chunks.map(c => c.text).join(' ');
|
|
2305
|
+
expect(allText).toContain('2.0'); // Should keep version numbers intact
|
|
2306
|
+
expect(allText).toContain('3.14159'); // Should keep decimals intact
|
|
2307
|
+
expect(allText).toContain('p.m.'); // Should keep time abbreviations intact
|
|
2308
|
+
expect(allText).toContain('What happened next?'); // Should end with question
|
|
2309
|
+
|
|
2310
|
+
// Should not break on decimals or version numbers
|
|
2311
|
+
expect(allText).not.toContain('2 '); // No broken version number
|
|
2312
|
+
expect(allText).not.toContain('3 '); // No broken decimal
|
|
2313
|
+
});
|
|
2314
|
+
});
|
|
2060
2315
|
});
|
|
2061
2316
|
|
|
2062
2317
|
// Helper function to find the longest common substring between two strings
|