@mastra/rag 1.0.6 → 1.0.7-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/.turbo/turbo-build.log +1 -1
  2. package/CHANGELOG.md +25 -0
  3. package/dist/document/document.d.ts +10 -9
  4. package/dist/document/document.d.ts.map +1 -1
  5. package/dist/document/extractors/base.d.ts +1 -1
  6. package/dist/document/extractors/index.d.ts +5 -5
  7. package/dist/document/extractors/keywords.d.ts +4 -4
  8. package/dist/document/extractors/questions.d.ts +4 -4
  9. package/dist/document/extractors/summary.d.ts +4 -4
  10. package/dist/document/extractors/title.d.ts +4 -4
  11. package/dist/document/extractors/types.d.ts +1 -1
  12. package/dist/document/index.d.ts +2 -2
  13. package/dist/document/prompts/base.d.ts +1 -1
  14. package/dist/document/prompts/index.d.ts +3 -3
  15. package/dist/document/prompts/prompt.d.ts +1 -1
  16. package/dist/document/schema/index.d.ts +3 -3
  17. package/dist/document/schema/node.d.ts +2 -2
  18. package/dist/document/transformers/character.d.ts +6 -28
  19. package/dist/document/transformers/character.d.ts.map +1 -1
  20. package/dist/document/transformers/html.d.ts +9 -4
  21. package/dist/document/transformers/html.d.ts.map +1 -1
  22. package/dist/document/transformers/json.d.ts +5 -5
  23. package/dist/document/transformers/json.d.ts.map +1 -1
  24. package/dist/document/transformers/latex.d.ts +3 -9
  25. package/dist/document/transformers/latex.d.ts.map +1 -1
  26. package/dist/document/transformers/markdown.d.ts +4 -10
  27. package/dist/document/transformers/markdown.d.ts.map +1 -1
  28. package/dist/document/transformers/sentence.d.ts +31 -0
  29. package/dist/document/transformers/sentence.d.ts.map +1 -0
  30. package/dist/document/transformers/text.d.ts +5 -5
  31. package/dist/document/transformers/text.d.ts.map +1 -1
  32. package/dist/document/transformers/token.d.ts +5 -16
  33. package/dist/document/transformers/token.d.ts.map +1 -1
  34. package/dist/document/transformers/transformer.d.ts +1 -1
  35. package/dist/document/types.d.ts +86 -15
  36. package/dist/document/types.d.ts.map +1 -1
  37. package/dist/document/validation.d.ts +3 -0
  38. package/dist/document/validation.d.ts.map +1 -0
  39. package/dist/index.cjs +414 -80
  40. package/dist/index.cjs.map +1 -1
  41. package/dist/index.d.ts +8 -8
  42. package/dist/index.js +414 -80
  43. package/dist/index.js.map +1 -1
  44. package/dist/rerank/relevance/index.d.ts +3 -3
  45. package/dist/tools/document-chunker.d.ts +1 -1
  46. package/dist/tools/document-chunker.d.ts.map +1 -1
  47. package/dist/tools/graph-rag.d.ts +2 -2
  48. package/dist/tools/index.d.ts +3 -3
  49. package/dist/tools/types.d.ts +1 -1
  50. package/dist/tools/vector-query.d.ts +2 -2
  51. package/dist/utils/convert-sources.d.ts +2 -2
  52. package/dist/utils/index.d.ts +3 -3
  53. package/dist/utils/vector-search.d.ts +1 -1
  54. package/package.json +8 -7
  55. package/src/document/document.test.ts +294 -39
  56. package/src/document/document.ts +69 -41
  57. package/src/document/transformers/character.ts +15 -43
  58. package/src/document/transformers/html.ts +9 -9
  59. package/src/document/transformers/json.ts +8 -3
  60. package/src/document/transformers/latex.ts +3 -11
  61. package/src/document/transformers/markdown.ts +3 -11
  62. package/src/document/transformers/sentence.ts +314 -0
  63. package/src/document/transformers/text.ts +10 -10
  64. package/src/document/transformers/token.ts +6 -17
  65. package/src/document/types.ts +66 -15
  66. package/src/document/validation.ts +147 -0
  67. package/src/tools/document-chunker.ts +12 -8
  68. package/tsup.config.ts +2 -7
@@ -43,9 +43,8 @@ describe('MDocument', () => {
43
43
  const doc = MDocument.fromMarkdown(sampleMarkdown);
44
44
 
45
45
  chunks = await doc.chunk({
46
- size: 1500,
46
+ maxSize: 1500,
47
47
  overlap: 0,
48
- separator: `\n`,
49
48
  extract: {
50
49
  keywords: true,
51
50
  },
@@ -75,7 +74,7 @@ describe('MDocument', () => {
75
74
  strategy: 'character',
76
75
  separator: '\n\n',
77
76
  isSeparatorRegex: false,
78
- size: 50,
77
+ maxSize: 50,
79
78
  overlap: 5,
80
79
  });
81
80
 
@@ -96,7 +95,7 @@ describe('MDocument', () => {
96
95
  strategy: 'character',
97
96
  separator: '\\s+',
98
97
  isSeparatorRegex: true,
99
- size: 50,
98
+ maxSize: 50,
100
99
  overlap: 5,
101
100
  });
102
101
 
@@ -112,7 +111,7 @@ describe('MDocument', () => {
112
111
  strategy: 'character',
113
112
  separator: '\n\n',
114
113
  isSeparatorRegex: false,
115
- size: 50,
114
+ maxSize: 50,
116
115
  overlap: 5,
117
116
  keepSeparator: 'end',
118
117
  });
@@ -132,7 +131,7 @@ describe('MDocument', () => {
132
131
  strategy: 'character',
133
132
  separator: '\n\n',
134
133
  isSeparatorRegex: false,
135
- size: 50,
134
+ maxSize: 50,
136
135
  overlap: 5,
137
136
  keepSeparator: 'end',
138
137
  });
@@ -153,7 +152,7 @@ describe('MDocument', () => {
153
152
  strategy: 'character',
154
153
  separator: '\n\n',
155
154
  isSeparatorRegex: false,
156
- size: 50,
155
+ maxSize: 50,
157
156
  overlap: 5,
158
157
  keepSeparator: 'start',
159
158
  });
@@ -175,7 +174,7 @@ describe('MDocument', () => {
175
174
  strategy: 'character',
176
175
  separator: '\n\n',
177
176
  isSeparatorRegex: false,
178
- size: 50,
177
+ maxSize: 50,
179
178
  overlap: 5,
180
179
  keepSeparator: 'end',
181
180
  });
@@ -195,7 +194,7 @@ describe('MDocument', () => {
195
194
  strategy: 'character',
196
195
  separator: '\n\n',
197
196
  isSeparatorRegex: false,
198
- size: 50,
197
+ maxSize: 50,
199
198
  overlap: 5,
200
199
  keepSeparator: 'end',
201
200
  });
@@ -215,7 +214,7 @@ describe('MDocument', () => {
215
214
  strategy: 'character',
216
215
  separator: '\n\n',
217
216
  isSeparatorRegex: false,
218
- size: 50,
217
+ maxSize: 50,
219
218
  overlap: 5,
220
219
  keepSeparator: 'start',
221
220
  });
@@ -235,7 +234,7 @@ describe('MDocument', () => {
235
234
 
236
235
  const result = await doc.chunk({
237
236
  strategy: 'character',
238
- size: chunkSize,
237
+ maxSize: chunkSize,
239
238
  overlap,
240
239
  });
241
240
 
@@ -265,7 +264,7 @@ describe('MDocument', () => {
265
264
  const doc = MDocument.fromText(text);
266
265
  const chunks = await doc.chunk({
267
266
  strategy: 'character',
268
- size: chunkSize,
267
+ maxSize: chunkSize,
269
268
  overlap,
270
269
  });
271
270
 
@@ -309,7 +308,7 @@ describe('MDocument', () => {
309
308
  const testDoc = MDocument.fromText(text);
310
309
  const chunks = await testDoc.chunk({
311
310
  strategy: 'character',
312
- size: chunkSize,
311
+ maxSize: chunkSize,
313
312
  overlap,
314
313
  });
315
314
 
@@ -326,7 +325,7 @@ describe('MDocument', () => {
326
325
  }
327
326
  expect(allChunksValid).toBe(true);
328
327
 
329
- // Verify each chunk size explicitly
328
+ // Verify the size of each chunk explicitly
330
329
  for (const chunk of chunks) {
331
330
  expect(chunk.text.length).toBeLessThanOrEqual(chunkSize);
332
331
  }
@@ -352,7 +351,7 @@ describe('MDocument', () => {
352
351
  const doc = MDocument.fromText(text);
353
352
  const chunks = await doc.chunk({
354
353
  strategy: 'character',
355
- size: chunkSize,
354
+ maxSize: chunkSize,
356
355
  overlap,
357
356
  });
358
357
 
@@ -360,7 +359,7 @@ describe('MDocument', () => {
360
359
  chunks.forEach(chunk => {
361
360
  // Each chunk should be either:
362
361
  // 1. Full size (chunkSize)
363
- // 2. Or at least half the chunk size if it's the last chunk
362
+ // 2. Or at least half the chunk maxSize if it's the last chunk
364
363
  const minSize = chunk === chunks[chunks.length - 1] ? Math.floor(chunkSize / 2) : chunkSize;
365
364
  expect(chunk.text.length).toBeGreaterThanOrEqual(minSize);
366
365
  });
@@ -386,9 +385,9 @@ describe('MDocument', () => {
386
385
 
387
386
  await doc.chunk({
388
387
  strategy: 'recursive',
389
- size,
388
+ maxSize: size,
390
389
  overlap: overlapSize,
391
- separator: '\n\n', // Split on double newlines
390
+ separators: ['\n\n'], // Split on double newlines
392
391
  });
393
392
 
394
393
  const docs = doc.getDocs();
@@ -420,7 +419,7 @@ describe('MDocument', () => {
420
419
  strategy: 'recursive',
421
420
  separators: ['\n\n', '\n', ' ', ''],
422
421
  isSeparatorRegex: false,
423
- size: 50,
422
+ maxSize: 50,
424
423
  overlap: 5,
425
424
  });
426
425
 
@@ -446,7 +445,7 @@ describe('MDocument', () => {
446
445
  const doc = MDocument.fromText(tsCode, { meta: 'data' });
447
446
 
448
447
  await doc.chunk({
449
- size: 50,
448
+ maxSize: 50,
450
449
  overlap: 5,
451
450
  language: Language.TS,
452
451
  });
@@ -461,7 +460,7 @@ describe('MDocument', () => {
461
460
 
462
461
  await expect(
463
462
  doc.chunk({
464
- size: 50,
463
+ maxSize: 50,
465
464
  overlap: 5,
466
465
  language: 'invalid-language' as any,
467
466
  }),
@@ -481,7 +480,7 @@ describe('MDocument', () => {
481
480
 
482
481
  await doc.chunk({
483
482
  strategy: 'recursive',
484
- size: 500, // Smaller chunk size to ensure multiple chunks
483
+ maxSize: 500, // Smaller chunk maxSize to ensure multiple chunks
485
484
  overlap: overlapSize,
486
485
  });
487
486
 
@@ -517,7 +516,7 @@ describe('MDocument', () => {
517
516
 
518
517
  await doc.chunk({
519
518
  strategy: 'recursive',
520
- size: chunkSize,
519
+ maxSize: chunkSize,
521
520
  overlap: overlapSize,
522
521
  });
523
522
 
@@ -1373,7 +1372,7 @@ describe('MDocument', () => {
1373
1372
  await doc.chunk({
1374
1373
  strategy: 'token',
1375
1374
  encodingName: 'cl100k_base',
1376
- size: 10,
1375
+ maxSize: 10,
1377
1376
  overlap: 2,
1378
1377
  });
1379
1378
 
@@ -1391,7 +1390,7 @@ describe('MDocument', () => {
1391
1390
  await doc.chunk({
1392
1391
  strategy: 'token',
1393
1392
  encodingName: 'gpt2',
1394
- size: 10,
1393
+ maxSize: 10,
1395
1394
  disallowedSpecial: new Set(),
1396
1395
  allowedSpecial: new Set(['<|endoftext|>']),
1397
1396
  overlap: 2,
@@ -1410,7 +1409,7 @@ describe('MDocument', () => {
1410
1409
  await doc.chunk({
1411
1410
  strategy: 'token',
1412
1411
  encodingName: 'gpt2',
1413
- size: 10,
1412
+ maxSize: 10,
1414
1413
  disallowedSpecial: new Set(),
1415
1414
  allowedSpecial: new Set(['<|endoftext|>']),
1416
1415
  overlap: 2,
@@ -1424,15 +1423,15 @@ describe('MDocument', () => {
1424
1423
  });
1425
1424
 
1426
1425
  describe('Error cases', () => {
1427
- it('should throw error for invalid chunk size and overlap', async () => {
1426
+ it('should throw error for invalid chunk maxSize and overlap', async () => {
1428
1427
  const text = ' This has whitespace ';
1429
1428
  const doc = MDocument.fromText(text, { meta: 'data' });
1430
1429
 
1431
1430
  await expect(
1432
1431
  doc.chunk({
1433
1432
  strategy: 'token',
1434
- size: 100,
1435
- overlap: 150, // overlap larger than chunk size
1433
+ maxSize: 100,
1434
+ overlap: 150, // overlap larger than chunk maxSize
1436
1435
  }),
1437
1436
  ).rejects.toThrow();
1438
1437
  });
@@ -1445,8 +1444,8 @@ describe('MDocument', () => {
1445
1444
  doc.chunk({
1446
1445
  strategy: 'token',
1447
1446
  encodingName: 'invalid-encoding' as any,
1448
- size: 100,
1449
- overlap: 150, // overlap larger than chunk size
1447
+ maxSize: 100,
1448
+ overlap: 150, // overlap larger than chunk maxSize
1450
1449
  }),
1451
1450
  ).rejects.toThrow();
1452
1451
  });
@@ -1472,7 +1471,7 @@ describe('MDocument', () => {
1472
1471
 
1473
1472
  await doc.chunk({
1474
1473
  strategy: 'markdown',
1475
- size: 100,
1474
+ maxSize: 100,
1476
1475
  overlap: 10,
1477
1476
  });
1478
1477
 
@@ -1496,7 +1495,7 @@ describe('MDocument', () => {
1496
1495
 
1497
1496
  await doc.chunk({
1498
1497
  strategy: 'markdown',
1499
- size: 100,
1498
+ maxSize: 100,
1500
1499
  overlap: 10,
1501
1500
  });
1502
1501
 
@@ -1527,7 +1526,7 @@ describe('MDocument', () => {
1527
1526
 
1528
1527
  await doc.chunk({
1529
1528
  strategy: 'latex',
1530
- size: 100,
1529
+ maxSize: 100,
1531
1530
  overlap: 10,
1532
1531
  keepSeparator: 'start',
1533
1532
  });
@@ -1557,7 +1556,7 @@ describe('MDocument', () => {
1557
1556
 
1558
1557
  await doc.chunk({
1559
1558
  strategy: 'latex',
1560
- size: 100,
1559
+ maxSize: 100,
1561
1560
  overlap: 10,
1562
1561
  keepSeparator: 'start',
1563
1562
  });
@@ -1579,7 +1578,7 @@ describe('MDocument', () => {
1579
1578
 
1580
1579
  await doc.chunk({
1581
1580
  strategy: 'latex',
1582
- size: 50,
1581
+ maxSize: 50,
1583
1582
  overlap: 0,
1584
1583
  keepSeparator: 'end',
1585
1584
  });
@@ -1600,7 +1599,7 @@ describe('MDocument', () => {
1600
1599
 
1601
1600
  await doc.chunk({
1602
1601
  strategy: 'latex',
1603
- size: 100,
1602
+ maxSize: 100,
1604
1603
  overlap: 0,
1605
1604
  stripWhitespace: true,
1606
1605
  });
@@ -1759,7 +1758,7 @@ describe('MDocument', () => {
1759
1758
  const doc = MDocument.fromMarkdown(markdown);
1760
1759
  const chunks = await doc.chunk({
1761
1760
  strategy: 'markdown',
1762
- size: 500,
1761
+ maxSize: 500,
1763
1762
  overlap: 0,
1764
1763
  headers: [
1765
1764
  ['#', 'h1'],
@@ -2006,7 +2005,7 @@ describe('MDocument', () => {
2006
2005
  const chunks = await doc.chunk({
2007
2006
  strategy: 'character',
2008
2007
  separator: '\n\n',
2009
- size: 20,
2008
+ maxSize: 20,
2010
2009
  overlap: 0,
2011
2010
  extract: { keywords: true },
2012
2011
  });
@@ -2057,6 +2056,262 @@ describe('MDocument', () => {
2057
2056
  expect(titleA1).not.toBe(titleB);
2058
2057
  });
2059
2058
  });
2059
+
2060
+ describe('chunkSentence', () => {
2061
+ it('should preserve sentence structure and avoid mid-sentence breaks', async () => {
2062
+ const text =
2063
+ 'A dynamic concert scene captures an energetic, vibrant atmosphere, with a densely packed crowd silhouetted against bright stage lights. The image features beams of white light radiating from multiple projectors, creating dramatic patterns across a darkened room. The audience, comprised of numerous people with raised hands, exudes excitement and engagement, enhancing the lively mood. The setting suggests a large indoor venue, possibly a music or worship event, with text visible on a screen in the background, adding to an immersive experience. The overall composition emphasizes a sense of community and shared enthusiasm, ideal for promoting entertainment events, live concerts, or communal gatherings. The high-contrast lighting and slight haze effect imbue the scene with a modern, electrifying quality.';
2064
+
2065
+ const doc = MDocument.fromText(text);
2066
+
2067
+ const chunks = await doc.chunk({
2068
+ strategy: 'sentence',
2069
+ minSize: 50,
2070
+ maxSize: 450,
2071
+ overlap: 0,
2072
+ sentenceEnders: ['.'],
2073
+ keepSeparator: true,
2074
+ });
2075
+
2076
+ expect(chunks.length).toBeGreaterThan(1);
2077
+
2078
+ chunks.forEach(chunk => {
2079
+ expect(chunk.text.length).toBeGreaterThanOrEqual(50);
2080
+ expect(chunk.text.length).toBeLessThanOrEqual(450);
2081
+
2082
+ expect(chunk.text.startsWith('.')).toBe(false);
2083
+ expect(chunk.text.startsWith(' .')).toBe(false);
2084
+
2085
+ expect(chunk.text.endsWith('.')).toBe(true);
2086
+ });
2087
+ });
2088
+
2089
+ it('should require maxSize parameter', async () => {
2090
+ const doc = MDocument.fromText('Short text.');
2091
+
2092
+ await expect(
2093
+ doc.chunk({
2094
+ strategy: 'sentence',
2095
+ minSize: 50,
2096
+ } as any),
2097
+ ).rejects.toThrow('Invalid parameters for sentence strategy: maxSize: Required');
2098
+ });
2099
+
2100
+ it('should handle custom sentence enders', async () => {
2101
+ const text =
2102
+ 'First sentence with more content to make it longer. Second sentence with additional content! Third sentence with even more text? Fourth sentence with final content.';
2103
+
2104
+ const doc = MDocument.fromText(text);
2105
+
2106
+ const chunks = await doc.chunk({
2107
+ strategy: 'sentence',
2108
+ maxSize: 100,
2109
+ sentenceEnders: ['.', '!', '?'],
2110
+ keepSeparator: true,
2111
+ });
2112
+
2113
+ expect(chunks.length).toBeGreaterThan(1);
2114
+
2115
+ chunks.forEach(chunk => {
2116
+ const endsWithValidSeparator = chunk.text.endsWith('.') || chunk.text.endsWith('!') || chunk.text.endsWith('?');
2117
+ expect(endsWithValidSeparator).toBe(true);
2118
+ });
2119
+ });
2120
+
2121
+ it('should handle overlap with complete sentences', async () => {
2122
+ const text =
2123
+ 'First sentence with some content that makes it quite long. Second sentence with different content that also makes it lengthy. Third sentence with more content to ensure multiple chunks. Fourth sentence with final content to complete the test.';
2124
+
2125
+ const doc = MDocument.fromText(text);
2126
+
2127
+ const chunks = await doc.chunk({
2128
+ strategy: 'sentence',
2129
+ maxSize: 120,
2130
+ overlap: 50,
2131
+ sentenceEnders: ['.'],
2132
+ keepSeparator: true,
2133
+ });
2134
+
2135
+ expect(chunks.length).toBeGreaterThan(1);
2136
+
2137
+ // Check that overlapping chunks share some content
2138
+ if (chunks.length > 1) {
2139
+ for (let i = 1; i < chunks.length; i++) {
2140
+ const currentChunk = chunks[i].text;
2141
+
2142
+ // With overlap, current chunk should start with some content from previous chunk
2143
+ // Just verify that overlap is being applied (chunk 2 starts with overlap from chunk 1)
2144
+ expect(currentChunk.length).toBeGreaterThan(50); // Should include overlap content
2145
+ }
2146
+ }
2147
+ });
2148
+
2149
+ it('should fallback to word splitting for oversized sentences', async () => {
2150
+ const longSentence =
2151
+ 'This is an extremely long sentence that ' +
2152
+ 'word '.repeat(50) +
2153
+ 'and should be split into smaller chunks when it exceeds the maximum size limit.';
2154
+
2155
+ const doc = MDocument.fromText(longSentence);
2156
+
2157
+ const chunks = await doc.chunk({
2158
+ strategy: 'sentence',
2159
+ maxSize: 100,
2160
+ fallbackToWords: true,
2161
+ });
2162
+
2163
+ expect(chunks.length).toBeGreaterThan(1);
2164
+
2165
+ chunks.forEach(chunk => {
2166
+ expect(chunk.text.length).toBeLessThanOrEqual(100);
2167
+ });
2168
+ });
2169
+
2170
+ it('should handle short text appropriately', async () => {
2171
+ const text = 'Short sentence.';
2172
+
2173
+ const doc = MDocument.fromText(text);
2174
+
2175
+ const chunks = await doc.chunk({
2176
+ strategy: 'sentence',
2177
+ minSize: 5,
2178
+ maxSize: 100,
2179
+ sentenceEnders: ['.'],
2180
+ keepSeparator: true,
2181
+ });
2182
+
2183
+ expect(chunks.length).toBe(1);
2184
+ expect(chunks[0].text).toBe(text);
2185
+ });
2186
+
2187
+ it('should group multiple sentences when they fit within target size', async () => {
2188
+ const text = 'Short one. Another short. Third short. Fourth sentence. Fifth one.';
2189
+
2190
+ const doc = MDocument.fromText(text);
2191
+
2192
+ const chunks = await doc.chunk({
2193
+ strategy: 'sentence',
2194
+ minSize: 10,
2195
+ maxSize: 100,
2196
+ targetSize: 40,
2197
+ sentenceEnders: ['.'],
2198
+ keepSeparator: true,
2199
+ });
2200
+
2201
+ // Should group multiple short sentences together
2202
+ expect(chunks.length).toBeLessThan(5); // Less than the number of sentences
2203
+
2204
+ chunks.forEach(chunk => {
2205
+ // Each chunk should contain multiple sentences when possible
2206
+ expect(chunk.text.length).toBeLessThanOrEqual(100);
2207
+ });
2208
+ });
2209
+
2210
+ it('should preserve metadata across chunks', async () => {
2211
+ const text =
2212
+ 'First sentence with enough content to make it longer than fifty characters. Second sentence with additional content to ensure multiple chunks. Third sentence with final content.';
2213
+ const metadata = { source: 'test', author: 'jest' };
2214
+
2215
+ const doc = MDocument.fromText(text, metadata);
2216
+
2217
+ const chunks = await doc.chunk({
2218
+ strategy: 'sentence',
2219
+ maxSize: 100,
2220
+ sentenceEnders: ['.'],
2221
+ keepSeparator: true,
2222
+ });
2223
+
2224
+ expect(chunks.length).toBeGreaterThan(1);
2225
+
2226
+ chunks.forEach(chunk => {
2227
+ expect(chunk.metadata.source).toBe('test');
2228
+ expect(chunk.metadata.author).toBe('jest');
2229
+ });
2230
+ });
2231
+
2232
+ it('should handle abbreviations without false sentence breaks', async () => {
2233
+ const text =
2234
+ 'Dr. Smith went to the U.S.A. at 3:30 a.m. on Monday. He met with Prof. Johnson at the U.N. headquarters.';
2235
+
2236
+ const doc = MDocument.fromText(text);
2237
+ const chunks = await doc.chunk({
2238
+ strategy: 'sentence',
2239
+ maxSize: 200,
2240
+ sentenceEnders: ['.'],
2241
+ keepSeparator: true,
2242
+ });
2243
+
2244
+ expect(chunks.length).toBeGreaterThanOrEqual(1);
2245
+ expect(chunks.length).toBeLessThanOrEqual(2);
2246
+
2247
+ const allText = chunks.map(c => c.text).join(' ');
2248
+ expect(allText).toContain('Dr. Smith'); // Should keep Dr. together
2249
+ expect(allText).toContain('U.S.A.'); // Should keep U.S.A. together
2250
+ expect(allText).toContain('a.m.'); // Should keep a.m. together
2251
+ expect(allText).toContain('Prof. Johnson'); // Should keep Prof. together
2252
+ expect(allText).toContain('U.N.'); // Should keep U.N. together
2253
+
2254
+ expect(allText).not.toContain('Dr '); // No broken Dr.
2255
+ expect(allText).not.toContain('Prof '); // No broken Prof.
2256
+ });
2257
+
2258
+ it('should respect fallbackToCharacters setting', async () => {
2259
+ const oversizedWord = 'supercalifragilisticexpialidocious'.repeat(5);
2260
+ const text = `Short sentence. ${oversizedWord}.`;
2261
+
2262
+ const doc1 = MDocument.fromText(text);
2263
+ const chunksWithFallback = await doc1.chunk({
2264
+ strategy: 'sentence',
2265
+ maxSize: 50,
2266
+ fallbackToWords: true,
2267
+ fallbackToCharacters: true,
2268
+ });
2269
+
2270
+ // Should split the oversized word
2271
+ expect(chunksWithFallback.length).toBeGreaterThan(2);
2272
+
2273
+ const doc2 = MDocument.fromText(text);
2274
+ const chunksWithoutFallback = await doc2.chunk({
2275
+ strategy: 'sentence',
2276
+ maxSize: 50,
2277
+ fallbackToWords: true,
2278
+ fallbackToCharacters: false,
2279
+ });
2280
+
2281
+ // Should have fewer chunks (oversized word kept intact)
2282
+ expect(chunksWithoutFallback.length).toBeLessThan(chunksWithFallback.length);
2283
+
2284
+ // Verify fallback disabled keeps oversized content
2285
+ const oversizedChunk = chunksWithoutFallback.find(chunk => chunk.text.length > 50);
2286
+ expect(oversizedChunk).toBeDefined();
2287
+ });
2288
+
2289
+ it('should handle complex punctuation and edge cases', async () => {
2290
+ const text =
2291
+ 'Version 2.0 was released. The score was 3.14159. Mr. & Mrs. Smith arrived at 12:30 p.m. What happened next?';
2292
+
2293
+ const doc = MDocument.fromText(text);
2294
+ const chunks = await doc.chunk({
2295
+ strategy: 'sentence',
2296
+ maxSize: 200,
2297
+ sentenceEnders: ['.', '?'],
2298
+ keepSeparator: true,
2299
+ });
2300
+
2301
+ expect(chunks.length).toBeGreaterThanOrEqual(1);
2302
+ expect(chunks.length).toBeLessThanOrEqual(4);
2303
+
2304
+ const allText = chunks.map(c => c.text).join(' ');
2305
+ expect(allText).toContain('2.0'); // Should keep version numbers intact
2306
+ expect(allText).toContain('3.14159'); // Should keep decimals intact
2307
+ expect(allText).toContain('p.m.'); // Should keep time abbreviations intact
2308
+ expect(allText).toContain('What happened next?'); // Should end with question
2309
+
2310
+ // Should not break on decimals or version numbers
2311
+ expect(allText).not.toContain('2 '); // No broken version number
2312
+ expect(allText).not.toContain('3 '); // No broken decimal
2313
+ });
2314
+ });
2060
2315
  });
2061
2316
 
2062
2317
  // Helper function to find the longest common substring between two strings