@mastra/rag 1.0.7 → 1.0.8-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +11 -0
- package/dist/document/document.d.ts +4 -1
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/transformers/semantic-markdown.d.ts +25 -0
- package/dist/document/transformers/semantic-markdown.d.ts.map +1 -0
- package/dist/document/types.d.ts +13 -1
- package/dist/document/types.d.ts.map +1 -1
- package/dist/document/validation.d.ts.map +1 -1
- package/dist/index.cjs +197 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -11
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
- package/src/document/document.test.ts +644 -1
- package/src/document/document.ts +32 -12
- package/src/document/transformers/semantic-markdown.ts +227 -0
- package/src/document/types.ts +21 -2
- package/src/document/validation.ts +11 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/rag",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.8-alpha.0",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -45,9 +45,9 @@
|
|
|
45
45
|
"tsup": "^8.5.0",
|
|
46
46
|
"typescript": "^5.8.3",
|
|
47
47
|
"vitest": "^3.2.4",
|
|
48
|
-
"@
|
|
49
|
-
"@internal/types-builder": "0.0.
|
|
50
|
-
"@
|
|
48
|
+
"@mastra/core": "0.13.2-alpha.0",
|
|
49
|
+
"@internal/types-builder": "0.0.3",
|
|
50
|
+
"@internal/lint": "0.0.28"
|
|
51
51
|
},
|
|
52
52
|
"keywords": [
|
|
53
53
|
"rag",
|
|
@@ -61,7 +61,7 @@ describe('MDocument', () => {
|
|
|
61
61
|
});
|
|
62
62
|
|
|
63
63
|
expect(embeddings).toBeDefined();
|
|
64
|
-
});
|
|
64
|
+
}, 15000);
|
|
65
65
|
});
|
|
66
66
|
|
|
67
67
|
describe('chunkCharacter', () => {
|
|
@@ -2312,6 +2312,649 @@ describe('MDocument', () => {
|
|
|
2312
2312
|
expect(allText).not.toContain('3 '); // No broken decimal
|
|
2313
2313
|
});
|
|
2314
2314
|
});
|
|
2315
|
+
|
|
2316
|
+
describe('chunkSemanticMarkdown', () => {
|
|
2317
|
+
it('should merge small sections based on token threshold', async () => {
|
|
2318
|
+
const text = `# Introduction
|
|
2319
|
+
Brief intro paragraph.
|
|
2320
|
+
|
|
2321
|
+
## Setup Guide
|
|
2322
|
+
Short setup instructions.
|
|
2323
|
+
|
|
2324
|
+
### Prerequisites
|
|
2325
|
+
Very short list.
|
|
2326
|
+
|
|
2327
|
+
### Installation Steps
|
|
2328
|
+
Very detailed installation process with code examples and explanations that would normally be quite long but in this test we'll keep it moderate length for testing purposes.
|
|
2329
|
+
|
|
2330
|
+
## Advanced Configuration
|
|
2331
|
+
Another section with moderate content for testing the merging algorithm.`;
|
|
2332
|
+
|
|
2333
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2334
|
+
|
|
2335
|
+
await doc.chunk({
|
|
2336
|
+
strategy: 'semantic-markdown',
|
|
2337
|
+
joinThreshold: 200,
|
|
2338
|
+
});
|
|
2339
|
+
|
|
2340
|
+
const chunks = doc.getText();
|
|
2341
|
+
const docs = doc.getDocs();
|
|
2342
|
+
|
|
2343
|
+
expect(chunks.length).toBeLessThan(6);
|
|
2344
|
+
|
|
2345
|
+
expect(docs[0]?.metadata?.tokenCount).toBeDefined();
|
|
2346
|
+
expect(typeof docs[0]?.metadata?.tokenCount).toBe('number');
|
|
2347
|
+
expect(docs[0]?.metadata?.tokenCount).toBeGreaterThan(0);
|
|
2348
|
+
});
|
|
2349
|
+
|
|
2350
|
+
it('should respect sibling/parent relationships in merging', async () => {
|
|
2351
|
+
const text = `# Main Document
|
|
2352
|
+
|
|
2353
|
+
## Section A
|
|
2354
|
+
Content for section A that is moderately long to ensure we have enough tokens for testing the semantic merging algorithm properly.
|
|
2355
|
+
|
|
2356
|
+
### Subsection A1
|
|
2357
|
+
This subsection has more content than the previous version to test the hierarchical merging behavior.
|
|
2358
|
+
|
|
2359
|
+
### Subsection A2
|
|
2360
|
+
Another subsection with substantial content to verify proper semantic boundary handling.
|
|
2361
|
+
|
|
2362
|
+
## Section B
|
|
2363
|
+
Content for section B that is also moderately sized with meaningful text to test cross-section merging behavior.
|
|
2364
|
+
|
|
2365
|
+
### Subsection B1
|
|
2366
|
+
This final subsection contains enough content to test the bottom-up merging algorithm effectively.`;
|
|
2367
|
+
|
|
2368
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2369
|
+
|
|
2370
|
+
await doc.chunk({
|
|
2371
|
+
strategy: 'semantic-markdown',
|
|
2372
|
+
joinThreshold: 100, // Threshold that allows some merging but not everything
|
|
2373
|
+
});
|
|
2374
|
+
|
|
2375
|
+
const chunks = doc.getText();
|
|
2376
|
+
const docs = doc.getDocs();
|
|
2377
|
+
|
|
2378
|
+
// Should create fewer chunks than original sections due to merging
|
|
2379
|
+
expect(chunks.length).toBeLessThan(7);
|
|
2380
|
+
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2381
|
+
|
|
2382
|
+
// Verify sections maintain semantic coherence
|
|
2383
|
+
const hasSection = chunks.some(chunk => chunk.includes('Section A') || chunk.includes('Subsection A1'));
|
|
2384
|
+
expect(hasSection).toBe(true);
|
|
2385
|
+
|
|
2386
|
+
expect(docs[0]?.metadata?.tokenCount).toBeDefined();
|
|
2387
|
+
expect(docs[0]?.metadata?.tokenCount).toBeGreaterThan(0);
|
|
2388
|
+
});
|
|
2389
|
+
|
|
2390
|
+
it('should correctly chunk a controlled test document', async () => {
|
|
2391
|
+
const controlledTestMarkdown = `# My Test Document
|
|
2392
|
+
|
|
2393
|
+
This is a short preamble to test how content before the first header is handled. It should be merged with the first section if that section is small enough.
|
|
2394
|
+
|
|
2395
|
+
## Chapter 1: The Small Sections
|
|
2396
|
+
|
|
2397
|
+
This is the introduction to Chapter 1. It contains several small subsections that are perfect candidates for merging.
|
|
2398
|
+
|
|
2399
|
+
### Section 1.1: A Tiny Topic
|
|
2400
|
+
|
|
2401
|
+
Just a few words here.
|
|
2402
|
+
|
|
2403
|
+
### Section 1.2: Another Tiny Topic
|
|
2404
|
+
|
|
2405
|
+
A few more words to make up a small paragraph.
|
|
2406
|
+
|
|
2407
|
+
## Chapter 2: The Big Section
|
|
2408
|
+
|
|
2409
|
+
This chapter has a very large section that should NOT be merged with its sibling because it is over the token limit all by itself.
|
|
2410
|
+
|
|
2411
|
+
\`\`\`python
|
|
2412
|
+
# This is a large block of Python code.
|
|
2413
|
+
# It is designed to have a high token count to test the merging threshold.
|
|
2414
|
+
import os
|
|
2415
|
+
import sys
|
|
2416
|
+
|
|
2417
|
+
class DataProcessor:
|
|
2418
|
+
def __init__(self, data):
|
|
2419
|
+
self.data = data
|
|
2420
|
+
self.length = len(data)
|
|
2421
|
+
|
|
2422
|
+
def process(self):
|
|
2423
|
+
"""
|
|
2424
|
+
This is a long docstring to add even more tokens to the count.
|
|
2425
|
+
We will iterate through the data and perform some kind of mock processing.
|
|
2426
|
+
The goal is to exceed the joinThreshold of 250 tokens easily.
|
|
2427
|
+
Let's add more lines to be sure.
|
|
2428
|
+
Line 1
|
|
2429
|
+
Line 2
|
|
2430
|
+
Line 3
|
|
2431
|
+
Line 4
|
|
2432
|
+
Line 5
|
|
2433
|
+
...and so on.
|
|
2434
|
+
"""
|
|
2435
|
+
results = []
|
|
2436
|
+
for i, item in enumerate(self.data):
|
|
2437
|
+
# A mock calculation
|
|
2438
|
+
processed_item = (item * i) + self.length
|
|
2439
|
+
results.append(processed_item)
|
|
2440
|
+
return results
|
|
2441
|
+
|
|
2442
|
+
# Let's make sure this section is large enough.
|
|
2443
|
+
# More comments and code will help.
|
|
2444
|
+
def another_function_to_add_tokens():
|
|
2445
|
+
"""Another long docstring for good measure."""
|
|
2446
|
+
x = 1
|
|
2447
|
+
y = 2
|
|
2448
|
+
z = x + y
|
|
2449
|
+
print(f"The result is {z}")
|
|
2450
|
+
# End of function
|
|
2451
|
+
\`\`\`
|
|
2452
|
+
|
|
2453
|
+
## Chapter 3: The Mixed Bag
|
|
2454
|
+
|
|
2455
|
+
This chapter contains a mix of small and medium sections.
|
|
2456
|
+
|
|
2457
|
+
### Section 3.1: A Medium Section
|
|
2458
|
+
|
|
2459
|
+
This section is moderately sized. It's not huge, but it has enough content to be a meaningful chunk on its own. We'll aim for about 150 tokens here so it can potentially merge with a small sibling.
|
|
2460
|
+
|
|
2461
|
+
### Section 3.2: A Final Small Section
|
|
2462
|
+
|
|
2463
|
+
This final section is very small and should definitely be merged into its predecessor, Section 3.1, because their combined total will be under the threshold.
|
|
2464
|
+
`;
|
|
2465
|
+
|
|
2466
|
+
const doc = MDocument.fromMarkdown(controlledTestMarkdown);
|
|
2467
|
+
await doc.chunk({
|
|
2468
|
+
strategy: 'semantic-markdown',
|
|
2469
|
+
joinThreshold: 250,
|
|
2470
|
+
modelName: 'gpt-3.5-turbo',
|
|
2471
|
+
});
|
|
2472
|
+
|
|
2473
|
+
const chunks = doc.getText();
|
|
2474
|
+
expect(chunks).toHaveLength(3);
|
|
2475
|
+
expect(chunks[0]).toContain('# My Test Document');
|
|
2476
|
+
expect(chunks[0]).toContain('### Section 1.2: Another Tiny Topic');
|
|
2477
|
+
expect(chunks[1]).toContain('## Chapter 2: The Big Section');
|
|
2478
|
+
expect(chunks[2]).toContain('## Chapter 3: The Mixed Bag');
|
|
2479
|
+
expect(chunks[2]).toContain('### Section 3.2: A Final Small Section');
|
|
2480
|
+
});
|
|
2481
|
+
|
|
2482
|
+
it('should preserve code blocks during merging', async () => {
|
|
2483
|
+
const text = `# Code Example
|
|
2484
|
+
|
|
2485
|
+
## Installation
|
|
2486
|
+
Install the package:
|
|
2487
|
+
|
|
2488
|
+
\`\`\`bash
|
|
2489
|
+
npm install example-package
|
|
2490
|
+
\`\`\`
|
|
2491
|
+
|
|
2492
|
+
## Usage
|
|
2493
|
+
Here's how to use it:
|
|
2494
|
+
|
|
2495
|
+
\`\`\`javascript
|
|
2496
|
+
const example = require('example-package');
|
|
2497
|
+
example.doSomething();
|
|
2498
|
+
\`\`\`
|
|
2499
|
+
|
|
2500
|
+
## Configuration
|
|
2501
|
+
Set up your config file.`;
|
|
2502
|
+
|
|
2503
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2504
|
+
|
|
2505
|
+
await doc.chunk({
|
|
2506
|
+
strategy: 'semantic-markdown',
|
|
2507
|
+
joinThreshold: 300,
|
|
2508
|
+
});
|
|
2509
|
+
|
|
2510
|
+
const chunks = doc.getText();
|
|
2511
|
+
|
|
2512
|
+
// Code blocks should be preserved intact
|
|
2513
|
+
expect(chunks.some(chunk => chunk.includes('```bash'))).toBe(true);
|
|
2514
|
+
expect(chunks.some(chunk => chunk.includes('```javascript'))).toBe(true);
|
|
2515
|
+
|
|
2516
|
+
// Should not split within code blocks
|
|
2517
|
+
const bashChunk = chunks.find(chunk => chunk.includes('npm install'));
|
|
2518
|
+
expect(bashChunk).toBeDefined();
|
|
2519
|
+
expect(bashChunk).toContain('```bash');
|
|
2520
|
+
});
|
|
2521
|
+
|
|
2522
|
+
it('should work with different tiktoken models', async () => {
|
|
2523
|
+
const text = `# Test Document
|
|
2524
|
+
|
|
2525
|
+
## Section 1
|
|
2526
|
+
Some content for testing different tiktoken models and their token counting accuracy.
|
|
2527
|
+
|
|
2528
|
+
## Section 2
|
|
2529
|
+
More content to verify the token counting works correctly across different model encodings.`;
|
|
2530
|
+
|
|
2531
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2532
|
+
|
|
2533
|
+
await doc.chunk({
|
|
2534
|
+
strategy: 'semantic-markdown',
|
|
2535
|
+
joinThreshold: 100,
|
|
2536
|
+
modelName: 'gpt-4',
|
|
2537
|
+
});
|
|
2538
|
+
|
|
2539
|
+
const chunks = doc.getText();
|
|
2540
|
+
const docs = doc.getDocs();
|
|
2541
|
+
|
|
2542
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
2543
|
+
expect(docs[0]?.metadata?.tokenCount).toBeDefined();
|
|
2544
|
+
expect(typeof docs[0]?.metadata?.tokenCount).toBe('number');
|
|
2545
|
+
});
|
|
2546
|
+
|
|
2547
|
+
it('should handle documents with no headers', async () => {
|
|
2548
|
+
const text = `This is a document with no markdown headers.
|
|
2549
|
+
|
|
2550
|
+
Just regular paragraphs of text that should be processed as a single semantic unit since there are no headers to split on.
|
|
2551
|
+
|
|
2552
|
+
More paragraphs here to test the behavior.`;
|
|
2553
|
+
|
|
2554
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2555
|
+
|
|
2556
|
+
await doc.chunk({
|
|
2557
|
+
strategy: 'semantic-markdown',
|
|
2558
|
+
joinThreshold: 200,
|
|
2559
|
+
});
|
|
2560
|
+
|
|
2561
|
+
const chunks = doc.getText();
|
|
2562
|
+
|
|
2563
|
+
// Should return single chunk since no headers to split on
|
|
2564
|
+
expect(chunks.length).toBe(1);
|
|
2565
|
+
expect(chunks[0]).toContain('This is a document with no markdown headers');
|
|
2566
|
+
});
|
|
2567
|
+
|
|
2568
|
+
it('should handle empty sections correctly', async () => {
|
|
2569
|
+
const text = `# Document
|
|
2570
|
+
|
|
2571
|
+
## Empty Section
|
|
2572
|
+
|
|
2573
|
+
## Another Section
|
|
2574
|
+
Some content here.
|
|
2575
|
+
|
|
2576
|
+
## Final Empty Section
|
|
2577
|
+
|
|
2578
|
+
`;
|
|
2579
|
+
|
|
2580
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2581
|
+
|
|
2582
|
+
await doc.chunk({
|
|
2583
|
+
strategy: 'semantic-markdown',
|
|
2584
|
+
joinThreshold: 100,
|
|
2585
|
+
});
|
|
2586
|
+
|
|
2587
|
+
const chunks = doc.getText();
|
|
2588
|
+
|
|
2589
|
+
// Should handle empty sections gracefully
|
|
2590
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
2591
|
+
expect(chunks.some(chunk => chunk.includes('Some content here'))).toBe(true);
|
|
2592
|
+
});
|
|
2593
|
+
|
|
2594
|
+
it('should maintain bottom-up merging order (deepest first)', async () => {
|
|
2595
|
+
const text = `# Root
|
|
2596
|
+
|
|
2597
|
+
## Level 2A
|
|
2598
|
+
Content 2A
|
|
2599
|
+
|
|
2600
|
+
### Level 3A
|
|
2601
|
+
Short content 3A
|
|
2602
|
+
|
|
2603
|
+
#### Level 4A
|
|
2604
|
+
Short content 4A
|
|
2605
|
+
|
|
2606
|
+
### Level 3B
|
|
2607
|
+
Short content 3B
|
|
2608
|
+
|
|
2609
|
+
## Level 2B
|
|
2610
|
+
Content 2B`;
|
|
2611
|
+
|
|
2612
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2613
|
+
|
|
2614
|
+
await doc.chunk({
|
|
2615
|
+
strategy: 'semantic-markdown',
|
|
2616
|
+
joinThreshold: 200,
|
|
2617
|
+
});
|
|
2618
|
+
|
|
2619
|
+
const chunks = doc.getText();
|
|
2620
|
+
|
|
2621
|
+
// The algorithm should merge from deepest level first
|
|
2622
|
+
// Level 4 should merge with Level 3, then Level 3s might merge with Level 2
|
|
2623
|
+
expect(chunks.length).toBeLessThan(7); // Less than original 7 sections
|
|
2624
|
+
|
|
2625
|
+
// Verify deep nesting is preserved in merged content
|
|
2626
|
+
const deepChunk = chunks.find(chunk => chunk.includes('Level 4A') && chunk.includes('Level 3A'));
|
|
2627
|
+
expect(deepChunk).toBeDefined();
|
|
2628
|
+
});
|
|
2629
|
+
|
|
2630
|
+
it('should compare token accuracy vs character-based sizing', async () => {
|
|
2631
|
+
// Use text with unicode and varying token densities
|
|
2632
|
+
const text = `# Test Document
|
|
2633
|
+
|
|
2634
|
+
## Unicode Section
|
|
2635
|
+
This section contains unicode characters: café, naïve, résumé, 中文, العربية
|
|
2636
|
+
|
|
2637
|
+
## Code Section
|
|
2638
|
+
\`\`\`python
|
|
2639
|
+
def function_with_long_name_and_parameters(param1, param2, param3):
|
|
2640
|
+
return param1 + param2 + param3
|
|
2641
|
+
\`\`\`
|
|
2642
|
+
|
|
2643
|
+
## Regular Section
|
|
2644
|
+
Regular English text without special characters.`;
|
|
2645
|
+
|
|
2646
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2647
|
+
|
|
2648
|
+
await doc.chunk({
|
|
2649
|
+
strategy: 'semantic-markdown',
|
|
2650
|
+
joinThreshold: 150, // Token-based threshold
|
|
2651
|
+
});
|
|
2652
|
+
|
|
2653
|
+
const docs = doc.getDocs();
|
|
2654
|
+
|
|
2655
|
+
// Verify token counts are provided in metadata
|
|
2656
|
+
docs.forEach(doc => {
|
|
2657
|
+
expect(doc.metadata.tokenCount).toBeDefined();
|
|
2658
|
+
expect(typeof doc.metadata.tokenCount).toBe('number');
|
|
2659
|
+
expect(doc.metadata.tokenCount).toBeGreaterThan(0);
|
|
2660
|
+
});
|
|
2661
|
+
|
|
2662
|
+
// Token count should be different from character count for unicode text
|
|
2663
|
+
const unicodeDoc = docs.find(doc => doc.text.includes('café'));
|
|
2664
|
+
if (unicodeDoc) {
|
|
2665
|
+
const charCount = unicodeDoc.text.length;
|
|
2666
|
+
const tokenCount = unicodeDoc.metadata.tokenCount;
|
|
2667
|
+
|
|
2668
|
+
// For text with unicode, token count is often different from char count
|
|
2669
|
+
expect(tokenCount).toBeDefined();
|
|
2670
|
+
expect(tokenCount).not.toBe(charCount);
|
|
2671
|
+
}
|
|
2672
|
+
});
|
|
2673
|
+
|
|
2674
|
+
it('should handle documents with only deep headers (no top-level sections)', async () => {
|
|
2675
|
+
const text = `### Deep Section 1
|
|
2676
|
+
Short content for deep section 1.
|
|
2677
|
+
|
|
2678
|
+
#### Very Deep Section 1.1
|
|
2679
|
+
Even shorter content.
|
|
2680
|
+
|
|
2681
|
+
#### Very Deep Section 1.2
|
|
2682
|
+
Another short subsection.
|
|
2683
|
+
|
|
2684
|
+
### Deep Section 2
|
|
2685
|
+
Short content for deep section 2.
|
|
2686
|
+
|
|
2687
|
+
#### Very Deep Section 2.1
|
|
2688
|
+
Final short content.`;
|
|
2689
|
+
|
|
2690
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2691
|
+
|
|
2692
|
+
await doc.chunk({
|
|
2693
|
+
strategy: 'semantic-markdown',
|
|
2694
|
+
joinThreshold: 200,
|
|
2695
|
+
});
|
|
2696
|
+
|
|
2697
|
+
const chunks = doc.getText();
|
|
2698
|
+
const docs = doc.getDocs();
|
|
2699
|
+
|
|
2700
|
+
// Should merge the small deep sections together
|
|
2701
|
+
expect(chunks.length).toBeLessThan(5);
|
|
2702
|
+
expect(chunks.length).toBeGreaterThan(0);
|
|
2703
|
+
|
|
2704
|
+
// Verify deep headers are preserved in merged content
|
|
2705
|
+
const deepChunk = chunks.find(
|
|
2706
|
+
chunk => chunk.includes('### Deep Section 1') && chunk.includes('#### Very Deep Section'),
|
|
2707
|
+
);
|
|
2708
|
+
expect(deepChunk).toBeDefined();
|
|
2709
|
+
|
|
2710
|
+
expect(docs[0]?.metadata?.tokenCount).toBeDefined();
|
|
2711
|
+
});
|
|
2712
|
+
|
|
2713
|
+
it('should leave very large individual sections intact (exceeding joinThreshold)', async () => {
|
|
2714
|
+
const largeContent = 'This is a very long section. '.repeat(50); // ~1500 tokens
|
|
2715
|
+
const text = `# Document Title
|
|
2716
|
+
|
|
2717
|
+
## Small Section
|
|
2718
|
+
Small content here.
|
|
2719
|
+
|
|
2720
|
+
## Oversized Section
|
|
2721
|
+
${largeContent}
|
|
2722
|
+
|
|
2723
|
+
\`\`\`javascript
|
|
2724
|
+
// Adding code to make it even larger
|
|
2725
|
+
function processData(data) {
|
|
2726
|
+
const results = [];
|
|
2727
|
+
for (let i = 0; i < data.length; i++) {
|
|
2728
|
+
const processed = data[i] * 2 + Math.random();
|
|
2729
|
+
results.push(processed);
|
|
2730
|
+
console.log(\`Processed item \${i}: \${processed}\`);
|
|
2731
|
+
}
|
|
2732
|
+
return results;
|
|
2733
|
+
}
|
|
2734
|
+
|
|
2735
|
+
// More code to ensure we exceed the threshold
|
|
2736
|
+
class DataManager {
|
|
2737
|
+
constructor(initialData) {
|
|
2738
|
+
this.data = initialData;
|
|
2739
|
+
this.processedCount = 0;
|
|
2740
|
+
}
|
|
2741
|
+
|
|
2742
|
+
process() {
|
|
2743
|
+
this.data.forEach((item, index) => {
|
|
2744
|
+
// Process each item
|
|
2745
|
+
this.processedCount++;
|
|
2746
|
+
});
|
|
2747
|
+
}
|
|
2748
|
+
}
|
|
2749
|
+
\`\`\`
|
|
2750
|
+
|
|
2751
|
+
## Another Small Section
|
|
2752
|
+
More small content.`;
|
|
2753
|
+
|
|
2754
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2755
|
+
|
|
2756
|
+
await doc.chunk({
|
|
2757
|
+
strategy: 'semantic-markdown',
|
|
2758
|
+
joinThreshold: 300, // Much smaller than the oversized section
|
|
2759
|
+
});
|
|
2760
|
+
|
|
2761
|
+
const chunks = doc.getText();
|
|
2762
|
+
const docs = doc.getDocs();
|
|
2763
|
+
|
|
2764
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
2765
|
+
|
|
2766
|
+
// The oversized section should be left as its own chunk
|
|
2767
|
+
const oversizedChunk = chunks.find(chunk => chunk.includes('Oversized Section'));
|
|
2768
|
+
expect(oversizedChunk).toBeDefined();
|
|
2769
|
+
expect(oversizedChunk).toContain('This is a very long section.');
|
|
2770
|
+
|
|
2771
|
+
// Verify the oversized chunk exceeds the threshold
|
|
2772
|
+
const oversizedDoc = docs.find(doc => doc.text.includes('Oversized Section'));
|
|
2773
|
+
expect(oversizedDoc?.metadata?.tokenCount).toBeGreaterThan(300);
|
|
2774
|
+
|
|
2775
|
+
// Small sections should still be merged where possible
|
|
2776
|
+
const smallChunk = chunks.find(chunk => chunk.includes('Small Section') && !chunk.includes('Oversized'));
|
|
2777
|
+
expect(smallChunk).toBeDefined();
|
|
2778
|
+
});
|
|
2779
|
+
|
|
2780
|
+
it('should handle mixed header levels with gaps (skipping levels)', async () => {
|
|
2781
|
+
const text = `# Top Level
|
|
2782
|
+
|
|
2783
|
+
#### Deep Level A (skipped H2 and H3)
|
|
2784
|
+
Content for deep level A that is moderately sized with enough text to make it substantial. This section needs to have sufficient content to test the merging behavior properly when header levels are skipped. Let's add more content to ensure we have enough tokens to work with.
|
|
2785
|
+
|
|
2786
|
+
## Middle Level
|
|
2787
|
+
Content for middle level section that also needs to be substantial enough to test the algorithm. This section should have enough content to be meaningful when testing the semantic markdown chunking with mixed header levels.
|
|
2788
|
+
|
|
2789
|
+
##### Very Deep Level (skipped H3 and H4)
|
|
2790
|
+
Short content for very deep level that should still be substantial enough for testing. Even though this is marked as short, we need enough content to make the test meaningful.
|
|
2791
|
+
|
|
2792
|
+
# Another Top Level
|
|
2793
|
+
|
|
2794
|
+
This second top-level section should definitely create a boundary that prevents everything from merging into a single chunk. We need substantial content here to ensure proper separation.
|
|
2795
|
+
|
|
2796
|
+
### Medium Deep Level (skipped H2)
|
|
2797
|
+
Final content for testing header level gaps. This section also needs substantial content to ensure we're testing the algorithm properly with realistic content sizes.`;
|
|
2798
|
+
|
|
2799
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2800
|
+
|
|
2801
|
+
await doc.chunk({
|
|
2802
|
+
strategy: 'semantic-markdown',
|
|
2803
|
+
joinThreshold: 150, // Smaller threshold to encourage more chunks
|
|
2804
|
+
});
|
|
2805
|
+
|
|
2806
|
+
const chunks = doc.getText();
|
|
2807
|
+
|
|
2808
|
+
// Should handle the gaps gracefully - expect at least 2 chunks due to the second top-level section
|
|
2809
|
+
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2810
|
+
|
|
2811
|
+
// Verify headers with gaps are preserved
|
|
2812
|
+
expect(chunks.some(chunk => chunk.includes('#### Deep Level A'))).toBe(true);
|
|
2813
|
+
expect(chunks.some(chunk => chunk.includes('##### Very Deep Level'))).toBe(true);
|
|
2814
|
+
expect(chunks.some(chunk => chunk.includes('### Medium Deep Level'))).toBe(true);
|
|
2815
|
+
|
|
2816
|
+
// Verify both top-level sections are present
|
|
2817
|
+
expect(chunks.some(chunk => chunk.includes('# Top Level'))).toBe(true);
|
|
2818
|
+
expect(chunks.some(chunk => chunk.includes('# Another Top Level'))).toBe(true);
|
|
2819
|
+
});
|
|
2820
|
+
|
|
2821
|
+
it('should handle large documents efficiently (performance test)', async () => {
|
|
2822
|
+
const sections: string[] = [];
|
|
2823
|
+
for (let i = 1; i <= 100; i++) {
|
|
2824
|
+
sections.push(`## Section ${i}`);
|
|
2825
|
+
sections.push(`This is content for section ${i}. `.repeat(10)); // ~100 tokens each
|
|
2826
|
+
|
|
2827
|
+
for (let j = 1; j <= 3; j++) {
|
|
2828
|
+
sections.push(`### Subsection ${i}.${j}`);
|
|
2829
|
+
sections.push(`This is subsection content ${i}.${j}. `.repeat(5)); // ~50 tokens each
|
|
2830
|
+
}
|
|
2831
|
+
}
|
|
2832
|
+
|
|
2833
|
+
const largeText = `# Large Test Document\n\n${sections.join('\n\n')}`;
|
|
2834
|
+
|
|
2835
|
+
const doc = MDocument.fromMarkdown(largeText);
|
|
2836
|
+
|
|
2837
|
+
const startTime = Date.now();
|
|
2838
|
+
|
|
2839
|
+
await doc.chunk({
|
|
2840
|
+
strategy: 'semantic-markdown',
|
|
2841
|
+
joinThreshold: 300,
|
|
2842
|
+
});
|
|
2843
|
+
|
|
2844
|
+
const duration = Date.now() - startTime;
|
|
2845
|
+
const chunks = doc.getText();
|
|
2846
|
+
const docs = doc.getDocs();
|
|
2847
|
+
|
|
2848
|
+
expect(duration).toBeLessThan(5000);
|
|
2849
|
+
|
|
2850
|
+
expect(chunks.length).toBeGreaterThan(10);
|
|
2851
|
+
expect(chunks.length).toBeLessThan(400);
|
|
2852
|
+
|
|
2853
|
+
docs.forEach(doc => {
|
|
2854
|
+
expect(doc.metadata.tokenCount).toBeDefined();
|
|
2855
|
+
expect(doc.metadata.tokenCount).toBeGreaterThan(0);
|
|
2856
|
+
});
|
|
2857
|
+
}, 10000);
|
|
2858
|
+
|
|
2859
|
+
it('should maintain semantic coherence with very small joinThreshold', async () => {
|
|
2860
|
+
const text = `# Document
|
|
2861
|
+
|
|
2862
|
+
This is a substantial preamble section that should have enough content to be meaningful in token counting. We need sufficient content here to test the algorithm properly.
|
|
2863
|
+
|
|
2864
|
+
## Section A
|
|
2865
|
+
Brief content for section A that needs to be expanded to ensure we have meaningful token counts for testing the semantic markdown chunking algorithm with a very small threshold.
|
|
2866
|
+
|
|
2867
|
+
### Sub A1
|
|
2868
|
+
More substantial content here for subsection A1. This content needs to be long enough to have a reasonable token count that will affect the merging decisions in our semantic chunking algorithm.
|
|
2869
|
+
|
|
2870
|
+
### Sub A2
|
|
2871
|
+
Even more substantial content for subsection A2. Again, we need enough tokens here to make the test meaningful and to properly exercise the algorithm's decision-making process.
|
|
2872
|
+
|
|
2873
|
+
## Section B
|
|
2874
|
+
Another section with substantial content for section B. This section should also have enough content to be meaningful in our token-based chunking strategy testing.
|
|
2875
|
+
|
|
2876
|
+
### Sub B1
|
|
2877
|
+
Final substantial content for subsection B1. This content should complete our test document with enough tokens to properly test the small threshold behavior.`;
|
|
2878
|
+
|
|
2879
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2880
|
+
|
|
2881
|
+
await doc.chunk({
|
|
2882
|
+
strategy: 'semantic-markdown',
|
|
2883
|
+
joinThreshold: 30, // Even smaller threshold to force separation
|
|
2884
|
+
});
|
|
2885
|
+
|
|
2886
|
+
const chunks = doc.getText();
|
|
2887
|
+
|
|
2888
|
+
// With a very small threshold, we should get at least some separation
|
|
2889
|
+
expect(chunks.length).toBeGreaterThanOrEqual(1);
|
|
2890
|
+
|
|
2891
|
+
// Verify all chunks have meaningful content
|
|
2892
|
+
chunks.forEach(chunk => {
|
|
2893
|
+
expect(chunk.trim().length).toBeGreaterThan(0);
|
|
2894
|
+
expect(chunk.trim().length).toBeGreaterThan(10);
|
|
2895
|
+
});
|
|
2896
|
+
|
|
2897
|
+
// Verify we have the main document structure preserved
|
|
2898
|
+
const allText = chunks.join(' ');
|
|
2899
|
+
expect(allText).toContain('# Document');
|
|
2900
|
+
expect(allText).toContain('## Section A');
|
|
2901
|
+
expect(allText).toContain('## Section B');
|
|
2902
|
+
});
|
|
2903
|
+
|
|
2904
|
+
it('should not treat headers inside code blocks as headers for splitting', async () => {
|
|
2905
|
+
const text = `# Real Header
|
|
2906
|
+
|
|
2907
|
+
Some introductory text explaining code examples.
|
|
2908
|
+
|
|
2909
|
+
\`\`\`markdown
|
|
2910
|
+
# This is not a real header
|
|
2911
|
+
It is inside a code block and should be ignored for chunking.
|
|
2912
|
+
|
|
2913
|
+
## This is also not a real header
|
|
2914
|
+
It should be treated as plain text content, not a section boundary.
|
|
2915
|
+
|
|
2916
|
+
### Even deeper fake headers
|
|
2917
|
+
Should also be ignored completely.
|
|
2918
|
+
\`\`\`
|
|
2919
|
+
|
|
2920
|
+
## A Real Second Header
|
|
2921
|
+
This content comes after the code block.
|
|
2922
|
+
|
|
2923
|
+
### A Real Subsection
|
|
2924
|
+
With some additional content to test the hierarchy.`;
|
|
2925
|
+
|
|
2926
|
+
const doc = MDocument.fromMarkdown(text);
|
|
2927
|
+
|
|
2928
|
+
await doc.chunk({
|
|
2929
|
+
strategy: 'semantic-markdown',
|
|
2930
|
+
joinThreshold: 25, // Low threshold to force separation into 2 or more chunks
|
|
2931
|
+
});
|
|
2932
|
+
|
|
2933
|
+
const chunks = doc.getText();
|
|
2934
|
+
|
|
2935
|
+
// With a low threshold, we should get exactly 2 chunks:
|
|
2936
|
+
// 1. "# Real Header" section (with the code block as content)
|
|
2937
|
+
// 2. "## A Real Second Header" section (with its subsection)
|
|
2938
|
+
// If fake headers were processed, we'd get more than 2 chunks
|
|
2939
|
+
expect(chunks.length).toBe(2);
|
|
2940
|
+
|
|
2941
|
+
const firstChunk = chunks[0];
|
|
2942
|
+
const secondChunk = chunks[1];
|
|
2943
|
+
|
|
2944
|
+
expect(firstChunk).toContain('# Real Header');
|
|
2945
|
+
expect(firstChunk).toContain('Some introductory text explaining code examples');
|
|
2946
|
+
expect(firstChunk).toContain('```markdown');
|
|
2947
|
+
expect(firstChunk).toContain('# This is not a real header');
|
|
2948
|
+
expect(firstChunk).toContain('## This is also not a real header');
|
|
2949
|
+
expect(firstChunk).toContain('### Even deeper fake headers');
|
|
2950
|
+
expect(firstChunk).not.toContain('## A Real Second Header');
|
|
2951
|
+
|
|
2952
|
+
expect(secondChunk).toContain('## A Real Second Header');
|
|
2953
|
+
expect(secondChunk).toContain('### A Real Subsection');
|
|
2954
|
+
expect(secondChunk).not.toContain('# Real Header');
|
|
2955
|
+
expect(secondChunk).not.toContain('# This is not a real header');
|
|
2956
|
+
});
|
|
2957
|
+
});
|
|
2315
2958
|
});
|
|
2316
2959
|
|
|
2317
2960
|
// Helper function to find the longest common substring between two strings
|