@mastra/rag 1.0.7 → 1.0.8-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/rag",
3
- "version": "1.0.7",
3
+ "version": "1.0.8-alpha.0",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -45,9 +45,9 @@
45
45
  "tsup": "^8.5.0",
46
46
  "typescript": "^5.8.3",
47
47
  "vitest": "^3.2.4",
48
- "@internal/lint": "0.0.27",
49
- "@internal/types-builder": "0.0.2",
50
- "@mastra/core": "0.13.0"
48
+ "@mastra/core": "0.13.2-alpha.0",
49
+ "@internal/types-builder": "0.0.3",
50
+ "@internal/lint": "0.0.28"
51
51
  },
52
52
  "keywords": [
53
53
  "rag",
@@ -61,7 +61,7 @@ describe('MDocument', () => {
61
61
  });
62
62
 
63
63
  expect(embeddings).toBeDefined();
64
- });
64
+ }, 15000);
65
65
  });
66
66
 
67
67
  describe('chunkCharacter', () => {
@@ -2312,6 +2312,649 @@ describe('MDocument', () => {
2312
2312
  expect(allText).not.toContain('3 '); // No broken decimal
2313
2313
  });
2314
2314
  });
2315
+
2316
+ describe('chunkSemanticMarkdown', () => {
2317
+ it('should merge small sections based on token threshold', async () => {
2318
+ const text = `# Introduction
2319
+ Brief intro paragraph.
2320
+
2321
+ ## Setup Guide
2322
+ Short setup instructions.
2323
+
2324
+ ### Prerequisites
2325
+ Very short list.
2326
+
2327
+ ### Installation Steps
2328
+ Very detailed installation process with code examples and explanations that would normally be quite long but in this test we'll keep it moderate length for testing purposes.
2329
+
2330
+ ## Advanced Configuration
2331
+ Another section with moderate content for testing the merging algorithm.`;
2332
+
2333
+ const doc = MDocument.fromMarkdown(text);
2334
+
2335
+ await doc.chunk({
2336
+ strategy: 'semantic-markdown',
2337
+ joinThreshold: 200,
2338
+ });
2339
+
2340
+ const chunks = doc.getText();
2341
+ const docs = doc.getDocs();
2342
+
2343
+ expect(chunks.length).toBeLessThan(6);
2344
+
2345
+ expect(docs[0]?.metadata?.tokenCount).toBeDefined();
2346
+ expect(typeof docs[0]?.metadata?.tokenCount).toBe('number');
2347
+ expect(docs[0]?.metadata?.tokenCount).toBeGreaterThan(0);
2348
+ });
2349
+
2350
+ it('should respect sibling/parent relationships in merging', async () => {
2351
+ const text = `# Main Document
2352
+
2353
+ ## Section A
2354
+ Content for section A that is moderately long to ensure we have enough tokens for testing the semantic merging algorithm properly.
2355
+
2356
+ ### Subsection A1
2357
+ This subsection has more content than the previous version to test the hierarchical merging behavior.
2358
+
2359
+ ### Subsection A2
2360
+ Another subsection with substantial content to verify proper semantic boundary handling.
2361
+
2362
+ ## Section B
2363
+ Content for section B that is also moderately sized with meaningful text to test cross-section merging behavior.
2364
+
2365
+ ### Subsection B1
2366
+ This final subsection contains enough content to test the bottom-up merging algorithm effectively.`;
2367
+
2368
+ const doc = MDocument.fromMarkdown(text);
2369
+
2370
+ await doc.chunk({
2371
+ strategy: 'semantic-markdown',
2372
+ joinThreshold: 100, // Threshold that allows some merging but not everything
2373
+ });
2374
+
2375
+ const chunks = doc.getText();
2376
+ const docs = doc.getDocs();
2377
+
2378
+ // Should create fewer chunks than original sections due to merging
2379
+ expect(chunks.length).toBeLessThan(7);
2380
+ expect(chunks.length).toBeGreaterThanOrEqual(1);
2381
+
2382
+ // Verify sections maintain semantic coherence
2383
+ const hasSection = chunks.some(chunk => chunk.includes('Section A') || chunk.includes('Subsection A1'));
2384
+ expect(hasSection).toBe(true);
2385
+
2386
+ expect(docs[0]?.metadata?.tokenCount).toBeDefined();
2387
+ expect(docs[0]?.metadata?.tokenCount).toBeGreaterThan(0);
2388
+ });
2389
+
2390
+ it('should correctly chunk a controlled test document', async () => {
2391
+ const controlledTestMarkdown = `# My Test Document
2392
+
2393
+ This is a short preamble to test how content before the first header is handled. It should be merged with the first section if that section is small enough.
2394
+
2395
+ ## Chapter 1: The Small Sections
2396
+
2397
+ This is the introduction to Chapter 1. It contains several small subsections that are perfect candidates for merging.
2398
+
2399
+ ### Section 1.1: A Tiny Topic
2400
+
2401
+ Just a few words here.
2402
+
2403
+ ### Section 1.2: Another Tiny Topic
2404
+
2405
+ A few more words to make up a small paragraph.
2406
+
2407
+ ## Chapter 2: The Big Section
2408
+
2409
+ This chapter has a very large section that should NOT be merged with its sibling because it is over the token limit all by itself.
2410
+
2411
+ \`\`\`python
2412
+ # This is a large block of Python code.
2413
+ # It is designed to have a high token count to test the merging threshold.
2414
+ import os
2415
+ import sys
2416
+
2417
+ class DataProcessor:
2418
+ def __init__(self, data):
2419
+ self.data = data
2420
+ self.length = len(data)
2421
+
2422
+ def process(self):
2423
+ """
2424
+ This is a long docstring to add even more tokens to the count.
2425
+ We will iterate through the data and perform some kind of mock processing.
2426
+ The goal is to exceed the joinThreshold of 250 tokens easily.
2427
+ Let's add more lines to be sure.
2428
+ Line 1
2429
+ Line 2
2430
+ Line 3
2431
+ Line 4
2432
+ Line 5
2433
+ ...and so on.
2434
+ """
2435
+ results = []
2436
+ for i, item in enumerate(self.data):
2437
+ # A mock calculation
2438
+ processed_item = (item * i) + self.length
2439
+ results.append(processed_item)
2440
+ return results
2441
+
2442
+ # Let's make sure this section is large enough.
2443
+ # More comments and code will help.
2444
+ def another_function_to_add_tokens():
2445
+ """Another long docstring for good measure."""
2446
+ x = 1
2447
+ y = 2
2448
+ z = x + y
2449
+ print(f"The result is {z}")
2450
+ # End of function
2451
+ \`\`\`
2452
+
2453
+ ## Chapter 3: The Mixed Bag
2454
+
2455
+ This chapter contains a mix of small and medium sections.
2456
+
2457
+ ### Section 3.1: A Medium Section
2458
+
2459
+ This section is moderately sized. It's not huge, but it has enough content to be a meaningful chunk on its own. We'll aim for about 150 tokens here so it can potentially merge with a small sibling.
2460
+
2461
+ ### Section 3.2: A Final Small Section
2462
+
2463
+ This final section is very small and should definitely be merged into its predecessor, Section 3.1, because their combined total will be under the threshold.
2464
+ `;
2465
+
2466
+ const doc = MDocument.fromMarkdown(controlledTestMarkdown);
2467
+ await doc.chunk({
2468
+ strategy: 'semantic-markdown',
2469
+ joinThreshold: 250,
2470
+ modelName: 'gpt-3.5-turbo',
2471
+ });
2472
+
2473
+ const chunks = doc.getText();
2474
+ expect(chunks).toHaveLength(3);
2475
+ expect(chunks[0]).toContain('# My Test Document');
2476
+ expect(chunks[0]).toContain('### Section 1.2: Another Tiny Topic');
2477
+ expect(chunks[1]).toContain('## Chapter 2: The Big Section');
2478
+ expect(chunks[2]).toContain('## Chapter 3: The Mixed Bag');
2479
+ expect(chunks[2]).toContain('### Section 3.2: A Final Small Section');
2480
+ });
2481
+
2482
+ it('should preserve code blocks during merging', async () => {
2483
+ const text = `# Code Example
2484
+
2485
+ ## Installation
2486
+ Install the package:
2487
+
2488
+ \`\`\`bash
2489
+ npm install example-package
2490
+ \`\`\`
2491
+
2492
+ ## Usage
2493
+ Here's how to use it:
2494
+
2495
+ \`\`\`javascript
2496
+ const example = require('example-package');
2497
+ example.doSomething();
2498
+ \`\`\`
2499
+
2500
+ ## Configuration
2501
+ Set up your config file.`;
2502
+
2503
+ const doc = MDocument.fromMarkdown(text);
2504
+
2505
+ await doc.chunk({
2506
+ strategy: 'semantic-markdown',
2507
+ joinThreshold: 300,
2508
+ });
2509
+
2510
+ const chunks = doc.getText();
2511
+
2512
+ // Code blocks should be preserved intact
2513
+ expect(chunks.some(chunk => chunk.includes('```bash'))).toBe(true);
2514
+ expect(chunks.some(chunk => chunk.includes('```javascript'))).toBe(true);
2515
+
2516
+ // Should not split within code blocks
2517
+ const bashChunk = chunks.find(chunk => chunk.includes('npm install'));
2518
+ expect(bashChunk).toBeDefined();
2519
+ expect(bashChunk).toContain('```bash');
2520
+ });
2521
+
2522
+ it('should work with different tiktoken models', async () => {
2523
+ const text = `# Test Document
2524
+
2525
+ ## Section 1
2526
+ Some content for testing different tiktoken models and their token counting accuracy.
2527
+
2528
+ ## Section 2
2529
+ More content to verify the token counting works correctly across different model encodings.`;
2530
+
2531
+ const doc = MDocument.fromMarkdown(text);
2532
+
2533
+ await doc.chunk({
2534
+ strategy: 'semantic-markdown',
2535
+ joinThreshold: 100,
2536
+ modelName: 'gpt-4',
2537
+ });
2538
+
2539
+ const chunks = doc.getText();
2540
+ const docs = doc.getDocs();
2541
+
2542
+ expect(chunks.length).toBeGreaterThan(0);
2543
+ expect(docs[0]?.metadata?.tokenCount).toBeDefined();
2544
+ expect(typeof docs[0]?.metadata?.tokenCount).toBe('number');
2545
+ });
2546
+
2547
+ it('should handle documents with no headers', async () => {
2548
+ const text = `This is a document with no markdown headers.
2549
+
2550
+ Just regular paragraphs of text that should be processed as a single semantic unit since there are no headers to split on.
2551
+
2552
+ More paragraphs here to test the behavior.`;
2553
+
2554
+ const doc = MDocument.fromMarkdown(text);
2555
+
2556
+ await doc.chunk({
2557
+ strategy: 'semantic-markdown',
2558
+ joinThreshold: 200,
2559
+ });
2560
+
2561
+ const chunks = doc.getText();
2562
+
2563
+ // Should return single chunk since no headers to split on
2564
+ expect(chunks.length).toBe(1);
2565
+ expect(chunks[0]).toContain('This is a document with no markdown headers');
2566
+ });
2567
+
2568
+ it('should handle empty sections correctly', async () => {
2569
+ const text = `# Document
2570
+
2571
+ ## Empty Section
2572
+
2573
+ ## Another Section
2574
+ Some content here.
2575
+
2576
+ ## Final Empty Section
2577
+
2578
+ `;
2579
+
2580
+ const doc = MDocument.fromMarkdown(text);
2581
+
2582
+ await doc.chunk({
2583
+ strategy: 'semantic-markdown',
2584
+ joinThreshold: 100,
2585
+ });
2586
+
2587
+ const chunks = doc.getText();
2588
+
2589
+ // Should handle empty sections gracefully
2590
+ expect(chunks.length).toBeGreaterThan(0);
2591
+ expect(chunks.some(chunk => chunk.includes('Some content here'))).toBe(true);
2592
+ });
2593
+
2594
+ it('should maintain bottom-up merging order (deepest first)', async () => {
2595
+ const text = `# Root
2596
+
2597
+ ## Level 2A
2598
+ Content 2A
2599
+
2600
+ ### Level 3A
2601
+ Short content 3A
2602
+
2603
+ #### Level 4A
2604
+ Short content 4A
2605
+
2606
+ ### Level 3B
2607
+ Short content 3B
2608
+
2609
+ ## Level 2B
2610
+ Content 2B`;
2611
+
2612
+ const doc = MDocument.fromMarkdown(text);
2613
+
2614
+ await doc.chunk({
2615
+ strategy: 'semantic-markdown',
2616
+ joinThreshold: 200,
2617
+ });
2618
+
2619
+ const chunks = doc.getText();
2620
+
2621
+ // The algorithm should merge from deepest level first
2622
+ // Level 4 should merge with Level 3, then Level 3s might merge with Level 2
2623
+ expect(chunks.length).toBeLessThan(7); // Less than original 7 sections
2624
+
2625
+ // Verify deep nesting is preserved in merged content
2626
+ const deepChunk = chunks.find(chunk => chunk.includes('Level 4A') && chunk.includes('Level 3A'));
2627
+ expect(deepChunk).toBeDefined();
2628
+ });
2629
+
2630
+ it('should compare token accuracy vs character-based sizing', async () => {
2631
+ // Use text with unicode and varying token densities
2632
+ const text = `# Test Document
2633
+
2634
+ ## Unicode Section
2635
+ This section contains unicode characters: café, naïve, résumé, 中文, العربية
2636
+
2637
+ ## Code Section
2638
+ \`\`\`python
2639
+ def function_with_long_name_and_parameters(param1, param2, param3):
2640
+ return param1 + param2 + param3
2641
+ \`\`\`
2642
+
2643
+ ## Regular Section
2644
+ Regular English text without special characters.`;
2645
+
2646
+ const doc = MDocument.fromMarkdown(text);
2647
+
2648
+ await doc.chunk({
2649
+ strategy: 'semantic-markdown',
2650
+ joinThreshold: 150, // Token-based threshold
2651
+ });
2652
+
2653
+ const docs = doc.getDocs();
2654
+
2655
+ // Verify token counts are provided in metadata
2656
+ docs.forEach(doc => {
2657
+ expect(doc.metadata.tokenCount).toBeDefined();
2658
+ expect(typeof doc.metadata.tokenCount).toBe('number');
2659
+ expect(doc.metadata.tokenCount).toBeGreaterThan(0);
2660
+ });
2661
+
2662
+ // Token count should be different from character count for unicode text
2663
+ const unicodeDoc = docs.find(doc => doc.text.includes('café'));
2664
+ if (unicodeDoc) {
2665
+ const charCount = unicodeDoc.text.length;
2666
+ const tokenCount = unicodeDoc.metadata.tokenCount;
2667
+
2668
+ // For text with unicode, token count is often different from char count
2669
+ expect(tokenCount).toBeDefined();
2670
+ expect(tokenCount).not.toBe(charCount);
2671
+ }
2672
+ });
2673
+
2674
+ it('should handle documents with only deep headers (no top-level sections)', async () => {
2675
+ const text = `### Deep Section 1
2676
+ Short content for deep section 1.
2677
+
2678
+ #### Very Deep Section 1.1
2679
+ Even shorter content.
2680
+
2681
+ #### Very Deep Section 1.2
2682
+ Another short subsection.
2683
+
2684
+ ### Deep Section 2
2685
+ Short content for deep section 2.
2686
+
2687
+ #### Very Deep Section 2.1
2688
+ Final short content.`;
2689
+
2690
+ const doc = MDocument.fromMarkdown(text);
2691
+
2692
+ await doc.chunk({
2693
+ strategy: 'semantic-markdown',
2694
+ joinThreshold: 200,
2695
+ });
2696
+
2697
+ const chunks = doc.getText();
2698
+ const docs = doc.getDocs();
2699
+
2700
+ // Should merge the small deep sections together
2701
+ expect(chunks.length).toBeLessThan(5);
2702
+ expect(chunks.length).toBeGreaterThan(0);
2703
+
2704
+ // Verify deep headers are preserved in merged content
2705
+ const deepChunk = chunks.find(
2706
+ chunk => chunk.includes('### Deep Section 1') && chunk.includes('#### Very Deep Section'),
2707
+ );
2708
+ expect(deepChunk).toBeDefined();
2709
+
2710
+ expect(docs[0]?.metadata?.tokenCount).toBeDefined();
2711
+ });
2712
+
2713
+ it('should leave very large individual sections intact (exceeding joinThreshold)', async () => {
2714
+ const largeContent = 'This is a very long section. '.repeat(50); // ~1500 tokens
2715
+ const text = `# Document Title
2716
+
2717
+ ## Small Section
2718
+ Small content here.
2719
+
2720
+ ## Oversized Section
2721
+ ${largeContent}
2722
+
2723
+ \`\`\`javascript
2724
+ // Adding code to make it even larger
2725
+ function processData(data) {
2726
+ const results = [];
2727
+ for (let i = 0; i < data.length; i++) {
2728
+ const processed = data[i] * 2 + Math.random();
2729
+ results.push(processed);
2730
+ console.log(\`Processed item \${i}: \${processed}\`);
2731
+ }
2732
+ return results;
2733
+ }
2734
+
2735
+ // More code to ensure we exceed the threshold
2736
+ class DataManager {
2737
+ constructor(initialData) {
2738
+ this.data = initialData;
2739
+ this.processedCount = 0;
2740
+ }
2741
+
2742
+ process() {
2743
+ this.data.forEach((item, index) => {
2744
+ // Process each item
2745
+ this.processedCount++;
2746
+ });
2747
+ }
2748
+ }
2749
+ \`\`\`
2750
+
2751
+ ## Another Small Section
2752
+ More small content.`;
2753
+
2754
+ const doc = MDocument.fromMarkdown(text);
2755
+
2756
+ await doc.chunk({
2757
+ strategy: 'semantic-markdown',
2758
+ joinThreshold: 300, // Much smaller than the oversized section
2759
+ });
2760
+
2761
+ const chunks = doc.getText();
2762
+ const docs = doc.getDocs();
2763
+
2764
+ expect(chunks.length).toBeGreaterThan(1);
2765
+
2766
+ // The oversized section should be left as its own chunk
2767
+ const oversizedChunk = chunks.find(chunk => chunk.includes('Oversized Section'));
2768
+ expect(oversizedChunk).toBeDefined();
2769
+ expect(oversizedChunk).toContain('This is a very long section.');
2770
+
2771
+ // Verify the oversized chunk exceeds the threshold
2772
+ const oversizedDoc = docs.find(doc => doc.text.includes('Oversized Section'));
2773
+ expect(oversizedDoc?.metadata?.tokenCount).toBeGreaterThan(300);
2774
+
2775
+ // Small sections should still be merged where possible
2776
+ const smallChunk = chunks.find(chunk => chunk.includes('Small Section') && !chunk.includes('Oversized'));
2777
+ expect(smallChunk).toBeDefined();
2778
+ });
2779
+
2780
+ it('should handle mixed header levels with gaps (skipping levels)', async () => {
2781
+ const text = `# Top Level
2782
+
2783
+ #### Deep Level A (skipped H2 and H3)
2784
+ Content for deep level A that is moderately sized with enough text to make it substantial. This section needs to have sufficient content to test the merging behavior properly when header levels are skipped. Let's add more content to ensure we have enough tokens to work with.
2785
+
2786
+ ## Middle Level
2787
+ Content for middle level section that also needs to be substantial enough to test the algorithm. This section should have enough content to be meaningful when testing the semantic markdown chunking with mixed header levels.
2788
+
2789
+ ##### Very Deep Level (skipped H3 and H4)
2790
+ Short content for very deep level that should still be substantial enough for testing. Even though this is marked as short, we need enough content to make the test meaningful.
2791
+
2792
+ # Another Top Level
2793
+
2794
+ This second top-level section should definitely create a boundary that prevents everything from merging into a single chunk. We need substantial content here to ensure proper separation.
2795
+
2796
+ ### Medium Deep Level (skipped H2)
2797
+ Final content for testing header level gaps. This section also needs substantial content to ensure we're testing the algorithm properly with realistic content sizes.`;
2798
+
2799
+ const doc = MDocument.fromMarkdown(text);
2800
+
2801
+ await doc.chunk({
2802
+ strategy: 'semantic-markdown',
2803
+ joinThreshold: 150, // Smaller threshold to encourage more chunks
2804
+ });
2805
+
2806
+ const chunks = doc.getText();
2807
+
2808
+ // Should handle the gaps gracefully - expect at least 2 chunks due to the second top-level section
2809
+ expect(chunks.length).toBeGreaterThanOrEqual(1);
2810
+
2811
+ // Verify headers with gaps are preserved
2812
+ expect(chunks.some(chunk => chunk.includes('#### Deep Level A'))).toBe(true);
2813
+ expect(chunks.some(chunk => chunk.includes('##### Very Deep Level'))).toBe(true);
2814
+ expect(chunks.some(chunk => chunk.includes('### Medium Deep Level'))).toBe(true);
2815
+
2816
+ // Verify both top-level sections are present
2817
+ expect(chunks.some(chunk => chunk.includes('# Top Level'))).toBe(true);
2818
+ expect(chunks.some(chunk => chunk.includes('# Another Top Level'))).toBe(true);
2819
+ });
2820
+
2821
+ it('should handle large documents efficiently (performance test)', async () => {
2822
+ const sections: string[] = [];
2823
+ for (let i = 1; i <= 100; i++) {
2824
+ sections.push(`## Section ${i}`);
2825
+ sections.push(`This is content for section ${i}. `.repeat(10)); // ~100 tokens each
2826
+
2827
+ for (let j = 1; j <= 3; j++) {
2828
+ sections.push(`### Subsection ${i}.${j}`);
2829
+ sections.push(`This is subsection content ${i}.${j}. `.repeat(5)); // ~50 tokens each
2830
+ }
2831
+ }
2832
+
2833
+ const largeText = `# Large Test Document\n\n${sections.join('\n\n')}`;
2834
+
2835
+ const doc = MDocument.fromMarkdown(largeText);
2836
+
2837
+ const startTime = Date.now();
2838
+
2839
+ await doc.chunk({
2840
+ strategy: 'semantic-markdown',
2841
+ joinThreshold: 300,
2842
+ });
2843
+
2844
+ const duration = Date.now() - startTime;
2845
+ const chunks = doc.getText();
2846
+ const docs = doc.getDocs();
2847
+
2848
+ expect(duration).toBeLessThan(5000);
2849
+
2850
+ expect(chunks.length).toBeGreaterThan(10);
2851
+ expect(chunks.length).toBeLessThan(400);
2852
+
2853
+ docs.forEach(doc => {
2854
+ expect(doc.metadata.tokenCount).toBeDefined();
2855
+ expect(doc.metadata.tokenCount).toBeGreaterThan(0);
2856
+ });
2857
+ }, 10000);
2858
+
2859
+ it('should maintain semantic coherence with very small joinThreshold', async () => {
2860
+ const text = `# Document
2861
+
2862
+ This is a substantial preamble section that should have enough content to be meaningful in token counting. We need sufficient content here to test the algorithm properly.
2863
+
2864
+ ## Section A
2865
+ Brief content for section A that needs to be expanded to ensure we have meaningful token counts for testing the semantic markdown chunking algorithm with a very small threshold.
2866
+
2867
+ ### Sub A1
2868
+ More substantial content here for subsection A1. This content needs to be long enough to have a reasonable token count that will affect the merging decisions in our semantic chunking algorithm.
2869
+
2870
+ ### Sub A2
2871
+ Even more substantial content for subsection A2. Again, we need enough tokens here to make the test meaningful and to properly exercise the algorithm's decision-making process.
2872
+
2873
+ ## Section B
2874
+ Another section with substantial content for section B. This section should also have enough content to be meaningful in our token-based chunking strategy testing.
2875
+
2876
+ ### Sub B1
2877
+ Final substantial content for subsection B1. This content should complete our test document with enough tokens to properly test the small threshold behavior.`;
2878
+
2879
+ const doc = MDocument.fromMarkdown(text);
2880
+
2881
+ await doc.chunk({
2882
+ strategy: 'semantic-markdown',
2883
+ joinThreshold: 30, // Even smaller threshold to force separation
2884
+ });
2885
+
2886
+ const chunks = doc.getText();
2887
+
2888
+ // With a very small threshold, we should get at least some separation
2889
+ expect(chunks.length).toBeGreaterThanOrEqual(1);
2890
+
2891
+ // Verify all chunks have meaningful content
2892
+ chunks.forEach(chunk => {
2893
+ expect(chunk.trim().length).toBeGreaterThan(0);
2894
+ expect(chunk.trim().length).toBeGreaterThan(10);
2895
+ });
2896
+
2897
+ // Verify we have the main document structure preserved
2898
+ const allText = chunks.join(' ');
2899
+ expect(allText).toContain('# Document');
2900
+ expect(allText).toContain('## Section A');
2901
+ expect(allText).toContain('## Section B');
2902
+ });
2903
+
2904
+ it('should not treat headers inside code blocks as headers for splitting', async () => {
2905
+ const text = `# Real Header
2906
+
2907
+ Some introductory text explaining code examples.
2908
+
2909
+ \`\`\`markdown
2910
+ # This is not a real header
2911
+ It is inside a code block and should be ignored for chunking.
2912
+
2913
+ ## This is also not a real header
2914
+ It should be treated as plain text content, not a section boundary.
2915
+
2916
+ ### Even deeper fake headers
2917
+ Should also be ignored completely.
2918
+ \`\`\`
2919
+
2920
+ ## A Real Second Header
2921
+ This content comes after the code block.
2922
+
2923
+ ### A Real Subsection
2924
+ With some additional content to test the hierarchy.`;
2925
+
2926
+ const doc = MDocument.fromMarkdown(text);
2927
+
2928
+ await doc.chunk({
2929
+ strategy: 'semantic-markdown',
2930
+ joinThreshold: 25, // Low threshold to force separation into 2 or more chunks
2931
+ });
2932
+
2933
+ const chunks = doc.getText();
2934
+
2935
+ // With a low threshold, we should get exactly 2 chunks:
2936
+ // 1. "# Real Header" section (with the code block as content)
2937
+ // 2. "## A Real Second Header" section (with its subsection)
2938
+ // If fake headers were processed, we'd get more than 2 chunks
2939
+ expect(chunks.length).toBe(2);
2940
+
2941
+ const firstChunk = chunks[0];
2942
+ const secondChunk = chunks[1];
2943
+
2944
+ expect(firstChunk).toContain('# Real Header');
2945
+ expect(firstChunk).toContain('Some introductory text explaining code examples');
2946
+ expect(firstChunk).toContain('```markdown');
2947
+ expect(firstChunk).toContain('# This is not a real header');
2948
+ expect(firstChunk).toContain('## This is also not a real header');
2949
+ expect(firstChunk).toContain('### Even deeper fake headers');
2950
+ expect(firstChunk).not.toContain('## A Real Second Header');
2951
+
2952
+ expect(secondChunk).toContain('## A Real Second Header');
2953
+ expect(secondChunk).toContain('### A Real Subsection');
2954
+ expect(secondChunk).not.toContain('# Real Header');
2955
+ expect(secondChunk).not.toContain('# This is not a real header');
2956
+ });
2957
+ });
2315
2958
  });
2316
2959
 
2317
2960
  // Helper function to find the longest common substring between two strings