@mastra/rag 0.2.0-alpha.1 → 0.10.1-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,23 @@
1
1
 
2
- > @mastra/rag@0.2.0-alpha.1 build /home/runner/work/mastra/mastra/packages/rag
2
+ > @mastra/rag@0.10.1-alpha.0 build /home/runner/work/mastra/mastra/packages/rag
3
3
  > tsup src/index.ts --format esm,cjs --experimental-dts --clean --treeshake=smallest --splitting
4
4
 
5
5
  CLI Building entry: src/index.ts
6
6
  CLI Using tsconfig: tsconfig.json
7
7
  CLI tsup v8.4.0
8
8
  TSC Build start
9
- TSC ⚡️ Build success in 13960ms
9
+ TSC ⚡️ Build success in 17241ms
10
10
  DTS Build start
11
11
  CLI Target: es2022
12
12
  Analysis will use the bundled TypeScript version 5.8.3
13
13
  Writing package typings: /home/runner/work/mastra/mastra/packages/rag/dist/_tsup-dts-rollup.d.ts
14
14
  Analysis will use the bundled TypeScript version 5.8.3
15
15
  Writing package typings: /home/runner/work/mastra/mastra/packages/rag/dist/_tsup-dts-rollup.d.cts
16
- DTS ⚡️ Build success in 14823ms
16
+ DTS ⚡️ Build success in 16153ms
17
17
  CLI Cleaning output folder
18
18
  ESM Build start
19
19
  CJS Build start
20
- ESM dist/index.js 238.92 KB
21
- ESM ⚡️ Build success in 4442ms
22
- CJS dist/index.cjs 240.69 KB
23
- CJS ⚡️ Build success in 4451ms
20
+ CJS dist/index.cjs 240.87 KB
21
+ CJS ⚡️ Build success in 4013ms
22
+ ESM dist/index.js 239.11 KB
23
+ ESM ⚡️ Build success in 4026ms
package/CHANGELOG.md CHANGED
@@ -1,5 +1,44 @@
1
1
  # @mastra/rag
2
2
 
3
+ ## 0.10.1-alpha.0
4
+
5
+ ### Patch Changes
6
+
7
+ - 8784cef: Changed stripHeaders for markdown chunking to strip headers correctly from output when true
8
+ - Updated dependencies [6d16390]
9
+ - Updated dependencies [1e4a421]
10
+ - @mastra/core@0.10.1-alpha.0
11
+
12
+ ## 0.10.0
13
+
14
+ ### Minor Changes
15
+
16
+ - 83da932: Move @mastra/core to peerdeps
17
+
18
+ ### Patch Changes
19
+
20
+ - 4424836: [MASTRA-2591] Rag Tool Return Types
21
+ - 8cdd799: [MASTRA-3078] added sources to return for vector query tool
22
+ - 4f62987: update rerank weight sum to use big.js
23
+ - Updated dependencies [b3a3d63]
24
+ - Updated dependencies [344f453]
25
+ - Updated dependencies [0a3ae6d]
26
+ - Updated dependencies [95911be]
27
+ - Updated dependencies [f53a6ac]
28
+ - Updated dependencies [5eb5a99]
29
+ - Updated dependencies [7e632c5]
30
+ - Updated dependencies [1e9fbfa]
31
+ - Updated dependencies [eabdcd9]
32
+ - Updated dependencies [90be034]
33
+ - Updated dependencies [99f050a]
34
+ - Updated dependencies [d0ee3c6]
35
+ - Updated dependencies [b2ae5aa]
36
+ - Updated dependencies [23f258c]
37
+ - Updated dependencies [a7292b0]
38
+ - Updated dependencies [0dcb9f0]
39
+ - Updated dependencies [2672a05]
40
+ - @mastra/core@0.10.0
41
+
3
42
  ## 0.2.0-alpha.1
4
43
 
5
44
  ### Minor Changes
package/dist/index.cjs CHANGED
@@ -5459,10 +5459,12 @@ var MarkdownHeaderTransformer = class {
5459
5459
  }
5460
5460
  const aggregatedChunks = [];
5461
5461
  for (const line of lines) {
5462
+ const lastLine = aggregatedChunks[aggregatedChunks.length - 1]?.content?.split("\n")?.slice(-1)[0]?.trim();
5463
+ const lastChunkIsHeader = lastLine ? this.headersToSplitOn.some(([sep]) => lastLine.startsWith(sep)) : false;
5462
5464
  if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) === JSON.stringify(line.metadata)) {
5463
5465
  const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
5464
5466
  aggChunk.content += " \n" + line.content;
5465
- } else if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) !== JSON.stringify(line.metadata) && Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1].metadata).length < Object.keys(line.metadata).length && aggregatedChunks?.[aggregatedChunks.length - 1]?.content?.split("\n")?.slice(-1)[0][0] === "#" && !this.stripHeaders) {
5467
+ } else if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) !== JSON.stringify(line.metadata) && Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1].metadata).length < Object.keys(line.metadata).length && lastChunkIsHeader) {
5466
5468
  if (aggregatedChunks && aggregatedChunks?.[aggregatedChunks.length - 1]) {
5467
5469
  const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
5468
5470
  if (aggChunk) {
@@ -5535,10 +5537,12 @@ var MarkdownHeaderTransformer = class {
5535
5537
  headerStack.push(header);
5536
5538
  initialMetadata[name14] = header.data;
5537
5539
  }
5538
- linesWithMetadata.push({
5539
- content: line,
5540
- metadata: { ...currentMetadata, ...initialMetadata }
5541
- });
5540
+ if (!this.stripHeaders) {
5541
+ linesWithMetadata.push({
5542
+ content: line,
5543
+ metadata: { ...currentMetadata, ...initialMetadata }
5544
+ });
5545
+ }
5542
5546
  break;
5543
5547
  }
5544
5548
  }
package/dist/index.js CHANGED
@@ -5457,10 +5457,12 @@ var MarkdownHeaderTransformer = class {
5457
5457
  }
5458
5458
  const aggregatedChunks = [];
5459
5459
  for (const line of lines) {
5460
+ const lastLine = aggregatedChunks[aggregatedChunks.length - 1]?.content?.split("\n")?.slice(-1)[0]?.trim();
5461
+ const lastChunkIsHeader = lastLine ? this.headersToSplitOn.some(([sep]) => lastLine.startsWith(sep)) : false;
5460
5462
  if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) === JSON.stringify(line.metadata)) {
5461
5463
  const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
5462
5464
  aggChunk.content += " \n" + line.content;
5463
- } else if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) !== JSON.stringify(line.metadata) && Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1].metadata).length < Object.keys(line.metadata).length && aggregatedChunks?.[aggregatedChunks.length - 1]?.content?.split("\n")?.slice(-1)[0][0] === "#" && !this.stripHeaders) {
5465
+ } else if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) !== JSON.stringify(line.metadata) && Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1].metadata).length < Object.keys(line.metadata).length && lastChunkIsHeader) {
5464
5466
  if (aggregatedChunks && aggregatedChunks?.[aggregatedChunks.length - 1]) {
5465
5467
  const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
5466
5468
  if (aggChunk) {
@@ -5533,10 +5535,12 @@ var MarkdownHeaderTransformer = class {
5533
5535
  headerStack.push(header);
5534
5536
  initialMetadata[name14] = header.data;
5535
5537
  }
5536
- linesWithMetadata.push({
5537
- content: line,
5538
- metadata: { ...currentMetadata, ...initialMetadata }
5539
- });
5538
+ if (!this.stripHeaders) {
5539
+ linesWithMetadata.push({
5540
+ content: line,
5541
+ metadata: { ...currentMetadata, ...initialMetadata }
5542
+ });
5543
+ }
5540
5544
  break;
5541
5545
  }
5542
5546
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/rag",
3
- "version": "0.2.0-alpha.1",
3
+ "version": "0.10.1-alpha.0",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -30,7 +30,7 @@
30
30
  },
31
31
  "peerDependencies": {
32
32
  "ai": "^4.0.0",
33
- "@mastra/core": "^0.9.4"
33
+ "@mastra/core": "^0.10.0"
34
34
  },
35
35
  "devDependencies": {
36
36
  "@ai-sdk/cohere": "latest",
@@ -44,8 +44,8 @@
44
44
  "tsup": "^8.4.0",
45
45
  "typescript": "^5.8.2",
46
46
  "vitest": "^3.1.2",
47
- "@internal/lint": "0.0.5",
48
- "@mastra/core": "0.10.0-alpha.1"
47
+ "@internal/lint": "0.0.6",
48
+ "@mastra/core": "0.10.1-alpha.0"
49
49
  },
50
50
  "keywords": [
51
51
  "rag",
@@ -1611,6 +1611,7 @@ describe('MDocument', () => {
1611
1611
  strategy: 'markdown',
1612
1612
  headers: [['#', 'Header 1']],
1613
1613
  returnEachLine: true,
1614
+ stripHeaders: false,
1614
1615
  });
1615
1616
 
1616
1617
  expect(doc.getDocs().length).toBe(4); // Title + 3 lines
@@ -1639,6 +1640,125 @@ describe('MDocument', () => {
1639
1640
  const docs = doc.getDocs();
1640
1641
  expect(docs?.[0]?.text).toContain('# Title');
1641
1642
  });
1643
+
1644
+ it('should remove headers when stripHeaders: true is set in markdown chunker', async () => {
1645
+ const markdown = [
1646
+ '# H1 Title',
1647
+ 'Some intro text.',
1648
+ '## H2 Subtitle',
1649
+ 'More details.',
1650
+ '### H3 Section',
1651
+ 'Final content.',
1652
+ ].join('\n');
1653
+
1654
+ const doc = MDocument.fromMarkdown(markdown);
1655
+ const chunks = await doc.chunk({
1656
+ strategy: 'markdown',
1657
+ size: 500,
1658
+ overlap: 0,
1659
+ headers: [
1660
+ ['#', 'h1'],
1661
+ ['##', 'h2'],
1662
+ ['###', 'h3'],
1663
+ ],
1664
+ stripHeaders: true,
1665
+ });
1666
+ // None of the chunk texts should start with the header patterns
1667
+ const headerPatterns = [/^#\s/, /^##\s/, /^###\s/];
1668
+ for (const chunk of chunks) {
1669
+ for (const pattern of headerPatterns) {
1670
+ expect(pattern.test(chunk.text)).toBe(false);
1671
+ }
1672
+ }
1673
+ });
1674
+
1675
+ it('should support custom header prefixes', async () => {
1676
+ const text = `!!! Important\nThis is important.\n--- Section\nSection content.`;
1677
+ const doc = MDocument.fromMarkdown(text);
1678
+ await doc.chunk({
1679
+ strategy: 'markdown',
1680
+ headers: [
1681
+ ['!!!', 'important'],
1682
+ ['---', 'section'],
1683
+ ],
1684
+ stripHeaders: true,
1685
+ });
1686
+ const texts = doc.getText();
1687
+ expect(texts.some(t => t.startsWith('!!!'))).toBe(false);
1688
+ expect(texts.some(t => t.startsWith('---'))).toBe(false);
1689
+ });
1690
+
1691
+ it('should attach correct metadata for nested headers', async () => {
1692
+ const text = `# H1\n## H2\n### H3\nContent`;
1693
+ const doc = MDocument.fromMarkdown(text);
1694
+ await doc.chunk({
1695
+ strategy: 'markdown',
1696
+ headers: [
1697
+ ['#', 'h1'],
1698
+ ['##', 'h2'],
1699
+ ['###', 'h3'],
1700
+ ],
1701
+ stripHeaders: true,
1702
+ });
1703
+ const chunk = doc.getDocs().find(c => c.text.includes('Content'));
1704
+ expect(chunk?.metadata?.h1).toBe('H1');
1705
+ expect(chunk?.metadata?.h2).toBe('H2');
1706
+ expect(chunk?.metadata?.h3).toBe('H3');
1707
+ });
1708
+
1709
+ it('should include header lines as chunks if stripHeaders is false', async () => {
1710
+ const text = `# H1\nContent`;
1711
+ const doc = MDocument.fromMarkdown(text);
1712
+ await doc.chunk({
1713
+ strategy: 'markdown',
1714
+ headers: [['#', 'h1']],
1715
+ stripHeaders: false,
1716
+ });
1717
+ const texts = doc.getText();
1718
+ expect(texts.some(t => t.startsWith('# H1'))).toBe(true);
1719
+ });
1720
+
1721
+ it('should handle multiple adjacent headers correctly', async () => {
1722
+ const text = `# H1\n## H2\n### H3\nContent`;
1723
+ const doc = MDocument.fromMarkdown(text);
1724
+ await doc.chunk({
1725
+ strategy: 'markdown',
1726
+ headers: [
1727
+ ['#', 'h1'],
1728
+ ['##', 'h2'],
1729
+ ['###', 'h3'],
1730
+ ],
1731
+ stripHeaders: true,
1732
+ });
1733
+ const texts = doc.getText();
1734
+ expect(texts.some(t => t === 'Content')).toBe(true);
1735
+ expect(texts.some(t => t === '')).toBe(false);
1736
+ });
1737
+
1738
+ it('should handle content before any header', async () => {
1739
+ const text = `Intro before header\n# H1\nContent`;
1740
+ const doc = MDocument.fromMarkdown(text);
1741
+ await doc.chunk({
1742
+ strategy: 'markdown',
1743
+ headers: [['#', 'h1']],
1744
+ stripHeaders: true,
1745
+ });
1746
+ const preHeaderChunk = doc.getDocs().find(c => c.text.includes('Intro before header'));
1747
+ expect(preHeaderChunk?.metadata?.h1).toBeUndefined();
1748
+ });
1749
+
1750
+ it('should not treat headers inside code blocks as headers', async () => {
1751
+ const text = ['# Real Header', '```', '# Not a header', '```', 'Content'].join('\n');
1752
+ const doc = MDocument.fromMarkdown(text);
1753
+ await doc.chunk({
1754
+ strategy: 'markdown',
1755
+ headers: [['#', 'h1']],
1756
+ stripHeaders: true,
1757
+ });
1758
+ const texts = doc.getText();
1759
+ expect(texts.some(t => t.includes('# Not a header'))).toBe(true);
1760
+ expect(texts.some(t => t.startsWith('# Real Header'))).toBe(false);
1761
+ });
1642
1762
  });
1643
1763
 
1644
1764
  describe('metadata extraction', () => {
@@ -61,6 +61,8 @@ export class MarkdownHeaderTransformer {
61
61
  const aggregatedChunks: LineType[] = [];
62
62
 
63
63
  for (const line of lines) {
64
+ const lastLine = aggregatedChunks[aggregatedChunks.length - 1]?.content?.split('\n')?.slice(-1)[0]?.trim();
65
+ const lastChunkIsHeader = lastLine ? this.headersToSplitOn.some(([sep]) => lastLine.startsWith(sep)) : false;
64
66
  if (
65
67
  aggregatedChunks.length > 0 &&
66
68
  JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1]!.metadata) === JSON.stringify(line.metadata)
@@ -72,8 +74,7 @@ export class MarkdownHeaderTransformer {
72
74
  JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1]!.metadata) !== JSON.stringify(line.metadata) &&
73
75
  Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1]!.metadata).length <
74
76
  Object.keys(line.metadata).length &&
75
- aggregatedChunks?.[aggregatedChunks.length - 1]?.content?.split('\n')?.slice(-1)[0]![0] === '#' &&
76
- !this.stripHeaders
77
+ lastChunkIsHeader
77
78
  ) {
78
79
  if (aggregatedChunks && aggregatedChunks?.[aggregatedChunks.length - 1]) {
79
80
  const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
@@ -166,12 +167,13 @@ export class MarkdownHeaderTransformer {
166
167
  initialMetadata[name] = header.data;
167
168
  }
168
169
 
169
- // Always create a separate chunk for the header
170
- linesWithMetadata.push({
171
- content: line,
172
- metadata: { ...currentMetadata, ...initialMetadata },
173
- });
174
-
170
+ // Only add header to linesWithMetadata if stripHeaders is false
171
+ if (!this.stripHeaders) {
172
+ linesWithMetadata.push({
173
+ content: line,
174
+ metadata: { ...currentMetadata, ...initialMetadata },
175
+ });
176
+ }
175
177
  break;
176
178
  }
177
179
  }
@@ -197,6 +199,7 @@ export class MarkdownHeaderTransformer {
197
199
  }
198
200
  }
199
201
 
202
+ // Reset metadata for next line
200
203
  currentMetadata = { ...initialMetadata };
201
204
  }
202
205