@mastra/rag 0.2.0-alpha.1 → 0.10.1-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +7 -7
- package/CHANGELOG.md +39 -0
- package/dist/index.cjs +9 -5
- package/dist/index.js +9 -5
- package/package.json +4 -4
- package/src/document/document.test.ts +120 -0
- package/src/document/transformers/markdown.ts +11 -8
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
|
|
2
|
-
> @mastra/rag@0.
|
|
2
|
+
> @mastra/rag@0.10.1-alpha.0 build /home/runner/work/mastra/mastra/packages/rag
|
|
3
3
|
> tsup src/index.ts --format esm,cjs --experimental-dts --clean --treeshake=smallest --splitting
|
|
4
4
|
|
|
5
5
|
[34mCLI[39m Building entry: src/index.ts
|
|
6
6
|
[34mCLI[39m Using tsconfig: tsconfig.json
|
|
7
7
|
[34mCLI[39m tsup v8.4.0
|
|
8
8
|
[34mTSC[39m Build start
|
|
9
|
-
[32mTSC[39m ⚡️ Build success in
|
|
9
|
+
[32mTSC[39m ⚡️ Build success in 17241ms
|
|
10
10
|
[34mDTS[39m Build start
|
|
11
11
|
[34mCLI[39m Target: es2022
|
|
12
12
|
Analysis will use the bundled TypeScript version 5.8.3
|
|
13
13
|
[36mWriting package typings: /home/runner/work/mastra/mastra/packages/rag/dist/_tsup-dts-rollup.d.ts[39m
|
|
14
14
|
Analysis will use the bundled TypeScript version 5.8.3
|
|
15
15
|
[36mWriting package typings: /home/runner/work/mastra/mastra/packages/rag/dist/_tsup-dts-rollup.d.cts[39m
|
|
16
|
-
[32mDTS[39m ⚡️ Build success in
|
|
16
|
+
[32mDTS[39m ⚡️ Build success in 16153ms
|
|
17
17
|
[34mCLI[39m Cleaning output folder
|
|
18
18
|
[34mESM[39m Build start
|
|
19
19
|
[34mCJS[39m Build start
|
|
20
|
-
[
|
|
21
|
-
[
|
|
22
|
-
[
|
|
23
|
-
[
|
|
20
|
+
[32mCJS[39m [1mdist/index.cjs [22m[32m240.87 KB[39m
|
|
21
|
+
[32mCJS[39m ⚡️ Build success in 4013ms
|
|
22
|
+
[32mESM[39m [1mdist/index.js [22m[32m239.11 KB[39m
|
|
23
|
+
[32mESM[39m ⚡️ Build success in 4026ms
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,44 @@
|
|
|
1
1
|
# @mastra/rag
|
|
2
2
|
|
|
3
|
+
## 0.10.1-alpha.0
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 8784cef: Changed stripHeaders for markdown chunking to strip headers correctly from output when true
|
|
8
|
+
- Updated dependencies [6d16390]
|
|
9
|
+
- Updated dependencies [1e4a421]
|
|
10
|
+
- @mastra/core@0.10.1-alpha.0
|
|
11
|
+
|
|
12
|
+
## 0.10.0
|
|
13
|
+
|
|
14
|
+
### Minor Changes
|
|
15
|
+
|
|
16
|
+
- 83da932: Move @mastra/core to peerdeps
|
|
17
|
+
|
|
18
|
+
### Patch Changes
|
|
19
|
+
|
|
20
|
+
- 4424836: [MASTRA-2591] Rag Tool Return Types
|
|
21
|
+
- 8cdd799: [MASTRA-3078] added sources to return for vector query tool
|
|
22
|
+
- 4f62987: update rerank weight sum to use big.js
|
|
23
|
+
- Updated dependencies [b3a3d63]
|
|
24
|
+
- Updated dependencies [344f453]
|
|
25
|
+
- Updated dependencies [0a3ae6d]
|
|
26
|
+
- Updated dependencies [95911be]
|
|
27
|
+
- Updated dependencies [f53a6ac]
|
|
28
|
+
- Updated dependencies [5eb5a99]
|
|
29
|
+
- Updated dependencies [7e632c5]
|
|
30
|
+
- Updated dependencies [1e9fbfa]
|
|
31
|
+
- Updated dependencies [eabdcd9]
|
|
32
|
+
- Updated dependencies [90be034]
|
|
33
|
+
- Updated dependencies [99f050a]
|
|
34
|
+
- Updated dependencies [d0ee3c6]
|
|
35
|
+
- Updated dependencies [b2ae5aa]
|
|
36
|
+
- Updated dependencies [23f258c]
|
|
37
|
+
- Updated dependencies [a7292b0]
|
|
38
|
+
- Updated dependencies [0dcb9f0]
|
|
39
|
+
- Updated dependencies [2672a05]
|
|
40
|
+
- @mastra/core@0.10.0
|
|
41
|
+
|
|
3
42
|
## 0.2.0-alpha.1
|
|
4
43
|
|
|
5
44
|
### Minor Changes
|
package/dist/index.cjs
CHANGED
|
@@ -5459,10 +5459,12 @@ var MarkdownHeaderTransformer = class {
|
|
|
5459
5459
|
}
|
|
5460
5460
|
const aggregatedChunks = [];
|
|
5461
5461
|
for (const line of lines) {
|
|
5462
|
+
const lastLine = aggregatedChunks[aggregatedChunks.length - 1]?.content?.split("\n")?.slice(-1)[0]?.trim();
|
|
5463
|
+
const lastChunkIsHeader = lastLine ? this.headersToSplitOn.some(([sep]) => lastLine.startsWith(sep)) : false;
|
|
5462
5464
|
if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) === JSON.stringify(line.metadata)) {
|
|
5463
5465
|
const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
|
|
5464
5466
|
aggChunk.content += " \n" + line.content;
|
|
5465
|
-
} else if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) !== JSON.stringify(line.metadata) && Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1].metadata).length < Object.keys(line.metadata).length &&
|
|
5467
|
+
} else if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) !== JSON.stringify(line.metadata) && Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1].metadata).length < Object.keys(line.metadata).length && lastChunkIsHeader) {
|
|
5466
5468
|
if (aggregatedChunks && aggregatedChunks?.[aggregatedChunks.length - 1]) {
|
|
5467
5469
|
const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
|
|
5468
5470
|
if (aggChunk) {
|
|
@@ -5535,10 +5537,12 @@ var MarkdownHeaderTransformer = class {
|
|
|
5535
5537
|
headerStack.push(header);
|
|
5536
5538
|
initialMetadata[name14] = header.data;
|
|
5537
5539
|
}
|
|
5538
|
-
|
|
5539
|
-
|
|
5540
|
-
|
|
5541
|
-
|
|
5540
|
+
if (!this.stripHeaders) {
|
|
5541
|
+
linesWithMetadata.push({
|
|
5542
|
+
content: line,
|
|
5543
|
+
metadata: { ...currentMetadata, ...initialMetadata }
|
|
5544
|
+
});
|
|
5545
|
+
}
|
|
5542
5546
|
break;
|
|
5543
5547
|
}
|
|
5544
5548
|
}
|
package/dist/index.js
CHANGED
|
@@ -5457,10 +5457,12 @@ var MarkdownHeaderTransformer = class {
|
|
|
5457
5457
|
}
|
|
5458
5458
|
const aggregatedChunks = [];
|
|
5459
5459
|
for (const line of lines) {
|
|
5460
|
+
const lastLine = aggregatedChunks[aggregatedChunks.length - 1]?.content?.split("\n")?.slice(-1)[0]?.trim();
|
|
5461
|
+
const lastChunkIsHeader = lastLine ? this.headersToSplitOn.some(([sep]) => lastLine.startsWith(sep)) : false;
|
|
5460
5462
|
if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) === JSON.stringify(line.metadata)) {
|
|
5461
5463
|
const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
|
|
5462
5464
|
aggChunk.content += " \n" + line.content;
|
|
5463
|
-
} else if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) !== JSON.stringify(line.metadata) && Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1].metadata).length < Object.keys(line.metadata).length &&
|
|
5465
|
+
} else if (aggregatedChunks.length > 0 && JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1].metadata) !== JSON.stringify(line.metadata) && Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1].metadata).length < Object.keys(line.metadata).length && lastChunkIsHeader) {
|
|
5464
5466
|
if (aggregatedChunks && aggregatedChunks?.[aggregatedChunks.length - 1]) {
|
|
5465
5467
|
const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
|
|
5466
5468
|
if (aggChunk) {
|
|
@@ -5533,10 +5535,12 @@ var MarkdownHeaderTransformer = class {
|
|
|
5533
5535
|
headerStack.push(header);
|
|
5534
5536
|
initialMetadata[name14] = header.data;
|
|
5535
5537
|
}
|
|
5536
|
-
|
|
5537
|
-
|
|
5538
|
-
|
|
5539
|
-
|
|
5538
|
+
if (!this.stripHeaders) {
|
|
5539
|
+
linesWithMetadata.push({
|
|
5540
|
+
content: line,
|
|
5541
|
+
metadata: { ...currentMetadata, ...initialMetadata }
|
|
5542
|
+
});
|
|
5543
|
+
}
|
|
5540
5544
|
break;
|
|
5541
5545
|
}
|
|
5542
5546
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/rag",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.1-alpha.0",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -30,7 +30,7 @@
|
|
|
30
30
|
},
|
|
31
31
|
"peerDependencies": {
|
|
32
32
|
"ai": "^4.0.0",
|
|
33
|
-
"@mastra/core": "^0.
|
|
33
|
+
"@mastra/core": "^0.10.0"
|
|
34
34
|
},
|
|
35
35
|
"devDependencies": {
|
|
36
36
|
"@ai-sdk/cohere": "latest",
|
|
@@ -44,8 +44,8 @@
|
|
|
44
44
|
"tsup": "^8.4.0",
|
|
45
45
|
"typescript": "^5.8.2",
|
|
46
46
|
"vitest": "^3.1.2",
|
|
47
|
-
"@internal/lint": "0.0.
|
|
48
|
-
"@mastra/core": "0.10.
|
|
47
|
+
"@internal/lint": "0.0.6",
|
|
48
|
+
"@mastra/core": "0.10.1-alpha.0"
|
|
49
49
|
},
|
|
50
50
|
"keywords": [
|
|
51
51
|
"rag",
|
|
@@ -1611,6 +1611,7 @@ describe('MDocument', () => {
|
|
|
1611
1611
|
strategy: 'markdown',
|
|
1612
1612
|
headers: [['#', 'Header 1']],
|
|
1613
1613
|
returnEachLine: true,
|
|
1614
|
+
stripHeaders: false,
|
|
1614
1615
|
});
|
|
1615
1616
|
|
|
1616
1617
|
expect(doc.getDocs().length).toBe(4); // Title + 3 lines
|
|
@@ -1639,6 +1640,125 @@ describe('MDocument', () => {
|
|
|
1639
1640
|
const docs = doc.getDocs();
|
|
1640
1641
|
expect(docs?.[0]?.text).toContain('# Title');
|
|
1641
1642
|
});
|
|
1643
|
+
|
|
1644
|
+
it('should remove headers when stripHeaders: true is set in markdown chunker', async () => {
|
|
1645
|
+
const markdown = [
|
|
1646
|
+
'# H1 Title',
|
|
1647
|
+
'Some intro text.',
|
|
1648
|
+
'## H2 Subtitle',
|
|
1649
|
+
'More details.',
|
|
1650
|
+
'### H3 Section',
|
|
1651
|
+
'Final content.',
|
|
1652
|
+
].join('\n');
|
|
1653
|
+
|
|
1654
|
+
const doc = MDocument.fromMarkdown(markdown);
|
|
1655
|
+
const chunks = await doc.chunk({
|
|
1656
|
+
strategy: 'markdown',
|
|
1657
|
+
size: 500,
|
|
1658
|
+
overlap: 0,
|
|
1659
|
+
headers: [
|
|
1660
|
+
['#', 'h1'],
|
|
1661
|
+
['##', 'h2'],
|
|
1662
|
+
['###', 'h3'],
|
|
1663
|
+
],
|
|
1664
|
+
stripHeaders: true,
|
|
1665
|
+
});
|
|
1666
|
+
// None of the chunk texts should start with the header patterns
|
|
1667
|
+
const headerPatterns = [/^#\s/, /^##\s/, /^###\s/];
|
|
1668
|
+
for (const chunk of chunks) {
|
|
1669
|
+
for (const pattern of headerPatterns) {
|
|
1670
|
+
expect(pattern.test(chunk.text)).toBe(false);
|
|
1671
|
+
}
|
|
1672
|
+
}
|
|
1673
|
+
});
|
|
1674
|
+
|
|
1675
|
+
it('should support custom header prefixes', async () => {
|
|
1676
|
+
const text = `!!! Important\nThis is important.\n--- Section\nSection content.`;
|
|
1677
|
+
const doc = MDocument.fromMarkdown(text);
|
|
1678
|
+
await doc.chunk({
|
|
1679
|
+
strategy: 'markdown',
|
|
1680
|
+
headers: [
|
|
1681
|
+
['!!!', 'important'],
|
|
1682
|
+
['---', 'section'],
|
|
1683
|
+
],
|
|
1684
|
+
stripHeaders: true,
|
|
1685
|
+
});
|
|
1686
|
+
const texts = doc.getText();
|
|
1687
|
+
expect(texts.some(t => t.startsWith('!!!'))).toBe(false);
|
|
1688
|
+
expect(texts.some(t => t.startsWith('---'))).toBe(false);
|
|
1689
|
+
});
|
|
1690
|
+
|
|
1691
|
+
it('should attach correct metadata for nested headers', async () => {
|
|
1692
|
+
const text = `# H1\n## H2\n### H3\nContent`;
|
|
1693
|
+
const doc = MDocument.fromMarkdown(text);
|
|
1694
|
+
await doc.chunk({
|
|
1695
|
+
strategy: 'markdown',
|
|
1696
|
+
headers: [
|
|
1697
|
+
['#', 'h1'],
|
|
1698
|
+
['##', 'h2'],
|
|
1699
|
+
['###', 'h3'],
|
|
1700
|
+
],
|
|
1701
|
+
stripHeaders: true,
|
|
1702
|
+
});
|
|
1703
|
+
const chunk = doc.getDocs().find(c => c.text.includes('Content'));
|
|
1704
|
+
expect(chunk?.metadata?.h1).toBe('H1');
|
|
1705
|
+
expect(chunk?.metadata?.h2).toBe('H2');
|
|
1706
|
+
expect(chunk?.metadata?.h3).toBe('H3');
|
|
1707
|
+
});
|
|
1708
|
+
|
|
1709
|
+
it('should include header lines as chunks if stripHeaders is false', async () => {
|
|
1710
|
+
const text = `# H1\nContent`;
|
|
1711
|
+
const doc = MDocument.fromMarkdown(text);
|
|
1712
|
+
await doc.chunk({
|
|
1713
|
+
strategy: 'markdown',
|
|
1714
|
+
headers: [['#', 'h1']],
|
|
1715
|
+
stripHeaders: false,
|
|
1716
|
+
});
|
|
1717
|
+
const texts = doc.getText();
|
|
1718
|
+
expect(texts.some(t => t.startsWith('# H1'))).toBe(true);
|
|
1719
|
+
});
|
|
1720
|
+
|
|
1721
|
+
it('should handle multiple adjacent headers correctly', async () => {
|
|
1722
|
+
const text = `# H1\n## H2\n### H3\nContent`;
|
|
1723
|
+
const doc = MDocument.fromMarkdown(text);
|
|
1724
|
+
await doc.chunk({
|
|
1725
|
+
strategy: 'markdown',
|
|
1726
|
+
headers: [
|
|
1727
|
+
['#', 'h1'],
|
|
1728
|
+
['##', 'h2'],
|
|
1729
|
+
['###', 'h3'],
|
|
1730
|
+
],
|
|
1731
|
+
stripHeaders: true,
|
|
1732
|
+
});
|
|
1733
|
+
const texts = doc.getText();
|
|
1734
|
+
expect(texts.some(t => t === 'Content')).toBe(true);
|
|
1735
|
+
expect(texts.some(t => t === '')).toBe(false);
|
|
1736
|
+
});
|
|
1737
|
+
|
|
1738
|
+
it('should handle content before any header', async () => {
|
|
1739
|
+
const text = `Intro before header\n# H1\nContent`;
|
|
1740
|
+
const doc = MDocument.fromMarkdown(text);
|
|
1741
|
+
await doc.chunk({
|
|
1742
|
+
strategy: 'markdown',
|
|
1743
|
+
headers: [['#', 'h1']],
|
|
1744
|
+
stripHeaders: true,
|
|
1745
|
+
});
|
|
1746
|
+
const preHeaderChunk = doc.getDocs().find(c => c.text.includes('Intro before header'));
|
|
1747
|
+
expect(preHeaderChunk?.metadata?.h1).toBeUndefined();
|
|
1748
|
+
});
|
|
1749
|
+
|
|
1750
|
+
it('should not treat headers inside code blocks as headers', async () => {
|
|
1751
|
+
const text = ['# Real Header', '```', '# Not a header', '```', 'Content'].join('\n');
|
|
1752
|
+
const doc = MDocument.fromMarkdown(text);
|
|
1753
|
+
await doc.chunk({
|
|
1754
|
+
strategy: 'markdown',
|
|
1755
|
+
headers: [['#', 'h1']],
|
|
1756
|
+
stripHeaders: true,
|
|
1757
|
+
});
|
|
1758
|
+
const texts = doc.getText();
|
|
1759
|
+
expect(texts.some(t => t.includes('# Not a header'))).toBe(true);
|
|
1760
|
+
expect(texts.some(t => t.startsWith('# Real Header'))).toBe(false);
|
|
1761
|
+
});
|
|
1642
1762
|
});
|
|
1643
1763
|
|
|
1644
1764
|
describe('metadata extraction', () => {
|
|
@@ -61,6 +61,8 @@ export class MarkdownHeaderTransformer {
|
|
|
61
61
|
const aggregatedChunks: LineType[] = [];
|
|
62
62
|
|
|
63
63
|
for (const line of lines) {
|
|
64
|
+
const lastLine = aggregatedChunks[aggregatedChunks.length - 1]?.content?.split('\n')?.slice(-1)[0]?.trim();
|
|
65
|
+
const lastChunkIsHeader = lastLine ? this.headersToSplitOn.some(([sep]) => lastLine.startsWith(sep)) : false;
|
|
64
66
|
if (
|
|
65
67
|
aggregatedChunks.length > 0 &&
|
|
66
68
|
JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1]!.metadata) === JSON.stringify(line.metadata)
|
|
@@ -72,8 +74,7 @@ export class MarkdownHeaderTransformer {
|
|
|
72
74
|
JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1]!.metadata) !== JSON.stringify(line.metadata) &&
|
|
73
75
|
Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1]!.metadata).length <
|
|
74
76
|
Object.keys(line.metadata).length &&
|
|
75
|
-
|
|
76
|
-
!this.stripHeaders
|
|
77
|
+
lastChunkIsHeader
|
|
77
78
|
) {
|
|
78
79
|
if (aggregatedChunks && aggregatedChunks?.[aggregatedChunks.length - 1]) {
|
|
79
80
|
const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
|
|
@@ -166,12 +167,13 @@ export class MarkdownHeaderTransformer {
|
|
|
166
167
|
initialMetadata[name] = header.data;
|
|
167
168
|
}
|
|
168
169
|
|
|
169
|
-
//
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
170
|
+
// Only add header to linesWithMetadata if stripHeaders is false
|
|
171
|
+
if (!this.stripHeaders) {
|
|
172
|
+
linesWithMetadata.push({
|
|
173
|
+
content: line,
|
|
174
|
+
metadata: { ...currentMetadata, ...initialMetadata },
|
|
175
|
+
});
|
|
176
|
+
}
|
|
175
177
|
break;
|
|
176
178
|
}
|
|
177
179
|
}
|
|
@@ -197,6 +199,7 @@ export class MarkdownHeaderTransformer {
|
|
|
197
199
|
}
|
|
198
200
|
}
|
|
199
201
|
|
|
202
|
+
// Reset metadata for next line
|
|
200
203
|
currentMetadata = { ...initialMetadata };
|
|
201
204
|
}
|
|
202
205
|
|