@mastra/rag 0.10.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +7 -7
- package/CHANGELOG.md +43 -0
- package/dist/index.cjs +16 -1
- package/dist/index.js +16 -1
- package/package.json +9 -9
- package/src/document/document.test.ts +105 -0
- package/src/document/transformers/character.ts +15 -0
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
|
|
2
|
-
> @mastra/rag@0.
|
|
2
|
+
> @mastra/rag@1.0.0-alpha.0 build /home/runner/work/mastra/mastra/packages/rag
|
|
3
3
|
> tsup src/index.ts --format esm,cjs --experimental-dts --clean --treeshake=smallest --splitting
|
|
4
4
|
|
|
5
5
|
[34mCLI[39m Building entry: src/index.ts
|
|
6
6
|
[34mCLI[39m Using tsconfig: tsconfig.json
|
|
7
7
|
[34mCLI[39m tsup v8.5.0
|
|
8
8
|
[34mTSC[39m Build start
|
|
9
|
-
[32mTSC[39m ⚡️ Build success in
|
|
9
|
+
[32mTSC[39m ⚡️ Build success in 13246ms
|
|
10
10
|
[34mDTS[39m Build start
|
|
11
11
|
[34mCLI[39m Target: es2022
|
|
12
12
|
Analysis will use the bundled TypeScript version 5.8.3
|
|
13
13
|
[36mWriting package typings: /home/runner/work/mastra/mastra/packages/rag/dist/_tsup-dts-rollup.d.ts[39m
|
|
14
14
|
Analysis will use the bundled TypeScript version 5.8.3
|
|
15
15
|
[36mWriting package typings: /home/runner/work/mastra/mastra/packages/rag/dist/_tsup-dts-rollup.d.cts[39m
|
|
16
|
-
[32mDTS[39m ⚡️ Build success in
|
|
16
|
+
[32mDTS[39m ⚡️ Build success in 12527ms
|
|
17
17
|
[34mCLI[39m Cleaning output folder
|
|
18
18
|
[34mESM[39m Build start
|
|
19
19
|
[34mCJS[39m Build start
|
|
20
|
-
[
|
|
21
|
-
[
|
|
22
|
-
[
|
|
23
|
-
[
|
|
20
|
+
[32mCJS[39m [1mdist/index.cjs [22m[32m243.73 KB[39m
|
|
21
|
+
[32mCJS[39m ⚡️ Build success in 4842ms
|
|
22
|
+
[32mESM[39m [1mdist/index.js [22m[32m242.02 KB[39m
|
|
23
|
+
[32mESM[39m ⚡️ Build success in 4846ms
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,48 @@
|
|
|
1
1
|
# @mastra/rag
|
|
2
2
|
|
|
3
|
+
## 1.0.0
|
|
4
|
+
|
|
5
|
+
### Major Changes
|
|
6
|
+
|
|
7
|
+
- 75136cd: Add LaTeX chunking support by adding a case for Language.LATEX in getSeparatorsForLanguage.
|
|
8
|
+
|
|
9
|
+
### Patch Changes
|
|
10
|
+
|
|
11
|
+
- 63f6b7d: dependencies updates:
|
|
12
|
+
- Updated dependency [`zod@^3.25.57` ↗︎](https://www.npmjs.com/package/zod/v/3.25.57) (from `^3.25.56`, in `dependencies`)
|
|
13
|
+
- Updated dependencies [63f6b7d]
|
|
14
|
+
- Updated dependencies [12a95fc]
|
|
15
|
+
- Updated dependencies [4b0f8a6]
|
|
16
|
+
- Updated dependencies [51264a5]
|
|
17
|
+
- Updated dependencies [8e6f677]
|
|
18
|
+
- Updated dependencies [d70c420]
|
|
19
|
+
- Updated dependencies [ee9af57]
|
|
20
|
+
- Updated dependencies [36f1c36]
|
|
21
|
+
- Updated dependencies [2a16996]
|
|
22
|
+
- Updated dependencies [10d352e]
|
|
23
|
+
- Updated dependencies [9589624]
|
|
24
|
+
- Updated dependencies [53d3c37]
|
|
25
|
+
- Updated dependencies [751c894]
|
|
26
|
+
- Updated dependencies [577ce3a]
|
|
27
|
+
- Updated dependencies [9260b3a]
|
|
28
|
+
- @mastra/core@0.10.6
|
|
29
|
+
|
|
30
|
+
## 1.0.0-alpha.0
|
|
31
|
+
|
|
32
|
+
### Major Changes
|
|
33
|
+
|
|
34
|
+
- 75136cd: Add LaTeX chunking support by adding a case for Language.LATEX in getSeparatorsForLanguage.
|
|
35
|
+
|
|
36
|
+
### Patch Changes
|
|
37
|
+
|
|
38
|
+
- 63f6b7d: dependencies updates:
|
|
39
|
+
- Updated dependency [`zod@^3.25.57` ↗︎](https://www.npmjs.com/package/zod/v/3.25.57) (from `^3.25.56`, in `dependencies`)
|
|
40
|
+
- Updated dependencies [63f6b7d]
|
|
41
|
+
- Updated dependencies [36f1c36]
|
|
42
|
+
- Updated dependencies [10d352e]
|
|
43
|
+
- Updated dependencies [53d3c37]
|
|
44
|
+
- @mastra/core@0.10.6-alpha.0
|
|
45
|
+
|
|
3
46
|
## 0.10.3
|
|
4
47
|
|
|
5
48
|
### Patch Changes
|
package/dist/index.cjs
CHANGED
|
@@ -729,7 +729,7 @@ var customAlphabet = (alphabet, defaultSize = 21) => {
|
|
|
729
729
|
};
|
|
730
730
|
};
|
|
731
731
|
|
|
732
|
-
// ../../node_modules/.pnpm/@ai-sdk+provider-utils@2.2.8_zod@3.25.
|
|
732
|
+
// ../../node_modules/.pnpm/@ai-sdk+provider-utils@2.2.8_zod@3.25.57/node_modules/@ai-sdk/provider-utils/dist/index.mjs
|
|
733
733
|
var import_secure_json_parse = __toESM(require_secure_json_parse());
|
|
734
734
|
function combineHeaders(...headers) {
|
|
735
735
|
return headers.reduce(
|
|
@@ -4767,6 +4767,21 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
|
|
|
4767
4767
|
" ",
|
|
4768
4768
|
""
|
|
4769
4769
|
];
|
|
4770
|
+
case "latex" /* LATEX */:
|
|
4771
|
+
return [
|
|
4772
|
+
"\\\\part\\*?\\{",
|
|
4773
|
+
"\\\\chapter\\*?\\{",
|
|
4774
|
+
"\\\\section\\*?\\{",
|
|
4775
|
+
"\\\\subsection\\*?\\{",
|
|
4776
|
+
"\\\\subsubsection\\*?\\{",
|
|
4777
|
+
"\\\\begin\\{.*?\\}",
|
|
4778
|
+
"\\\\end\\{.*?\\}",
|
|
4779
|
+
"\\\\[a-zA-Z]+\\{.*?\\}",
|
|
4780
|
+
"\n\n",
|
|
4781
|
+
"\n",
|
|
4782
|
+
" ",
|
|
4783
|
+
""
|
|
4784
|
+
];
|
|
4770
4785
|
// ... (add other language cases following the same pattern)
|
|
4771
4786
|
default:
|
|
4772
4787
|
throw new Error(`Language ${language} is not supported! Please choose from ${Object.values(Language)}`);
|
package/dist/index.js
CHANGED
|
@@ -727,7 +727,7 @@ var customAlphabet = (alphabet, defaultSize = 21) => {
|
|
|
727
727
|
};
|
|
728
728
|
};
|
|
729
729
|
|
|
730
|
-
// ../../node_modules/.pnpm/@ai-sdk+provider-utils@2.2.8_zod@3.25.
|
|
730
|
+
// ../../node_modules/.pnpm/@ai-sdk+provider-utils@2.2.8_zod@3.25.57/node_modules/@ai-sdk/provider-utils/dist/index.mjs
|
|
731
731
|
var import_secure_json_parse = __toESM(require_secure_json_parse());
|
|
732
732
|
function combineHeaders(...headers) {
|
|
733
733
|
return headers.reduce(
|
|
@@ -4765,6 +4765,21 @@ var RecursiveCharacterTransformer = class _RecursiveCharacterTransformer extends
|
|
|
4765
4765
|
" ",
|
|
4766
4766
|
""
|
|
4767
4767
|
];
|
|
4768
|
+
case "latex" /* LATEX */:
|
|
4769
|
+
return [
|
|
4770
|
+
"\\\\part\\*?\\{",
|
|
4771
|
+
"\\\\chapter\\*?\\{",
|
|
4772
|
+
"\\\\section\\*?\\{",
|
|
4773
|
+
"\\\\subsection\\*?\\{",
|
|
4774
|
+
"\\\\subsubsection\\*?\\{",
|
|
4775
|
+
"\\\\begin\\{.*?\\}",
|
|
4776
|
+
"\\\\end\\{.*?\\}",
|
|
4777
|
+
"\\\\[a-zA-Z]+\\{.*?\\}",
|
|
4778
|
+
"\n\n",
|
|
4779
|
+
"\n",
|
|
4780
|
+
" ",
|
|
4781
|
+
""
|
|
4782
|
+
];
|
|
4768
4783
|
// ... (add other language cases following the same pattern)
|
|
4769
4784
|
default:
|
|
4770
4785
|
throw new Error(`Language ${language} is not supported! Please choose from ${Object.values(Language)}`);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/rag",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "1.0.0",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -26,26 +26,26 @@
|
|
|
26
26
|
"js-tiktoken": "^1.0.20",
|
|
27
27
|
"node-html-better-parser": "^1.4.11",
|
|
28
28
|
"pathe": "^2.0.3",
|
|
29
|
-
"zod": "^3.25.
|
|
29
|
+
"zod": "^3.25.57"
|
|
30
30
|
},
|
|
31
31
|
"peerDependencies": {
|
|
32
|
-
"
|
|
33
|
-
"
|
|
32
|
+
"@mastra/core": "^0.10.0-alpha.0",
|
|
33
|
+
"ai": "^4.0.0"
|
|
34
34
|
},
|
|
35
35
|
"devDependencies": {
|
|
36
36
|
"@ai-sdk/cohere": "latest",
|
|
37
37
|
"@ai-sdk/openai": "latest",
|
|
38
38
|
"@microsoft/api-extractor": "^7.52.8",
|
|
39
39
|
"@types/big.js": "^6.2.2",
|
|
40
|
-
"@types/node": "^20.
|
|
40
|
+
"@types/node": "^20.19.0",
|
|
41
41
|
"ai": "^4.3.16",
|
|
42
42
|
"dotenv": "^16.5.0",
|
|
43
43
|
"eslint": "^9.28.0",
|
|
44
44
|
"tsup": "^8.5.0",
|
|
45
|
-
"typescript": "^5.8.
|
|
46
|
-
"vitest": "^3.2.
|
|
47
|
-
"@internal/lint": "0.0.
|
|
48
|
-
"@mastra/core": "0.10.
|
|
45
|
+
"typescript": "^5.8.3",
|
|
46
|
+
"vitest": "^3.2.3",
|
|
47
|
+
"@internal/lint": "0.0.13",
|
|
48
|
+
"@mastra/core": "0.10.6"
|
|
49
49
|
},
|
|
50
50
|
"keywords": [
|
|
51
51
|
"rag",
|
|
@@ -1505,6 +1505,111 @@ describe('MDocument', () => {
|
|
|
1505
1505
|
});
|
|
1506
1506
|
});
|
|
1507
1507
|
|
|
1508
|
+
describe('chunkLaTeX', () => {
|
|
1509
|
+
it('should split LaTeX text correctly based on sections', async () => {
|
|
1510
|
+
const text = `\\section{Introduction}
|
|
1511
|
+
|
|
1512
|
+
This is the introduction section.
|
|
1513
|
+
|
|
1514
|
+
\\subsection{Background}
|
|
1515
|
+
|
|
1516
|
+
Some background information.
|
|
1517
|
+
|
|
1518
|
+
\\subsubsection{Details}
|
|
1519
|
+
|
|
1520
|
+
Even more detailed explanation.
|
|
1521
|
+
|
|
1522
|
+
\\section{Conclusion}
|
|
1523
|
+
|
|
1524
|
+
Final thoughts here.`;
|
|
1525
|
+
|
|
1526
|
+
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1527
|
+
|
|
1528
|
+
await doc.chunk({
|
|
1529
|
+
strategy: 'latex',
|
|
1530
|
+
size: 100,
|
|
1531
|
+
overlap: 10,
|
|
1532
|
+
keepSeparator: 'start',
|
|
1533
|
+
});
|
|
1534
|
+
|
|
1535
|
+
const chunks = doc.getText();
|
|
1536
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
1537
|
+
expect(chunks[0]).toContain('\\section{Introduction}');
|
|
1538
|
+
});
|
|
1539
|
+
|
|
1540
|
+
it('should handle environments like equations or itemize', async () => {
|
|
1541
|
+
const text = `\\section{Math Section}
|
|
1542
|
+
|
|
1543
|
+
Here is an equation:
|
|
1544
|
+
|
|
1545
|
+
\\[
|
|
1546
|
+
E = mc^2
|
|
1547
|
+
\\]
|
|
1548
|
+
|
|
1549
|
+
\\begin{itemize}
|
|
1550
|
+
\\item First item
|
|
1551
|
+
\\item Second item
|
|
1552
|
+
\\end{itemize}
|
|
1553
|
+
|
|
1554
|
+
End of the section.`;
|
|
1555
|
+
|
|
1556
|
+
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1557
|
+
|
|
1558
|
+
await doc.chunk({
|
|
1559
|
+
strategy: 'latex',
|
|
1560
|
+
size: 100,
|
|
1561
|
+
overlap: 10,
|
|
1562
|
+
keepSeparator: 'start',
|
|
1563
|
+
});
|
|
1564
|
+
|
|
1565
|
+
const chunks = doc.getText();
|
|
1566
|
+
expect(chunks.some(chunk => chunk.includes('\\begin{itemize}'))).toBe(true);
|
|
1567
|
+
expect(chunks.some(chunk => chunk.includes('E = mc^2'))).toBe(true);
|
|
1568
|
+
});
|
|
1569
|
+
|
|
1570
|
+
it('should split with keepSeparator at end', async () => {
|
|
1571
|
+
const text = `Intro text here.
|
|
1572
|
+
\\section{First}
|
|
1573
|
+
Content A.
|
|
1574
|
+
|
|
1575
|
+
\\section{Second}
|
|
1576
|
+
Content B.`;
|
|
1577
|
+
|
|
1578
|
+
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1579
|
+
|
|
1580
|
+
await doc.chunk({
|
|
1581
|
+
strategy: 'latex',
|
|
1582
|
+
size: 50,
|
|
1583
|
+
overlap: 0,
|
|
1584
|
+
keepSeparator: 'end',
|
|
1585
|
+
});
|
|
1586
|
+
|
|
1587
|
+
const chunks = doc.getText();
|
|
1588
|
+
expect(chunks.length).toBe(3);
|
|
1589
|
+
expect(chunks[0].trimEnd().includes('\\section{')).toBe(true);
|
|
1590
|
+
expect(chunks[1].trimEnd().includes('\\section{')).toBe(true);
|
|
1591
|
+
});
|
|
1592
|
+
|
|
1593
|
+
it('should strip whitespace correctly', async () => {
|
|
1594
|
+
const text = `\\section{Whitespace}
|
|
1595
|
+
|
|
1596
|
+
Content with leading and trailing whitespace.
|
|
1597
|
+
`;
|
|
1598
|
+
|
|
1599
|
+
const doc = MDocument.fromText(text, { meta: 'data' });
|
|
1600
|
+
|
|
1601
|
+
await doc.chunk({
|
|
1602
|
+
strategy: 'latex',
|
|
1603
|
+
size: 100,
|
|
1604
|
+
overlap: 0,
|
|
1605
|
+
stripWhitespace: true,
|
|
1606
|
+
});
|
|
1607
|
+
|
|
1608
|
+
const chunks = doc.getText();
|
|
1609
|
+
expect(chunks.every(chunk => chunk === chunk.trim())).toBe(true);
|
|
1610
|
+
});
|
|
1611
|
+
});
|
|
1612
|
+
|
|
1508
1613
|
describe('MarkdownHeader', () => {
|
|
1509
1614
|
it('should split on headers and preserve metadata', async () => {
|
|
1510
1615
|
const text = `# Main Title
|
|
@@ -272,6 +272,21 @@ export class RecursiveCharacterTransformer extends TextTransformer {
|
|
|
272
272
|
' ',
|
|
273
273
|
'',
|
|
274
274
|
];
|
|
275
|
+
case Language.LATEX:
|
|
276
|
+
return [
|
|
277
|
+
'\\\\part\\*?\\{',
|
|
278
|
+
'\\\\chapter\\*?\\{',
|
|
279
|
+
'\\\\section\\*?\\{',
|
|
280
|
+
'\\\\subsection\\*?\\{',
|
|
281
|
+
'\\\\subsubsection\\*?\\{',
|
|
282
|
+
'\\\\begin\\{.*?\\}',
|
|
283
|
+
'\\\\end\\{.*?\\}',
|
|
284
|
+
'\\\\[a-zA-Z]+\\{.*?\\}',
|
|
285
|
+
'\n\n',
|
|
286
|
+
'\n',
|
|
287
|
+
' ',
|
|
288
|
+
'',
|
|
289
|
+
];
|
|
275
290
|
// ... (add other language cases following the same pattern)
|
|
276
291
|
default:
|
|
277
292
|
throw new Error(`Language ${language} is not supported! Please choose from ${Object.values(Language)}`);
|