@mastra/rag 1.2.2 → 1.2.3-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CHANGELOG.md +22 -0
  2. package/dist/index.cjs +25 -9
  3. package/dist/index.cjs.map +1 -1
  4. package/dist/index.js +25 -9
  5. package/dist/index.js.map +1 -1
  6. package/dist/tools/graph-rag.d.ts.map +1 -1
  7. package/dist/tools/types.d.ts +18 -5
  8. package/dist/tools/types.d.ts.map +1 -1
  9. package/dist/tools/vector-query.d.ts.map +1 -1
  10. package/dist/utils/vector-search.d.ts +6 -7
  11. package/dist/utils/vector-search.d.ts.map +1 -1
  12. package/package.json +19 -6
  13. package/.turbo/turbo-build.log +0 -4
  14. package/docker-compose.yaml +0 -22
  15. package/eslint.config.js +0 -6
  16. package/src/document/document.test.ts +0 -2975
  17. package/src/document/document.ts +0 -335
  18. package/src/document/extractors/base.ts +0 -30
  19. package/src/document/extractors/index.ts +0 -5
  20. package/src/document/extractors/keywords.test.ts +0 -125
  21. package/src/document/extractors/keywords.ts +0 -126
  22. package/src/document/extractors/questions.test.ts +0 -120
  23. package/src/document/extractors/questions.ts +0 -111
  24. package/src/document/extractors/summary.test.ts +0 -107
  25. package/src/document/extractors/summary.ts +0 -122
  26. package/src/document/extractors/title.test.ts +0 -121
  27. package/src/document/extractors/title.ts +0 -185
  28. package/src/document/extractors/types.ts +0 -40
  29. package/src/document/index.ts +0 -2
  30. package/src/document/prompts/base.ts +0 -77
  31. package/src/document/prompts/format.ts +0 -9
  32. package/src/document/prompts/index.ts +0 -15
  33. package/src/document/prompts/prompt.ts +0 -60
  34. package/src/document/prompts/types.ts +0 -29
  35. package/src/document/schema/index.ts +0 -3
  36. package/src/document/schema/node.ts +0 -187
  37. package/src/document/schema/types.ts +0 -40
  38. package/src/document/transformers/character.ts +0 -267
  39. package/src/document/transformers/html.ts +0 -346
  40. package/src/document/transformers/json.ts +0 -536
  41. package/src/document/transformers/latex.ts +0 -11
  42. package/src/document/transformers/markdown.ts +0 -239
  43. package/src/document/transformers/semantic-markdown.ts +0 -227
  44. package/src/document/transformers/sentence.ts +0 -314
  45. package/src/document/transformers/text.ts +0 -158
  46. package/src/document/transformers/token.ts +0 -137
  47. package/src/document/transformers/transformer.ts +0 -5
  48. package/src/document/types.ts +0 -145
  49. package/src/document/validation.ts +0 -158
  50. package/src/graph-rag/index.test.ts +0 -235
  51. package/src/graph-rag/index.ts +0 -306
  52. package/src/index.ts +0 -8
  53. package/src/rerank/index.test.ts +0 -150
  54. package/src/rerank/index.ts +0 -198
  55. package/src/rerank/relevance/cohere/index.ts +0 -56
  56. package/src/rerank/relevance/index.ts +0 -3
  57. package/src/rerank/relevance/mastra-agent/index.ts +0 -32
  58. package/src/rerank/relevance/zeroentropy/index.ts +0 -26
  59. package/src/tools/README.md +0 -153
  60. package/src/tools/document-chunker.ts +0 -34
  61. package/src/tools/graph-rag.test.ts +0 -115
  62. package/src/tools/graph-rag.ts +0 -154
  63. package/src/tools/index.ts +0 -3
  64. package/src/tools/types.ts +0 -110
  65. package/src/tools/vector-query-database-config.test.ts +0 -190
  66. package/src/tools/vector-query.test.ts +0 -418
  67. package/src/tools/vector-query.ts +0 -169
  68. package/src/utils/convert-sources.ts +0 -43
  69. package/src/utils/default-settings.ts +0 -38
  70. package/src/utils/index.ts +0 -3
  71. package/src/utils/tool-schemas.ts +0 -38
  72. package/src/utils/vector-prompts.ts +0 -832
  73. package/src/utils/vector-search.ts +0 -117
  74. package/tsconfig.build.json +0 -9
  75. package/tsconfig.json +0 -5
  76. package/tsup.config.ts +0 -17
  77. package/vitest.config.ts +0 -8
@@ -1,239 +0,0 @@
1
- import { Document } from '../schema';
2
-
3
- import { Language } from '../types';
4
- import type { BaseChunkOptions } from '../types';
5
-
6
- import { RecursiveCharacterTransformer } from './character';
7
-
8
- interface LineType {
9
- metadata: Record<string, string>;
10
- content: string;
11
- }
12
-
13
- interface HeaderType {
14
- level: number;
15
- name: string;
16
- data: string;
17
- }
18
-
19
- export class MarkdownTransformer extends RecursiveCharacterTransformer {
20
- constructor(options: BaseChunkOptions = {}) {
21
- const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage(Language.MARKDOWN);
22
- super({ ...options, separators, isSeparatorRegex: true });
23
- }
24
- }
25
-
26
- export class MarkdownHeaderTransformer {
27
- private headersToSplitOn: [string, string][];
28
- private returnEachLine: boolean;
29
- private stripHeaders: boolean;
30
-
31
- constructor(headersToSplitOn: [string, string][], returnEachLine: boolean = false, stripHeaders: boolean = true) {
32
- this.headersToSplitOn = [...headersToSplitOn].sort((a, b) => b[0].length - a[0].length);
33
- this.returnEachLine = returnEachLine;
34
- this.stripHeaders = stripHeaders;
35
- }
36
-
37
- private aggregateLinesToChunks(lines: LineType[]): Document[] {
38
- if (this.returnEachLine) {
39
- return lines.flatMap(line => {
40
- const contentLines = line.content.split('\n');
41
- return contentLines
42
- .filter(l => l.trim() !== '' || this.headersToSplitOn.some(([sep]) => l.trim().startsWith(sep)))
43
- .map(
44
- l =>
45
- new Document({
46
- text: l.trim(),
47
- metadata: line.metadata,
48
- }),
49
- );
50
- });
51
- }
52
-
53
- const aggregatedChunks: LineType[] = [];
54
-
55
- for (const line of lines) {
56
- const lastLine = aggregatedChunks[aggregatedChunks.length - 1]?.content?.split('\n')?.slice(-1)[0]?.trim();
57
- const lastChunkIsHeader = lastLine ? this.headersToSplitOn.some(([sep]) => lastLine.startsWith(sep)) : false;
58
- if (
59
- aggregatedChunks.length > 0 &&
60
- JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1]!.metadata) === JSON.stringify(line.metadata)
61
- ) {
62
- const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
63
- aggChunk!.content += ' \n' + line.content;
64
- } else if (
65
- aggregatedChunks.length > 0 &&
66
- JSON.stringify(aggregatedChunks?.[aggregatedChunks.length - 1]!.metadata) !== JSON.stringify(line.metadata) &&
67
- Object.keys(aggregatedChunks?.[aggregatedChunks.length - 1]!.metadata).length <
68
- Object.keys(line.metadata).length &&
69
- lastChunkIsHeader
70
- ) {
71
- if (aggregatedChunks && aggregatedChunks?.[aggregatedChunks.length - 1]) {
72
- const aggChunk = aggregatedChunks[aggregatedChunks.length - 1];
73
- if (aggChunk) {
74
- aggChunk.content += ' \n' + line.content;
75
- aggChunk.metadata = line.metadata;
76
- }
77
- }
78
- } else {
79
- aggregatedChunks.push(line);
80
- }
81
- }
82
-
83
- return aggregatedChunks.map(
84
- chunk =>
85
- new Document({
86
- text: chunk.content,
87
- metadata: chunk.metadata,
88
- }),
89
- );
90
- }
91
-
92
- splitText({ text }: { text: string }): Document[] {
93
- const lines = text.split('\n');
94
- const linesWithMetadata: LineType[] = [];
95
- let currentContent: string[] = [];
96
- let currentMetadata: Record<string, string> = {};
97
- const headerStack: HeaderType[] = [];
98
- const initialMetadata: Record<string, string> = {};
99
-
100
- let inCodeBlock = false;
101
- let openingFence = '';
102
-
103
- for (let i = 0; i < lines.length; i++) {
104
- const line = lines[i]!;
105
- const strippedLine = line.trim();
106
-
107
- if (!inCodeBlock) {
108
- if (
109
- (strippedLine.startsWith('```') && strippedLine.split('```').length === 2) ||
110
- strippedLine.startsWith('~~~')
111
- ) {
112
- inCodeBlock = true;
113
- openingFence = strippedLine.startsWith('```') ? '```' : '~~~';
114
- }
115
- } else {
116
- if (strippedLine.startsWith(openingFence)) {
117
- inCodeBlock = false;
118
- openingFence = '';
119
- }
120
- }
121
-
122
- if (inCodeBlock) {
123
- currentContent.push(line);
124
- continue;
125
- }
126
-
127
- let headerMatched = false;
128
- for (const [sep, name] of this.headersToSplitOn) {
129
- if (strippedLine.startsWith(sep) && (strippedLine.length === sep.length || strippedLine[sep.length] === ' ')) {
130
- headerMatched = true;
131
-
132
- // If we have existing content, save it before processing the header
133
- if (currentContent.length > 0) {
134
- linesWithMetadata.push({
135
- content: currentContent.join('\n'),
136
- metadata: { ...currentMetadata },
137
- });
138
- currentContent = [];
139
- }
140
-
141
- if (name !== null) {
142
- const currentHeaderLevel = (sep.match(/#/g) || []).length;
143
-
144
- // Pop headers of lower or same level
145
- while (headerStack.length > 0 && headerStack?.[headerStack.length - 1]!.level >= currentHeaderLevel) {
146
- const poppedHeader = headerStack.pop()!;
147
- if (poppedHeader.name in initialMetadata) {
148
- delete initialMetadata[poppedHeader.name];
149
- }
150
- }
151
-
152
- // Push current header
153
- const header: HeaderType = {
154
- level: currentHeaderLevel,
155
- name,
156
- data: strippedLine.slice(sep.length).trim(),
157
- };
158
- headerStack.push(header);
159
- initialMetadata[name] = header.data;
160
- }
161
-
162
- // Only add header to linesWithMetadata if stripHeaders is false
163
- if (!this.stripHeaders) {
164
- linesWithMetadata.push({
165
- content: line,
166
- metadata: { ...currentMetadata, ...initialMetadata },
167
- });
168
- }
169
- break;
170
- }
171
- }
172
-
173
- if (!headerMatched) {
174
- if (strippedLine || this.returnEachLine) {
175
- currentContent.push(line);
176
-
177
- if (this.returnEachLine) {
178
- // In returnEachLine mode, flush each non-header line immediately
179
- linesWithMetadata.push({
180
- content: line,
181
- metadata: { ...currentMetadata },
182
- });
183
- currentContent = [];
184
- }
185
- } else if (currentContent.length > 0) {
186
- linesWithMetadata.push({
187
- content: currentContent.join('\n'),
188
- metadata: { ...currentMetadata },
189
- });
190
- currentContent = [];
191
- }
192
- }
193
-
194
- // Reset metadata for next line
195
- currentMetadata = { ...initialMetadata };
196
- }
197
-
198
- // Handle any remaining content
199
- if (currentContent.length > 0) {
200
- linesWithMetadata.push({
201
- content: currentContent.join('\n'),
202
- metadata: currentMetadata,
203
- });
204
- }
205
-
206
- return this.aggregateLinesToChunks(linesWithMetadata);
207
- }
208
-
209
- createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[] {
210
- const _metadatas = metadatas || Array(texts.length).fill({});
211
- const documents: Document[] = [];
212
-
213
- texts.forEach((text, i) => {
214
- this.splitText({ text }).forEach(chunk => {
215
- const metadata = { ..._metadatas[i], ...chunk.metadata };
216
- documents.push(
217
- new Document({
218
- text: chunk.text,
219
- metadata,
220
- }),
221
- );
222
- });
223
- });
224
-
225
- return documents;
226
- }
227
-
228
- transformDocuments(documents: Document[]): Document[] {
229
- const texts: string[] = [];
230
- const metadatas: Record<string, any>[] = [];
231
-
232
- for (const doc of documents) {
233
- texts.push(doc.text);
234
- metadatas.push(doc.metadata);
235
- }
236
-
237
- return this.createDocuments(texts, metadatas);
238
- }
239
- }
@@ -1,227 +0,0 @@
1
- import type { TiktokenModel, TiktokenEncoding, Tiktoken } from 'js-tiktoken';
2
- import { encodingForModel, getEncoding } from 'js-tiktoken';
3
- import { Document } from '../schema';
4
- import type { SemanticMarkdownChunkOptions } from '../types';
5
-
6
- import { TextTransformer } from './text';
7
-
8
- interface MarkdownNode {
9
- title: string;
10
- depth: number;
11
- content: string;
12
- length: number;
13
- }
14
-
15
- export class SemanticMarkdownTransformer extends TextTransformer {
16
- private tokenizer: Tiktoken;
17
- private joinThreshold: number;
18
- private allowedSpecial: Set<string> | 'all';
19
- private disallowedSpecial: Set<string> | 'all';
20
-
21
- constructor({
22
- joinThreshold = 500,
23
- encodingName = 'cl100k_base',
24
- modelName,
25
- allowedSpecial = new Set(),
26
- disallowedSpecial = 'all',
27
- ...baseOptions
28
- }: SemanticMarkdownChunkOptions = {}) {
29
- super(baseOptions);
30
-
31
- this.joinThreshold = joinThreshold;
32
- this.allowedSpecial = allowedSpecial;
33
- this.disallowedSpecial = disallowedSpecial;
34
-
35
- try {
36
- this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
37
- } catch {
38
- throw new Error('Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.');
39
- }
40
- }
41
-
42
- private countTokens(text: string): number {
43
- const allowed = this.allowedSpecial === 'all' ? 'all' : Array.from(this.allowedSpecial);
44
- const disallowed = this.disallowedSpecial === 'all' ? 'all' : Array.from(this.disallowedSpecial);
45
-
46
- const processedText = this.stripWhitespace ? text.trim() : text;
47
- return this.tokenizer.encode(processedText, allowed, disallowed).length;
48
- }
49
-
50
- private splitMarkdownByHeaders(markdown: string): MarkdownNode[] {
51
- const sections: MarkdownNode[] = [];
52
- const lines = markdown.split('\n');
53
- let currentContent = '';
54
- let currentTitle = '';
55
- let currentDepth = 0;
56
- let inCodeBlock = false;
57
-
58
- const headerRegex = /^(#+)\s+(.+)$/;
59
-
60
- for (let i = 0; i < lines.length; i++) {
61
- const line = lines[i]!;
62
- const headerMatch = line.match(headerRegex);
63
-
64
- // Track code blocks to avoid parsing headers inside them
65
- if (line.startsWith('```') || line.startsWith('~~~')) {
66
- inCodeBlock = !inCodeBlock;
67
- }
68
-
69
- if (headerMatch && !inCodeBlock) {
70
- // Save previous section
71
- // Push the previous section if it has content or if it's a header.
72
- // This ensures headers that only act as parents are not lost.
73
- if (currentContent.trim() !== '' || (currentTitle && currentDepth > 0)) {
74
- sections.push({
75
- title: currentTitle,
76
- content: currentContent.trim(),
77
- depth: currentDepth,
78
- length: this.countTokens(currentContent.trim()),
79
- });
80
- }
81
- currentContent = ''; // Always reset for the new section
82
-
83
- // Start new section
84
- currentDepth = headerMatch[1]!.length;
85
- currentTitle = headerMatch[2]!;
86
- } else {
87
- currentContent += line + '\n';
88
- }
89
- }
90
-
91
- // Add the last section
92
- if (currentContent.trim() !== '') {
93
- sections.push({
94
- title: currentTitle,
95
- content: currentContent.trim(),
96
- depth: currentDepth,
97
- length: this.countTokens(currentContent.trim()),
98
- });
99
- }
100
-
101
- // Remove initial empty preamble if present, but keep non-empty preambles
102
- if (sections.length > 1 && sections[0]!.title === '' && sections[0]!.content.trim() === '') {
103
- sections.shift();
104
- }
105
-
106
- return sections;
107
- }
108
-
109
- private mergeSemanticSections(sections: MarkdownNode[]): MarkdownNode[] {
110
- if (sections.length === 0) return sections;
111
-
112
- const workingSections = [...sections];
113
- const deepest = Math.max(...workingSections.map(s => s.depth));
114
-
115
- for (let depth = deepest; depth > 0; depth--) {
116
- for (let j = 1; j < workingSections.length; j++) {
117
- const current = workingSections[j]!;
118
-
119
- if (current.depth === depth) {
120
- const prev = workingSections[j - 1]!;
121
-
122
- if (prev.length + current.length < this.joinThreshold && prev.depth <= current.depth) {
123
- const title = `${'#'.repeat(current.depth)} ${current.title}`;
124
- const formattedTitle = `\n\n${title}`;
125
-
126
- prev.content += `${formattedTitle}\n${current.content}`;
127
-
128
- prev.length = this.countTokens(prev.content);
129
-
130
- workingSections.splice(j, 1);
131
- j--;
132
- }
133
- }
134
- }
135
- }
136
-
137
- return workingSections;
138
- }
139
-
140
- splitText({ text }: { text: string }): string[] {
141
- if (!text.trim()) return [];
142
-
143
- const initialSections = this.splitMarkdownByHeaders(text);
144
-
145
- const mergedSections = this.mergeSemanticSections(initialSections);
146
-
147
- return mergedSections.map(section => {
148
- if (section.title) {
149
- const header = `${'#'.repeat(section.depth)} ${section.title}`;
150
- return `${header}\n${section.content}`;
151
- }
152
- return section.content;
153
- });
154
- }
155
-
156
- createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[] {
157
- const _metadatas = metadatas || Array(texts.length).fill({});
158
- const documents: Document[] = [];
159
-
160
- texts.forEach((text, i) => {
161
- this.splitText({ text }).forEach(chunk => {
162
- const metadata = {
163
- ..._metadatas[i],
164
- tokenCount: this.countTokens(chunk),
165
- };
166
-
167
- documents.push(
168
- new Document({
169
- text: chunk,
170
- metadata,
171
- }),
172
- );
173
- });
174
- });
175
-
176
- return documents;
177
- }
178
-
179
- transformDocuments(documents: Document[]): Document[] {
180
- const texts: string[] = [];
181
- const metadatas: Record<string, any>[] = [];
182
-
183
- for (const doc of documents) {
184
- texts.push(doc.text);
185
- metadatas.push(doc.metadata);
186
- }
187
-
188
- return this.createDocuments(texts, metadatas);
189
- }
190
-
191
- static fromTikToken({
192
- encodingName = 'cl100k_base',
193
- modelName,
194
- options = {},
195
- }: {
196
- encodingName?: TiktokenEncoding;
197
- modelName?: TiktokenModel;
198
- options?: SemanticMarkdownChunkOptions;
199
- }): SemanticMarkdownTransformer {
200
- let tokenizer: Tiktoken;
201
-
202
- try {
203
- tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
204
- } catch {
205
- throw new Error('Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.');
206
- }
207
-
208
- const tikTokenCounter = (text: string): number => {
209
- const allowed =
210
- options.allowedSpecial === 'all' ? 'all' : options.allowedSpecial ? Array.from(options.allowedSpecial) : [];
211
- const disallowed =
212
- options.disallowedSpecial === 'all'
213
- ? 'all'
214
- : options.disallowedSpecial
215
- ? Array.from(options.disallowedSpecial)
216
- : [];
217
- return tokenizer.encode(text, allowed, disallowed).length;
218
- };
219
-
220
- return new SemanticMarkdownTransformer({
221
- ...options,
222
- encodingName,
223
- modelName,
224
- lengthFunction: tikTokenCounter,
225
- });
226
- }
227
- }