@mastra/rag 1.0.7 → 1.0.8-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +11 -0
- package/dist/document/document.d.ts +4 -1
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/transformers/semantic-markdown.d.ts +25 -0
- package/dist/document/transformers/semantic-markdown.d.ts.map +1 -0
- package/dist/document/types.d.ts +13 -1
- package/dist/document/types.d.ts.map +1 -1
- package/dist/document/validation.d.ts.map +1 -1
- package/dist/index.cjs +197 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -11
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
- package/src/document/document.test.ts +644 -1
- package/src/document/document.ts +32 -12
- package/src/document/transformers/semantic-markdown.ts +227 -0
- package/src/document/types.ts +21 -2
- package/src/document/validation.ts +11 -0
package/src/document/document.ts
CHANGED
|
@@ -7,6 +7,7 @@ import { HTMLHeaderTransformer, HTMLSectionTransformer } from './transformers/ht
|
|
|
7
7
|
import { RecursiveJsonTransformer } from './transformers/json';
|
|
8
8
|
import { LatexTransformer } from './transformers/latex';
|
|
9
9
|
import { MarkdownHeaderTransformer, MarkdownTransformer } from './transformers/markdown';
|
|
10
|
+
import { SemanticMarkdownTransformer } from './transformers/semantic-markdown';
|
|
10
11
|
import { SentenceTransformer } from './transformers/sentence';
|
|
11
12
|
import { TokenTransformer } from './transformers/token';
|
|
12
13
|
import type {
|
|
@@ -18,6 +19,7 @@ import type {
|
|
|
18
19
|
CharacterChunkOptions,
|
|
19
20
|
TokenChunkOptions,
|
|
20
21
|
MarkdownChunkOptions,
|
|
22
|
+
SemanticMarkdownChunkOptions,
|
|
21
23
|
JsonChunkOptions,
|
|
22
24
|
LatexChunkOptions,
|
|
23
25
|
SentenceChunkOptions,
|
|
@@ -150,19 +152,27 @@ export class MDocument {
|
|
|
150
152
|
}
|
|
151
153
|
}
|
|
152
154
|
|
|
155
|
+
private _strategyMap?: { [S in ChunkStrategy]: (options?: StrategyOptions[S]) => Promise<void> };
|
|
156
|
+
|
|
157
|
+
private get strategyMap() {
|
|
158
|
+
if (!this._strategyMap) {
|
|
159
|
+
this._strategyMap = {
|
|
160
|
+
recursive: options => this.chunkRecursive(options),
|
|
161
|
+
character: options => this.chunkCharacter(options),
|
|
162
|
+
token: options => this.chunkToken(options),
|
|
163
|
+
markdown: options => this.chunkMarkdown(options),
|
|
164
|
+
html: options => this.chunkHTML(options),
|
|
165
|
+
json: options => this.chunkJSON(options),
|
|
166
|
+
latex: options => this.chunkLatex(options),
|
|
167
|
+
sentence: options => this.chunkSentence(options),
|
|
168
|
+
'semantic-markdown': options => this.chunkSemanticMarkdown(options),
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
return this._strategyMap;
|
|
172
|
+
}
|
|
173
|
+
|
|
153
174
|
private async chunkBy<K extends ChunkStrategy>(strategy: K, options?: StrategyOptions[K]): Promise<void> {
|
|
154
|
-
const
|
|
155
|
-
recursive: options => this.chunkRecursive(options),
|
|
156
|
-
character: options => this.chunkCharacter(options),
|
|
157
|
-
token: options => this.chunkToken(options),
|
|
158
|
-
markdown: options => this.chunkMarkdown(options),
|
|
159
|
-
html: options => this.chunkHTML(options),
|
|
160
|
-
json: options => this.chunkJSON(options),
|
|
161
|
-
latex: options => this.chunkLatex(options),
|
|
162
|
-
sentence: options => this.chunkSentence(options),
|
|
163
|
-
};
|
|
164
|
-
|
|
165
|
-
const chunkingFunc = strategyMap[strategy];
|
|
175
|
+
const chunkingFunc = this.strategyMap[strategy];
|
|
166
176
|
if (chunkingFunc) {
|
|
167
177
|
await chunkingFunc(options);
|
|
168
178
|
} else {
|
|
@@ -284,6 +294,16 @@ export class MDocument {
|
|
|
284
294
|
this.chunks = textSplit;
|
|
285
295
|
}
|
|
286
296
|
|
|
297
|
+
async chunkSemanticMarkdown(options?: SemanticMarkdownChunkOptions): Promise<void> {
|
|
298
|
+
const rt = SemanticMarkdownTransformer.fromTikToken({
|
|
299
|
+
options,
|
|
300
|
+
encodingName: options?.encodingName,
|
|
301
|
+
modelName: options?.modelName,
|
|
302
|
+
});
|
|
303
|
+
const textSplit = rt.transformDocuments(this.chunks);
|
|
304
|
+
this.chunks = textSplit;
|
|
305
|
+
}
|
|
306
|
+
|
|
287
307
|
async chunk(params?: ChunkParams): Promise<Chunk[]> {
|
|
288
308
|
const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
|
|
289
309
|
// Determine the default strategy based on type if not specified
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import type { TiktokenModel, TiktokenEncoding, Tiktoken } from 'js-tiktoken';
|
|
2
|
+
import { encodingForModel, getEncoding } from 'js-tiktoken';
|
|
3
|
+
import { Document } from '../schema';
|
|
4
|
+
import type { SemanticMarkdownChunkOptions } from '../types';
|
|
5
|
+
|
|
6
|
+
import { TextTransformer } from './text';
|
|
7
|
+
|
|
8
|
+
interface MarkdownNode {
|
|
9
|
+
title: string;
|
|
10
|
+
depth: number;
|
|
11
|
+
content: string;
|
|
12
|
+
length: number;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export class SemanticMarkdownTransformer extends TextTransformer {
|
|
16
|
+
private tokenizer: Tiktoken;
|
|
17
|
+
private joinThreshold: number;
|
|
18
|
+
private allowedSpecial: Set<string> | 'all';
|
|
19
|
+
private disallowedSpecial: Set<string> | 'all';
|
|
20
|
+
|
|
21
|
+
constructor({
|
|
22
|
+
joinThreshold = 500,
|
|
23
|
+
encodingName = 'cl100k_base',
|
|
24
|
+
modelName,
|
|
25
|
+
allowedSpecial = new Set(),
|
|
26
|
+
disallowedSpecial = 'all',
|
|
27
|
+
...baseOptions
|
|
28
|
+
}: SemanticMarkdownChunkOptions = {}) {
|
|
29
|
+
super(baseOptions);
|
|
30
|
+
|
|
31
|
+
this.joinThreshold = joinThreshold;
|
|
32
|
+
this.allowedSpecial = allowedSpecial;
|
|
33
|
+
this.disallowedSpecial = disallowedSpecial;
|
|
34
|
+
|
|
35
|
+
try {
|
|
36
|
+
this.tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
|
|
37
|
+
} catch {
|
|
38
|
+
throw new Error('Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.');
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
private countTokens(text: string): number {
|
|
43
|
+
const allowed = this.allowedSpecial === 'all' ? 'all' : Array.from(this.allowedSpecial);
|
|
44
|
+
const disallowed = this.disallowedSpecial === 'all' ? 'all' : Array.from(this.disallowedSpecial);
|
|
45
|
+
|
|
46
|
+
const processedText = this.stripWhitespace ? text.trim() : text;
|
|
47
|
+
return this.tokenizer.encode(processedText, allowed, disallowed).length;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
private splitMarkdownByHeaders(markdown: string): MarkdownNode[] {
|
|
51
|
+
const sections: MarkdownNode[] = [];
|
|
52
|
+
const lines = markdown.split('\n');
|
|
53
|
+
let currentContent = '';
|
|
54
|
+
let currentTitle = '';
|
|
55
|
+
let currentDepth = 0;
|
|
56
|
+
let inCodeBlock = false;
|
|
57
|
+
|
|
58
|
+
const headerRegex = /^(#+)\s+(.+)$/;
|
|
59
|
+
|
|
60
|
+
for (let i = 0; i < lines.length; i++) {
|
|
61
|
+
const line = lines[i]!;
|
|
62
|
+
const headerMatch = line.match(headerRegex);
|
|
63
|
+
|
|
64
|
+
// Track code blocks to avoid parsing headers inside them
|
|
65
|
+
if (line.startsWith('```') || line.startsWith('~~~')) {
|
|
66
|
+
inCodeBlock = !inCodeBlock;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (headerMatch && !inCodeBlock) {
|
|
70
|
+
// Save previous section
|
|
71
|
+
// Push the previous section if it has content or if it's a header.
|
|
72
|
+
// This ensures headers that only act as parents are not lost.
|
|
73
|
+
if (currentContent.trim() !== '' || (currentTitle && currentDepth > 0)) {
|
|
74
|
+
sections.push({
|
|
75
|
+
title: currentTitle,
|
|
76
|
+
content: currentContent.trim(),
|
|
77
|
+
depth: currentDepth,
|
|
78
|
+
length: this.countTokens(currentContent.trim()),
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
currentContent = ''; // Always reset for the new section
|
|
82
|
+
|
|
83
|
+
// Start new section
|
|
84
|
+
currentDepth = headerMatch[1]!.length;
|
|
85
|
+
currentTitle = headerMatch[2]!;
|
|
86
|
+
} else {
|
|
87
|
+
currentContent += line + '\n';
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Add the last section
|
|
92
|
+
if (currentContent.trim() !== '') {
|
|
93
|
+
sections.push({
|
|
94
|
+
title: currentTitle,
|
|
95
|
+
content: currentContent.trim(),
|
|
96
|
+
depth: currentDepth,
|
|
97
|
+
length: this.countTokens(currentContent.trim()),
|
|
98
|
+
});
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Remove initial empty preamble if present, but keep non-empty preambles
|
|
102
|
+
if (sections.length > 1 && sections[0]!.title === '' && sections[0]!.content.trim() === '') {
|
|
103
|
+
sections.shift();
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return sections;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
private mergeSemanticSections(sections: MarkdownNode[]): MarkdownNode[] {
|
|
110
|
+
if (sections.length === 0) return sections;
|
|
111
|
+
|
|
112
|
+
const workingSections = [...sections];
|
|
113
|
+
const deepest = Math.max(...workingSections.map(s => s.depth));
|
|
114
|
+
|
|
115
|
+
for (let depth = deepest; depth > 0; depth--) {
|
|
116
|
+
for (let j = 1; j < workingSections.length; j++) {
|
|
117
|
+
const current = workingSections[j]!;
|
|
118
|
+
|
|
119
|
+
if (current.depth === depth) {
|
|
120
|
+
const prev = workingSections[j - 1]!;
|
|
121
|
+
|
|
122
|
+
if (prev.length + current.length < this.joinThreshold && prev.depth <= current.depth) {
|
|
123
|
+
const title = `${'#'.repeat(current.depth)} ${current.title}`;
|
|
124
|
+
const formattedTitle = `\n\n${title}`;
|
|
125
|
+
|
|
126
|
+
prev.content += `${formattedTitle}\n${current.content}`;
|
|
127
|
+
|
|
128
|
+
prev.length = this.countTokens(prev.content);
|
|
129
|
+
|
|
130
|
+
workingSections.splice(j, 1);
|
|
131
|
+
j--;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return workingSections;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
splitText({ text }: { text: string }): string[] {
|
|
141
|
+
if (!text.trim()) return [];
|
|
142
|
+
|
|
143
|
+
const initialSections = this.splitMarkdownByHeaders(text);
|
|
144
|
+
|
|
145
|
+
const mergedSections = this.mergeSemanticSections(initialSections);
|
|
146
|
+
|
|
147
|
+
return mergedSections.map(section => {
|
|
148
|
+
if (section.title) {
|
|
149
|
+
const header = `${'#'.repeat(section.depth)} ${section.title}`;
|
|
150
|
+
return `${header}\n${section.content}`;
|
|
151
|
+
}
|
|
152
|
+
return section.content;
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[] {
|
|
157
|
+
const _metadatas = metadatas || Array(texts.length).fill({});
|
|
158
|
+
const documents: Document[] = [];
|
|
159
|
+
|
|
160
|
+
texts.forEach((text, i) => {
|
|
161
|
+
this.splitText({ text }).forEach(chunk => {
|
|
162
|
+
const metadata = {
|
|
163
|
+
..._metadatas[i],
|
|
164
|
+
tokenCount: this.countTokens(chunk),
|
|
165
|
+
};
|
|
166
|
+
|
|
167
|
+
documents.push(
|
|
168
|
+
new Document({
|
|
169
|
+
text: chunk,
|
|
170
|
+
metadata,
|
|
171
|
+
}),
|
|
172
|
+
);
|
|
173
|
+
});
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
return documents;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
transformDocuments(documents: Document[]): Document[] {
|
|
180
|
+
const texts: string[] = [];
|
|
181
|
+
const metadatas: Record<string, any>[] = [];
|
|
182
|
+
|
|
183
|
+
for (const doc of documents) {
|
|
184
|
+
texts.push(doc.text);
|
|
185
|
+
metadatas.push(doc.metadata);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
return this.createDocuments(texts, metadatas);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
static fromTikToken({
|
|
192
|
+
encodingName = 'cl100k_base',
|
|
193
|
+
modelName,
|
|
194
|
+
options = {},
|
|
195
|
+
}: {
|
|
196
|
+
encodingName?: TiktokenEncoding;
|
|
197
|
+
modelName?: TiktokenModel;
|
|
198
|
+
options?: SemanticMarkdownChunkOptions;
|
|
199
|
+
}): SemanticMarkdownTransformer {
|
|
200
|
+
let tokenizer: Tiktoken;
|
|
201
|
+
|
|
202
|
+
try {
|
|
203
|
+
tokenizer = modelName ? encodingForModel(modelName) : getEncoding(encodingName);
|
|
204
|
+
} catch {
|
|
205
|
+
throw new Error('Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.');
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
const tikTokenCounter = (text: string): number => {
|
|
209
|
+
const allowed =
|
|
210
|
+
options.allowedSpecial === 'all' ? 'all' : options.allowedSpecial ? Array.from(options.allowedSpecial) : [];
|
|
211
|
+
const disallowed =
|
|
212
|
+
options.disallowedSpecial === 'all'
|
|
213
|
+
? 'all'
|
|
214
|
+
: options.disallowedSpecial
|
|
215
|
+
? Array.from(options.disallowedSpecial)
|
|
216
|
+
: [];
|
|
217
|
+
return tokenizer.encode(text, allowed, disallowed).length;
|
|
218
|
+
};
|
|
219
|
+
|
|
220
|
+
return new SemanticMarkdownTransformer({
|
|
221
|
+
...options,
|
|
222
|
+
encodingName,
|
|
223
|
+
modelName,
|
|
224
|
+
lengthFunction: tikTokenCounter,
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
}
|
package/src/document/types.ts
CHANGED
|
@@ -79,6 +79,14 @@ export type MarkdownChunkOptions = BaseChunkOptions & {
|
|
|
79
79
|
stripHeaders?: boolean;
|
|
80
80
|
};
|
|
81
81
|
|
|
82
|
+
export type SemanticMarkdownChunkOptions = BaseChunkOptions & {
|
|
83
|
+
joinThreshold?: number;
|
|
84
|
+
encodingName?: TiktokenEncoding;
|
|
85
|
+
modelName?: TiktokenModel;
|
|
86
|
+
allowedSpecial?: Set<string> | 'all';
|
|
87
|
+
disallowedSpecial?: Set<string> | 'all';
|
|
88
|
+
};
|
|
89
|
+
|
|
82
90
|
export type HTMLChunkOptions = BaseChunkOptions &
|
|
83
91
|
(
|
|
84
92
|
| { headers: [string, string][]; sections?: never; returnEachLine?: boolean }
|
|
@@ -111,9 +119,19 @@ export type StrategyOptions = {
|
|
|
111
119
|
json: JsonChunkOptions;
|
|
112
120
|
latex: LatexChunkOptions;
|
|
113
121
|
sentence: SentenceChunkOptions;
|
|
122
|
+
'semantic-markdown': SemanticMarkdownChunkOptions;
|
|
114
123
|
};
|
|
115
124
|
|
|
116
|
-
export type ChunkStrategy =
|
|
125
|
+
export type ChunkStrategy =
|
|
126
|
+
| 'recursive'
|
|
127
|
+
| 'character'
|
|
128
|
+
| 'token'
|
|
129
|
+
| 'markdown'
|
|
130
|
+
| 'html'
|
|
131
|
+
| 'json'
|
|
132
|
+
| 'latex'
|
|
133
|
+
| 'sentence'
|
|
134
|
+
| 'semantic-markdown';
|
|
117
135
|
|
|
118
136
|
export type ChunkParams =
|
|
119
137
|
| ({ strategy?: 'character' } & CharacterChunkOptions & { extract?: ExtractParams })
|
|
@@ -123,4 +141,5 @@ export type ChunkParams =
|
|
|
123
141
|
| ({ strategy: 'html' } & HTMLChunkOptions & { extract?: ExtractParams })
|
|
124
142
|
| ({ strategy: 'json' } & JsonChunkOptions & { extract?: ExtractParams })
|
|
125
143
|
| ({ strategy: 'latex' } & LatexChunkOptions & { extract?: ExtractParams })
|
|
126
|
-
| ({ strategy: 'sentence' } & SentenceChunkOptions & { extract?: ExtractParams })
|
|
144
|
+
| ({ strategy: 'sentence' } & SentenceChunkOptions & { extract?: ExtractParams })
|
|
145
|
+
| ({ strategy: 'semantic-markdown' } & SemanticMarkdownChunkOptions & { extract?: ExtractParams });
|
|
@@ -108,6 +108,16 @@ const markdownChunkOptionsSchema = baseChunkOptionsSchema
|
|
|
108
108
|
})
|
|
109
109
|
.strict();
|
|
110
110
|
|
|
111
|
+
const semanticMarkdownChunkOptionsSchema = baseChunkOptionsSchema
|
|
112
|
+
.extend({
|
|
113
|
+
joinThreshold: z.number().positive().optional(),
|
|
114
|
+
encodingName: z.string().optional(),
|
|
115
|
+
modelName: z.string().optional(),
|
|
116
|
+
allowedSpecial: setOrAllSchema,
|
|
117
|
+
disallowedSpecial: setOrAllSchema,
|
|
118
|
+
})
|
|
119
|
+
.strict();
|
|
120
|
+
|
|
111
121
|
const latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
|
|
112
122
|
|
|
113
123
|
// Strategy-specific validation schemas
|
|
@@ -119,6 +129,7 @@ const validationSchemas = {
|
|
|
119
129
|
json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
120
130
|
html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
121
131
|
markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
132
|
+
'semantic-markdown': semanticMarkdownChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
122
133
|
latex: latexChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
123
134
|
} as const;
|
|
124
135
|
|