@mastra/rag 1.0.6 → 1.0.7-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +12 -0
- package/dist/document/document.d.ts +9 -8
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/transformers/character.d.ts +4 -26
- package/dist/document/transformers/character.d.ts.map +1 -1
- package/dist/document/transformers/html.d.ts +8 -3
- package/dist/document/transformers/html.d.ts.map +1 -1
- package/dist/document/transformers/json.d.ts +4 -4
- package/dist/document/transformers/json.d.ts.map +1 -1
- package/dist/document/transformers/latex.d.ts +2 -8
- package/dist/document/transformers/latex.d.ts.map +1 -1
- package/dist/document/transformers/markdown.d.ts +2 -8
- package/dist/document/transformers/markdown.d.ts.map +1 -1
- package/dist/document/transformers/sentence.d.ts +31 -0
- package/dist/document/transformers/sentence.d.ts.map +1 -0
- package/dist/document/transformers/text.d.ts +3 -3
- package/dist/document/transformers/text.d.ts.map +1 -1
- package/dist/document/transformers/token.d.ts +4 -15
- package/dist/document/transformers/token.d.ts.map +1 -1
- package/dist/document/types.d.ts +85 -14
- package/dist/document/types.d.ts.map +1 -1
- package/dist/document/validation.d.ts +3 -0
- package/dist/document/validation.d.ts.map +1 -0
- package/dist/index.cjs +414 -80
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +414 -80
- package/dist/index.js.map +1 -1
- package/dist/tools/document-chunker.d.ts.map +1 -1
- package/package.json +5 -5
- package/src/document/document.test.ts +294 -39
- package/src/document/document.ts +69 -41
- package/src/document/transformers/character.ts +15 -43
- package/src/document/transformers/html.ts +9 -9
- package/src/document/transformers/json.ts +8 -3
- package/src/document/transformers/latex.ts +3 -11
- package/src/document/transformers/markdown.ts +3 -11
- package/src/document/transformers/sentence.ts +314 -0
- package/src/document/transformers/text.ts +10 -10
- package/src/document/transformers/token.ts +6 -17
- package/src/document/types.ts +66 -15
- package/src/document/validation.ts +147 -0
- package/src/tools/document-chunker.ts +12 -8
package/src/document/document.ts
CHANGED
|
@@ -7,8 +7,23 @@ import { HTMLHeaderTransformer, HTMLSectionTransformer } from './transformers/ht
|
|
|
7
7
|
import { RecursiveJsonTransformer } from './transformers/json';
|
|
8
8
|
import { LatexTransformer } from './transformers/latex';
|
|
9
9
|
import { MarkdownHeaderTransformer, MarkdownTransformer } from './transformers/markdown';
|
|
10
|
+
import { SentenceTransformer } from './transformers/sentence';
|
|
10
11
|
import { TokenTransformer } from './transformers/token';
|
|
11
|
-
import type {
|
|
12
|
+
import type {
|
|
13
|
+
ChunkParams,
|
|
14
|
+
ChunkStrategy,
|
|
15
|
+
ExtractParams,
|
|
16
|
+
HTMLChunkOptions,
|
|
17
|
+
RecursiveChunkOptions,
|
|
18
|
+
CharacterChunkOptions,
|
|
19
|
+
TokenChunkOptions,
|
|
20
|
+
MarkdownChunkOptions,
|
|
21
|
+
JsonChunkOptions,
|
|
22
|
+
LatexChunkOptions,
|
|
23
|
+
SentenceChunkOptions,
|
|
24
|
+
StrategyOptions,
|
|
25
|
+
} from './types';
|
|
26
|
+
import { validateChunkParams } from './validation';
|
|
12
27
|
|
|
13
28
|
export class MDocument {
|
|
14
29
|
private chunks: Chunk[];
|
|
@@ -135,35 +150,27 @@ export class MDocument {
|
|
|
135
150
|
}
|
|
136
151
|
}
|
|
137
152
|
|
|
138
|
-
private async chunkBy(strategy:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
case 'json':
|
|
156
|
-
await this.chunkJSON(options);
|
|
157
|
-
break;
|
|
158
|
-
case 'latex':
|
|
159
|
-
await this.chunkLatex(options);
|
|
160
|
-
break;
|
|
161
|
-
default:
|
|
162
|
-
throw new Error(`Unknown strategy: ${strategy}`);
|
|
153
|
+
private async chunkBy<K extends ChunkStrategy>(strategy: K, options?: StrategyOptions[K]): Promise<void> {
|
|
154
|
+
const strategyMap: { [S in ChunkStrategy]: (options?: StrategyOptions[S]) => Promise<void> } = {
|
|
155
|
+
recursive: options => this.chunkRecursive(options),
|
|
156
|
+
character: options => this.chunkCharacter(options),
|
|
157
|
+
token: options => this.chunkToken(options),
|
|
158
|
+
markdown: options => this.chunkMarkdown(options),
|
|
159
|
+
html: options => this.chunkHTML(options),
|
|
160
|
+
json: options => this.chunkJSON(options),
|
|
161
|
+
latex: options => this.chunkLatex(options),
|
|
162
|
+
sentence: options => this.chunkSentence(options),
|
|
163
|
+
};
|
|
164
|
+
|
|
165
|
+
const chunkingFunc = strategyMap[strategy];
|
|
166
|
+
if (chunkingFunc) {
|
|
167
|
+
await chunkingFunc(options);
|
|
168
|
+
} else {
|
|
169
|
+
throw new Error(`Unknown strategy: ${strategy}`);
|
|
163
170
|
}
|
|
164
171
|
}
|
|
165
172
|
|
|
166
|
-
async chunkRecursive(options?:
|
|
173
|
+
async chunkRecursive(options?: RecursiveChunkOptions): Promise<void> {
|
|
167
174
|
if (options?.language) {
|
|
168
175
|
const rt = RecursiveCharacterTransformer.fromLanguage(options.language, options);
|
|
169
176
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
@@ -171,28 +178,24 @@ export class MDocument {
|
|
|
171
178
|
return;
|
|
172
179
|
}
|
|
173
180
|
|
|
174
|
-
const rt = new RecursiveCharacterTransformer(
|
|
175
|
-
separators: options?.separators,
|
|
176
|
-
isSeparatorRegex: options?.isSeparatorRegex,
|
|
177
|
-
options,
|
|
178
|
-
});
|
|
181
|
+
const rt = new RecursiveCharacterTransformer(options);
|
|
179
182
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
180
183
|
this.chunks = textSplit;
|
|
181
184
|
}
|
|
182
185
|
|
|
183
|
-
async chunkCharacter(options?:
|
|
186
|
+
async chunkCharacter(options?: CharacterChunkOptions): Promise<void> {
|
|
184
187
|
const rt = new CharacterTransformer({
|
|
188
|
+
...options,
|
|
185
189
|
separator: options?.separator,
|
|
186
190
|
isSeparatorRegex: options?.isSeparatorRegex,
|
|
187
|
-
options,
|
|
188
191
|
});
|
|
189
192
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
190
193
|
this.chunks = textSplit;
|
|
191
194
|
}
|
|
192
195
|
|
|
193
|
-
async chunkHTML(options?:
|
|
196
|
+
async chunkHTML(options?: HTMLChunkOptions): Promise<void> {
|
|
194
197
|
if (options?.headers?.length) {
|
|
195
|
-
const rt = new HTMLHeaderTransformer(options
|
|
198
|
+
const rt = new HTMLHeaderTransformer(options as HTMLChunkOptions & { headers: [string, string][] });
|
|
196
199
|
|
|
197
200
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
198
201
|
this.chunks = textSplit;
|
|
@@ -200,7 +203,7 @@ export class MDocument {
|
|
|
200
203
|
}
|
|
201
204
|
|
|
202
205
|
if (options?.sections?.length) {
|
|
203
|
-
const rt = new HTMLSectionTransformer(options
|
|
206
|
+
const rt = new HTMLSectionTransformer(options as HTMLChunkOptions & { sections: [string, string][] });
|
|
204
207
|
|
|
205
208
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
206
209
|
this.chunks = textSplit;
|
|
@@ -210,7 +213,7 @@ export class MDocument {
|
|
|
210
213
|
throw new Error('HTML chunking requires either headers or sections to be specified');
|
|
211
214
|
}
|
|
212
215
|
|
|
213
|
-
async chunkJSON(options?:
|
|
216
|
+
async chunkJSON(options?: JsonChunkOptions): Promise<void> {
|
|
214
217
|
if (!options?.maxSize) {
|
|
215
218
|
throw new Error('JSON chunking requires maxSize to be specified');
|
|
216
219
|
}
|
|
@@ -229,13 +232,13 @@ export class MDocument {
|
|
|
229
232
|
this.chunks = textSplit;
|
|
230
233
|
}
|
|
231
234
|
|
|
232
|
-
async chunkLatex(options?:
|
|
235
|
+
async chunkLatex(options?: LatexChunkOptions): Promise<void> {
|
|
233
236
|
const rt = new LatexTransformer(options);
|
|
234
237
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
235
238
|
this.chunks = textSplit;
|
|
236
239
|
}
|
|
237
240
|
|
|
238
|
-
async chunkToken(options?:
|
|
241
|
+
async chunkToken(options?: TokenChunkOptions): Promise<void> {
|
|
239
242
|
const rt = TokenTransformer.fromTikToken({
|
|
240
243
|
options,
|
|
241
244
|
encodingName: options?.encodingName,
|
|
@@ -245,7 +248,7 @@ export class MDocument {
|
|
|
245
248
|
this.chunks = textSplit;
|
|
246
249
|
}
|
|
247
250
|
|
|
248
|
-
async chunkMarkdown(options?:
|
|
251
|
+
async chunkMarkdown(options?: MarkdownChunkOptions): Promise<void> {
|
|
249
252
|
if (options?.headers) {
|
|
250
253
|
const rt = new MarkdownHeaderTransformer(options.headers, options?.returnEachLine, options?.stripHeaders);
|
|
251
254
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
@@ -258,11 +261,36 @@ export class MDocument {
|
|
|
258
261
|
this.chunks = textSplit;
|
|
259
262
|
}
|
|
260
263
|
|
|
264
|
+
async chunkSentence(options?: SentenceChunkOptions): Promise<void> {
|
|
265
|
+
if (!options?.maxSize) {
|
|
266
|
+
throw new Error('Sentence chunking requires maxSize to be specified');
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const rt = new SentenceTransformer({
|
|
270
|
+
minSize: options?.minSize,
|
|
271
|
+
maxSize: options?.maxSize,
|
|
272
|
+
targetSize: options?.targetSize,
|
|
273
|
+
overlap: options?.overlap,
|
|
274
|
+
sentenceEnders: options?.sentenceEnders,
|
|
275
|
+
fallbackToWords: options?.fallbackToWords,
|
|
276
|
+
fallbackToCharacters: options?.fallbackToCharacters,
|
|
277
|
+
keepSeparator: options?.keepSeparator,
|
|
278
|
+
lengthFunction: options?.lengthFunction,
|
|
279
|
+
addStartIndex: options?.addStartIndex,
|
|
280
|
+
stripWhitespace: options?.stripWhitespace,
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
const textSplit = rt.transformDocuments(this.chunks);
|
|
284
|
+
this.chunks = textSplit;
|
|
285
|
+
}
|
|
286
|
+
|
|
261
287
|
async chunk(params?: ChunkParams): Promise<Chunk[]> {
|
|
262
288
|
const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
|
|
263
289
|
// Determine the default strategy based on type if not specified
|
|
264
290
|
const strategy = passedStrategy || this.defaultStrategy();
|
|
265
291
|
|
|
292
|
+
validateChunkParams(strategy, chunkOptions);
|
|
293
|
+
|
|
266
294
|
// Apply the appropriate chunking strategy
|
|
267
295
|
await this.chunkBy(strategy, chunkOptions);
|
|
268
296
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Language } from '../types';
|
|
2
|
-
import type {
|
|
2
|
+
import type { BaseChunkOptions, CharacterChunkOptions, RecursiveChunkOptions } from '../types';
|
|
3
3
|
|
|
4
4
|
import { TextTransformer } from './text';
|
|
5
5
|
|
|
@@ -52,23 +52,8 @@ export class CharacterTransformer extends TextTransformer {
|
|
|
52
52
|
protected separator: string;
|
|
53
53
|
protected isSeparatorRegex: boolean;
|
|
54
54
|
|
|
55
|
-
constructor({
|
|
56
|
-
|
|
57
|
-
isSeparatorRegex = false,
|
|
58
|
-
options = {},
|
|
59
|
-
}: {
|
|
60
|
-
separator?: string;
|
|
61
|
-
isSeparatorRegex?: boolean;
|
|
62
|
-
options?: {
|
|
63
|
-
size?: number;
|
|
64
|
-
overlap?: number;
|
|
65
|
-
lengthFunction?: (text: string) => number;
|
|
66
|
-
keepSeparator?: boolean | 'start' | 'end';
|
|
67
|
-
addStartIndex?: boolean;
|
|
68
|
-
stripWhitespace?: boolean;
|
|
69
|
-
};
|
|
70
|
-
}) {
|
|
71
|
-
super(options);
|
|
55
|
+
constructor({ separator = '\n\n', isSeparatorRegex = false, ...baseOptions }: CharacterChunkOptions = {}) {
|
|
56
|
+
super(baseOptions);
|
|
72
57
|
this.separator = separator;
|
|
73
58
|
this.isSeparatorRegex = isSeparatorRegex;
|
|
74
59
|
}
|
|
@@ -82,7 +67,7 @@ export class CharacterTransformer extends TextTransformer {
|
|
|
82
67
|
// If length of any split is greater than chunk size, perform additional splitting
|
|
83
68
|
const chunks: string[] = [];
|
|
84
69
|
for (const split of initialSplits) {
|
|
85
|
-
if (this.lengthFunction(split) <= this.
|
|
70
|
+
if (this.lengthFunction(split) <= this.maxSize) {
|
|
86
71
|
chunks.push(split);
|
|
87
72
|
} else {
|
|
88
73
|
// If a single split is too large, split it further with overlap
|
|
@@ -102,7 +87,7 @@ export class CharacterTransformer extends TextTransformer {
|
|
|
102
87
|
let chunkEnd = currentPosition;
|
|
103
88
|
|
|
104
89
|
// Build chunk up to max size
|
|
105
|
-
while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.
|
|
90
|
+
while (chunkEnd < text.length && this.lengthFunction(text.slice(currentPosition, chunkEnd + 1)) <= this.maxSize) {
|
|
106
91
|
chunkEnd++;
|
|
107
92
|
}
|
|
108
93
|
|
|
@@ -125,16 +110,8 @@ export class RecursiveCharacterTransformer extends TextTransformer {
|
|
|
125
110
|
protected separators: string[];
|
|
126
111
|
protected isSeparatorRegex: boolean;
|
|
127
112
|
|
|
128
|
-
constructor({
|
|
129
|
-
|
|
130
|
-
isSeparatorRegex = false,
|
|
131
|
-
options = {},
|
|
132
|
-
}: {
|
|
133
|
-
separators?: string[];
|
|
134
|
-
isSeparatorRegex?: boolean;
|
|
135
|
-
options?: ChunkOptions;
|
|
136
|
-
}) {
|
|
137
|
-
super(options);
|
|
113
|
+
constructor({ separators, isSeparatorRegex = false, language, ...baseOptions }: RecursiveChunkOptions = {}) {
|
|
114
|
+
super(baseOptions);
|
|
138
115
|
this.separators = separators || ['\n\n', '\n', ' ', ''];
|
|
139
116
|
this.isSeparatorRegex = isSeparatorRegex;
|
|
140
117
|
}
|
|
@@ -169,7 +146,7 @@ export class RecursiveCharacterTransformer extends TextTransformer {
|
|
|
169
146
|
const mergeSeparator = this.keepSeparator ? '' : separator;
|
|
170
147
|
|
|
171
148
|
for (const s of splits) {
|
|
172
|
-
if (this.lengthFunction(s) < this.
|
|
149
|
+
if (this.lengthFunction(s) < this.maxSize) {
|
|
173
150
|
goodSplits.push(s);
|
|
174
151
|
} else {
|
|
175
152
|
if (goodSplits.length > 0) {
|
|
@@ -198,19 +175,14 @@ export class RecursiveCharacterTransformer extends TextTransformer {
|
|
|
198
175
|
return this._splitText(text, this.separators);
|
|
199
176
|
}
|
|
200
177
|
|
|
201
|
-
static fromLanguage(
|
|
202
|
-
language: Language,
|
|
203
|
-
options: {
|
|
204
|
-
size?: number;
|
|
205
|
-
chunkOverlap?: number;
|
|
206
|
-
lengthFunction?: (text: string) => number;
|
|
207
|
-
keepSeparator?: boolean | 'start' | 'end';
|
|
208
|
-
addStartIndex?: boolean;
|
|
209
|
-
stripWhitespace?: boolean;
|
|
210
|
-
} = {},
|
|
211
|
-
): RecursiveCharacterTransformer {
|
|
178
|
+
static fromLanguage(language: Language, options: BaseChunkOptions = {}): RecursiveCharacterTransformer {
|
|
212
179
|
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage(language);
|
|
213
|
-
return new RecursiveCharacterTransformer({
|
|
180
|
+
return new RecursiveCharacterTransformer({
|
|
181
|
+
...options,
|
|
182
|
+
separators,
|
|
183
|
+
isSeparatorRegex: true,
|
|
184
|
+
language,
|
|
185
|
+
});
|
|
214
186
|
}
|
|
215
187
|
|
|
216
188
|
static getSeparatorsForLanguage(language: Language): string[] {
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { parse } from 'node-html-better-parser';
|
|
2
2
|
import { Document } from '../schema';
|
|
3
|
+
import type { HTMLChunkOptions } from '../types';
|
|
3
4
|
|
|
4
5
|
import { RecursiveCharacterTransformer } from './character';
|
|
5
6
|
|
|
@@ -14,9 +15,9 @@ export class HTMLHeaderTransformer {
|
|
|
14
15
|
private headersToSplitOn: [string, string][];
|
|
15
16
|
private returnEachElement: boolean;
|
|
16
17
|
|
|
17
|
-
constructor(
|
|
18
|
-
this.returnEachElement =
|
|
19
|
-
this.headersToSplitOn = [...
|
|
18
|
+
constructor(options: HTMLChunkOptions & { headers: [string, string][] }) {
|
|
19
|
+
this.returnEachElement = options.returnEachLine ?? false;
|
|
20
|
+
this.headersToSplitOn = [...options.headers].sort();
|
|
20
21
|
}
|
|
21
22
|
|
|
22
23
|
splitText({ text }: { text: string }): Document[] {
|
|
@@ -195,11 +196,11 @@ export class HTMLHeaderTransformer {
|
|
|
195
196
|
|
|
196
197
|
export class HTMLSectionTransformer {
|
|
197
198
|
private headersToSplitOn: Record<string, string>;
|
|
198
|
-
private
|
|
199
|
+
private textSplitter: RecursiveCharacterTransformer;
|
|
199
200
|
|
|
200
|
-
constructor(
|
|
201
|
-
this.headersToSplitOn = Object.fromEntries(
|
|
202
|
-
this.
|
|
201
|
+
constructor(options: HTMLChunkOptions & { sections: [string, string][] }) {
|
|
202
|
+
this.headersToSplitOn = Object.fromEntries(options.sections.map(([tag, name]) => [tag.toLowerCase(), name]));
|
|
203
|
+
this.textSplitter = new RecursiveCharacterTransformer(options);
|
|
203
204
|
}
|
|
204
205
|
|
|
205
206
|
splitText(text: string): Document[] {
|
|
@@ -296,9 +297,8 @@ export class HTMLSectionTransformer {
|
|
|
296
297
|
metadatas.push(doc.metadata);
|
|
297
298
|
}
|
|
298
299
|
const results = await this.createDocuments(texts, metadatas);
|
|
299
|
-
const textSplitter = new RecursiveCharacterTransformer({ options: this.options });
|
|
300
300
|
|
|
301
|
-
return textSplitter.splitDocuments(results);
|
|
301
|
+
return this.textSplitter.splitDocuments(results);
|
|
302
302
|
}
|
|
303
303
|
|
|
304
304
|
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[] {
|
|
@@ -1,12 +1,17 @@
|
|
|
1
1
|
import { Document } from '../schema';
|
|
2
|
+
import type { JsonChunkOptions } from '../types';
|
|
2
3
|
|
|
3
4
|
export class RecursiveJsonTransformer {
|
|
4
5
|
private maxSize: number;
|
|
5
6
|
private minSize: number;
|
|
7
|
+
private ensureAscii: boolean;
|
|
8
|
+
private convertLists: boolean;
|
|
6
9
|
|
|
7
|
-
constructor({ maxSize = 2000, minSize
|
|
10
|
+
constructor({ maxSize = 2000, minSize, ensureAscii = false, convertLists = true }: JsonChunkOptions) {
|
|
8
11
|
this.maxSize = maxSize;
|
|
9
12
|
this.minSize = minSize ?? Math.max(maxSize - 200, 50);
|
|
13
|
+
this.ensureAscii = ensureAscii;
|
|
14
|
+
this.convertLists = convertLists;
|
|
10
15
|
}
|
|
11
16
|
|
|
12
17
|
private static jsonSize(data: Record<string, any>): number {
|
|
@@ -170,8 +175,8 @@ export class RecursiveJsonTransformer {
|
|
|
170
175
|
private isWithinSizeLimit(value: any, currentSize: number = 0): boolean {
|
|
171
176
|
const size = RecursiveJsonTransformer.jsonSize(value);
|
|
172
177
|
// If this is a new chunk (currentSize = 0), allow items smaller than maxSize
|
|
173
|
-
// If adding to existing chunk, ensure
|
|
174
|
-
return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize
|
|
178
|
+
// If adding to existing chunk, ensure total size doesn't exceed maxSize
|
|
179
|
+
return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize;
|
|
175
180
|
}
|
|
176
181
|
|
|
177
182
|
/**
|
|
@@ -1,19 +1,11 @@
|
|
|
1
1
|
import { Language } from '../types';
|
|
2
|
+
import type { BaseChunkOptions } from '../types';
|
|
2
3
|
|
|
3
4
|
import { RecursiveCharacterTransformer } from './character';
|
|
4
5
|
|
|
5
6
|
export class LatexTransformer extends RecursiveCharacterTransformer {
|
|
6
|
-
constructor(
|
|
7
|
-
options: {
|
|
8
|
-
size?: number;
|
|
9
|
-
overlap?: number;
|
|
10
|
-
lengthFunction?: (text: string) => number;
|
|
11
|
-
keepSeparator?: boolean | 'start' | 'end';
|
|
12
|
-
addStartIndex?: boolean;
|
|
13
|
-
stripWhitespace?: boolean;
|
|
14
|
-
} = {},
|
|
15
|
-
) {
|
|
7
|
+
constructor(options: BaseChunkOptions = {}) {
|
|
16
8
|
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage(Language.LATEX);
|
|
17
|
-
super({ separators, isSeparatorRegex: true
|
|
9
|
+
super({ ...options, separators, isSeparatorRegex: true });
|
|
18
10
|
}
|
|
19
11
|
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { Document } from '../schema';
|
|
2
2
|
|
|
3
3
|
import { Language } from '../types';
|
|
4
|
+
import type { BaseChunkOptions } from '../types';
|
|
4
5
|
|
|
5
6
|
import { RecursiveCharacterTransformer } from './character';
|
|
6
7
|
|
|
@@ -16,18 +17,9 @@ interface HeaderType {
|
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
export class MarkdownTransformer extends RecursiveCharacterTransformer {
|
|
19
|
-
constructor(
|
|
20
|
-
options: {
|
|
21
|
-
chunkSize?: number;
|
|
22
|
-
chunkOverlap?: number;
|
|
23
|
-
lengthFunction?: (text: string) => number;
|
|
24
|
-
keepSeparator?: boolean | 'start' | 'end';
|
|
25
|
-
addStartIndex?: boolean;
|
|
26
|
-
stripWhitespace?: boolean;
|
|
27
|
-
} = {},
|
|
28
|
-
) {
|
|
20
|
+
constructor(options: BaseChunkOptions = {}) {
|
|
29
21
|
const separators = RecursiveCharacterTransformer.getSeparatorsForLanguage(Language.MARKDOWN);
|
|
30
|
-
super({ separators, isSeparatorRegex: true
|
|
22
|
+
super({ ...options, separators, isSeparatorRegex: true });
|
|
31
23
|
}
|
|
32
24
|
}
|
|
33
25
|
|