@dakshp1234/langchain-textsplitters 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +11 -0
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/dist/index.cjs +12 -0
- package/dist/index.d.cts +3 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +3 -0
- package/dist/semantic_text_splitter.cjs +179 -0
- package/dist/semantic_text_splitter.cjs.map +1 -0
- package/dist/semantic_text_splitter.d.cts +44 -0
- package/dist/semantic_text_splitter.d.cts.map +1 -0
- package/dist/semantic_text_splitter.d.ts +44 -0
- package/dist/semantic_text_splitter.d.ts.map +1 -0
- package/dist/semantic_text_splitter.js +178 -0
- package/dist/semantic_text_splitter.js.map +1 -0
- package/dist/text_splitter.cjs +536 -0
- package/dist/text_splitter.cjs.map +1 -0
- package/dist/text_splitter.d.cts +82 -0
- package/dist/text_splitter.d.cts.map +1 -0
- package/dist/text_splitter.d.ts +82 -0
- package/dist/text_splitter.d.ts.map +1 -0
- package/dist/text_splitter.js +530 -0
- package/dist/text_splitter.js.map +1 -0
- package/package.json +65 -0
|
@@ -0,0 +1,536 @@
|
|
|
1
|
+
let _langchain_core_documents = require("@langchain/core/documents");
|
|
2
|
+
let _langchain_core_utils_tiktoken = require("@langchain/core/utils/tiktoken");
|
|
3
|
+
//#region src/text_splitter.ts
|
|
4
|
+
var TextSplitter = class extends _langchain_core_documents.BaseDocumentTransformer {
|
|
5
|
+
lc_namespace = [
|
|
6
|
+
"langchain",
|
|
7
|
+
"document_transformers",
|
|
8
|
+
"text_splitters"
|
|
9
|
+
];
|
|
10
|
+
chunkSize = 1e3;
|
|
11
|
+
chunkOverlap = 200;
|
|
12
|
+
keepSeparator = false;
|
|
13
|
+
lengthFunction;
|
|
14
|
+
constructor(fields) {
|
|
15
|
+
super(fields);
|
|
16
|
+
this.chunkSize = fields?.chunkSize ?? this.chunkSize;
|
|
17
|
+
this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;
|
|
18
|
+
this.keepSeparator = fields?.keepSeparator ?? this.keepSeparator;
|
|
19
|
+
this.lengthFunction = fields?.lengthFunction ?? ((text) => text.length);
|
|
20
|
+
if (this.chunkOverlap >= this.chunkSize) throw new Error("Cannot have chunkOverlap >= chunkSize");
|
|
21
|
+
}
|
|
22
|
+
async transformDocuments(documents, chunkHeaderOptions = {}) {
|
|
23
|
+
return this.splitDocuments(documents, chunkHeaderOptions);
|
|
24
|
+
}
|
|
25
|
+
splitOnSeparator(text, separator) {
|
|
26
|
+
let splits;
|
|
27
|
+
if (separator) if (this.keepSeparator) {
|
|
28
|
+
const regexEscapedSeparator = separator.replace(/[/\-\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
29
|
+
splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));
|
|
30
|
+
} else splits = text.split(separator);
|
|
31
|
+
else splits = text.split("");
|
|
32
|
+
return splits.filter((s) => s !== "");
|
|
33
|
+
}
|
|
34
|
+
async createDocuments(texts, metadatas = [], chunkHeaderOptions = {}) {
|
|
35
|
+
const _metadatas = metadatas.length > 0 ? metadatas : [...Array(texts.length)].map(() => ({}));
|
|
36
|
+
const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false } = chunkHeaderOptions;
|
|
37
|
+
const documents = new Array();
|
|
38
|
+
for (let i = 0; i < texts.length; i += 1) {
|
|
39
|
+
const text = texts[i];
|
|
40
|
+
let lineCounterIndex = 1;
|
|
41
|
+
let prevChunk = null;
|
|
42
|
+
let indexPrevChunk = -1;
|
|
43
|
+
for (const chunk of await this.splitText(text)) {
|
|
44
|
+
let pageContent = chunkHeader;
|
|
45
|
+
const indexChunk = text.indexOf(chunk, indexPrevChunk + 1);
|
|
46
|
+
if (prevChunk === null) {
|
|
47
|
+
const newLinesBeforeFirstChunk = this.numberOfNewLines(text, 0, indexChunk);
|
|
48
|
+
lineCounterIndex += newLinesBeforeFirstChunk;
|
|
49
|
+
} else {
|
|
50
|
+
const indexEndPrevChunk = indexPrevChunk + await this.lengthFunction(prevChunk);
|
|
51
|
+
if (indexEndPrevChunk < indexChunk) {
|
|
52
|
+
const numberOfIntermediateNewLines = this.numberOfNewLines(text, indexEndPrevChunk, indexChunk);
|
|
53
|
+
lineCounterIndex += numberOfIntermediateNewLines;
|
|
54
|
+
} else if (indexEndPrevChunk > indexChunk) {
|
|
55
|
+
const numberOfIntermediateNewLines = this.numberOfNewLines(text, indexChunk, indexEndPrevChunk);
|
|
56
|
+
lineCounterIndex -= numberOfIntermediateNewLines;
|
|
57
|
+
}
|
|
58
|
+
if (appendChunkOverlapHeader) pageContent += chunkOverlapHeader;
|
|
59
|
+
}
|
|
60
|
+
const newLinesCount = this.numberOfNewLines(chunk);
|
|
61
|
+
const loc = _metadatas[i].loc && typeof _metadatas[i].loc === "object" ? { ..._metadatas[i].loc } : {};
|
|
62
|
+
loc.lines = {
|
|
63
|
+
from: lineCounterIndex,
|
|
64
|
+
to: lineCounterIndex + newLinesCount
|
|
65
|
+
};
|
|
66
|
+
const metadataWithLinesNumber = {
|
|
67
|
+
..._metadatas[i],
|
|
68
|
+
loc
|
|
69
|
+
};
|
|
70
|
+
pageContent += chunk;
|
|
71
|
+
documents.push(new _langchain_core_documents.Document({
|
|
72
|
+
pageContent,
|
|
73
|
+
metadata: metadataWithLinesNumber
|
|
74
|
+
}));
|
|
75
|
+
lineCounterIndex += newLinesCount;
|
|
76
|
+
prevChunk = chunk;
|
|
77
|
+
indexPrevChunk = indexChunk;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return documents;
|
|
81
|
+
}
|
|
82
|
+
numberOfNewLines(text, start, end) {
|
|
83
|
+
return (text.slice(start, end).match(/\n/g) || []).length;
|
|
84
|
+
}
|
|
85
|
+
async splitDocuments(documents, chunkHeaderOptions = {}) {
|
|
86
|
+
const selectedDocuments = documents.filter((doc) => doc.pageContent !== void 0);
|
|
87
|
+
const texts = selectedDocuments.map((doc) => doc.pageContent);
|
|
88
|
+
const metadatas = selectedDocuments.map((doc) => doc.metadata);
|
|
89
|
+
return this.createDocuments(texts, metadatas, chunkHeaderOptions);
|
|
90
|
+
}
|
|
91
|
+
joinDocs(docs, separator) {
|
|
92
|
+
const text = docs.join(separator).trim();
|
|
93
|
+
return text === "" ? null : text;
|
|
94
|
+
}
|
|
95
|
+
async mergeSplits(splits, separator) {
|
|
96
|
+
const docs = [];
|
|
97
|
+
const currentDoc = [];
|
|
98
|
+
let total = 0;
|
|
99
|
+
for (const d of splits) {
|
|
100
|
+
const _len = await this.lengthFunction(d);
|
|
101
|
+
if (total + _len + currentDoc.length * separator.length > this.chunkSize) {
|
|
102
|
+
if (total > this.chunkSize) console.warn(`Created a chunk of size ${total}, +
|
|
103
|
+
which is longer than the specified ${this.chunkSize}`);
|
|
104
|
+
if (currentDoc.length > 0) {
|
|
105
|
+
const doc = this.joinDocs(currentDoc, separator);
|
|
106
|
+
if (doc !== null) docs.push(doc);
|
|
107
|
+
while (total > this.chunkOverlap || total + _len + currentDoc.length * separator.length > this.chunkSize && total > 0) {
|
|
108
|
+
total -= await this.lengthFunction(currentDoc[0]);
|
|
109
|
+
currentDoc.shift();
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
currentDoc.push(d);
|
|
114
|
+
total += _len;
|
|
115
|
+
}
|
|
116
|
+
const doc = this.joinDocs(currentDoc, separator);
|
|
117
|
+
if (doc !== null) docs.push(doc);
|
|
118
|
+
return docs;
|
|
119
|
+
}
|
|
120
|
+
};
|
|
121
|
+
var CharacterTextSplitter = class extends TextSplitter {
|
|
122
|
+
static lc_name() {
|
|
123
|
+
return "CharacterTextSplitter";
|
|
124
|
+
}
|
|
125
|
+
separator = "\n\n";
|
|
126
|
+
constructor(fields) {
|
|
127
|
+
super(fields);
|
|
128
|
+
this.separator = fields?.separator ?? this.separator;
|
|
129
|
+
}
|
|
130
|
+
async splitText(text) {
|
|
131
|
+
const splits = this.splitOnSeparator(text, this.separator);
|
|
132
|
+
return this.mergeSplits(splits, this.keepSeparator ? "" : this.separator);
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
const SupportedTextSplitterLanguages = [
|
|
136
|
+
"cpp",
|
|
137
|
+
"go",
|
|
138
|
+
"java",
|
|
139
|
+
"js",
|
|
140
|
+
"php",
|
|
141
|
+
"proto",
|
|
142
|
+
"python",
|
|
143
|
+
"rst",
|
|
144
|
+
"ruby",
|
|
145
|
+
"rust",
|
|
146
|
+
"scala",
|
|
147
|
+
"swift",
|
|
148
|
+
"markdown",
|
|
149
|
+
"latex",
|
|
150
|
+
"html",
|
|
151
|
+
"sol"
|
|
152
|
+
];
|
|
153
|
+
var RecursiveCharacterTextSplitter = class RecursiveCharacterTextSplitter extends TextSplitter {
|
|
154
|
+
static lc_name() {
|
|
155
|
+
return "RecursiveCharacterTextSplitter";
|
|
156
|
+
}
|
|
157
|
+
separators = [
|
|
158
|
+
"\n\n",
|
|
159
|
+
"\n",
|
|
160
|
+
" ",
|
|
161
|
+
""
|
|
162
|
+
];
|
|
163
|
+
constructor(fields) {
|
|
164
|
+
super(fields);
|
|
165
|
+
this.separators = fields?.separators ?? this.separators;
|
|
166
|
+
this.keepSeparator = fields?.keepSeparator ?? true;
|
|
167
|
+
}
|
|
168
|
+
async _splitText(text, separators) {
|
|
169
|
+
const finalChunks = [];
|
|
170
|
+
let separator = separators[separators.length - 1];
|
|
171
|
+
let newSeparators;
|
|
172
|
+
for (let i = 0; i < separators.length; i += 1) {
|
|
173
|
+
const s = separators[i];
|
|
174
|
+
if (s === "") {
|
|
175
|
+
separator = s;
|
|
176
|
+
break;
|
|
177
|
+
}
|
|
178
|
+
if (text.includes(s)) {
|
|
179
|
+
separator = s;
|
|
180
|
+
newSeparators = separators.slice(i + 1);
|
|
181
|
+
break;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
const splits = this.splitOnSeparator(text, separator);
|
|
185
|
+
let goodSplits = [];
|
|
186
|
+
const _separator = this.keepSeparator ? "" : separator;
|
|
187
|
+
for (const s of splits) if (await this.lengthFunction(s) < this.chunkSize) goodSplits.push(s);
|
|
188
|
+
else {
|
|
189
|
+
if (goodSplits.length) {
|
|
190
|
+
const mergedText = await this.mergeSplits(goodSplits, _separator);
|
|
191
|
+
finalChunks.push(...mergedText);
|
|
192
|
+
goodSplits = [];
|
|
193
|
+
}
|
|
194
|
+
if (!newSeparators) finalChunks.push(s);
|
|
195
|
+
else {
|
|
196
|
+
const otherInfo = await this._splitText(s, newSeparators);
|
|
197
|
+
finalChunks.push(...otherInfo);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
if (goodSplits.length) {
|
|
201
|
+
const mergedText = await this.mergeSplits(goodSplits, _separator);
|
|
202
|
+
finalChunks.push(...mergedText);
|
|
203
|
+
}
|
|
204
|
+
return finalChunks;
|
|
205
|
+
}
|
|
206
|
+
async splitText(text) {
|
|
207
|
+
return this._splitText(text, this.separators);
|
|
208
|
+
}
|
|
209
|
+
static fromLanguage(language, options) {
|
|
210
|
+
return new RecursiveCharacterTextSplitter({
|
|
211
|
+
...options,
|
|
212
|
+
separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage(language)
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
static getSeparatorsForLanguage(language) {
|
|
216
|
+
if (language === "cpp") return [
|
|
217
|
+
"\nclass ",
|
|
218
|
+
"\nvoid ",
|
|
219
|
+
"\nint ",
|
|
220
|
+
"\nfloat ",
|
|
221
|
+
"\ndouble ",
|
|
222
|
+
"\nif ",
|
|
223
|
+
"\nfor ",
|
|
224
|
+
"\nwhile ",
|
|
225
|
+
"\nswitch ",
|
|
226
|
+
"\ncase ",
|
|
227
|
+
"\n\n",
|
|
228
|
+
"\n",
|
|
229
|
+
" ",
|
|
230
|
+
""
|
|
231
|
+
];
|
|
232
|
+
else if (language === "go") return [
|
|
233
|
+
"\nfunc ",
|
|
234
|
+
"\nvar ",
|
|
235
|
+
"\nconst ",
|
|
236
|
+
"\ntype ",
|
|
237
|
+
"\nif ",
|
|
238
|
+
"\nfor ",
|
|
239
|
+
"\nswitch ",
|
|
240
|
+
"\ncase ",
|
|
241
|
+
"\n\n",
|
|
242
|
+
"\n",
|
|
243
|
+
" ",
|
|
244
|
+
""
|
|
245
|
+
];
|
|
246
|
+
else if (language === "java") return [
|
|
247
|
+
"\nclass ",
|
|
248
|
+
"\npublic ",
|
|
249
|
+
"\nprotected ",
|
|
250
|
+
"\nprivate ",
|
|
251
|
+
"\nstatic ",
|
|
252
|
+
"\nif ",
|
|
253
|
+
"\nfor ",
|
|
254
|
+
"\nwhile ",
|
|
255
|
+
"\nswitch ",
|
|
256
|
+
"\ncase ",
|
|
257
|
+
"\n\n",
|
|
258
|
+
"\n",
|
|
259
|
+
" ",
|
|
260
|
+
""
|
|
261
|
+
];
|
|
262
|
+
else if (language === "js") return [
|
|
263
|
+
"\nfunction ",
|
|
264
|
+
"\nconst ",
|
|
265
|
+
"\nlet ",
|
|
266
|
+
"\nvar ",
|
|
267
|
+
"\nclass ",
|
|
268
|
+
"\nif ",
|
|
269
|
+
"\nfor ",
|
|
270
|
+
"\nwhile ",
|
|
271
|
+
"\nswitch ",
|
|
272
|
+
"\ncase ",
|
|
273
|
+
"\ndefault ",
|
|
274
|
+
"\n\n",
|
|
275
|
+
"\n",
|
|
276
|
+
" ",
|
|
277
|
+
""
|
|
278
|
+
];
|
|
279
|
+
else if (language === "php") return [
|
|
280
|
+
"\nfunction ",
|
|
281
|
+
"\nclass ",
|
|
282
|
+
"\nif ",
|
|
283
|
+
"\nforeach ",
|
|
284
|
+
"\nwhile ",
|
|
285
|
+
"\ndo ",
|
|
286
|
+
"\nswitch ",
|
|
287
|
+
"\ncase ",
|
|
288
|
+
"\n\n",
|
|
289
|
+
"\n",
|
|
290
|
+
" ",
|
|
291
|
+
""
|
|
292
|
+
];
|
|
293
|
+
else if (language === "proto") return [
|
|
294
|
+
"\nmessage ",
|
|
295
|
+
"\nservice ",
|
|
296
|
+
"\nenum ",
|
|
297
|
+
"\noption ",
|
|
298
|
+
"\nimport ",
|
|
299
|
+
"\nsyntax ",
|
|
300
|
+
"\n\n",
|
|
301
|
+
"\n",
|
|
302
|
+
" ",
|
|
303
|
+
""
|
|
304
|
+
];
|
|
305
|
+
else if (language === "python") return [
|
|
306
|
+
"\nclass ",
|
|
307
|
+
"\ndef ",
|
|
308
|
+
"\n def ",
|
|
309
|
+
"\n\n",
|
|
310
|
+
"\n",
|
|
311
|
+
" ",
|
|
312
|
+
""
|
|
313
|
+
];
|
|
314
|
+
else if (language === "rst") return [
|
|
315
|
+
"\n===\n",
|
|
316
|
+
"\n---\n",
|
|
317
|
+
"\n***\n",
|
|
318
|
+
"\n.. ",
|
|
319
|
+
"\n\n",
|
|
320
|
+
"\n",
|
|
321
|
+
" ",
|
|
322
|
+
""
|
|
323
|
+
];
|
|
324
|
+
else if (language === "ruby") return [
|
|
325
|
+
"\ndef ",
|
|
326
|
+
"\nclass ",
|
|
327
|
+
"\nif ",
|
|
328
|
+
"\nunless ",
|
|
329
|
+
"\nwhile ",
|
|
330
|
+
"\nfor ",
|
|
331
|
+
"\ndo ",
|
|
332
|
+
"\nbegin ",
|
|
333
|
+
"\nrescue ",
|
|
334
|
+
"\n\n",
|
|
335
|
+
"\n",
|
|
336
|
+
" ",
|
|
337
|
+
""
|
|
338
|
+
];
|
|
339
|
+
else if (language === "rust") return [
|
|
340
|
+
"\nfn ",
|
|
341
|
+
"\nconst ",
|
|
342
|
+
"\nlet ",
|
|
343
|
+
"\nif ",
|
|
344
|
+
"\nwhile ",
|
|
345
|
+
"\nfor ",
|
|
346
|
+
"\nloop ",
|
|
347
|
+
"\nmatch ",
|
|
348
|
+
"\nconst ",
|
|
349
|
+
"\n\n",
|
|
350
|
+
"\n",
|
|
351
|
+
" ",
|
|
352
|
+
""
|
|
353
|
+
];
|
|
354
|
+
else if (language === "scala") return [
|
|
355
|
+
"\nclass ",
|
|
356
|
+
"\nobject ",
|
|
357
|
+
"\ndef ",
|
|
358
|
+
"\nval ",
|
|
359
|
+
"\nvar ",
|
|
360
|
+
"\nif ",
|
|
361
|
+
"\nfor ",
|
|
362
|
+
"\nwhile ",
|
|
363
|
+
"\nmatch ",
|
|
364
|
+
"\ncase ",
|
|
365
|
+
"\n\n",
|
|
366
|
+
"\n",
|
|
367
|
+
" ",
|
|
368
|
+
""
|
|
369
|
+
];
|
|
370
|
+
else if (language === "swift") return [
|
|
371
|
+
"\nfunc ",
|
|
372
|
+
"\nclass ",
|
|
373
|
+
"\nstruct ",
|
|
374
|
+
"\nenum ",
|
|
375
|
+
"\nif ",
|
|
376
|
+
"\nfor ",
|
|
377
|
+
"\nwhile ",
|
|
378
|
+
"\ndo ",
|
|
379
|
+
"\nswitch ",
|
|
380
|
+
"\ncase ",
|
|
381
|
+
"\n\n",
|
|
382
|
+
"\n",
|
|
383
|
+
" ",
|
|
384
|
+
""
|
|
385
|
+
];
|
|
386
|
+
else if (language === "markdown") return [
|
|
387
|
+
"\n## ",
|
|
388
|
+
"\n### ",
|
|
389
|
+
"\n#### ",
|
|
390
|
+
"\n##### ",
|
|
391
|
+
"\n###### ",
|
|
392
|
+
"```\n\n",
|
|
393
|
+
"\n\n***\n\n",
|
|
394
|
+
"\n\n---\n\n",
|
|
395
|
+
"\n\n___\n\n",
|
|
396
|
+
"\n\n",
|
|
397
|
+
"\n",
|
|
398
|
+
" ",
|
|
399
|
+
""
|
|
400
|
+
];
|
|
401
|
+
else if (language === "latex") return [
|
|
402
|
+
"\n\\chapter{",
|
|
403
|
+
"\n\\section{",
|
|
404
|
+
"\n\\subsection{",
|
|
405
|
+
"\n\\subsubsection{",
|
|
406
|
+
"\n\\begin{enumerate}",
|
|
407
|
+
"\n\\begin{itemize}",
|
|
408
|
+
"\n\\begin{description}",
|
|
409
|
+
"\n\\begin{list}",
|
|
410
|
+
"\n\\begin{quote}",
|
|
411
|
+
"\n\\begin{quotation}",
|
|
412
|
+
"\n\\begin{verse}",
|
|
413
|
+
"\n\\begin{verbatim}",
|
|
414
|
+
"\n\\begin{align}",
|
|
415
|
+
"$$",
|
|
416
|
+
"$",
|
|
417
|
+
"\n\n",
|
|
418
|
+
"\n",
|
|
419
|
+
" ",
|
|
420
|
+
""
|
|
421
|
+
];
|
|
422
|
+
else if (language === "html") return [
|
|
423
|
+
"<body>",
|
|
424
|
+
"<div>",
|
|
425
|
+
"<p>",
|
|
426
|
+
"<br>",
|
|
427
|
+
"<li>",
|
|
428
|
+
"<h1>",
|
|
429
|
+
"<h2>",
|
|
430
|
+
"<h3>",
|
|
431
|
+
"<h4>",
|
|
432
|
+
"<h5>",
|
|
433
|
+
"<h6>",
|
|
434
|
+
"<span>",
|
|
435
|
+
"<table>",
|
|
436
|
+
"<tr>",
|
|
437
|
+
"<td>",
|
|
438
|
+
"<th>",
|
|
439
|
+
"<ul>",
|
|
440
|
+
"<ol>",
|
|
441
|
+
"<header>",
|
|
442
|
+
"<footer>",
|
|
443
|
+
"<nav>",
|
|
444
|
+
"<head>",
|
|
445
|
+
"<style>",
|
|
446
|
+
"<script>",
|
|
447
|
+
"<meta>",
|
|
448
|
+
"<title>",
|
|
449
|
+
" ",
|
|
450
|
+
""
|
|
451
|
+
];
|
|
452
|
+
else if (language === "sol") return [
|
|
453
|
+
"\npragma ",
|
|
454
|
+
"\nusing ",
|
|
455
|
+
"\ncontract ",
|
|
456
|
+
"\ninterface ",
|
|
457
|
+
"\nlibrary ",
|
|
458
|
+
"\nconstructor ",
|
|
459
|
+
"\ntype ",
|
|
460
|
+
"\nfunction ",
|
|
461
|
+
"\nevent ",
|
|
462
|
+
"\nmodifier ",
|
|
463
|
+
"\nerror ",
|
|
464
|
+
"\nstruct ",
|
|
465
|
+
"\nenum ",
|
|
466
|
+
"\nif ",
|
|
467
|
+
"\nfor ",
|
|
468
|
+
"\nwhile ",
|
|
469
|
+
"\ndo while ",
|
|
470
|
+
"\nassembly ",
|
|
471
|
+
"\n\n",
|
|
472
|
+
"\n",
|
|
473
|
+
" ",
|
|
474
|
+
""
|
|
475
|
+
];
|
|
476
|
+
else throw new Error(`Language ${language} is not supported.`);
|
|
477
|
+
}
|
|
478
|
+
};
|
|
479
|
+
/**
|
|
480
|
+
* Implementation of splitter which looks at tokens.
|
|
481
|
+
*/
|
|
482
|
+
var TokenTextSplitter = class extends TextSplitter {
|
|
483
|
+
static lc_name() {
|
|
484
|
+
return "TokenTextSplitter";
|
|
485
|
+
}
|
|
486
|
+
encodingName;
|
|
487
|
+
allowedSpecial;
|
|
488
|
+
disallowedSpecial;
|
|
489
|
+
tokenizer;
|
|
490
|
+
constructor(fields) {
|
|
491
|
+
super(fields);
|
|
492
|
+
this.encodingName = fields?.encodingName ?? "gpt2";
|
|
493
|
+
this.allowedSpecial = fields?.allowedSpecial ?? [];
|
|
494
|
+
this.disallowedSpecial = fields?.disallowedSpecial ?? "all";
|
|
495
|
+
}
|
|
496
|
+
async splitText(text) {
|
|
497
|
+
if (!this.tokenizer) this.tokenizer = await (0, _langchain_core_utils_tiktoken.getEncoding)(this.encodingName);
|
|
498
|
+
const splits = [];
|
|
499
|
+
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
|
|
500
|
+
let start_idx = 0;
|
|
501
|
+
while (start_idx < input_ids.length) {
|
|
502
|
+
if (start_idx > 0) start_idx -= this.chunkOverlap;
|
|
503
|
+
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
|
|
504
|
+
const chunk_ids = input_ids.slice(start_idx, end_idx);
|
|
505
|
+
splits.push(this.tokenizer.decode(chunk_ids));
|
|
506
|
+
start_idx = end_idx;
|
|
507
|
+
}
|
|
508
|
+
return splits;
|
|
509
|
+
}
|
|
510
|
+
};
|
|
511
|
+
var MarkdownTextSplitter = class extends RecursiveCharacterTextSplitter {
|
|
512
|
+
constructor(fields) {
|
|
513
|
+
super({
|
|
514
|
+
...fields,
|
|
515
|
+
separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("markdown")
|
|
516
|
+
});
|
|
517
|
+
}
|
|
518
|
+
};
|
|
519
|
+
var LatexTextSplitter = class extends RecursiveCharacterTextSplitter {
|
|
520
|
+
constructor(fields) {
|
|
521
|
+
super({
|
|
522
|
+
...fields,
|
|
523
|
+
separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("latex")
|
|
524
|
+
});
|
|
525
|
+
}
|
|
526
|
+
};
|
|
527
|
+
//#endregion
|
|
528
|
+
exports.CharacterTextSplitter = CharacterTextSplitter;
|
|
529
|
+
exports.LatexTextSplitter = LatexTextSplitter;
|
|
530
|
+
exports.MarkdownTextSplitter = MarkdownTextSplitter;
|
|
531
|
+
exports.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter;
|
|
532
|
+
exports.SupportedTextSplitterLanguages = SupportedTextSplitterLanguages;
|
|
533
|
+
exports.TextSplitter = TextSplitter;
|
|
534
|
+
exports.TokenTextSplitter = TokenTextSplitter;
|
|
535
|
+
|
|
536
|
+
//# sourceMappingURL=text_splitter.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text_splitter.cjs","names":["BaseDocumentTransformer","Document"],"sources":["../src/text_splitter.ts"],"sourcesContent":["import type * as tiktoken from \"js-tiktoken\";\nimport { Document, BaseDocumentTransformer } from \"@langchain/core/documents\";\nimport { getEncoding } from \"@langchain/core/utils/tiktoken\";\n\nexport interface TextSplitterParams {\n chunkSize: number;\n chunkOverlap: number;\n keepSeparator: boolean;\n lengthFunction?:\n | ((text: string) => number)\n | ((text: string) => Promise<number>);\n}\n\nexport type TextSplitterChunkHeaderOptions = {\n chunkHeader?: string;\n chunkOverlapHeader?: string;\n appendChunkOverlapHeader?: boolean;\n};\n\nexport abstract class TextSplitter\n extends BaseDocumentTransformer\n implements TextSplitterParams\n{\n lc_namespace = [\"langchain\", \"document_transformers\", \"text_splitters\"];\n\n chunkSize = 1000;\n\n chunkOverlap = 200;\n\n keepSeparator = false;\n\n lengthFunction:\n | ((text: string) => number)\n | ((text: string) => Promise<number>);\n\n constructor(fields?: Partial<TextSplitterParams>) {\n super(fields);\n this.chunkSize = fields?.chunkSize ?? this.chunkSize;\n this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;\n this.keepSeparator = fields?.keepSeparator ?? this.keepSeparator;\n this.lengthFunction =\n fields?.lengthFunction ?? ((text: string) => text.length);\n if (this.chunkOverlap >= this.chunkSize) {\n throw new Error(\"Cannot have chunkOverlap >= chunkSize\");\n }\n }\n\n async transformDocuments(\n documents: Document[],\n chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}\n ): Promise<Document[]> {\n return this.splitDocuments(documents, chunkHeaderOptions);\n }\n\n abstract splitText(text: string): Promise<string[]>;\n\n protected splitOnSeparator(text: string, separator: string): string[] {\n let splits;\n if (separator) {\n if (this.keepSeparator) {\n const regexEscapedSeparator = separator.replace(\n /[/\\-\\\\^$*+?.()|[\\]{}]/g,\n \"\\\\$&\"\n );\n splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));\n } else {\n splits = text.split(separator);\n }\n } else {\n splits = text.split(\"\");\n }\n return splits.filter((s) => s !== \"\");\n }\n\n async createDocuments(\n texts: string[],\n // oxlint-disable-next-line @typescript-eslint/no-explicit-any\n metadatas: Record<string, any>[] = [],\n chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}\n ): Promise<Document[]> {\n // if no metadata is provided, we create an empty one for each text\n // oxlint-disable-next-line @typescript-eslint/no-explicit-any\n const _metadatas: Record<string, any>[] =\n metadatas.length > 0\n ? metadatas\n : [...Array(texts.length)].map(() => ({}));\n const {\n chunkHeader = \"\",\n chunkOverlapHeader = \"(cont'd) \",\n appendChunkOverlapHeader = false,\n } = chunkHeaderOptions;\n const documents = new Array<Document>();\n for (let i = 0; i < texts.length; i += 1) {\n const text = texts[i];\n let lineCounterIndex = 1;\n let prevChunk = null;\n let indexPrevChunk = -1;\n for (const chunk of await this.splitText(text)) {\n let pageContent = chunkHeader;\n\n // we need to count the \\n that are in the text before getting removed by the splitting\n const indexChunk = text.indexOf(chunk, indexPrevChunk + 1);\n if (prevChunk === null) {\n const newLinesBeforeFirstChunk = this.numberOfNewLines(\n text,\n 0,\n indexChunk\n );\n lineCounterIndex += newLinesBeforeFirstChunk;\n } else {\n const indexEndPrevChunk =\n indexPrevChunk + (await this.lengthFunction(prevChunk));\n if (indexEndPrevChunk < indexChunk) {\n const numberOfIntermediateNewLines = this.numberOfNewLines(\n text,\n indexEndPrevChunk,\n indexChunk\n );\n lineCounterIndex += numberOfIntermediateNewLines;\n } else if (indexEndPrevChunk > indexChunk) {\n const numberOfIntermediateNewLines = this.numberOfNewLines(\n text,\n indexChunk,\n indexEndPrevChunk\n );\n lineCounterIndex -= numberOfIntermediateNewLines;\n }\n if (appendChunkOverlapHeader) {\n pageContent += chunkOverlapHeader;\n }\n }\n const newLinesCount = this.numberOfNewLines(chunk);\n\n const loc =\n _metadatas[i].loc && typeof _metadatas[i].loc === \"object\"\n ? { ..._metadatas[i].loc }\n : {};\n loc.lines = {\n from: lineCounterIndex,\n to: lineCounterIndex + newLinesCount,\n };\n const metadataWithLinesNumber = {\n ..._metadatas[i],\n loc,\n };\n\n pageContent += chunk;\n documents.push(\n new Document({\n pageContent,\n metadata: metadataWithLinesNumber,\n })\n );\n lineCounterIndex += newLinesCount;\n prevChunk = chunk;\n indexPrevChunk = indexChunk;\n }\n }\n return documents;\n }\n\n private numberOfNewLines(text: string, start?: number, end?: number) {\n const textSection = text.slice(start, end);\n return (textSection.match(/\\n/g) || []).length;\n }\n\n async splitDocuments(\n documents: Document[],\n chunkHeaderOptions: TextSplitterChunkHeaderOptions = {}\n ): Promise<Document[]> {\n const selectedDocuments = documents.filter(\n (doc) => doc.pageContent !== undefined\n );\n const texts = selectedDocuments.map((doc) => doc.pageContent);\n const metadatas = selectedDocuments.map((doc) => doc.metadata);\n return this.createDocuments(texts, metadatas, chunkHeaderOptions);\n }\n\n private joinDocs(docs: string[], separator: string): string | null {\n const text = docs.join(separator).trim();\n return text === \"\" ? null : text;\n }\n\n async mergeSplits(splits: string[], separator: string): Promise<string[]> {\n const docs: string[] = [];\n const currentDoc: string[] = [];\n let total = 0;\n for (const d of splits) {\n const _len = await this.lengthFunction(d);\n if (\n total + _len + currentDoc.length * separator.length >\n this.chunkSize\n ) {\n if (total > this.chunkSize) {\n console.warn(\n `Created a chunk of size ${total}, +\nwhich is longer than the specified ${this.chunkSize}`\n );\n }\n if (currentDoc.length > 0) {\n const doc = this.joinDocs(currentDoc, separator);\n if (doc !== null) {\n docs.push(doc);\n }\n // Keep on popping if:\n // - we have a larger chunk than in the chunk overlap\n // - or if we still have any chunks and the length is long\n while (\n total > this.chunkOverlap ||\n (total + _len + currentDoc.length * separator.length >\n this.chunkSize &&\n total > 0)\n ) {\n total -= await this.lengthFunction(currentDoc[0]);\n currentDoc.shift();\n }\n }\n }\n currentDoc.push(d);\n total += _len;\n }\n const doc = this.joinDocs(currentDoc, separator);\n if (doc !== null) {\n docs.push(doc);\n }\n return docs;\n }\n}\n\nexport interface CharacterTextSplitterParams extends TextSplitterParams {\n separator: string;\n}\n\nexport class CharacterTextSplitter\n extends TextSplitter\n implements CharacterTextSplitterParams\n{\n static lc_name() {\n return \"CharacterTextSplitter\";\n }\n\n separator = \"\\n\\n\";\n\n constructor(fields?: Partial<CharacterTextSplitterParams>) {\n super(fields);\n this.separator = fields?.separator ?? this.separator;\n }\n\n async splitText(text: string): Promise<string[]> {\n // First we naively split the large input into a bunch of smaller ones.\n const splits = this.splitOnSeparator(text, this.separator);\n return this.mergeSplits(splits, this.keepSeparator ? \"\" : this.separator);\n }\n}\n\nexport interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {\n separators: string[];\n}\n\nexport const SupportedTextSplitterLanguages = [\n \"cpp\",\n \"go\",\n \"java\",\n \"js\",\n \"php\",\n \"proto\",\n \"python\",\n \"rst\",\n \"ruby\",\n \"rust\",\n \"scala\",\n \"swift\",\n \"markdown\",\n \"latex\",\n \"html\",\n \"sol\",\n] as const;\n\nexport type SupportedTextSplitterLanguage =\n (typeof SupportedTextSplitterLanguages)[number];\n\nexport class RecursiveCharacterTextSplitter\n extends TextSplitter\n implements RecursiveCharacterTextSplitterParams\n{\n static lc_name() {\n return \"RecursiveCharacterTextSplitter\";\n }\n\n separators: string[] = [\"\\n\\n\", \"\\n\", \" \", \"\"];\n\n constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>) {\n super(fields);\n this.separators = fields?.separators ?? this.separators;\n this.keepSeparator = fields?.keepSeparator ?? true;\n }\n\n private async _splitText(text: string, separators: string[]) {\n const finalChunks: string[] = [];\n\n // Get appropriate separator to use\n let separator: string = separators[separators.length - 1];\n let newSeparators;\n for (let i = 0; i < separators.length; i += 1) {\n const s = separators[i];\n if (s === \"\") {\n separator = s;\n break;\n }\n if (text.includes(s)) {\n separator = s;\n newSeparators = separators.slice(i + 1);\n break;\n }\n }\n\n // Now that we have the separator, split the text\n const splits = this.splitOnSeparator(text, separator);\n\n // Now go merging things, recursively splitting longer texts.\n let goodSplits: string[] = [];\n const _separator = this.keepSeparator ? \"\" : separator;\n for (const s of splits) {\n if ((await this.lengthFunction(s)) < this.chunkSize) {\n goodSplits.push(s);\n } else {\n if (goodSplits.length) {\n const mergedText = await this.mergeSplits(goodSplits, _separator);\n finalChunks.push(...mergedText);\n goodSplits = [];\n }\n if (!newSeparators) {\n finalChunks.push(s);\n } else {\n const otherInfo = await this._splitText(s, newSeparators);\n finalChunks.push(...otherInfo);\n }\n }\n }\n if (goodSplits.length) {\n const mergedText = await this.mergeSplits(goodSplits, _separator);\n finalChunks.push(...mergedText);\n }\n return finalChunks;\n }\n\n async splitText(text: string): Promise<string[]> {\n return this._splitText(text, this.separators);\n }\n\n static fromLanguage(\n language: SupportedTextSplitterLanguage,\n options?: Partial<RecursiveCharacterTextSplitterParams>\n ) {\n return new RecursiveCharacterTextSplitter({\n ...options,\n separators:\n RecursiveCharacterTextSplitter.getSeparatorsForLanguage(language),\n });\n }\n\n static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage) {\n if (language === \"cpp\") {\n return [\n // Split along class definitions\n \"\\nclass \",\n // Split along function definitions\n \"\\nvoid \",\n \"\\nint \",\n \"\\nfloat \",\n \"\\ndouble \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"go\") {\n return [\n // Split along function definitions\n \"\\nfunc \",\n \"\\nvar \",\n \"\\nconst \",\n \"\\ntype \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"java\") {\n return [\n // Split along class definitions\n \"\\nclass \",\n // Split along method definitions\n \"\\npublic \",\n \"\\nprotected \",\n \"\\nprivate \",\n \"\\nstatic \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"js\") {\n return [\n // Split along function definitions\n \"\\nfunction \",\n \"\\nconst \",\n \"\\nlet \",\n \"\\nvar \",\n \"\\nclass \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\nswitch \",\n \"\\ncase \",\n \"\\ndefault \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"php\") {\n return [\n // Split along function definitions\n \"\\nfunction \",\n // Split along class definitions\n \"\\nclass \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nforeach \",\n \"\\nwhile \",\n \"\\ndo \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"proto\") {\n return [\n // Split along message definitions\n \"\\nmessage \",\n // Split along service definitions\n \"\\nservice \",\n // Split along enum definitions\n \"\\nenum \",\n // Split along option definitions\n \"\\noption \",\n // Split along import statements\n \"\\nimport \",\n // Split along syntax declarations\n \"\\nsyntax \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"python\") {\n return [\n // First, try to split along class definitions\n \"\\nclass \",\n \"\\ndef \",\n \"\\n\\tdef \",\n // Now split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"rst\") {\n return [\n // Split along section titles\n \"\\n===\\n\",\n \"\\n---\\n\",\n \"\\n***\\n\",\n // Split along directive markers\n \"\\n.. \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"ruby\") {\n return [\n // Split along method definitions\n \"\\ndef \",\n \"\\nclass \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nunless \",\n \"\\nwhile \",\n \"\\nfor \",\n \"\\ndo \",\n \"\\nbegin \",\n \"\\nrescue \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"rust\") {\n return [\n // Split along function definitions\n \"\\nfn \",\n \"\\nconst \",\n \"\\nlet \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nwhile \",\n \"\\nfor \",\n \"\\nloop \",\n \"\\nmatch \",\n \"\\nconst \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"scala\") {\n return [\n // Split along class definitions\n \"\\nclass \",\n \"\\nobject \",\n // Split along method definitions\n \"\\ndef \",\n \"\\nval \",\n \"\\nvar \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\nmatch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"swift\") {\n return [\n // Split along function definitions\n \"\\nfunc \",\n // Split along class definitions\n \"\\nclass \",\n \"\\nstruct \",\n \"\\nenum \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\ndo \",\n \"\\nswitch \",\n \"\\ncase \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"markdown\") {\n return [\n // First, try to split along Markdown headings (starting with level 2)\n \"\\n## \",\n \"\\n### \",\n \"\\n#### \",\n \"\\n##### \",\n \"\\n###### \",\n // Note the alternative syntax for headings (below) is not handled here\n // Heading level 2\n // ---------------\n // End of code block\n \"```\\n\\n\",\n // Horizontal lines\n \"\\n\\n***\\n\\n\",\n \"\\n\\n---\\n\\n\",\n \"\\n\\n___\\n\\n\",\n // Note that this splitter doesn't handle horizontal lines defined\n // by *three or more* of ***, ---, or ___, but this is not handled\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"latex\") {\n return [\n // First, try to split along Latex sections\n \"\\n\\\\chapter{\",\n \"\\n\\\\section{\",\n \"\\n\\\\subsection{\",\n \"\\n\\\\subsubsection{\",\n\n // Now split by environments\n \"\\n\\\\begin{enumerate}\",\n \"\\n\\\\begin{itemize}\",\n \"\\n\\\\begin{description}\",\n \"\\n\\\\begin{list}\",\n \"\\n\\\\begin{quote}\",\n \"\\n\\\\begin{quotation}\",\n \"\\n\\\\begin{verse}\",\n \"\\n\\\\begin{verbatim}\",\n\n // Now split by math environments\n \"\\n\\\\begin{align}\",\n \"$$\",\n \"$\",\n\n // Now split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else if (language === \"html\") {\n return [\n // First, try to split along HTML tags\n \"<body>\",\n \"<div>\",\n \"<p>\",\n \"<br>\",\n \"<li>\",\n \"<h1>\",\n \"<h2>\",\n \"<h3>\",\n \"<h4>\",\n \"<h5>\",\n \"<h6>\",\n \"<span>\",\n \"<table>\",\n \"<tr>\",\n \"<td>\",\n \"<th>\",\n \"<ul>\",\n \"<ol>\",\n \"<header>\",\n \"<footer>\",\n \"<nav>\",\n // Head\n \"<head>\",\n \"<style>\",\n \"<script>\",\n \"<meta>\",\n \"<title>\",\n // Normal type of lines\n \" \",\n \"\",\n ];\n } else if (language === \"sol\") {\n return [\n // Split along compiler informations definitions\n \"\\npragma \",\n \"\\nusing \",\n // Split along contract definitions\n \"\\ncontract \",\n \"\\ninterface \",\n \"\\nlibrary \",\n // Split along method definitions\n \"\\nconstructor \",\n \"\\ntype \",\n \"\\nfunction \",\n \"\\nevent \",\n \"\\nmodifier \",\n \"\\nerror \",\n \"\\nstruct \",\n \"\\nenum \",\n // Split along control flow statements\n \"\\nif \",\n \"\\nfor \",\n \"\\nwhile \",\n \"\\ndo while \",\n \"\\nassembly \",\n // Split by the normal type of lines\n \"\\n\\n\",\n \"\\n\",\n \" \",\n \"\",\n ];\n } else {\n throw new Error(`Language ${language} is not supported.`);\n }\n }\n}\n\nexport interface TokenTextSplitterParams extends TextSplitterParams {\n encodingName: tiktoken.TiktokenEncoding;\n allowedSpecial: \"all\" | Array<string>;\n disallowedSpecial: \"all\" | Array<string>;\n}\n\n/**\n * Implementation of splitter which looks at tokens.\n */\nexport class TokenTextSplitter\n extends TextSplitter\n implements TokenTextSplitterParams\n{\n static lc_name() {\n return \"TokenTextSplitter\";\n }\n\n encodingName: tiktoken.TiktokenEncoding;\n\n allowedSpecial: \"all\" | Array<string>;\n\n disallowedSpecial: \"all\" | Array<string>;\n\n private tokenizer: tiktoken.Tiktoken;\n\n constructor(fields?: Partial<TokenTextSplitterParams>) {\n super(fields);\n\n this.encodingName = fields?.encodingName ?? \"gpt2\";\n this.allowedSpecial = fields?.allowedSpecial ?? [];\n this.disallowedSpecial = fields?.disallowedSpecial ?? \"all\";\n }\n\n async splitText(text: string): Promise<string[]> {\n if (!this.tokenizer) {\n this.tokenizer = await getEncoding(this.encodingName);\n }\n\n const splits: string[] = [];\n\n const input_ids = this.tokenizer.encode(\n text,\n this.allowedSpecial,\n this.disallowedSpecial\n );\n\n let start_idx = 0;\n\n while (start_idx < input_ids.length) {\n if (start_idx > 0) {\n start_idx -= this.chunkOverlap;\n }\n const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);\n const chunk_ids = input_ids.slice(start_idx, end_idx);\n splits.push(this.tokenizer.decode(chunk_ids));\n start_idx = end_idx;\n }\n\n return splits;\n }\n}\n\nexport type MarkdownTextSplitterParams = TextSplitterParams;\n\nexport class MarkdownTextSplitter\n extends RecursiveCharacterTextSplitter\n implements MarkdownTextSplitterParams\n{\n constructor(fields?: Partial<MarkdownTextSplitterParams>) {\n super({\n ...fields,\n separators:\n RecursiveCharacterTextSplitter.getSeparatorsForLanguage(\"markdown\"),\n });\n }\n}\n\nexport type LatexTextSplitterParams = TextSplitterParams;\n\nexport class LatexTextSplitter\n extends RecursiveCharacterTextSplitter\n implements LatexTextSplitterParams\n{\n constructor(fields?: Partial<LatexTextSplitterParams>) {\n super({\n ...fields,\n separators:\n RecursiveCharacterTextSplitter.getSeparatorsForLanguage(\"latex\"),\n });\n }\n}\n"],"mappings":";;;AAmBA,IAAsB,eAAtB,cACUA,0BAAAA,wBAEV;CACE,eAAe;EAAC;EAAa;EAAyB;EAAiB;CAEvE,YAAY;CAEZ,eAAe;CAEf,gBAAgB;CAEhB;CAIA,YAAY,QAAsC;AAChD,QAAM,OAAO;AACb,OAAK,YAAY,QAAQ,aAAa,KAAK;AAC3C,OAAK,eAAe,QAAQ,gBAAgB,KAAK;AACjD,OAAK,gBAAgB,QAAQ,iBAAiB,KAAK;AACnD,OAAK,iBACH,QAAQ,oBAAoB,SAAiB,KAAK;AACpD,MAAI,KAAK,gBAAgB,KAAK,UAC5B,OAAM,IAAI,MAAM,wCAAwC;;CAI5D,MAAM,mBACJ,WACA,qBAAqD,EAAE,EAClC;AACrB,SAAO,KAAK,eAAe,WAAW,mBAAmB;;CAK3D,iBAA2B,MAAc,WAA6B;EACpE,IAAI;AACJ,MAAI,UACF,KAAI,KAAK,eAAe;GACtB,MAAM,wBAAwB,UAAU,QACtC,0BACA,OACD;AACD,YAAS,KAAK,MAAM,IAAI,OAAO,MAAM,sBAAsB,GAAG,CAAC;QAE/D,UAAS,KAAK,MAAM,UAAU;MAGhC,UAAS,KAAK,MAAM,GAAG;AAEzB,SAAO,OAAO,QAAQ,MAAM,MAAM,GAAG;;CAGvC,MAAM,gBACJ,OAEA,YAAmC,EAAE,EACrC,qBAAqD,EAAE,EAClC;EAGrB,MAAM,aACJ,UAAU,SAAS,IACf,YACA,CAAC,GAAG,MAAM,MAAM,OAAO,CAAC,CAAC,WAAW,EAAE,EAAE;EAC9C,MAAM,EACJ,cAAc,IACd,qBAAqB,aACrB,2BAA2B,UACzB;EACJ,MAAM,YAAY,IAAI,OAAiB;AACvC,OAAK,IAAI,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK,GAAG;GACxC,MAAM,OAAO,MAAM;GACnB,IAAI,mBAAmB;GACvB,IAAI,YAAY;GAChB,IAAI,iBAAiB;AACrB,QAAK,MAAM,SAAS,MAAM,KAAK,UAAU,KAAK,EAAE;IAC9C,IAAI,cAAc;IAGlB,MAAM,aAAa,KAAK,QAAQ,OAAO,iBAAiB,EAAE;AAC1D,QAAI,cAAc,MAAM;KACtB,MAAM,2BAA2B,KAAK,iBACpC,MACA,GACA,WACD;AACD,yBAAoB;WACf;KACL,MAAM,oBACJ,iBAAkB,MAAM,KAAK,eAAe,UAAU;AACxD,SAAI,oBAAoB,YAAY;MAClC,MAAM,+BAA+B,KAAK,iBACxC,MACA,mBACA,WACD;AACD,0BAAoB;gBACX,oBAAoB,YAAY;MACzC,MAAM,+BAA+B,KAAK,iBACxC,MACA,YACA,kBACD;AACD,0BAAoB;;AAEtB,SAAI,yBACF,gBAAe;;IAGnB,MAAM,gBAAgB,KAAK,iBAAiB,MAAM;IAElD,MAAM,MACJ,WAAW,GAAG,OAAO,OAAO,WAAW,GAAG,QAAQ,WAC9C,EAAE,GAAG,WAAW,GAAG,KAAK,GACxB,EAAE;AACR,QAAI,QAAQ;KACV,MAAM;KACN,IAAI,mBAAmB;KACxB;IACD,MAAM,0BAA0B;KAC9B,GAAG,WAAW;KACd;KACD;AAED,mBAAe;AACf,cAAU,KACR,IAAIC,0BAAAA,SAAS;KACX;KACA,UAAU;KACX,CAAC,CACH;AACD,wBAAoB;AACpB,gBAAY;AACZ,qBAAiB;;;AAGrB,SAAO;;CAGT,iBAAyB,MAAc,OAAgB,KAAc;AAEnE,UADoB,KAAK,MAAM,OAAO,IAAI,CACtB,MAAM,MAAM,IAAI,EAAE,EAAE;;CAG1C,MAAM,eACJ,WACA,qBAAqD,EAAE,EAClC;EACrB,MAAM,oBAAoB,UAAU,QACjC,QAAQ,IAAI,gBAAgB,KAAA,EAC9B;EACD,MAAM,QAAQ,kBAAkB,KAAK,QAAQ,IAAI,YAAY;EAC7D,MAAM,YAAY,kBAAkB,KAAK,QAAQ,IAAI,SAAS;AAC9D,SAAO,KAAK,gBAAgB,OAAO,WAAW,mBAAmB;;CAGnE,SAAiB,MAAgB,WAAkC;EACjE,MAAM,OAAO,KAAK,KAAK,UAAU,CAAC,MAAM;AACxC,SAAO,SAAS,KAAK,OAAO;;CAG9B,MAAM,YAAY,QAAkB,WAAsC;EACxE,MAAM,OAAiB,EAAE;EACzB,MAAM,aAAuB,EAAE;EAC/B,IAAI,QAAQ;AACZ,OAAK,MAAM,KAAK,QAAQ;GACtB,MAAM,OAAO,MAAM,KAAK,eAAe,EAAE;AACzC,OACE,QAAQ,OAAO,WAAW,SAAS,UAAU,SAC7C,KAAK,WACL;AACA,QAAI,QAAQ,KAAK,UACf,SAAQ,KACN,2BAA2B,MAAM;qCACR,KAAK,YAC/B;AAEH,QAAI,WAAW,SAAS,GAAG;KACzB,MAAM,MAAM,KAAK,SAAS,YAAY,UAAU;AAChD,SAAI,QAAQ,KACV,MAAK,KAAK,IAAI;AAKhB,YACE,QAAQ,KAAK,gBACZ,QAAQ,OAAO,WAAW,SAAS,UAAU,SAC5C,KAAK,aACL,QAAQ,GACV;AACA,eAAS,MAAM,KAAK,eAAe,WAAW,GAAG;AACjD,iBAAW,OAAO;;;;AAIxB,cAAW,KAAK,EAAE;AAClB,YAAS;;EAEX,MAAM,MAAM,KAAK,SAAS,YAAY,UAAU;AAChD,MAAI,QAAQ,KACV,MAAK,KAAK,IAAI;AAEhB,SAAO;;;AAQX,IAAa,wBAAb,cACU,aAEV;CACE,OAAO,UAAU;AACf,SAAO;;CAGT,YAAY;CAEZ,YAAY,QAA+C;AACzD,QAAM,OAAO;AACb,OAAK,YAAY,QAAQ,aAAa,KAAK;;CAG7C,MAAM,UAAU,MAAiC;EAE/C,MAAM,SAAS,KAAK,iBAAiB,MAAM,KAAK,UAAU;AAC1D,SAAO,KAAK,YAAY,QAAQ,KAAK,gBAAgB,KAAK,KAAK,UAAU;;;AAQ7E,MAAa,iCAAiC;CAC5C;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACD;AAKD,IAAa,iCAAb,MAAa,uCACH,aAEV;CACE,OAAO,UAAU;AACf,SAAO;;CAGT,aAAuB;EAAC;EAAQ;EAAM;EAAK;EAAG;CAE9C,YAAY,QAAwD;AAClE,QAAM,OAAO;AACb,OAAK,aAAa,QAAQ,cAAc,KAAK;AAC7C,OAAK,gBAAgB,QAAQ,iBAAiB;;CAGhD,MAAc,WAAW,MAAc,YAAsB;EAC3D,MAAM,cAAwB,EAAE;EAGhC,IAAI,YAAoB,WAAW,WAAW,SAAS;EACvD,IAAI;AACJ,OAAK,IAAI,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK,GAAG;GAC7C,MAAM,IAAI,WAAW;AACrB,OAAI,MAAM,IAAI;AACZ,gBAAY;AACZ;;AAEF,OAAI,KAAK,SAAS,EAAE,EAAE;AACpB,gBAAY;AACZ,oBAAgB,WAAW,MAAM,IAAI,EAAE;AACvC;;;EAKJ,MAAM,SAAS,KAAK,iBAAiB,MAAM,UAAU;EAGrD,IAAI,aAAuB,EAAE;EAC7B,MAAM,aAAa,KAAK,gBAAgB,KAAK;AAC7C,OAAK,MAAM,KAAK,OACd,KAAK,MAAM,KAAK,eAAe,EAAE,GAAI,KAAK,UACxC,YAAW,KAAK,EAAE;OACb;AACL,OAAI,WAAW,QAAQ;IACrB,MAAM,aAAa,MAAM,KAAK,YAAY,YAAY,WAAW;AACjE,gBAAY,KAAK,GAAG,WAAW;AAC/B,iBAAa,EAAE;;AAEjB,OAAI,CAAC,cACH,aAAY,KAAK,EAAE;QACd;IACL,MAAM,YAAY,MAAM,KAAK,WAAW,GAAG,cAAc;AACzD,gBAAY,KAAK,GAAG,UAAU;;;AAIpC,MAAI,WAAW,QAAQ;GACrB,MAAM,aAAa,MAAM,KAAK,YAAY,YAAY,WAAW;AACjE,eAAY,KAAK,GAAG,WAAW;;AAEjC,SAAO;;CAGT,MAAM,UAAU,MAAiC;AAC/C,SAAO,KAAK,WAAW,MAAM,KAAK,WAAW;;CAG/C,OAAO,aACL,UACA,SACA;AACA,SAAO,IAAI,+BAA+B;GACxC,GAAG;GACH,YACE,+BAA+B,yBAAyB,SAAS;GACpE,CAAC;;CAGJ,OAAO,yBAAyB,UAAyC;AACvE,MAAI,aAAa,MACf,QAAO;GAEL;GAEA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,KACtB,QAAO;GAEL;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,OACtB,QAAO;GAEL;GAEA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,KACtB,QAAO;GAEL;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,MACtB,QAAO;GAEL;GAEA;GAEA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,QACtB,QAAO;GAEL;GAEA;GAEA;GAEA;GAEA;GAEA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,SACtB,QAAO;GAEL;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,MACtB,QAAO;GAEL;GACA;GACA;GAEA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,OACtB,QAAO;GAEL;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,OACtB,QAAO;GAEL;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,QACtB,QAAO;GAEL;GACA;GAEA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,QACtB,QAAO;GAEL;GAEA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;WACQ,aAAa,WACtB,QAAO;GAEL;GACA;GACA;GACA;GACA;GAKA;GAEA;GACA;GACA;GAGA;GACA;GACA;GACA;GACD;WACQ,aAAa,QACtB,QAAO;GAEL;GACA;GACA;GACA;GAGA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GAGA;GACA;GACA;GAGA;GACA;GACA;GACA;GACD;WACQ,aAAa,OACtB,QAAO;GAEL;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACD;WACQ,aAAa,MACtB,QAAO;GAEL;GACA;GAEA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACA;GAEA;GACA;GACA;GACA;GACD;MAED,OAAM,IAAI,MAAM,YAAY,SAAS,oBAAoB;;;;;;AAc/D,IAAa,oBAAb,cACU,aAEV;CACE,OAAO,UAAU;AACf,SAAO;;CAGT;CAEA;CAEA;CAEA;CAEA,YAAY,QAA2C;AACrD,QAAM,OAAO;AAEb,OAAK,eAAe,QAAQ,gBAAgB;AAC5C,OAAK,iBAAiB,QAAQ,kBAAkB,EAAE;AAClD,OAAK,oBAAoB,QAAQ,qBAAqB;;CAGxD,MAAM,UAAU,MAAiC;AAC/C,MAAI,CAAC,KAAK,UACR,MAAK,YAAY,OAAA,GAAA,+BAAA,aAAkB,KAAK,aAAa;EAGvD,MAAM,SAAmB,EAAE;EAE3B,MAAM,YAAY,KAAK,UAAU,OAC/B,MACA,KAAK,gBACL,KAAK,kBACN;EAED,IAAI,YAAY;AAEhB,SAAO,YAAY,UAAU,QAAQ;AACnC,OAAI,YAAY,EACd,cAAa,KAAK;GAEpB,MAAM,UAAU,KAAK,IAAI,YAAY,KAAK,WAAW,UAAU,OAAO;GACtE,MAAM,YAAY,UAAU,MAAM,WAAW,QAAQ;AACrD,UAAO,KAAK,KAAK,UAAU,OAAO,UAAU,CAAC;AAC7C,eAAY;;AAGd,SAAO;;;AAMX,IAAa,uBAAb,cACU,+BAEV;CACE,YAAY,QAA8C;AACxD,QAAM;GACJ,GAAG;GACH,YACE,+BAA+B,yBAAyB,WAAW;GACtE,CAAC;;;AAMN,IAAa,oBAAb,cACU,+BAEV;CACE,YAAY,QAA2C;AACrD,QAAM;GACJ,GAAG;GACH,YACE,+BAA+B,yBAAyB,QAAQ;GACnE,CAAC"}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import * as tiktoken from "js-tiktoken";
|
|
2
|
+
import { BaseDocumentTransformer, Document } from "@langchain/core/documents";
|
|
3
|
+
|
|
4
|
+
//#region src/text_splitter.d.ts
|
|
5
|
+
interface TextSplitterParams {
|
|
6
|
+
chunkSize: number;
|
|
7
|
+
chunkOverlap: number;
|
|
8
|
+
keepSeparator: boolean;
|
|
9
|
+
lengthFunction?: ((text: string) => number) | ((text: string) => Promise<number>);
|
|
10
|
+
}
|
|
11
|
+
type TextSplitterChunkHeaderOptions = {
|
|
12
|
+
chunkHeader?: string;
|
|
13
|
+
chunkOverlapHeader?: string;
|
|
14
|
+
appendChunkOverlapHeader?: boolean;
|
|
15
|
+
};
|
|
16
|
+
declare abstract class TextSplitter extends BaseDocumentTransformer implements TextSplitterParams {
|
|
17
|
+
lc_namespace: string[];
|
|
18
|
+
chunkSize: number;
|
|
19
|
+
chunkOverlap: number;
|
|
20
|
+
keepSeparator: boolean;
|
|
21
|
+
lengthFunction: ((text: string) => number) | ((text: string) => Promise<number>);
|
|
22
|
+
constructor(fields?: Partial<TextSplitterParams>);
|
|
23
|
+
transformDocuments(documents: Document[], chunkHeaderOptions?: TextSplitterChunkHeaderOptions): Promise<Document[]>;
|
|
24
|
+
abstract splitText(text: string): Promise<string[]>;
|
|
25
|
+
protected splitOnSeparator(text: string, separator: string): string[];
|
|
26
|
+
createDocuments(texts: string[], metadatas?: Record<string, any>[], chunkHeaderOptions?: TextSplitterChunkHeaderOptions): Promise<Document[]>;
|
|
27
|
+
private numberOfNewLines;
|
|
28
|
+
splitDocuments(documents: Document[], chunkHeaderOptions?: TextSplitterChunkHeaderOptions): Promise<Document[]>;
|
|
29
|
+
private joinDocs;
|
|
30
|
+
mergeSplits(splits: string[], separator: string): Promise<string[]>;
|
|
31
|
+
}
|
|
32
|
+
interface CharacterTextSplitterParams extends TextSplitterParams {
|
|
33
|
+
separator: string;
|
|
34
|
+
}
|
|
35
|
+
declare class CharacterTextSplitter extends TextSplitter implements CharacterTextSplitterParams {
|
|
36
|
+
static lc_name(): string;
|
|
37
|
+
separator: string;
|
|
38
|
+
constructor(fields?: Partial<CharacterTextSplitterParams>);
|
|
39
|
+
splitText(text: string): Promise<string[]>;
|
|
40
|
+
}
|
|
41
|
+
interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {
|
|
42
|
+
separators: string[];
|
|
43
|
+
}
|
|
44
|
+
declare const SupportedTextSplitterLanguages: readonly ["cpp", "go", "java", "js", "php", "proto", "python", "rst", "ruby", "rust", "scala", "swift", "markdown", "latex", "html", "sol"];
|
|
45
|
+
type SupportedTextSplitterLanguage = (typeof SupportedTextSplitterLanguages)[number];
|
|
46
|
+
declare class RecursiveCharacterTextSplitter extends TextSplitter implements RecursiveCharacterTextSplitterParams {
|
|
47
|
+
static lc_name(): string;
|
|
48
|
+
separators: string[];
|
|
49
|
+
constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>);
|
|
50
|
+
private _splitText;
|
|
51
|
+
splitText(text: string): Promise<string[]>;
|
|
52
|
+
static fromLanguage(language: SupportedTextSplitterLanguage, options?: Partial<RecursiveCharacterTextSplitterParams>): RecursiveCharacterTextSplitter;
|
|
53
|
+
static getSeparatorsForLanguage(language: SupportedTextSplitterLanguage): string[];
|
|
54
|
+
}
|
|
55
|
+
interface TokenTextSplitterParams extends TextSplitterParams {
|
|
56
|
+
encodingName: tiktoken.TiktokenEncoding;
|
|
57
|
+
allowedSpecial: "all" | Array<string>;
|
|
58
|
+
disallowedSpecial: "all" | Array<string>;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Implementation of splitter which looks at tokens.
|
|
62
|
+
*/
|
|
63
|
+
declare class TokenTextSplitter extends TextSplitter implements TokenTextSplitterParams {
|
|
64
|
+
static lc_name(): string;
|
|
65
|
+
encodingName: tiktoken.TiktokenEncoding;
|
|
66
|
+
allowedSpecial: "all" | Array<string>;
|
|
67
|
+
disallowedSpecial: "all" | Array<string>;
|
|
68
|
+
private tokenizer;
|
|
69
|
+
constructor(fields?: Partial<TokenTextSplitterParams>);
|
|
70
|
+
splitText(text: string): Promise<string[]>;
|
|
71
|
+
}
|
|
72
|
+
type MarkdownTextSplitterParams = TextSplitterParams;
|
|
73
|
+
declare class MarkdownTextSplitter extends RecursiveCharacterTextSplitter implements MarkdownTextSplitterParams {
|
|
74
|
+
constructor(fields?: Partial<MarkdownTextSplitterParams>);
|
|
75
|
+
}
|
|
76
|
+
type LatexTextSplitterParams = TextSplitterParams;
|
|
77
|
+
declare class LatexTextSplitter extends RecursiveCharacterTextSplitter implements LatexTextSplitterParams {
|
|
78
|
+
constructor(fields?: Partial<LatexTextSplitterParams>);
|
|
79
|
+
}
|
|
80
|
+
//#endregion
|
|
81
|
+
export { CharacterTextSplitter, CharacterTextSplitterParams, LatexTextSplitter, LatexTextSplitterParams, MarkdownTextSplitter, MarkdownTextSplitterParams, RecursiveCharacterTextSplitter, RecursiveCharacterTextSplitterParams, SupportedTextSplitterLanguage, SupportedTextSplitterLanguages, TextSplitter, TextSplitterChunkHeaderOptions, TextSplitterParams, TokenTextSplitter, TokenTextSplitterParams };
|
|
82
|
+
//# sourceMappingURL=text_splitter.d.cts.map
|