langchain 0.1.35 → 0.1.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chains/conversational_retrieval_chain.cjs +61 -19
- package/dist/chains/conversational_retrieval_chain.d.ts +61 -19
- package/dist/chains/conversational_retrieval_chain.js +61 -19
- package/dist/chains/llm_chain.cjs +10 -5
- package/dist/chains/llm_chain.d.ts +10 -5
- package/dist/chains/llm_chain.js +10 -5
- package/dist/chains/openai_functions/base.cjs +2 -0
- package/dist/chains/openai_functions/base.d.ts +2 -0
- package/dist/chains/openai_functions/base.js +2 -0
- package/dist/chains/query_constructor/index.cjs +5 -8
- package/dist/chains/query_constructor/index.d.ts +5 -4
- package/dist/chains/query_constructor/index.js +3 -6
- package/dist/chains/query_constructor/ir.cjs +15 -139
- package/dist/chains/query_constructor/ir.d.ts +1 -138
- package/dist/chains/query_constructor/ir.js +1 -132
- package/dist/chains/query_constructor/prompt.cjs +2 -2
- package/dist/chains/query_constructor/prompt.d.ts +1 -1
- package/dist/chains/query_constructor/prompt.js +1 -1
- package/dist/chains/retrieval_qa.cjs +23 -14
- package/dist/chains/retrieval_qa.d.ts +23 -14
- package/dist/chains/retrieval_qa.js +23 -14
- package/dist/document_loaders/fs/unstructured.cjs +1 -1
- package/dist/document_loaders/fs/unstructured.js +1 -1
- package/dist/document_loaders/web/browserbase.cjs +87 -0
- package/dist/document_loaders/web/browserbase.d.ts +49 -0
- package/dist/document_loaders/web/browserbase.js +80 -0
- package/dist/document_loaders/web/firecrawl.cjs +88 -0
- package/dist/document_loaders/web/firecrawl.d.ts +48 -0
- package/dist/document_loaders/web/firecrawl.js +81 -0
- package/dist/document_loaders/web/s3.cjs +2 -2
- package/dist/document_loaders/web/s3.js +2 -2
- package/dist/load/import_constants.cjs +2 -0
- package/dist/load/import_constants.js +2 -0
- package/dist/output_parsers/expression.cjs +1 -1
- package/dist/output_parsers/expression.d.ts +1 -1
- package/dist/output_parsers/expression.js +1 -1
- package/dist/retrievers/self_query/base.cjs +3 -136
- package/dist/retrievers/self_query/base.d.ts +1 -69
- package/dist/retrievers/self_query/base.js +1 -134
- package/dist/retrievers/self_query/chroma.cjs +9 -10
- package/dist/retrievers/self_query/chroma.d.ts +1 -1
- package/dist/retrievers/self_query/chroma.js +1 -2
- package/dist/retrievers/self_query/functional.cjs +2 -195
- package/dist/retrievers/self_query/functional.d.ts +1 -87
- package/dist/retrievers/self_query/functional.js +1 -194
- package/dist/retrievers/self_query/index.cjs +9 -13
- package/dist/retrievers/self_query/index.d.ts +11 -8
- package/dist/retrievers/self_query/index.js +7 -11
- package/dist/retrievers/self_query/pinecone.cjs +9 -10
- package/dist/retrievers/self_query/pinecone.d.ts +1 -1
- package/dist/retrievers/self_query/pinecone.js +1 -2
- package/dist/retrievers/self_query/supabase.cjs +28 -30
- package/dist/retrievers/self_query/supabase.d.ts +1 -2
- package/dist/retrievers/self_query/supabase.js +1 -3
- package/dist/retrievers/self_query/supabase_utils.cjs +2 -2
- package/dist/retrievers/self_query/supabase_utils.d.ts +1 -1
- package/dist/retrievers/self_query/supabase_utils.js +1 -1
- package/dist/retrievers/self_query/vectara.cjs +15 -17
- package/dist/retrievers/self_query/vectara.d.ts +1 -2
- package/dist/retrievers/self_query/vectara.js +1 -3
- package/dist/retrievers/self_query/weaviate.cjs +19 -21
- package/dist/retrievers/self_query/weaviate.d.ts +1 -2
- package/dist/retrievers/self_query/weaviate.js +1 -3
- package/dist/smith/config.d.ts +4 -4
- package/dist/storage/in_memory.cjs +2 -81
- package/dist/storage/in_memory.d.ts +1 -49
- package/dist/storage/in_memory.js +1 -80
- package/dist/text_splitter.cjs +15 -727
- package/dist/text_splitter.d.ts +1 -77
- package/dist/text_splitter.js +1 -720
- package/dist/vectorstores/qdrant.cjs +2 -0
- package/dist/vectorstores/qdrant.js +2 -0
- package/document_loaders/web/browserbase.cjs +1 -0
- package/document_loaders/web/browserbase.d.cts +1 -0
- package/document_loaders/web/browserbase.d.ts +1 -0
- package/document_loaders/web/browserbase.js +1 -0
- package/document_loaders/web/firecrawl.cjs +1 -0
- package/document_loaders/web/firecrawl.d.cts +1 -0
- package/document_loaders/web/firecrawl.d.ts +1 -0
- package/document_loaders/web/firecrawl.js +1 -0
- package/package.json +40 -3
- package/dist/retrievers/self_query/utils.cjs +0 -94
- package/dist/retrievers/self_query/utils.d.ts +0 -29
- package/dist/retrievers/self_query/utils.js +0 -85
package/dist/text_splitter.cjs
CHANGED
|
@@ -1,729 +1,17 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
2
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
|
|
4
|
-
const documents_1 = require("@langchain/core/documents");
|
|
5
|
-
const tiktoken_1 = require("@langchain/core/utils/tiktoken");
|
|
6
|
-
class TextSplitter extends documents_1.BaseDocumentTransformer {
|
|
7
|
-
constructor(fields) {
|
|
8
|
-
super(fields);
|
|
9
|
-
Object.defineProperty(this, "lc_namespace", {
|
|
10
|
-
enumerable: true,
|
|
11
|
-
configurable: true,
|
|
12
|
-
writable: true,
|
|
13
|
-
value: ["langchain", "document_transformers", "text_splitters"]
|
|
14
|
-
});
|
|
15
|
-
Object.defineProperty(this, "chunkSize", {
|
|
16
|
-
enumerable: true,
|
|
17
|
-
configurable: true,
|
|
18
|
-
writable: true,
|
|
19
|
-
value: 1000
|
|
20
|
-
});
|
|
21
|
-
Object.defineProperty(this, "chunkOverlap", {
|
|
22
|
-
enumerable: true,
|
|
23
|
-
configurable: true,
|
|
24
|
-
writable: true,
|
|
25
|
-
value: 200
|
|
26
|
-
});
|
|
27
|
-
Object.defineProperty(this, "keepSeparator", {
|
|
28
|
-
enumerable: true,
|
|
29
|
-
configurable: true,
|
|
30
|
-
writable: true,
|
|
31
|
-
value: false
|
|
32
|
-
});
|
|
33
|
-
Object.defineProperty(this, "lengthFunction", {
|
|
34
|
-
enumerable: true,
|
|
35
|
-
configurable: true,
|
|
36
|
-
writable: true,
|
|
37
|
-
value: void 0
|
|
38
|
-
});
|
|
39
|
-
this.chunkSize = fields?.chunkSize ?? this.chunkSize;
|
|
40
|
-
this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;
|
|
41
|
-
this.keepSeparator = fields?.keepSeparator ?? this.keepSeparator;
|
|
42
|
-
this.lengthFunction =
|
|
43
|
-
fields?.lengthFunction ?? ((text) => text.length);
|
|
44
|
-
if (this.chunkOverlap >= this.chunkSize) {
|
|
45
|
-
throw new Error("Cannot have chunkOverlap >= chunkSize");
|
|
46
|
-
}
|
|
47
|
-
}
|
|
48
|
-
async transformDocuments(documents, chunkHeaderOptions = {}) {
|
|
49
|
-
return this.splitDocuments(documents, chunkHeaderOptions);
|
|
50
|
-
}
|
|
51
|
-
splitOnSeparator(text, separator) {
|
|
52
|
-
let splits;
|
|
53
|
-
if (separator) {
|
|
54
|
-
if (this.keepSeparator) {
|
|
55
|
-
const regexEscapedSeparator = separator.replace(/[/\-\\^$*+?.()|[\]{}]/g, "\\$&");
|
|
56
|
-
splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));
|
|
57
|
-
}
|
|
58
|
-
else {
|
|
59
|
-
splits = text.split(separator);
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
else {
|
|
63
|
-
splits = text.split("");
|
|
64
|
-
}
|
|
65
|
-
return splits.filter((s) => s !== "");
|
|
66
|
-
}
|
|
67
|
-
async createDocuments(texts,
|
|
68
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
69
|
-
metadatas = [], chunkHeaderOptions = {}) {
|
|
70
|
-
// if no metadata is provided, we create an empty one for each text
|
|
71
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
72
|
-
const _metadatas = metadatas.length > 0
|
|
73
|
-
? metadatas
|
|
74
|
-
: [...Array(texts.length)].map(() => ({}));
|
|
75
|
-
const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false, } = chunkHeaderOptions;
|
|
76
|
-
const documents = new Array();
|
|
77
|
-
for (let i = 0; i < texts.length; i += 1) {
|
|
78
|
-
const text = texts[i];
|
|
79
|
-
let lineCounterIndex = 1;
|
|
80
|
-
let prevChunk = null;
|
|
81
|
-
let indexPrevChunk = -1;
|
|
82
|
-
for (const chunk of await this.splitText(text)) {
|
|
83
|
-
let pageContent = chunkHeader;
|
|
84
|
-
// we need to count the \n that are in the text before getting removed by the splitting
|
|
85
|
-
const indexChunk = text.indexOf(chunk, indexPrevChunk + 1);
|
|
86
|
-
if (prevChunk === null) {
|
|
87
|
-
const newLinesBeforeFirstChunk = this.numberOfNewLines(text, 0, indexChunk);
|
|
88
|
-
lineCounterIndex += newLinesBeforeFirstChunk;
|
|
89
|
-
}
|
|
90
|
-
else {
|
|
91
|
-
const indexEndPrevChunk = indexPrevChunk + (await this.lengthFunction(prevChunk));
|
|
92
|
-
if (indexEndPrevChunk < indexChunk) {
|
|
93
|
-
const numberOfIntermediateNewLines = this.numberOfNewLines(text, indexEndPrevChunk, indexChunk);
|
|
94
|
-
lineCounterIndex += numberOfIntermediateNewLines;
|
|
95
|
-
}
|
|
96
|
-
else if (indexEndPrevChunk > indexChunk) {
|
|
97
|
-
const numberOfIntermediateNewLines = this.numberOfNewLines(text, indexChunk, indexEndPrevChunk);
|
|
98
|
-
lineCounterIndex -= numberOfIntermediateNewLines;
|
|
99
|
-
}
|
|
100
|
-
if (appendChunkOverlapHeader) {
|
|
101
|
-
pageContent += chunkOverlapHeader;
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
const newLinesCount = this.numberOfNewLines(chunk);
|
|
105
|
-
const loc = _metadatas[i].loc && typeof _metadatas[i].loc === "object"
|
|
106
|
-
? { ..._metadatas[i].loc }
|
|
107
|
-
: {};
|
|
108
|
-
loc.lines = {
|
|
109
|
-
from: lineCounterIndex,
|
|
110
|
-
to: lineCounterIndex + newLinesCount,
|
|
111
|
-
};
|
|
112
|
-
const metadataWithLinesNumber = {
|
|
113
|
-
..._metadatas[i],
|
|
114
|
-
loc,
|
|
115
|
-
};
|
|
116
|
-
pageContent += chunk;
|
|
117
|
-
documents.push(new documents_1.Document({
|
|
118
|
-
pageContent,
|
|
119
|
-
metadata: metadataWithLinesNumber,
|
|
120
|
-
}));
|
|
121
|
-
lineCounterIndex += newLinesCount;
|
|
122
|
-
prevChunk = chunk;
|
|
123
|
-
indexPrevChunk = indexChunk;
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
return documents;
|
|
127
|
-
}
|
|
128
|
-
numberOfNewLines(text, start, end) {
|
|
129
|
-
const textSection = text.slice(start, end);
|
|
130
|
-
return (textSection.match(/\n/g) || []).length;
|
|
131
|
-
}
|
|
132
|
-
async splitDocuments(documents, chunkHeaderOptions = {}) {
|
|
133
|
-
const selectedDocuments = documents.filter((doc) => doc.pageContent !== undefined);
|
|
134
|
-
const texts = selectedDocuments.map((doc) => doc.pageContent);
|
|
135
|
-
const metadatas = selectedDocuments.map((doc) => doc.metadata);
|
|
136
|
-
return this.createDocuments(texts, metadatas, chunkHeaderOptions);
|
|
137
|
-
}
|
|
138
|
-
joinDocs(docs, separator) {
|
|
139
|
-
const text = docs.join(separator).trim();
|
|
140
|
-
return text === "" ? null : text;
|
|
141
|
-
}
|
|
142
|
-
async mergeSplits(splits, separator) {
|
|
143
|
-
const docs = [];
|
|
144
|
-
const currentDoc = [];
|
|
145
|
-
let total = 0;
|
|
146
|
-
for (const d of splits) {
|
|
147
|
-
const _len = await this.lengthFunction(d);
|
|
148
|
-
if (total + _len + currentDoc.length * separator.length >
|
|
149
|
-
this.chunkSize) {
|
|
150
|
-
if (total > this.chunkSize) {
|
|
151
|
-
console.warn(`Created a chunk of size ${total}, +
|
|
152
|
-
which is longer than the specified ${this.chunkSize}`);
|
|
153
|
-
}
|
|
154
|
-
if (currentDoc.length > 0) {
|
|
155
|
-
const doc = this.joinDocs(currentDoc, separator);
|
|
156
|
-
if (doc !== null) {
|
|
157
|
-
docs.push(doc);
|
|
158
|
-
}
|
|
159
|
-
// Keep on popping if:
|
|
160
|
-
// - we have a larger chunk than in the chunk overlap
|
|
161
|
-
// - or if we still have any chunks and the length is long
|
|
162
|
-
while (total > this.chunkOverlap ||
|
|
163
|
-
(total + _len + currentDoc.length * separator.length >
|
|
164
|
-
this.chunkSize &&
|
|
165
|
-
total > 0)) {
|
|
166
|
-
total -= await this.lengthFunction(currentDoc[0]);
|
|
167
|
-
currentDoc.shift();
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
}
|
|
171
|
-
currentDoc.push(d);
|
|
172
|
-
total += _len;
|
|
173
|
-
}
|
|
174
|
-
const doc = this.joinDocs(currentDoc, separator);
|
|
175
|
-
if (doc !== null) {
|
|
176
|
-
docs.push(doc);
|
|
177
|
-
}
|
|
178
|
-
return docs;
|
|
179
|
-
}
|
|
180
|
-
}
|
|
181
|
-
exports.TextSplitter = TextSplitter;
|
|
182
|
-
class CharacterTextSplitter extends TextSplitter {
|
|
183
|
-
static lc_name() {
|
|
184
|
-
return "CharacterTextSplitter";
|
|
185
|
-
}
|
|
186
|
-
constructor(fields) {
|
|
187
|
-
super(fields);
|
|
188
|
-
Object.defineProperty(this, "separator", {
|
|
189
|
-
enumerable: true,
|
|
190
|
-
configurable: true,
|
|
191
|
-
writable: true,
|
|
192
|
-
value: "\n\n"
|
|
193
|
-
});
|
|
194
|
-
this.separator = fields?.separator ?? this.separator;
|
|
195
|
-
}
|
|
196
|
-
async splitText(text) {
|
|
197
|
-
// First we naively split the large input into a bunch of smaller ones.
|
|
198
|
-
const splits = this.splitOnSeparator(text, this.separator);
|
|
199
|
-
return this.mergeSplits(splits, this.keepSeparator ? "" : this.separator);
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
exports.CharacterTextSplitter = CharacterTextSplitter;
|
|
203
|
-
exports.SupportedTextSplitterLanguages = [
|
|
204
|
-
"cpp",
|
|
205
|
-
"go",
|
|
206
|
-
"java",
|
|
207
|
-
"js",
|
|
208
|
-
"php",
|
|
209
|
-
"proto",
|
|
210
|
-
"python",
|
|
211
|
-
"rst",
|
|
212
|
-
"ruby",
|
|
213
|
-
"rust",
|
|
214
|
-
"scala",
|
|
215
|
-
"swift",
|
|
216
|
-
"markdown",
|
|
217
|
-
"latex",
|
|
218
|
-
"html",
|
|
219
|
-
"sol",
|
|
220
|
-
];
|
|
221
|
-
class RecursiveCharacterTextSplitter extends TextSplitter {
|
|
222
|
-
static lc_name() {
|
|
223
|
-
return "RecursiveCharacterTextSplitter";
|
|
224
|
-
}
|
|
225
|
-
constructor(fields) {
|
|
226
|
-
super(fields);
|
|
227
|
-
Object.defineProperty(this, "separators", {
|
|
228
|
-
enumerable: true,
|
|
229
|
-
configurable: true,
|
|
230
|
-
writable: true,
|
|
231
|
-
value: ["\n\n", "\n", " ", ""]
|
|
232
|
-
});
|
|
233
|
-
this.separators = fields?.separators ?? this.separators;
|
|
234
|
-
this.keepSeparator = fields?.keepSeparator ?? true;
|
|
235
|
-
}
|
|
236
|
-
async _splitText(text, separators) {
|
|
237
|
-
const finalChunks = [];
|
|
238
|
-
// Get appropriate separator to use
|
|
239
|
-
let separator = separators[separators.length - 1];
|
|
240
|
-
let newSeparators;
|
|
241
|
-
for (let i = 0; i < separators.length; i += 1) {
|
|
242
|
-
const s = separators[i];
|
|
243
|
-
if (s === "") {
|
|
244
|
-
separator = s;
|
|
245
|
-
break;
|
|
246
|
-
}
|
|
247
|
-
if (text.includes(s)) {
|
|
248
|
-
separator = s;
|
|
249
|
-
newSeparators = separators.slice(i + 1);
|
|
250
|
-
break;
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
// Now that we have the separator, split the text
|
|
254
|
-
const splits = this.splitOnSeparator(text, separator);
|
|
255
|
-
// Now go merging things, recursively splitting longer texts.
|
|
256
|
-
let goodSplits = [];
|
|
257
|
-
const _separator = this.keepSeparator ? "" : separator;
|
|
258
|
-
for (const s of splits) {
|
|
259
|
-
if ((await this.lengthFunction(s)) < this.chunkSize) {
|
|
260
|
-
goodSplits.push(s);
|
|
261
|
-
}
|
|
262
|
-
else {
|
|
263
|
-
if (goodSplits.length) {
|
|
264
|
-
const mergedText = await this.mergeSplits(goodSplits, _separator);
|
|
265
|
-
finalChunks.push(...mergedText);
|
|
266
|
-
goodSplits = [];
|
|
267
|
-
}
|
|
268
|
-
if (!newSeparators) {
|
|
269
|
-
finalChunks.push(s);
|
|
270
|
-
}
|
|
271
|
-
else {
|
|
272
|
-
const otherInfo = await this._splitText(s, newSeparators);
|
|
273
|
-
finalChunks.push(...otherInfo);
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
}
|
|
277
|
-
if (goodSplits.length) {
|
|
278
|
-
const mergedText = await this.mergeSplits(goodSplits, _separator);
|
|
279
|
-
finalChunks.push(...mergedText);
|
|
280
|
-
}
|
|
281
|
-
return finalChunks;
|
|
282
|
-
}
|
|
283
|
-
async splitText(text) {
|
|
284
|
-
return this._splitText(text, this.separators);
|
|
285
|
-
}
|
|
286
|
-
static fromLanguage(language, options) {
|
|
287
|
-
return new RecursiveCharacterTextSplitter({
|
|
288
|
-
...options,
|
|
289
|
-
separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage(language),
|
|
290
|
-
});
|
|
291
|
-
}
|
|
292
|
-
static getSeparatorsForLanguage(language) {
|
|
293
|
-
if (language === "cpp") {
|
|
294
|
-
return [
|
|
295
|
-
// Split along class definitions
|
|
296
|
-
"\nclass ",
|
|
297
|
-
// Split along function definitions
|
|
298
|
-
"\nvoid ",
|
|
299
|
-
"\nint ",
|
|
300
|
-
"\nfloat ",
|
|
301
|
-
"\ndouble ",
|
|
302
|
-
// Split along control flow statements
|
|
303
|
-
"\nif ",
|
|
304
|
-
"\nfor ",
|
|
305
|
-
"\nwhile ",
|
|
306
|
-
"\nswitch ",
|
|
307
|
-
"\ncase ",
|
|
308
|
-
// Split by the normal type of lines
|
|
309
|
-
"\n\n",
|
|
310
|
-
"\n",
|
|
311
|
-
" ",
|
|
312
|
-
"",
|
|
313
|
-
];
|
|
314
|
-
}
|
|
315
|
-
else if (language === "go") {
|
|
316
|
-
return [
|
|
317
|
-
// Split along function definitions
|
|
318
|
-
"\nfunc ",
|
|
319
|
-
"\nvar ",
|
|
320
|
-
"\nconst ",
|
|
321
|
-
"\ntype ",
|
|
322
|
-
// Split along control flow statements
|
|
323
|
-
"\nif ",
|
|
324
|
-
"\nfor ",
|
|
325
|
-
"\nswitch ",
|
|
326
|
-
"\ncase ",
|
|
327
|
-
// Split by the normal type of lines
|
|
328
|
-
"\n\n",
|
|
329
|
-
"\n",
|
|
330
|
-
" ",
|
|
331
|
-
"",
|
|
332
|
-
];
|
|
333
|
-
}
|
|
334
|
-
else if (language === "java") {
|
|
335
|
-
return [
|
|
336
|
-
// Split along class definitions
|
|
337
|
-
"\nclass ",
|
|
338
|
-
// Split along method definitions
|
|
339
|
-
"\npublic ",
|
|
340
|
-
"\nprotected ",
|
|
341
|
-
"\nprivate ",
|
|
342
|
-
"\nstatic ",
|
|
343
|
-
// Split along control flow statements
|
|
344
|
-
"\nif ",
|
|
345
|
-
"\nfor ",
|
|
346
|
-
"\nwhile ",
|
|
347
|
-
"\nswitch ",
|
|
348
|
-
"\ncase ",
|
|
349
|
-
// Split by the normal type of lines
|
|
350
|
-
"\n\n",
|
|
351
|
-
"\n",
|
|
352
|
-
" ",
|
|
353
|
-
"",
|
|
354
|
-
];
|
|
355
|
-
}
|
|
356
|
-
else if (language === "js") {
|
|
357
|
-
return [
|
|
358
|
-
// Split along function definitions
|
|
359
|
-
"\nfunction ",
|
|
360
|
-
"\nconst ",
|
|
361
|
-
"\nlet ",
|
|
362
|
-
"\nvar ",
|
|
363
|
-
"\nclass ",
|
|
364
|
-
// Split along control flow statements
|
|
365
|
-
"\nif ",
|
|
366
|
-
"\nfor ",
|
|
367
|
-
"\nwhile ",
|
|
368
|
-
"\nswitch ",
|
|
369
|
-
"\ncase ",
|
|
370
|
-
"\ndefault ",
|
|
371
|
-
// Split by the normal type of lines
|
|
372
|
-
"\n\n",
|
|
373
|
-
"\n",
|
|
374
|
-
" ",
|
|
375
|
-
"",
|
|
376
|
-
];
|
|
377
|
-
}
|
|
378
|
-
else if (language === "php") {
|
|
379
|
-
return [
|
|
380
|
-
// Split along function definitions
|
|
381
|
-
"\nfunction ",
|
|
382
|
-
// Split along class definitions
|
|
383
|
-
"\nclass ",
|
|
384
|
-
// Split along control flow statements
|
|
385
|
-
"\nif ",
|
|
386
|
-
"\nforeach ",
|
|
387
|
-
"\nwhile ",
|
|
388
|
-
"\ndo ",
|
|
389
|
-
"\nswitch ",
|
|
390
|
-
"\ncase ",
|
|
391
|
-
// Split by the normal type of lines
|
|
392
|
-
"\n\n",
|
|
393
|
-
"\n",
|
|
394
|
-
" ",
|
|
395
|
-
"",
|
|
396
|
-
];
|
|
397
|
-
}
|
|
398
|
-
else if (language === "proto") {
|
|
399
|
-
return [
|
|
400
|
-
// Split along message definitions
|
|
401
|
-
"\nmessage ",
|
|
402
|
-
// Split along service definitions
|
|
403
|
-
"\nservice ",
|
|
404
|
-
// Split along enum definitions
|
|
405
|
-
"\nenum ",
|
|
406
|
-
// Split along option definitions
|
|
407
|
-
"\noption ",
|
|
408
|
-
// Split along import statements
|
|
409
|
-
"\nimport ",
|
|
410
|
-
// Split along syntax declarations
|
|
411
|
-
"\nsyntax ",
|
|
412
|
-
// Split by the normal type of lines
|
|
413
|
-
"\n\n",
|
|
414
|
-
"\n",
|
|
415
|
-
" ",
|
|
416
|
-
"",
|
|
417
|
-
];
|
|
418
|
-
}
|
|
419
|
-
else if (language === "python") {
|
|
420
|
-
return [
|
|
421
|
-
// First, try to split along class definitions
|
|
422
|
-
"\nclass ",
|
|
423
|
-
"\ndef ",
|
|
424
|
-
"\n\tdef ",
|
|
425
|
-
// Now split by the normal type of lines
|
|
426
|
-
"\n\n",
|
|
427
|
-
"\n",
|
|
428
|
-
" ",
|
|
429
|
-
"",
|
|
430
|
-
];
|
|
431
|
-
}
|
|
432
|
-
else if (language === "rst") {
|
|
433
|
-
return [
|
|
434
|
-
// Split along section titles
|
|
435
|
-
"\n===\n",
|
|
436
|
-
"\n---\n",
|
|
437
|
-
"\n***\n",
|
|
438
|
-
// Split along directive markers
|
|
439
|
-
"\n.. ",
|
|
440
|
-
// Split by the normal type of lines
|
|
441
|
-
"\n\n",
|
|
442
|
-
"\n",
|
|
443
|
-
" ",
|
|
444
|
-
"",
|
|
445
|
-
];
|
|
446
|
-
}
|
|
447
|
-
else if (language === "ruby") {
|
|
448
|
-
return [
|
|
449
|
-
// Split along method definitions
|
|
450
|
-
"\ndef ",
|
|
451
|
-
"\nclass ",
|
|
452
|
-
// Split along control flow statements
|
|
453
|
-
"\nif ",
|
|
454
|
-
"\nunless ",
|
|
455
|
-
"\nwhile ",
|
|
456
|
-
"\nfor ",
|
|
457
|
-
"\ndo ",
|
|
458
|
-
"\nbegin ",
|
|
459
|
-
"\nrescue ",
|
|
460
|
-
// Split by the normal type of lines
|
|
461
|
-
"\n\n",
|
|
462
|
-
"\n",
|
|
463
|
-
" ",
|
|
464
|
-
"",
|
|
465
|
-
];
|
|
466
|
-
}
|
|
467
|
-
else if (language === "rust") {
|
|
468
|
-
return [
|
|
469
|
-
// Split along function definitions
|
|
470
|
-
"\nfn ",
|
|
471
|
-
"\nconst ",
|
|
472
|
-
"\nlet ",
|
|
473
|
-
// Split along control flow statements
|
|
474
|
-
"\nif ",
|
|
475
|
-
"\nwhile ",
|
|
476
|
-
"\nfor ",
|
|
477
|
-
"\nloop ",
|
|
478
|
-
"\nmatch ",
|
|
479
|
-
"\nconst ",
|
|
480
|
-
// Split by the normal type of lines
|
|
481
|
-
"\n\n",
|
|
482
|
-
"\n",
|
|
483
|
-
" ",
|
|
484
|
-
"",
|
|
485
|
-
];
|
|
486
|
-
}
|
|
487
|
-
else if (language === "scala") {
|
|
488
|
-
return [
|
|
489
|
-
// Split along class definitions
|
|
490
|
-
"\nclass ",
|
|
491
|
-
"\nobject ",
|
|
492
|
-
// Split along method definitions
|
|
493
|
-
"\ndef ",
|
|
494
|
-
"\nval ",
|
|
495
|
-
"\nvar ",
|
|
496
|
-
// Split along control flow statements
|
|
497
|
-
"\nif ",
|
|
498
|
-
"\nfor ",
|
|
499
|
-
"\nwhile ",
|
|
500
|
-
"\nmatch ",
|
|
501
|
-
"\ncase ",
|
|
502
|
-
// Split by the normal type of lines
|
|
503
|
-
"\n\n",
|
|
504
|
-
"\n",
|
|
505
|
-
" ",
|
|
506
|
-
"",
|
|
507
|
-
];
|
|
508
|
-
}
|
|
509
|
-
else if (language === "swift") {
|
|
510
|
-
return [
|
|
511
|
-
// Split along function definitions
|
|
512
|
-
"\nfunc ",
|
|
513
|
-
// Split along class definitions
|
|
514
|
-
"\nclass ",
|
|
515
|
-
"\nstruct ",
|
|
516
|
-
"\nenum ",
|
|
517
|
-
// Split along control flow statements
|
|
518
|
-
"\nif ",
|
|
519
|
-
"\nfor ",
|
|
520
|
-
"\nwhile ",
|
|
521
|
-
"\ndo ",
|
|
522
|
-
"\nswitch ",
|
|
523
|
-
"\ncase ",
|
|
524
|
-
// Split by the normal type of lines
|
|
525
|
-
"\n\n",
|
|
526
|
-
"\n",
|
|
527
|
-
" ",
|
|
528
|
-
"",
|
|
529
|
-
];
|
|
530
|
-
}
|
|
531
|
-
else if (language === "markdown") {
|
|
532
|
-
return [
|
|
533
|
-
// First, try to split along Markdown headings (starting with level 2)
|
|
534
|
-
"\n## ",
|
|
535
|
-
"\n### ",
|
|
536
|
-
"\n#### ",
|
|
537
|
-
"\n##### ",
|
|
538
|
-
"\n###### ",
|
|
539
|
-
// Note the alternative syntax for headings (below) is not handled here
|
|
540
|
-
// Heading level 2
|
|
541
|
-
// ---------------
|
|
542
|
-
// End of code block
|
|
543
|
-
"```\n\n",
|
|
544
|
-
// Horizontal lines
|
|
545
|
-
"\n\n***\n\n",
|
|
546
|
-
"\n\n---\n\n",
|
|
547
|
-
"\n\n___\n\n",
|
|
548
|
-
// Note that this splitter doesn't handle horizontal lines defined
|
|
549
|
-
// by *three or more* of ***, ---, or ___, but this is not handled
|
|
550
|
-
"\n\n",
|
|
551
|
-
"\n",
|
|
552
|
-
" ",
|
|
553
|
-
"",
|
|
554
|
-
];
|
|
555
|
-
}
|
|
556
|
-
else if (language === "latex") {
|
|
557
|
-
return [
|
|
558
|
-
// First, try to split along Latex sections
|
|
559
|
-
"\n\\chapter{",
|
|
560
|
-
"\n\\section{",
|
|
561
|
-
"\n\\subsection{",
|
|
562
|
-
"\n\\subsubsection{",
|
|
563
|
-
// Now split by environments
|
|
564
|
-
"\n\\begin{enumerate}",
|
|
565
|
-
"\n\\begin{itemize}",
|
|
566
|
-
"\n\\begin{description}",
|
|
567
|
-
"\n\\begin{list}",
|
|
568
|
-
"\n\\begin{quote}",
|
|
569
|
-
"\n\\begin{quotation}",
|
|
570
|
-
"\n\\begin{verse}",
|
|
571
|
-
"\n\\begin{verbatim}",
|
|
572
|
-
// Now split by math environments
|
|
573
|
-
"\n\\begin{align}",
|
|
574
|
-
"$$",
|
|
575
|
-
"$",
|
|
576
|
-
// Now split by the normal type of lines
|
|
577
|
-
"\n\n",
|
|
578
|
-
"\n",
|
|
579
|
-
" ",
|
|
580
|
-
"",
|
|
581
|
-
];
|
|
582
|
-
}
|
|
583
|
-
else if (language === "html") {
|
|
584
|
-
return [
|
|
585
|
-
// First, try to split along HTML tags
|
|
586
|
-
"<body>",
|
|
587
|
-
"<div>",
|
|
588
|
-
"<p>",
|
|
589
|
-
"<br>",
|
|
590
|
-
"<li>",
|
|
591
|
-
"<h1>",
|
|
592
|
-
"<h2>",
|
|
593
|
-
"<h3>",
|
|
594
|
-
"<h4>",
|
|
595
|
-
"<h5>",
|
|
596
|
-
"<h6>",
|
|
597
|
-
"<span>",
|
|
598
|
-
"<table>",
|
|
599
|
-
"<tr>",
|
|
600
|
-
"<td>",
|
|
601
|
-
"<th>",
|
|
602
|
-
"<ul>",
|
|
603
|
-
"<ol>",
|
|
604
|
-
"<header>",
|
|
605
|
-
"<footer>",
|
|
606
|
-
"<nav>",
|
|
607
|
-
// Head
|
|
608
|
-
"<head>",
|
|
609
|
-
"<style>",
|
|
610
|
-
"<script>",
|
|
611
|
-
"<meta>",
|
|
612
|
-
"<title>",
|
|
613
|
-
// Normal type of lines
|
|
614
|
-
" ",
|
|
615
|
-
"",
|
|
616
|
-
];
|
|
617
|
-
}
|
|
618
|
-
else if (language === "sol") {
|
|
619
|
-
return [
|
|
620
|
-
// Split along compiler informations definitions
|
|
621
|
-
"\npragma ",
|
|
622
|
-
"\nusing ",
|
|
623
|
-
// Split along contract definitions
|
|
624
|
-
"\ncontract ",
|
|
625
|
-
"\ninterface ",
|
|
626
|
-
"\nlibrary ",
|
|
627
|
-
// Split along method definitions
|
|
628
|
-
"\nconstructor ",
|
|
629
|
-
"\ntype ",
|
|
630
|
-
"\nfunction ",
|
|
631
|
-
"\nevent ",
|
|
632
|
-
"\nmodifier ",
|
|
633
|
-
"\nerror ",
|
|
634
|
-
"\nstruct ",
|
|
635
|
-
"\nenum ",
|
|
636
|
-
// Split along control flow statements
|
|
637
|
-
"\nif ",
|
|
638
|
-
"\nfor ",
|
|
639
|
-
"\nwhile ",
|
|
640
|
-
"\ndo while ",
|
|
641
|
-
"\nassembly ",
|
|
642
|
-
// Split by the normal type of lines
|
|
643
|
-
"\n\n",
|
|
644
|
-
"\n",
|
|
645
|
-
" ",
|
|
646
|
-
"",
|
|
647
|
-
];
|
|
648
|
-
}
|
|
649
|
-
else {
|
|
650
|
-
throw new Error(`Language ${language} is not supported.`);
|
|
651
|
-
}
|
|
652
|
-
}
|
|
653
|
-
}
|
|
654
|
-
exports.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter;
|
|
655
|
-
/**
|
|
656
|
-
* Implementation of splitter which looks at tokens.
|
|
657
|
-
*/
|
|
658
|
-
class TokenTextSplitter extends TextSplitter {
|
|
659
|
-
static lc_name() {
|
|
660
|
-
return "TokenTextSplitter";
|
|
661
|
-
}
|
|
662
|
-
constructor(fields) {
|
|
663
|
-
super(fields);
|
|
664
|
-
Object.defineProperty(this, "encodingName", {
|
|
665
|
-
enumerable: true,
|
|
666
|
-
configurable: true,
|
|
667
|
-
writable: true,
|
|
668
|
-
value: void 0
|
|
669
|
-
});
|
|
670
|
-
Object.defineProperty(this, "allowedSpecial", {
|
|
671
|
-
enumerable: true,
|
|
672
|
-
configurable: true,
|
|
673
|
-
writable: true,
|
|
674
|
-
value: void 0
|
|
675
|
-
});
|
|
676
|
-
Object.defineProperty(this, "disallowedSpecial", {
|
|
677
|
-
enumerable: true,
|
|
678
|
-
configurable: true,
|
|
679
|
-
writable: true,
|
|
680
|
-
value: void 0
|
|
681
|
-
});
|
|
682
|
-
Object.defineProperty(this, "tokenizer", {
|
|
683
|
-
enumerable: true,
|
|
684
|
-
configurable: true,
|
|
685
|
-
writable: true,
|
|
686
|
-
value: void 0
|
|
687
|
-
});
|
|
688
|
-
this.encodingName = fields?.encodingName ?? "gpt2";
|
|
689
|
-
this.allowedSpecial = fields?.allowedSpecial ?? [];
|
|
690
|
-
this.disallowedSpecial = fields?.disallowedSpecial ?? "all";
|
|
691
|
-
}
|
|
692
|
-
async splitText(text) {
|
|
693
|
-
if (!this.tokenizer) {
|
|
694
|
-
this.tokenizer = await (0, tiktoken_1.getEncoding)(this.encodingName);
|
|
695
|
-
}
|
|
696
|
-
const splits = [];
|
|
697
|
-
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
|
|
698
|
-
let start_idx = 0;
|
|
699
|
-
while (start_idx < input_ids.length) {
|
|
700
|
-
if (start_idx > 0) {
|
|
701
|
-
start_idx -= this.chunkOverlap;
|
|
702
|
-
}
|
|
703
|
-
const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
|
|
704
|
-
const chunk_ids = input_ids.slice(start_idx, end_idx);
|
|
705
|
-
splits.push(this.tokenizer.decode(chunk_ids));
|
|
706
|
-
start_idx = end_idx;
|
|
707
|
-
}
|
|
708
|
-
return splits;
|
|
709
|
-
}
|
|
710
|
-
}
|
|
711
|
-
exports.TokenTextSplitter = TokenTextSplitter;
|
|
712
|
-
class MarkdownTextSplitter extends RecursiveCharacterTextSplitter {
|
|
713
|
-
constructor(fields) {
|
|
714
|
-
super({
|
|
715
|
-
...fields,
|
|
716
|
-
separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("markdown"),
|
|
717
|
-
});
|
|
718
|
-
}
|
|
719
|
-
}
|
|
720
|
-
exports.MarkdownTextSplitter = MarkdownTextSplitter;
|
|
721
|
-
class LatexTextSplitter extends RecursiveCharacterTextSplitter {
|
|
722
|
-
constructor(fields) {
|
|
723
|
-
super({
|
|
724
|
-
...fields,
|
|
725
|
-
separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("latex"),
|
|
726
|
-
});
|
|
727
|
-
}
|
|
728
|
-
}
|
|
729
|
-
exports.LatexTextSplitter = LatexTextSplitter;
|
|
17
|
+
__exportStar(require("@langchain/textsplitters"), exports);
|