langchain 0.1.35 → 0.1.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/dist/chains/conversational_retrieval_chain.cjs +61 -19
  2. package/dist/chains/conversational_retrieval_chain.d.ts +61 -19
  3. package/dist/chains/conversational_retrieval_chain.js +61 -19
  4. package/dist/chains/llm_chain.cjs +10 -5
  5. package/dist/chains/llm_chain.d.ts +10 -5
  6. package/dist/chains/llm_chain.js +10 -5
  7. package/dist/chains/openai_functions/base.cjs +2 -0
  8. package/dist/chains/openai_functions/base.d.ts +2 -0
  9. package/dist/chains/openai_functions/base.js +2 -0
  10. package/dist/chains/query_constructor/index.cjs +5 -8
  11. package/dist/chains/query_constructor/index.d.ts +5 -4
  12. package/dist/chains/query_constructor/index.js +3 -6
  13. package/dist/chains/query_constructor/ir.cjs +15 -139
  14. package/dist/chains/query_constructor/ir.d.ts +1 -138
  15. package/dist/chains/query_constructor/ir.js +1 -132
  16. package/dist/chains/query_constructor/prompt.cjs +2 -2
  17. package/dist/chains/query_constructor/prompt.d.ts +1 -1
  18. package/dist/chains/query_constructor/prompt.js +1 -1
  19. package/dist/chains/retrieval_qa.cjs +23 -14
  20. package/dist/chains/retrieval_qa.d.ts +23 -14
  21. package/dist/chains/retrieval_qa.js +23 -14
  22. package/dist/document_loaders/fs/unstructured.cjs +1 -1
  23. package/dist/document_loaders/fs/unstructured.js +1 -1
  24. package/dist/document_loaders/web/browserbase.cjs +87 -0
  25. package/dist/document_loaders/web/browserbase.d.ts +49 -0
  26. package/dist/document_loaders/web/browserbase.js +80 -0
  27. package/dist/document_loaders/web/firecrawl.cjs +88 -0
  28. package/dist/document_loaders/web/firecrawl.d.ts +48 -0
  29. package/dist/document_loaders/web/firecrawl.js +81 -0
  30. package/dist/document_loaders/web/s3.cjs +2 -2
  31. package/dist/document_loaders/web/s3.js +2 -2
  32. package/dist/load/import_constants.cjs +2 -0
  33. package/dist/load/import_constants.js +2 -0
  34. package/dist/output_parsers/expression.cjs +1 -1
  35. package/dist/output_parsers/expression.d.ts +1 -1
  36. package/dist/output_parsers/expression.js +1 -1
  37. package/dist/retrievers/self_query/base.cjs +3 -136
  38. package/dist/retrievers/self_query/base.d.ts +1 -69
  39. package/dist/retrievers/self_query/base.js +1 -134
  40. package/dist/retrievers/self_query/chroma.cjs +9 -10
  41. package/dist/retrievers/self_query/chroma.d.ts +1 -1
  42. package/dist/retrievers/self_query/chroma.js +1 -2
  43. package/dist/retrievers/self_query/functional.cjs +2 -195
  44. package/dist/retrievers/self_query/functional.d.ts +1 -87
  45. package/dist/retrievers/self_query/functional.js +1 -194
  46. package/dist/retrievers/self_query/index.cjs +9 -13
  47. package/dist/retrievers/self_query/index.d.ts +11 -8
  48. package/dist/retrievers/self_query/index.js +7 -11
  49. package/dist/retrievers/self_query/pinecone.cjs +9 -10
  50. package/dist/retrievers/self_query/pinecone.d.ts +1 -1
  51. package/dist/retrievers/self_query/pinecone.js +1 -2
  52. package/dist/retrievers/self_query/supabase.cjs +28 -30
  53. package/dist/retrievers/self_query/supabase.d.ts +1 -2
  54. package/dist/retrievers/self_query/supabase.js +1 -3
  55. package/dist/retrievers/self_query/supabase_utils.cjs +2 -2
  56. package/dist/retrievers/self_query/supabase_utils.d.ts +1 -1
  57. package/dist/retrievers/self_query/supabase_utils.js +1 -1
  58. package/dist/retrievers/self_query/vectara.cjs +15 -17
  59. package/dist/retrievers/self_query/vectara.d.ts +1 -2
  60. package/dist/retrievers/self_query/vectara.js +1 -3
  61. package/dist/retrievers/self_query/weaviate.cjs +19 -21
  62. package/dist/retrievers/self_query/weaviate.d.ts +1 -2
  63. package/dist/retrievers/self_query/weaviate.js +1 -3
  64. package/dist/smith/config.d.ts +4 -4
  65. package/dist/storage/in_memory.cjs +2 -81
  66. package/dist/storage/in_memory.d.ts +1 -49
  67. package/dist/storage/in_memory.js +1 -80
  68. package/dist/text_splitter.cjs +15 -727
  69. package/dist/text_splitter.d.ts +1 -77
  70. package/dist/text_splitter.js +1 -720
  71. package/dist/vectorstores/qdrant.cjs +2 -0
  72. package/dist/vectorstores/qdrant.js +2 -0
  73. package/document_loaders/web/browserbase.cjs +1 -0
  74. package/document_loaders/web/browserbase.d.cts +1 -0
  75. package/document_loaders/web/browserbase.d.ts +1 -0
  76. package/document_loaders/web/browserbase.js +1 -0
  77. package/document_loaders/web/firecrawl.cjs +1 -0
  78. package/document_loaders/web/firecrawl.d.cts +1 -0
  79. package/document_loaders/web/firecrawl.d.ts +1 -0
  80. package/document_loaders/web/firecrawl.js +1 -0
  81. package/package.json +40 -3
  82. package/dist/retrievers/self_query/utils.cjs +0 -94
  83. package/dist/retrievers/self_query/utils.d.ts +0 -29
  84. package/dist/retrievers/self_query/utils.js +0 -85
@@ -1,720 +1 @@
1
- import { Document, BaseDocumentTransformer } from "@langchain/core/documents";
2
- import { getEncoding } from "@langchain/core/utils/tiktoken";
3
- export class TextSplitter extends BaseDocumentTransformer {
4
- constructor(fields) {
5
- super(fields);
6
- Object.defineProperty(this, "lc_namespace", {
7
- enumerable: true,
8
- configurable: true,
9
- writable: true,
10
- value: ["langchain", "document_transformers", "text_splitters"]
11
- });
12
- Object.defineProperty(this, "chunkSize", {
13
- enumerable: true,
14
- configurable: true,
15
- writable: true,
16
- value: 1000
17
- });
18
- Object.defineProperty(this, "chunkOverlap", {
19
- enumerable: true,
20
- configurable: true,
21
- writable: true,
22
- value: 200
23
- });
24
- Object.defineProperty(this, "keepSeparator", {
25
- enumerable: true,
26
- configurable: true,
27
- writable: true,
28
- value: false
29
- });
30
- Object.defineProperty(this, "lengthFunction", {
31
- enumerable: true,
32
- configurable: true,
33
- writable: true,
34
- value: void 0
35
- });
36
- this.chunkSize = fields?.chunkSize ?? this.chunkSize;
37
- this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;
38
- this.keepSeparator = fields?.keepSeparator ?? this.keepSeparator;
39
- this.lengthFunction =
40
- fields?.lengthFunction ?? ((text) => text.length);
41
- if (this.chunkOverlap >= this.chunkSize) {
42
- throw new Error("Cannot have chunkOverlap >= chunkSize");
43
- }
44
- }
45
- async transformDocuments(documents, chunkHeaderOptions = {}) {
46
- return this.splitDocuments(documents, chunkHeaderOptions);
47
- }
48
- splitOnSeparator(text, separator) {
49
- let splits;
50
- if (separator) {
51
- if (this.keepSeparator) {
52
- const regexEscapedSeparator = separator.replace(/[/\-\\^$*+?.()|[\]{}]/g, "\\$&");
53
- splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));
54
- }
55
- else {
56
- splits = text.split(separator);
57
- }
58
- }
59
- else {
60
- splits = text.split("");
61
- }
62
- return splits.filter((s) => s !== "");
63
- }
64
- async createDocuments(texts,
65
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
66
- metadatas = [], chunkHeaderOptions = {}) {
67
- // if no metadata is provided, we create an empty one for each text
68
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
69
- const _metadatas = metadatas.length > 0
70
- ? metadatas
71
- : [...Array(texts.length)].map(() => ({}));
72
- const { chunkHeader = "", chunkOverlapHeader = "(cont'd) ", appendChunkOverlapHeader = false, } = chunkHeaderOptions;
73
- const documents = new Array();
74
- for (let i = 0; i < texts.length; i += 1) {
75
- const text = texts[i];
76
- let lineCounterIndex = 1;
77
- let prevChunk = null;
78
- let indexPrevChunk = -1;
79
- for (const chunk of await this.splitText(text)) {
80
- let pageContent = chunkHeader;
81
- // we need to count the \n that are in the text before getting removed by the splitting
82
- const indexChunk = text.indexOf(chunk, indexPrevChunk + 1);
83
- if (prevChunk === null) {
84
- const newLinesBeforeFirstChunk = this.numberOfNewLines(text, 0, indexChunk);
85
- lineCounterIndex += newLinesBeforeFirstChunk;
86
- }
87
- else {
88
- const indexEndPrevChunk = indexPrevChunk + (await this.lengthFunction(prevChunk));
89
- if (indexEndPrevChunk < indexChunk) {
90
- const numberOfIntermediateNewLines = this.numberOfNewLines(text, indexEndPrevChunk, indexChunk);
91
- lineCounterIndex += numberOfIntermediateNewLines;
92
- }
93
- else if (indexEndPrevChunk > indexChunk) {
94
- const numberOfIntermediateNewLines = this.numberOfNewLines(text, indexChunk, indexEndPrevChunk);
95
- lineCounterIndex -= numberOfIntermediateNewLines;
96
- }
97
- if (appendChunkOverlapHeader) {
98
- pageContent += chunkOverlapHeader;
99
- }
100
- }
101
- const newLinesCount = this.numberOfNewLines(chunk);
102
- const loc = _metadatas[i].loc && typeof _metadatas[i].loc === "object"
103
- ? { ..._metadatas[i].loc }
104
- : {};
105
- loc.lines = {
106
- from: lineCounterIndex,
107
- to: lineCounterIndex + newLinesCount,
108
- };
109
- const metadataWithLinesNumber = {
110
- ..._metadatas[i],
111
- loc,
112
- };
113
- pageContent += chunk;
114
- documents.push(new Document({
115
- pageContent,
116
- metadata: metadataWithLinesNumber,
117
- }));
118
- lineCounterIndex += newLinesCount;
119
- prevChunk = chunk;
120
- indexPrevChunk = indexChunk;
121
- }
122
- }
123
- return documents;
124
- }
125
- numberOfNewLines(text, start, end) {
126
- const textSection = text.slice(start, end);
127
- return (textSection.match(/\n/g) || []).length;
128
- }
129
- async splitDocuments(documents, chunkHeaderOptions = {}) {
130
- const selectedDocuments = documents.filter((doc) => doc.pageContent !== undefined);
131
- const texts = selectedDocuments.map((doc) => doc.pageContent);
132
- const metadatas = selectedDocuments.map((doc) => doc.metadata);
133
- return this.createDocuments(texts, metadatas, chunkHeaderOptions);
134
- }
135
- joinDocs(docs, separator) {
136
- const text = docs.join(separator).trim();
137
- return text === "" ? null : text;
138
- }
139
- async mergeSplits(splits, separator) {
140
- const docs = [];
141
- const currentDoc = [];
142
- let total = 0;
143
- for (const d of splits) {
144
- const _len = await this.lengthFunction(d);
145
- if (total + _len + currentDoc.length * separator.length >
146
- this.chunkSize) {
147
- if (total > this.chunkSize) {
148
- console.warn(`Created a chunk of size ${total}, +
149
- which is longer than the specified ${this.chunkSize}`);
150
- }
151
- if (currentDoc.length > 0) {
152
- const doc = this.joinDocs(currentDoc, separator);
153
- if (doc !== null) {
154
- docs.push(doc);
155
- }
156
- // Keep on popping if:
157
- // - we have a larger chunk than in the chunk overlap
158
- // - or if we still have any chunks and the length is long
159
- while (total > this.chunkOverlap ||
160
- (total + _len + currentDoc.length * separator.length >
161
- this.chunkSize &&
162
- total > 0)) {
163
- total -= await this.lengthFunction(currentDoc[0]);
164
- currentDoc.shift();
165
- }
166
- }
167
- }
168
- currentDoc.push(d);
169
- total += _len;
170
- }
171
- const doc = this.joinDocs(currentDoc, separator);
172
- if (doc !== null) {
173
- docs.push(doc);
174
- }
175
- return docs;
176
- }
177
- }
178
- export class CharacterTextSplitter extends TextSplitter {
179
- static lc_name() {
180
- return "CharacterTextSplitter";
181
- }
182
- constructor(fields) {
183
- super(fields);
184
- Object.defineProperty(this, "separator", {
185
- enumerable: true,
186
- configurable: true,
187
- writable: true,
188
- value: "\n\n"
189
- });
190
- this.separator = fields?.separator ?? this.separator;
191
- }
192
- async splitText(text) {
193
- // First we naively split the large input into a bunch of smaller ones.
194
- const splits = this.splitOnSeparator(text, this.separator);
195
- return this.mergeSplits(splits, this.keepSeparator ? "" : this.separator);
196
- }
197
- }
198
- export const SupportedTextSplitterLanguages = [
199
- "cpp",
200
- "go",
201
- "java",
202
- "js",
203
- "php",
204
- "proto",
205
- "python",
206
- "rst",
207
- "ruby",
208
- "rust",
209
- "scala",
210
- "swift",
211
- "markdown",
212
- "latex",
213
- "html",
214
- "sol",
215
- ];
216
- export class RecursiveCharacterTextSplitter extends TextSplitter {
217
- static lc_name() {
218
- return "RecursiveCharacterTextSplitter";
219
- }
220
- constructor(fields) {
221
- super(fields);
222
- Object.defineProperty(this, "separators", {
223
- enumerable: true,
224
- configurable: true,
225
- writable: true,
226
- value: ["\n\n", "\n", " ", ""]
227
- });
228
- this.separators = fields?.separators ?? this.separators;
229
- this.keepSeparator = fields?.keepSeparator ?? true;
230
- }
231
- async _splitText(text, separators) {
232
- const finalChunks = [];
233
- // Get appropriate separator to use
234
- let separator = separators[separators.length - 1];
235
- let newSeparators;
236
- for (let i = 0; i < separators.length; i += 1) {
237
- const s = separators[i];
238
- if (s === "") {
239
- separator = s;
240
- break;
241
- }
242
- if (text.includes(s)) {
243
- separator = s;
244
- newSeparators = separators.slice(i + 1);
245
- break;
246
- }
247
- }
248
- // Now that we have the separator, split the text
249
- const splits = this.splitOnSeparator(text, separator);
250
- // Now go merging things, recursively splitting longer texts.
251
- let goodSplits = [];
252
- const _separator = this.keepSeparator ? "" : separator;
253
- for (const s of splits) {
254
- if ((await this.lengthFunction(s)) < this.chunkSize) {
255
- goodSplits.push(s);
256
- }
257
- else {
258
- if (goodSplits.length) {
259
- const mergedText = await this.mergeSplits(goodSplits, _separator);
260
- finalChunks.push(...mergedText);
261
- goodSplits = [];
262
- }
263
- if (!newSeparators) {
264
- finalChunks.push(s);
265
- }
266
- else {
267
- const otherInfo = await this._splitText(s, newSeparators);
268
- finalChunks.push(...otherInfo);
269
- }
270
- }
271
- }
272
- if (goodSplits.length) {
273
- const mergedText = await this.mergeSplits(goodSplits, _separator);
274
- finalChunks.push(...mergedText);
275
- }
276
- return finalChunks;
277
- }
278
- async splitText(text) {
279
- return this._splitText(text, this.separators);
280
- }
281
- static fromLanguage(language, options) {
282
- return new RecursiveCharacterTextSplitter({
283
- ...options,
284
- separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage(language),
285
- });
286
- }
287
- static getSeparatorsForLanguage(language) {
288
- if (language === "cpp") {
289
- return [
290
- // Split along class definitions
291
- "\nclass ",
292
- // Split along function definitions
293
- "\nvoid ",
294
- "\nint ",
295
- "\nfloat ",
296
- "\ndouble ",
297
- // Split along control flow statements
298
- "\nif ",
299
- "\nfor ",
300
- "\nwhile ",
301
- "\nswitch ",
302
- "\ncase ",
303
- // Split by the normal type of lines
304
- "\n\n",
305
- "\n",
306
- " ",
307
- "",
308
- ];
309
- }
310
- else if (language === "go") {
311
- return [
312
- // Split along function definitions
313
- "\nfunc ",
314
- "\nvar ",
315
- "\nconst ",
316
- "\ntype ",
317
- // Split along control flow statements
318
- "\nif ",
319
- "\nfor ",
320
- "\nswitch ",
321
- "\ncase ",
322
- // Split by the normal type of lines
323
- "\n\n",
324
- "\n",
325
- " ",
326
- "",
327
- ];
328
- }
329
- else if (language === "java") {
330
- return [
331
- // Split along class definitions
332
- "\nclass ",
333
- // Split along method definitions
334
- "\npublic ",
335
- "\nprotected ",
336
- "\nprivate ",
337
- "\nstatic ",
338
- // Split along control flow statements
339
- "\nif ",
340
- "\nfor ",
341
- "\nwhile ",
342
- "\nswitch ",
343
- "\ncase ",
344
- // Split by the normal type of lines
345
- "\n\n",
346
- "\n",
347
- " ",
348
- "",
349
- ];
350
- }
351
- else if (language === "js") {
352
- return [
353
- // Split along function definitions
354
- "\nfunction ",
355
- "\nconst ",
356
- "\nlet ",
357
- "\nvar ",
358
- "\nclass ",
359
- // Split along control flow statements
360
- "\nif ",
361
- "\nfor ",
362
- "\nwhile ",
363
- "\nswitch ",
364
- "\ncase ",
365
- "\ndefault ",
366
- // Split by the normal type of lines
367
- "\n\n",
368
- "\n",
369
- " ",
370
- "",
371
- ];
372
- }
373
- else if (language === "php") {
374
- return [
375
- // Split along function definitions
376
- "\nfunction ",
377
- // Split along class definitions
378
- "\nclass ",
379
- // Split along control flow statements
380
- "\nif ",
381
- "\nforeach ",
382
- "\nwhile ",
383
- "\ndo ",
384
- "\nswitch ",
385
- "\ncase ",
386
- // Split by the normal type of lines
387
- "\n\n",
388
- "\n",
389
- " ",
390
- "",
391
- ];
392
- }
393
- else if (language === "proto") {
394
- return [
395
- // Split along message definitions
396
- "\nmessage ",
397
- // Split along service definitions
398
- "\nservice ",
399
- // Split along enum definitions
400
- "\nenum ",
401
- // Split along option definitions
402
- "\noption ",
403
- // Split along import statements
404
- "\nimport ",
405
- // Split along syntax declarations
406
- "\nsyntax ",
407
- // Split by the normal type of lines
408
- "\n\n",
409
- "\n",
410
- " ",
411
- "",
412
- ];
413
- }
414
- else if (language === "python") {
415
- return [
416
- // First, try to split along class definitions
417
- "\nclass ",
418
- "\ndef ",
419
- "\n\tdef ",
420
- // Now split by the normal type of lines
421
- "\n\n",
422
- "\n",
423
- " ",
424
- "",
425
- ];
426
- }
427
- else if (language === "rst") {
428
- return [
429
- // Split along section titles
430
- "\n===\n",
431
- "\n---\n",
432
- "\n***\n",
433
- // Split along directive markers
434
- "\n.. ",
435
- // Split by the normal type of lines
436
- "\n\n",
437
- "\n",
438
- " ",
439
- "",
440
- ];
441
- }
442
- else if (language === "ruby") {
443
- return [
444
- // Split along method definitions
445
- "\ndef ",
446
- "\nclass ",
447
- // Split along control flow statements
448
- "\nif ",
449
- "\nunless ",
450
- "\nwhile ",
451
- "\nfor ",
452
- "\ndo ",
453
- "\nbegin ",
454
- "\nrescue ",
455
- // Split by the normal type of lines
456
- "\n\n",
457
- "\n",
458
- " ",
459
- "",
460
- ];
461
- }
462
- else if (language === "rust") {
463
- return [
464
- // Split along function definitions
465
- "\nfn ",
466
- "\nconst ",
467
- "\nlet ",
468
- // Split along control flow statements
469
- "\nif ",
470
- "\nwhile ",
471
- "\nfor ",
472
- "\nloop ",
473
- "\nmatch ",
474
- "\nconst ",
475
- // Split by the normal type of lines
476
- "\n\n",
477
- "\n",
478
- " ",
479
- "",
480
- ];
481
- }
482
- else if (language === "scala") {
483
- return [
484
- // Split along class definitions
485
- "\nclass ",
486
- "\nobject ",
487
- // Split along method definitions
488
- "\ndef ",
489
- "\nval ",
490
- "\nvar ",
491
- // Split along control flow statements
492
- "\nif ",
493
- "\nfor ",
494
- "\nwhile ",
495
- "\nmatch ",
496
- "\ncase ",
497
- // Split by the normal type of lines
498
- "\n\n",
499
- "\n",
500
- " ",
501
- "",
502
- ];
503
- }
504
- else if (language === "swift") {
505
- return [
506
- // Split along function definitions
507
- "\nfunc ",
508
- // Split along class definitions
509
- "\nclass ",
510
- "\nstruct ",
511
- "\nenum ",
512
- // Split along control flow statements
513
- "\nif ",
514
- "\nfor ",
515
- "\nwhile ",
516
- "\ndo ",
517
- "\nswitch ",
518
- "\ncase ",
519
- // Split by the normal type of lines
520
- "\n\n",
521
- "\n",
522
- " ",
523
- "",
524
- ];
525
- }
526
- else if (language === "markdown") {
527
- return [
528
- // First, try to split along Markdown headings (starting with level 2)
529
- "\n## ",
530
- "\n### ",
531
- "\n#### ",
532
- "\n##### ",
533
- "\n###### ",
534
- // Note the alternative syntax for headings (below) is not handled here
535
- // Heading level 2
536
- // ---------------
537
- // End of code block
538
- "```\n\n",
539
- // Horizontal lines
540
- "\n\n***\n\n",
541
- "\n\n---\n\n",
542
- "\n\n___\n\n",
543
- // Note that this splitter doesn't handle horizontal lines defined
544
- // by *three or more* of ***, ---, or ___, but this is not handled
545
- "\n\n",
546
- "\n",
547
- " ",
548
- "",
549
- ];
550
- }
551
- else if (language === "latex") {
552
- return [
553
- // First, try to split along Latex sections
554
- "\n\\chapter{",
555
- "\n\\section{",
556
- "\n\\subsection{",
557
- "\n\\subsubsection{",
558
- // Now split by environments
559
- "\n\\begin{enumerate}",
560
- "\n\\begin{itemize}",
561
- "\n\\begin{description}",
562
- "\n\\begin{list}",
563
- "\n\\begin{quote}",
564
- "\n\\begin{quotation}",
565
- "\n\\begin{verse}",
566
- "\n\\begin{verbatim}",
567
- // Now split by math environments
568
- "\n\\begin{align}",
569
- "$$",
570
- "$",
571
- // Now split by the normal type of lines
572
- "\n\n",
573
- "\n",
574
- " ",
575
- "",
576
- ];
577
- }
578
- else if (language === "html") {
579
- return [
580
- // First, try to split along HTML tags
581
- "<body>",
582
- "<div>",
583
- "<p>",
584
- "<br>",
585
- "<li>",
586
- "<h1>",
587
- "<h2>",
588
- "<h3>",
589
- "<h4>",
590
- "<h5>",
591
- "<h6>",
592
- "<span>",
593
- "<table>",
594
- "<tr>",
595
- "<td>",
596
- "<th>",
597
- "<ul>",
598
- "<ol>",
599
- "<header>",
600
- "<footer>",
601
- "<nav>",
602
- // Head
603
- "<head>",
604
- "<style>",
605
- "<script>",
606
- "<meta>",
607
- "<title>",
608
- // Normal type of lines
609
- " ",
610
- "",
611
- ];
612
- }
613
- else if (language === "sol") {
614
- return [
615
- // Split along compiler informations definitions
616
- "\npragma ",
617
- "\nusing ",
618
- // Split along contract definitions
619
- "\ncontract ",
620
- "\ninterface ",
621
- "\nlibrary ",
622
- // Split along method definitions
623
- "\nconstructor ",
624
- "\ntype ",
625
- "\nfunction ",
626
- "\nevent ",
627
- "\nmodifier ",
628
- "\nerror ",
629
- "\nstruct ",
630
- "\nenum ",
631
- // Split along control flow statements
632
- "\nif ",
633
- "\nfor ",
634
- "\nwhile ",
635
- "\ndo while ",
636
- "\nassembly ",
637
- // Split by the normal type of lines
638
- "\n\n",
639
- "\n",
640
- " ",
641
- "",
642
- ];
643
- }
644
- else {
645
- throw new Error(`Language ${language} is not supported.`);
646
- }
647
- }
648
- }
649
- /**
650
- * Implementation of splitter which looks at tokens.
651
- */
652
- export class TokenTextSplitter extends TextSplitter {
653
- static lc_name() {
654
- return "TokenTextSplitter";
655
- }
656
- constructor(fields) {
657
- super(fields);
658
- Object.defineProperty(this, "encodingName", {
659
- enumerable: true,
660
- configurable: true,
661
- writable: true,
662
- value: void 0
663
- });
664
- Object.defineProperty(this, "allowedSpecial", {
665
- enumerable: true,
666
- configurable: true,
667
- writable: true,
668
- value: void 0
669
- });
670
- Object.defineProperty(this, "disallowedSpecial", {
671
- enumerable: true,
672
- configurable: true,
673
- writable: true,
674
- value: void 0
675
- });
676
- Object.defineProperty(this, "tokenizer", {
677
- enumerable: true,
678
- configurable: true,
679
- writable: true,
680
- value: void 0
681
- });
682
- this.encodingName = fields?.encodingName ?? "gpt2";
683
- this.allowedSpecial = fields?.allowedSpecial ?? [];
684
- this.disallowedSpecial = fields?.disallowedSpecial ?? "all";
685
- }
686
- async splitText(text) {
687
- if (!this.tokenizer) {
688
- this.tokenizer = await getEncoding(this.encodingName);
689
- }
690
- const splits = [];
691
- const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
692
- let start_idx = 0;
693
- while (start_idx < input_ids.length) {
694
- if (start_idx > 0) {
695
- start_idx -= this.chunkOverlap;
696
- }
697
- const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
698
- const chunk_ids = input_ids.slice(start_idx, end_idx);
699
- splits.push(this.tokenizer.decode(chunk_ids));
700
- start_idx = end_idx;
701
- }
702
- return splits;
703
- }
704
- }
705
- export class MarkdownTextSplitter extends RecursiveCharacterTextSplitter {
706
- constructor(fields) {
707
- super({
708
- ...fields,
709
- separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("markdown"),
710
- });
711
- }
712
- }
713
- export class LatexTextSplitter extends RecursiveCharacterTextSplitter {
714
- constructor(fields) {
715
- super({
716
- ...fields,
717
- separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("latex"),
718
- });
719
- }
720
- }
1
+ export * from "@langchain/textsplitters";