langchain 0.0.85 → 0.0.87

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,12 +14,35 @@ export class TextSplitter {
14
14
  writable: true,
15
15
  value: 200
16
16
  });
17
+ Object.defineProperty(this, "keepSeparator", {
18
+ enumerable: true,
19
+ configurable: true,
20
+ writable: true,
21
+ value: false
22
+ });
17
23
  this.chunkSize = fields?.chunkSize ?? this.chunkSize;
18
24
  this.chunkOverlap = fields?.chunkOverlap ?? this.chunkOverlap;
25
+ this.keepSeparator = fields?.keepSeparator ?? this.keepSeparator;
19
26
  if (this.chunkOverlap >= this.chunkSize) {
20
27
  throw new Error("Cannot have chunkOverlap >= chunkSize");
21
28
  }
22
29
  }
30
+ splitOnSeparator(text, separator) {
31
+ let splits;
32
+ if (separator) {
33
+ if (this.keepSeparator) {
34
+ const regexEscapedSeparator = separator.replace(/[/\-\\^$*+?.()|[\]{}]/g, "\\$&");
35
+ splits = text.split(new RegExp(`(?=${regexEscapedSeparator})`));
36
+ }
37
+ else {
38
+ splits = text.split(separator);
39
+ }
40
+ }
41
+ else {
42
+ splits = text.split("");
43
+ }
44
+ return splits.filter((s) => s !== "");
45
+ }
23
46
  async createDocuments(texts,
24
47
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
25
48
  metadatas = [], chunkHeaderOptions = {}) {
@@ -128,16 +151,27 @@ export class CharacterTextSplitter extends TextSplitter {
128
151
  }
129
152
  async splitText(text) {
130
153
  // First we naively split the large input into a bunch of smaller ones.
131
- let splits;
132
- if (this.separator) {
133
- splits = text.split(this.separator);
134
- }
135
- else {
136
- splits = text.split("");
137
- }
138
- return this.mergeSplits(splits, this.separator);
154
+ const splits = this.splitOnSeparator(text, this.separator);
155
+ return this.mergeSplits(splits, this.keepSeparator ? "" : this.separator);
139
156
  }
140
157
  }
158
+ export const SupportedTextSplitterLanguages = [
159
+ "cpp",
160
+ "go",
161
+ "java",
162
+ "js",
163
+ "php",
164
+ "proto",
165
+ "python",
166
+ "rst",
167
+ "ruby",
168
+ "rust",
169
+ "scala",
170
+ "swift",
171
+ "markdown",
172
+ "latex",
173
+ "html",
174
+ ];
141
175
  export class RecursiveCharacterTextSplitter extends TextSplitter {
142
176
  constructor(fields) {
143
177
  super(fields);
@@ -148,51 +182,394 @@ export class RecursiveCharacterTextSplitter extends TextSplitter {
148
182
  value: ["\n\n", "\n", " ", ""]
149
183
  });
150
184
  this.separators = fields?.separators ?? this.separators;
185
+ this.keepSeparator = fields?.keepSeparator ?? true;
151
186
  }
152
- async splitText(text) {
187
+ async _splitText(text, separators) {
153
188
  const finalChunks = [];
154
189
  // Get appropriate separator to use
155
- let separator = this.separators[this.separators.length - 1];
156
- for (const s of this.separators) {
190
+ let separator = separators[separators.length - 1];
191
+ let newSeparators;
192
+ for (let i = 0; i < separators.length; i += 1) {
193
+ const s = separators[i];
157
194
  if (s === "") {
158
195
  separator = s;
159
196
  break;
160
197
  }
161
198
  if (text.includes(s)) {
162
199
  separator = s;
200
+ newSeparators = separators.slice(i + 1);
163
201
  break;
164
202
  }
165
203
  }
166
204
  // Now that we have the separator, split the text
167
- let splits;
168
- if (separator) {
169
- splits = text.split(separator);
170
- }
171
- else {
172
- splits = text.split("");
173
- }
205
+ const splits = this.splitOnSeparator(text, separator);
174
206
  // Now go merging things, recursively splitting longer texts.
175
207
  let goodSplits = [];
208
+ const _separator = this.keepSeparator ? "" : separator;
176
209
  for (const s of splits) {
177
210
  if (s.length < this.chunkSize) {
178
211
  goodSplits.push(s);
179
212
  }
180
213
  else {
181
214
  if (goodSplits.length) {
182
- const mergedText = this.mergeSplits(goodSplits, separator);
215
+ const mergedText = this.mergeSplits(goodSplits, _separator);
183
216
  finalChunks.push(...mergedText);
184
217
  goodSplits = [];
185
218
  }
186
- const otherInfo = await this.splitText(s);
187
- finalChunks.push(...otherInfo);
219
+ if (!newSeparators) {
220
+ finalChunks.push(s);
221
+ }
222
+ else {
223
+ const otherInfo = await this._splitText(s, newSeparators);
224
+ finalChunks.push(...otherInfo);
225
+ }
188
226
  }
189
227
  }
190
228
  if (goodSplits.length) {
191
- const mergedText = this.mergeSplits(goodSplits, separator);
229
+ const mergedText = this.mergeSplits(goodSplits, _separator);
192
230
  finalChunks.push(...mergedText);
193
231
  }
194
232
  return finalChunks;
195
233
  }
234
+ async splitText(text) {
235
+ return this._splitText(text, this.separators);
236
+ }
237
+ static fromLanguage(language, options) {
238
+ return new RecursiveCharacterTextSplitter({
239
+ ...options,
240
+ separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage(language),
241
+ });
242
+ }
243
+ static getSeparatorsForLanguage(language) {
244
+ if (language === "cpp") {
245
+ return [
246
+ // Split along class definitions
247
+ "\nclass ",
248
+ // Split along function definitions
249
+ "\nvoid ",
250
+ "\nint ",
251
+ "\nfloat ",
252
+ "\ndouble ",
253
+ // Split along control flow statements
254
+ "\nif ",
255
+ "\nfor ",
256
+ "\nwhile ",
257
+ "\nswitch ",
258
+ "\ncase ",
259
+ // Split by the normal type of lines
260
+ "\n\n",
261
+ "\n",
262
+ " ",
263
+ "",
264
+ ];
265
+ }
266
+ else if (language === "go") {
267
+ return [
268
+ // Split along function definitions
269
+ "\nfunc ",
270
+ "\nvar ",
271
+ "\nconst ",
272
+ "\ntype ",
273
+ // Split along control flow statements
274
+ "\nif ",
275
+ "\nfor ",
276
+ "\nswitch ",
277
+ "\ncase ",
278
+ // Split by the normal type of lines
279
+ "\n\n",
280
+ "\n",
281
+ " ",
282
+ "",
283
+ ];
284
+ }
285
+ else if (language === "java") {
286
+ return [
287
+ // Split along class definitions
288
+ "\nclass ",
289
+ // Split along method definitions
290
+ "\npublic ",
291
+ "\nprotected ",
292
+ "\nprivate ",
293
+ "\nstatic ",
294
+ // Split along control flow statements
295
+ "\nif ",
296
+ "\nfor ",
297
+ "\nwhile ",
298
+ "\nswitch ",
299
+ "\ncase ",
300
+ // Split by the normal type of lines
301
+ "\n\n",
302
+ "\n",
303
+ " ",
304
+ "",
305
+ ];
306
+ }
307
+ else if (language === "js") {
308
+ return [
309
+ // Split along function definitions
310
+ "\nfunction ",
311
+ "\nconst ",
312
+ "\nlet ",
313
+ "\nvar ",
314
+ "\nclass ",
315
+ // Split along control flow statements
316
+ "\nif ",
317
+ "\nfor ",
318
+ "\nwhile ",
319
+ "\nswitch ",
320
+ "\ncase ",
321
+ "\ndefault ",
322
+ // Split by the normal type of lines
323
+ "\n\n",
324
+ "\n",
325
+ " ",
326
+ "",
327
+ ];
328
+ }
329
+ else if (language === "php") {
330
+ return [
331
+ // Split along function definitions
332
+ "\nfunction ",
333
+ // Split along class definitions
334
+ "\nclass ",
335
+ // Split along control flow statements
336
+ "\nif ",
337
+ "\nforeach ",
338
+ "\nwhile ",
339
+ "\ndo ",
340
+ "\nswitch ",
341
+ "\ncase ",
342
+ // Split by the normal type of lines
343
+ "\n\n",
344
+ "\n",
345
+ " ",
346
+ "",
347
+ ];
348
+ }
349
+ else if (language === "proto") {
350
+ return [
351
+ // Split along message definitions
352
+ "\nmessage ",
353
+ // Split along service definitions
354
+ "\nservice ",
355
+ // Split along enum definitions
356
+ "\nenum ",
357
+ // Split along option definitions
358
+ "\noption ",
359
+ // Split along import statements
360
+ "\nimport ",
361
+ // Split along syntax declarations
362
+ "\nsyntax ",
363
+ // Split by the normal type of lines
364
+ "\n\n",
365
+ "\n",
366
+ " ",
367
+ "",
368
+ ];
369
+ }
370
+ else if (language === "python") {
371
+ return [
372
+ // First, try to split along class definitions
373
+ "\nclass ",
374
+ "\ndef ",
375
+ "\n\tdef ",
376
+ // Now split by the normal type of lines
377
+ "\n\n",
378
+ "\n",
379
+ " ",
380
+ "",
381
+ ];
382
+ }
383
+ else if (language === "rst") {
384
+ return [
385
+ // Split along section titles
386
+ "\n===\n",
387
+ "\n---\n",
388
+ "\n***\n",
389
+ // Split along directive markers
390
+ "\n.. ",
391
+ // Split by the normal type of lines
392
+ "\n\n",
393
+ "\n",
394
+ " ",
395
+ "",
396
+ ];
397
+ }
398
+ else if (language === "ruby") {
399
+ return [
400
+ // Split along method definitions
401
+ "\ndef ",
402
+ "\nclass ",
403
+ // Split along control flow statements
404
+ "\nif ",
405
+ "\nunless ",
406
+ "\nwhile ",
407
+ "\nfor ",
408
+ "\ndo ",
409
+ "\nbegin ",
410
+ "\nrescue ",
411
+ // Split by the normal type of lines
412
+ "\n\n",
413
+ "\n",
414
+ " ",
415
+ "",
416
+ ];
417
+ }
418
+ else if (language === "rust") {
419
+ return [
420
+ // Split along function definitions
421
+ "\nfn ",
422
+ "\nconst ",
423
+ "\nlet ",
424
+ // Split along control flow statements
425
+ "\nif ",
426
+ "\nwhile ",
427
+ "\nfor ",
428
+ "\nloop ",
429
+ "\nmatch ",
430
+ "\nconst ",
431
+ // Split by the normal type of lines
432
+ "\n\n",
433
+ "\n",
434
+ " ",
435
+ "",
436
+ ];
437
+ }
438
+ else if (language === "scala") {
439
+ return [
440
+ // Split along class definitions
441
+ "\nclass ",
442
+ "\nobject ",
443
+ // Split along method definitions
444
+ "\ndef ",
445
+ "\nval ",
446
+ "\nvar ",
447
+ // Split along control flow statements
448
+ "\nif ",
449
+ "\nfor ",
450
+ "\nwhile ",
451
+ "\nmatch ",
452
+ "\ncase ",
453
+ // Split by the normal type of lines
454
+ "\n\n",
455
+ "\n",
456
+ " ",
457
+ "",
458
+ ];
459
+ }
460
+ else if (language === "swift") {
461
+ return [
462
+ // Split along function definitions
463
+ "\nfunc ",
464
+ // Split along class definitions
465
+ "\nclass ",
466
+ "\nstruct ",
467
+ "\nenum ",
468
+ // Split along control flow statements
469
+ "\nif ",
470
+ "\nfor ",
471
+ "\nwhile ",
472
+ "\ndo ",
473
+ "\nswitch ",
474
+ "\ncase ",
475
+ // Split by the normal type of lines
476
+ "\n\n",
477
+ "\n",
478
+ " ",
479
+ "",
480
+ ];
481
+ }
482
+ else if (language === "markdown") {
483
+ return [
484
+ // First, try to split along Markdown headings (starting with level 2)
485
+ "\n## ",
486
+ "\n### ",
487
+ "\n#### ",
488
+ "\n##### ",
489
+ "\n###### ",
490
+ // Note the alternative syntax for headings (below) is not handled here
491
+ // Heading level 2
492
+ // ---------------
493
+ // End of code block
494
+ "```\n\n",
495
+ // Horizontal lines
496
+ "\n\n***\n\n",
497
+ "\n\n---\n\n",
498
+ "\n\n___\n\n",
499
+ // Note that this splitter doesn't handle horizontal lines defined
500
+ // by *three or more* of ***, ---, or ___, but this is not handled
501
+ "\n\n",
502
+ "\n",
503
+ " ",
504
+ "",
505
+ ];
506
+ }
507
+ else if (language === "latex") {
508
+ return [
509
+ // First, try to split along Latex sections
510
+ "\n\\chapter{",
511
+ "\n\\section{",
512
+ "\n\\subsection{",
513
+ "\n\\subsubsection{",
514
+ // Now split by environments
515
+ "\n\\begin{enumerate}",
516
+ "\n\\begin{itemize}",
517
+ "\n\\begin{description}",
518
+ "\n\\begin{list}",
519
+ "\n\\begin{quote}",
520
+ "\n\\begin{quotation}",
521
+ "\n\\begin{verse}",
522
+ "\n\\begin{verbatim}",
523
+ // Now split by math environments
524
+ "\n\\begin{align}",
525
+ "$$",
526
+ "$",
527
+ // Now split by the normal type of lines
528
+ "\n\n",
529
+ "\n",
530
+ " ",
531
+ "",
532
+ ];
533
+ }
534
+ else if (language === "html") {
535
+ return [
536
+ // First, try to split along HTML tags
537
+ "<body>",
538
+ "<div>",
539
+ "<p>",
540
+ "<br>",
541
+ "<li>",
542
+ "<h1>",
543
+ "<h2>",
544
+ "<h3>",
545
+ "<h4>",
546
+ "<h5>",
547
+ "<h6>",
548
+ "<span>",
549
+ "<table>",
550
+ "<tr>",
551
+ "<td>",
552
+ "<th>",
553
+ "<ul>",
554
+ "<ol>",
555
+ "<header>",
556
+ "<footer>",
557
+ "<nav>",
558
+ // Head
559
+ "<head>",
560
+ "<style>",
561
+ "<script>",
562
+ "<meta>",
563
+ "<title>",
564
+ // Normal type of lines
565
+ " ",
566
+ "",
567
+ ];
568
+ }
569
+ else {
570
+ throw new Error(`Language ${language} is not supported.`);
571
+ }
572
+ }
196
573
  }
197
574
  /**
198
575
  * Implementation of splitter which looks at tokens.
@@ -248,67 +625,17 @@ export class TokenTextSplitter extends TextSplitter {
248
625
  }
249
626
  export class MarkdownTextSplitter extends RecursiveCharacterTextSplitter {
250
627
  constructor(fields) {
251
- super(fields);
252
- Object.defineProperty(this, "separators", {
253
- enumerable: true,
254
- configurable: true,
255
- writable: true,
256
- value: [
257
- // First, try to split along Markdown headings (starting with level 2)
258
- "\n## ",
259
- "\n### ",
260
- "\n#### ",
261
- "\n##### ",
262
- "\n###### ",
263
- // Note the alternative syntax for headings (below) is not handled here
264
- // Heading level 2
265
- // ---------------
266
- // End of code block
267
- "```\n\n",
268
- // Horizontal lines
269
- "\n\n***\n\n",
270
- "\n\n---\n\n",
271
- "\n\n___\n\n",
272
- // Note that this splitter doesn't handle horizontal lines defined
273
- // by *three or more* of ***, ---, or ___, but this is not handled
274
- "\n\n",
275
- "\n",
276
- " ",
277
- "",
278
- ]
628
+ super({
629
+ ...fields,
630
+ separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("markdown"),
279
631
  });
280
632
  }
281
633
  }
282
634
  export class LatexTextSplitter extends RecursiveCharacterTextSplitter {
283
635
  constructor(fields) {
284
- super(fields);
285
- Object.defineProperty(this, "separators", {
286
- enumerable: true,
287
- configurable: true,
288
- writable: true,
289
- value: [
290
- // First, try to split along Latex sections
291
- "\n\\chapter{",
292
- "\n\\section{",
293
- "\n\\subsection{",
294
- "\n\\subsubsection{",
295
- // Now split by environments
296
- "\n\\begin{enumerate}",
297
- "\n\\begin{itemize}",
298
- "\n\\begin{description}",
299
- "\n\\begin{list}",
300
- "\n\\begin{quote}",
301
- "\n\\begin{quotation}",
302
- "\n\\begin{verse}",
303
- "\n\\begin{verbatim}",
304
- // Now split by math environments
305
- "\n\\begin{align}",
306
- "$$",
307
- "$",
308
- // Now split by the normal type of lines
309
- " ",
310
- "",
311
- ]
636
+ super({
637
+ ...fields,
638
+ separators: RecursiveCharacterTextSplitter.getSeparatorsForLanguage("latex"),
312
639
  });
313
640
  }
314
641
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "langchain",
3
- "version": "0.0.85",
3
+ "version": "0.0.87",
4
4
  "description": "Typescript bindings for langchain",
5
5
  "type": "module",
6
6
  "engines": {