vectra 0.12.2 → 0.12.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/LICENSE +1 -1
  2. package/README.draft.md +499 -0
  3. package/README.draft.outline.md +160 -0
  4. package/README.research.md +2159 -0
  5. package/bin/vectra.js +3 -0
  6. package/lib/FileFetcher.d.ts +5 -0
  7. package/lib/FileFetcher.d.ts.map +1 -0
  8. package/lib/FileFetcher.js +79 -0
  9. package/lib/FileFetcher.js.map +1 -0
  10. package/lib/GPT3Tokenizer.d.ts +9 -0
  11. package/lib/ItemSelector.d.ts +41 -0
  12. package/lib/ItemSelector.d.ts.map +1 -0
  13. package/lib/ItemSelector.js +168 -0
  14. package/lib/ItemSelector.js.map +1 -0
  15. package/lib/LocalDocument.d.ts +54 -0
  16. package/lib/LocalDocument.js +156 -0
  17. package/lib/LocalDocument.js.map +1 -0
  18. package/lib/LocalDocumentIndex.d.ts +132 -0
  19. package/lib/LocalDocumentIndex.js +456 -0
  20. package/lib/LocalDocumentIndex.js.map +1 -0
  21. package/lib/LocalDocumentResult.d.ts +45 -0
  22. package/lib/LocalDocumentResult.js +328 -0
  23. package/lib/LocalDocumentResult.js.map +1 -0
  24. package/lib/LocalIndex.d.ts +150 -0
  25. package/lib/LocalIndex.d.ts.map +1 -1
  26. package/lib/LocalIndex.js +515 -0
  27. package/lib/LocalIndex.js.map +1 -0
  28. package/lib/LocalIndex.spec.d.ts +2 -0
  29. package/lib/LocalIndex.spec.js +218 -7
  30. package/lib/LocalIndex.spec.js.map +1 -1
  31. package/lib/OpenAIEmbeddings.d.ts +126 -0
  32. package/lib/OpenAIEmbeddings.d.ts.map +1 -0
  33. package/lib/OpenAIEmbeddings.js +174 -0
  34. package/lib/OpenAIEmbeddings.js.map +1 -0
  35. package/lib/TextSplitter.d.ts +19 -0
  36. package/lib/TextSplitter.d.ts.map +1 -1
  37. package/lib/TextSplitter.js +457 -0
  38. package/lib/TextSplitter.js.map +1 -0
  39. package/lib/TextSplitter.spec.d.ts +2 -0
  40. package/lib/TextSplitter.spec.d.ts.map +1 -0
  41. package/lib/TextSplitter.spec.js +109 -0
  42. package/lib/TextSplitter.spec.js.map +1 -0
  43. package/lib/WebFetcher.d.ts +15 -0
  44. package/lib/WebFetcher.d.ts.map +1 -0
  45. package/lib/WebFetcher.js +234 -0
  46. package/lib/WebFetcher.js.map +1 -0
  47. package/lib/index.d.ts +12 -0
  48. package/lib/index.js +28 -0
  49. package/lib/index.js.map +1 -0
  50. package/lib/internals/Colorize.d.ts +14 -0
  51. package/lib/internals/Colorize.d.ts.map +1 -0
  52. package/lib/internals/Colorize.js +64 -0
  53. package/lib/internals/Colorize.js.map +1 -0
  54. package/lib/internals/index.d.ts +3 -0
  55. package/lib/internals/index.d.ts.map +1 -0
  56. package/lib/internals/index.js +19 -0
  57. package/lib/internals/index.js.map +1 -0
  58. package/lib/internals/types.d.ts +43 -0
  59. package/lib/internals/types.d.ts.map +1 -0
  60. package/lib/internals/types.js +3 -0
  61. package/lib/internals/types.js.map +1 -0
  62. package/lib/types.d.ts +146 -0
  63. package/lib/types.d.ts.map +1 -0
  64. package/lib/types.js +3 -0
  65. package/lib/types.js.map +1 -0
  66. package/lib/vectra-cli.d.ts +2 -0
  67. package/lib/vectra-cli.js +323 -0
  68. package/lib/vectra-cli.js.map +1 -0
  69. package/package.json +3 -1
  70. package/src/LocalIndex.spec.ts +265 -8
  71. package/src/LocalIndex.ts +1 -0
  72. package/src/TextSplitter.spec.ts +87 -0
  73. package/src/TextSplitter.ts +459 -531
@@ -0,0 +1,457 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.TextSplitter = void 0;
4
+ const GPT3Tokenizer_1 = require("./GPT3Tokenizer");
5
+ const ALPHANUMERIC_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
6
+ class TextSplitter {
7
+ constructor(config) {
8
+ this._config = Object.assign({
9
+ keepSeparators: false,
10
+ chunkSize: 400,
11
+ chunkOverlap: 40,
12
+ }, config);
13
+ // Create a default tokenizer if none is provided
14
+ if (!this._config.tokenizer) {
15
+ this._config.tokenizer = new GPT3Tokenizer_1.GPT3Tokenizer();
16
+ }
17
+ // Use default separators if none are provided
18
+ if (!this._config.separators || this._config.separators.length === 0) {
19
+ this._config.separators = this.getSeparators(this._config.docType);
20
+ }
21
+ // Validate the config settings
22
+ if (this._config.chunkSize < 1) {
23
+ throw new Error("chunkSize must be >= 1");
24
+ }
25
+ else if (this._config.chunkOverlap < 0) {
26
+ throw new Error("chunkOverlap must be >= 0");
27
+ }
28
+ else if (this._config.chunkOverlap > this._config.chunkSize) {
29
+ throw new Error("chunkOverlap must be <= chunkSize");
30
+ }
31
+ }
32
+ split(text) {
33
+ // Get basic chunks
34
+ const chunks = this.recursiveSplit(text, this._config.separators, 0);
35
+ const that = this;
36
+ function getOverlapTokens(tokens) {
37
+ if (tokens != undefined) {
38
+ const len = tokens.length > that._config.chunkOverlap ? that._config.chunkOverlap : tokens.length;
39
+ return tokens.slice(0, len);
40
+ }
41
+ else {
42
+ return [];
43
+ }
44
+ }
45
+ // Add overlap tokens and text to the start and end of each chunk
46
+ if (this._config.chunkOverlap > 0) {
47
+ for (let i = 1; i < chunks.length; i++) {
48
+ const previousChunk = chunks[i - 1];
49
+ const chunk = chunks[i];
50
+ const nextChunk = i < chunks.length - 1 ? chunks[i + 1] : undefined;
51
+ // Use copies to avoid reversing in place (preserve token order in previous chunks)
52
+ const prevTokensCopy = previousChunk.tokens.slice();
53
+ chunk.startOverlap = getOverlapTokens(prevTokensCopy.reverse()).reverse();
54
+ chunk.endOverlap = getOverlapTokens(nextChunk === null || nextChunk === void 0 ? void 0 : nextChunk.tokens);
55
+ }
56
+ }
57
+ return chunks;
58
+ }
59
+ recursiveSplit(text, separators, startPos) {
60
+ const chunks = [];
61
+ if (text.length > 0) {
62
+ // Split text into parts
63
+ let parts;
64
+ let separator = '';
65
+ const nextSeparators = separators.length > 1 ? separators.slice(1) : [];
66
+ if (separators.length > 0) {
67
+ // Split by separator
68
+ separator = separators[0];
69
+ parts = separator == ' ' ? this.splitBySpaces(text) : text.split(separator);
70
+ }
71
+ else {
72
+ // Cut text in half
73
+ const half = Math.floor(text.length / 2);
74
+ parts = [text.substring(0, half), text.substring(half)];
75
+ }
76
+ // Iterate over parts
77
+ for (let i = 0; i < parts.length; i++) {
78
+ const lastChunk = (i === parts.length - 1);
79
+ // Get chunk text and endPos
80
+ let chunk = parts[i];
81
+ const endPos = (startPos + (chunk.length - 1)) + (lastChunk ? 0 : separator.length);
82
+ if (this._config.keepSeparators && !lastChunk) {
83
+ chunk += separator;
84
+ }
85
+ // Keep chunks that contain any non-whitespace; drop whitespace-only
86
+ if (!/\S/.test(chunk)) {
87
+ // drop whitespace-only chunks
88
+ startPos = endPos + 1;
89
+ continue;
90
+ }
91
+ // Optimization to avoid encoding really large chunks
92
+ if (chunk.length / 6 > this._config.chunkSize) {
93
+ // Break the text into smaller chunks
94
+ const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
95
+ chunks.push(...subChunks);
96
+ }
97
+ else {
98
+ // Encode chunk text
99
+ const tokens = this._config.tokenizer.encode(chunk);
100
+ if (tokens.length > this._config.chunkSize) {
101
+ // Break the text into smaller chunks
102
+ const subChunks = this.recursiveSplit(chunk, nextSeparators, startPos);
103
+ chunks.push(...subChunks);
104
+ }
105
+ else {
106
+ // Append chunk to output
107
+ chunks.push({
108
+ text: chunk,
109
+ tokens: tokens,
110
+ startPos: startPos,
111
+ endPos: endPos,
112
+ startOverlap: [],
113
+ endOverlap: [],
114
+ });
115
+ }
116
+ }
117
+ // Update startPos
118
+ startPos = endPos + 1;
119
+ }
120
+ }
121
+ return this.combineChunks(chunks);
122
+ }
123
+ combineChunks(chunks) {
124
+ const combinedChunks = [];
125
+ let currentChunk;
126
+ let currentLength = 0;
127
+ // When not keeping separators, we previously inserted a space between merged chunks.
128
+ // We will still use a space for normal merges, but we will prevent merging punctuation-only
129
+ // separator chunks (e.g., '---', '***', '====') to preserve them as standalone.
130
+ const separator = this._config.keepSeparators ? '' : ' ';
131
+ const isWhitespaceOnly = (t) => !/\S/.test(t);
132
+ const isPunctuationOnly = (t) => /\S/.test(t) && !/[a-zA-Z0-9]/.test(t);
133
+ for (let i = 0; i < chunks.length; i++) {
134
+ const chunk = chunks[i];
135
+ if (!currentChunk) {
136
+ currentChunk = chunk;
137
+ currentLength = chunk.tokens.length;
138
+ continue;
139
+ }
140
+ // If either the current or next chunk is punctuation-only (non-whitespace, no alphanumeric),
141
+ // do not merge; keep them as separate chunks to preserve separators like '---'.
142
+ if (isPunctuationOnly(currentChunk.text) || isPunctuationOnly(chunk.text)) {
143
+ combinedChunks.push(currentChunk);
144
+ currentChunk = chunk;
145
+ currentLength = chunk.tokens.length;
146
+ continue;
147
+ }
148
+ // Normal merge path constrained by token budget
149
+ const length = currentChunk.tokens.length + chunk.tokens.length;
150
+ if (length > this._config.chunkSize) {
151
+ combinedChunks.push(currentChunk);
152
+ currentChunk = chunk;
153
+ currentLength = chunk.tokens.length;
154
+ }
155
+ else {
156
+ // Only insert separator if neither chunk is whitespace-only (defensive)
157
+ const joiner = (!this._config.keepSeparators && !isWhitespaceOnly(currentChunk.text) && !isWhitespaceOnly(chunk.text)) ? separator : '';
158
+ currentChunk.text += joiner + chunk.text;
159
+ currentChunk.endPos = chunk.endPos;
160
+ currentChunk.tokens.push(...chunk.tokens);
161
+ currentLength += chunk.tokens.length;
162
+ }
163
+ }
164
+ if (currentChunk) {
165
+ combinedChunks.push(currentChunk);
166
+ }
167
+ return combinedChunks;
168
+ }
169
+ splitBySpaces(text) {
170
+ // Split text by tokens and return parts
171
+ const parts = [];
172
+ let tokens = this._config.tokenizer.encode(text);
173
+ do {
174
+ if (tokens.length <= this._config.chunkSize) {
175
+ parts.push(this._config.tokenizer.decode(tokens));
176
+ break;
177
+ }
178
+ else {
179
+ const span = tokens.splice(0, this._config.chunkSize);
180
+ parts.push(this._config.tokenizer.decode(span));
181
+ }
182
+ } while (true);
183
+ return parts;
184
+ }
185
+ getSeparators(docType) {
186
+ switch (docType !== null && docType !== void 0 ? docType : '') {
187
+ case "cpp":
188
+ return [
189
+ "\nclass ",
190
+ "\nvoid ",
191
+ "\nint ",
192
+ "\nfloat ",
193
+ "\ndouble ",
194
+ "\nif ",
195
+ "\nfor ",
196
+ "\nwhile ",
197
+ "\nswitch ",
198
+ "\ncase ",
199
+ "\n\n",
200
+ "\n",
201
+ ];
202
+ case "go":
203
+ return [
204
+ "\nfunc ",
205
+ "\nvar ",
206
+ "\nconst ",
207
+ "\ntype ",
208
+ "\nif ",
209
+ "\nfor ",
210
+ "\nswitch ",
211
+ "\ncase ",
212
+ "\n\n",
213
+ "\n",
214
+ ];
215
+ case "java":
216
+ case "c#":
217
+ case "csharp":
218
+ case "cs":
219
+ case "ts":
220
+ case "tsx":
221
+ case "typescript":
222
+ return [
223
+ "// LLM-REGION",
224
+ "/* LLM-REGION",
225
+ "/** LLM-REGION",
226
+ "\nclass ",
227
+ "\npublic ",
228
+ "\nprotected ",
229
+ "\nprivate ",
230
+ "\nstatic ",
231
+ "\nif ",
232
+ "\nfor ",
233
+ "\nwhile ",
234
+ "\nswitch ",
235
+ "\ncase ",
236
+ "\n\n",
237
+ "\n",
238
+ " "
239
+ ];
240
+ case "js":
241
+ case "jsx":
242
+ case "javascript":
243
+ return [
244
+ "// LLM-REGION",
245
+ "/* LLM-REGION",
246
+ "/** LLM-REGION",
247
+ "\nclass ",
248
+ "\nfunction ",
249
+ "\nconst ",
250
+ "\nlet ",
251
+ "\nvar ",
252
+ "\nclass ",
253
+ "\nif ",
254
+ "\nfor ",
255
+ "\nwhile ",
256
+ "\nswitch ",
257
+ "\ncase ",
258
+ "\ndefault ",
259
+ "\n\n",
260
+ "\n",
261
+ ];
262
+ case "php":
263
+ return [
264
+ "\nfunction ",
265
+ "\nclass ",
266
+ "\nif ",
267
+ "\nforeach ",
268
+ "\nwhile ",
269
+ "\ndo ",
270
+ "\nswitch ",
271
+ "\ncase ",
272
+ "\n\n",
273
+ "\n",
274
+ ];
275
+ case "proto":
276
+ return [
277
+ "\nmessage ",
278
+ "\nservice ",
279
+ "\nenum ",
280
+ "\noption ",
281
+ "\nimport ",
282
+ "\nsyntax ",
283
+ "\n\n",
284
+ "\n",
285
+ ];
286
+ case "python":
287
+ case "py":
288
+ return [
289
+ "\nclass ",
290
+ "\ndef ",
291
+ "\n\tdef ",
292
+ "\n\n",
293
+ "\n",
294
+ ];
295
+ case "rst":
296
+ return [
297
+ "\n===\n",
298
+ "\n---\n",
299
+ "\n***\n",
300
+ "\n.. ",
301
+ "\n\n",
302
+ "\n",
303
+ ];
304
+ case "ruby":
305
+ return [
306
+ "\ndef ",
307
+ "\nclass ",
308
+ "\nif ",
309
+ "\nunless ",
310
+ "\nwhile ",
311
+ "\nfor ",
312
+ "\ndo ",
313
+ "\nbegin ",
314
+ "\nrescue ",
315
+ "\n\n",
316
+ "\n",
317
+ ];
318
+ case "rust":
319
+ return [
320
+ "\nfn ",
321
+ "\nconst ",
322
+ "\nlet ",
323
+ "\nif ",
324
+ "\nwhile ",
325
+ "\nfor ",
326
+ "\nloop ",
327
+ "\nmatch ",
328
+ "\nconst ",
329
+ "\n\n",
330
+ "\n",
331
+ ];
332
+ case "scala":
333
+ return [
334
+ "\nclass ",
335
+ "\nobject ",
336
+ "\ndef ",
337
+ "\nval ",
338
+ "\nvar ",
339
+ "\nif ",
340
+ "\nfor ",
341
+ "\nwhile ",
342
+ "\nmatch ",
343
+ "\ncase ",
344
+ "\n\n",
345
+ "\n",
346
+ ];
347
+ case "swift":
348
+ return [
349
+ "\nfunc ",
350
+ "\nclass ",
351
+ "\nstruct ",
352
+ "\nenum ",
353
+ "\nif ",
354
+ "\nfor ",
355
+ "\nwhile ",
356
+ "\ndo ",
357
+ "\nswitch ",
358
+ "\ncase ",
359
+ "\n\n",
360
+ "\n",
361
+ ];
362
+ case "md":
363
+ case "markdown":
364
+ return [
365
+ "\n## ",
366
+ "\n### ",
367
+ "\n#### ",
368
+ "\n##### ",
369
+ "\n###### ",
370
+ "```\n\n",
371
+ "\n\n***\n\n",
372
+ "\n\n---\n\n",
373
+ "\n\n___\n\n",
374
+ "<table>",
375
+ "\n\n",
376
+ "\n",
377
+ ];
378
+ case "latex":
379
+ return [
380
+ "\n\\chapter{",
381
+ "\n\\section{",
382
+ "\n\\subsection{",
383
+ "\n\\subsubsection{",
384
+ "\n\\begin{enumerate}",
385
+ "\n\\begin{itemize}",
386
+ "\n\\begin{description}",
387
+ "\n\\begin{list}",
388
+ "\n\\begin{quote}",
389
+ "\n\\begin{quotation}",
390
+ "\n\\begin{verse}",
391
+ "\n\\begin{verbatim}",
392
+ "\n\\begin{align}",
393
+ "\n\n",
394
+ "\n",
395
+ ];
396
+ case "html":
397
+ return [
398
+ "<body>",
399
+ "<div>",
400
+ "<p>",
401
+ "<br>",
402
+ "<li>",
403
+ "<h1>",
404
+ "<h2>",
405
+ "<h3>",
406
+ "<h4>",
407
+ "<h5>",
408
+ "<h6>",
409
+ "<span>",
410
+ "<table>",
411
+ "<tr>",
412
+ "<td>",
413
+ "<th>",
414
+ "<ul>",
415
+ "<ol>",
416
+ "<header>",
417
+ "<footer>",
418
+ "<nav>",
419
+ "<head>",
420
+ "<style>",
421
+ "<script>",
422
+ "<meta>",
423
+ "<title>",
424
+ ];
425
+ case "sol":
426
+ return [
427
+ "\npragma ",
428
+ "\nusing ",
429
+ "\ncontract ",
430
+ "\ninterface ",
431
+ "\nlibrary ",
432
+ "\nconstructor ",
433
+ "\ntype ",
434
+ "\nfunction ",
435
+ "\nevent ",
436
+ "\nmodifier ",
437
+ "\nerror ",
438
+ "\nstruct ",
439
+ "\nenum ",
440
+ "\nif ",
441
+ "\nfor ",
442
+ "\nwhile ",
443
+ "\ndo while ",
444
+ "\nassembly ",
445
+ "\n\n",
446
+ "\n",
447
+ ];
448
+ default:
449
+ return [
450
+ "\n\n",
451
+ "\n",
452
+ ];
453
+ }
454
+ }
455
+ }
456
+ exports.TextSplitter = TextSplitter;
457
+ //# sourceMappingURL=TextSplitter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TextSplitter.js","sourceRoot":"","sources":["../src/TextSplitter.ts"],"names":[],"mappings":";;;AAAA,mDAAgD;AAGhD,MAAM,kBAAkB,GAAG,gEAAgE,CAAC;AAW5F,MAAa,YAAY;IAGvB,YAAmB,MAAoC;QACrD,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC;YAC3B,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,GAAG;YACd,YAAY,EAAE,EAAE;SACK,EAAE,MAAM,CAAC,CAAC;QAEjC,iDAAiD;QACjD,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;YAC5B,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,IAAI,6BAAa,EAAE,CAAC;QAC/C,CAAC;QAED,8CAA8C;QAC9C,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,IAAI,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACrE,IAAI,CAAC,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QACrE,CAAC;QAED,+BAA+B;QAC/B,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,GAAG,CAAC,EAAE,CAAC;YAC/B,MAAM,IAAI,KAAK,CAAC,wBAAwB,CAAC,CAAC;QAC5C,CAAC;aAAM,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;YACzC,MAAM,IAAI,KAAK,CAAC,2BAA2B,CAAC,CAAC;QAC/C,CAAC;aAAM,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;YAC9D,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;QACvD,CAAC;IACH,CAAC;IAEM,KAAK,CAAC,IAAY;QACvB,mBAAmB;QACnB,MAAM,MAAM,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;QAErE,MAAM,IAAI,GAAG,IAAI,CAAC;QAClB,SAAS,gBAAgB,CAAC,MAAiB;YACzC,IAAI,MAAM,IAAI,SAAS,EAAE,CAAC;gBACxB,MAAM,GAAG,GAAG,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC;gBAClG,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;YAC9B,CAAC;iBAAM,CAAC;gBACN,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC;QAED,iEAAiE;QACjE,IAAI,IAAI,CAAC,OAAO,CAAC,YAAY,GAAG,CAAC,EAAE,CAAC;YAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACvC,MAAM,aAAa,GAAG,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;gBACpC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;gBACxB,MAAM,SAAS,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;gBAEpE,mFAAmF;gBACnF,MAAM,cAAc,GAAG,aAAa,CAAC,MAAM,CAAC,KAAK,EAAE,CAAC;gBACpD,KAAK,CAAC,YAAY,GAAG,gBAAgB,CAAC,cAAc,CAAC,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC;gBAC1E,KAAK,CAAC,UAAU,GAAG,gBAAgB,CAAC,SAAS,aAAT,SAAS,uBAAT,SAAS,CAAE,MAAM,CAAC,CAAC;YACzD,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAEO,cAAc,CAAC,IAAY,EAAE,UAAoB,EAAE,QAAgB;QACzE,MAAM,MAAM,GAAgB,EAAE,CAAC;QAE/B,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpB,wBAAwB;YACxB,IAAI,KAAe,CAAC;YACpB,IAAI,SAAS,GAAG,EAAE,CAAC;YACnB,MAAM,cAAc,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAExE,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC1B,qBAAqB;gBACrB,SAAS,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;gBAC1B,KAAK,GAAG,SAAS,IAAI,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;YAC9E,CAAC;iBAAM,CAAC;gBACN,mBAAmB;gBACnB,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;gBACzC,KAAK,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;YAC1D,CAAC;YAED,qBAAqB;YACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;gBAE3C,4BAA4B;gBAC5B,IAAI,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACrB,MAAM,MAAM,GAAG,CAAC,QAAQ,GAAG,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;gBAEpF,IAAI,IAAI,CAAC,OAAO,CAAC,cAAc,IAAI,CAAC,SAAS,EAAE,CAAC;oBAC9C,KAAK,IAAI,SAAS,CAAC;gBACrB,CAAC;gBAED,oEAAoE;gBACpE,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;oBACtB,8BAA8B;oBAC9B,QAAQ,GAAG,MAAM,GAAG,CAAC,CAAC;oBACtB,SAAS;gBACX,CAAC;gBAED,qDAAqD;gBACrD,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;oBAC9C,qCAAqC;oBACrC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,cAAc,EAAE,QAAQ,CAAC,CAAC;oBACvE,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;gBAC5B,CAAC;qBAAM,CAAC;oBACN,oBAAoB;oBACpB,MAAM,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;oBACpD,IAAI,MAAM,CAAC,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;wBAC3C,qCAAqC;wBACrC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,cAAc,EAAE,QAAQ,CAAC,CAAC;wBACvE,MAAM,CAAC,IAAI,CAAC,GAAG,SAAS,CAAC,CAAC;oBAC5B,CAAC;yBAAM,CAAC;wBACN,yBAAyB;wBACzB,MAAM,CAAC,IAAI,CAAC;4BACV,IAAI,EAAE,KAAK;4BACX,MAAM,EAAE,MAAM;4BACd,QAAQ,EAAE,QAAQ;4BAClB,MAAM,EAAE,MAAM;4BACd,YAAY,EAAE,EAAE;4BAChB,UAAU,EAAE,EAAE;yBACf,CAAC,CAAC;oBACL,CAAC;gBACH,CAAC;gBAED,kBAAkB;gBAClB,QAAQ,GAAG,MAAM,GAAG,CAAC,CAAC;YACxB,CAAC;QACH,CAAC;QAED,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;IACpC,CAAC;IAEO,aAAa,CAAC,MAAmB;QACvC,MAAM,cAAc,GAAgB,EAAE,CAAC;QACvC,IAAI,YAAmC,CAAC;QACxC,IAAI,aAAa,GAAG,CAAC,CAAC;QAEtB,qFAAqF;QACrF,4FAA4F;QAC5F,gFAAgF;QAChF,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;QAEzD,MAAM,gBAAgB,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACtD,MAAM,iBAAiB,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAEhF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;YAExB,IAAI,CAAC,YAAY,EAAE,CAAC;gBAClB,YAAY,GAAG,KAAK,CAAC;gBACrB,aAAa,GAAG,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;gBACpC,SAAS;YACX,CAAC;YAED,6FAA6F;YAC7F,gFAAgF;YAChF,IAAI,iBAAiB,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,iBAAiB,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC1E,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;gBAClC,YAAY,GAAG,KAAK,CAAC;gBACrB,aAAa,GAAG,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;gBACpC,SAAS;YACX,CAAC;YAED,gDAAgD;YAChD,MAAM,MAAM,GAAG,YAAY,CAAC,MAAM,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;YAChE,IAAI,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;gBACpC,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;gBAClC,YAAY,GAAG,KAAK,CAAC;gBACrB,aAAa,GAAG,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;YACtC,CAAC;iBAAM,CAAC;gBACN,wEAAwE;gBACxE,MAAM,MAAM,GAAG,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,cAAc,IAAI,CAAC,gBAAgB,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC;gBACxI,YAAY,CAAC,IAAI,IAAI,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC;gBACzC,YAAY,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;gBACnC,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC;gBAC1C,aAAa,IAAI,KAAK,CAAC,MAAM,CAAC,MAAM,CAAC;YACvC,CAAC;QACH,CAAC;QAED,IAAI,YAAY,EAAE,CAAC;YACjB,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QACpC,CAAC;QAED,OAAO,cAAc,CAAC;IACxB,CAAC;IAEO,aAAa,CAAC,IAAY;QAChC,wCAAwC;QACxC,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,IAAI,MAAM,GAAG,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAEjD,GAAG,CAAC;YACF,IAAI,MAAM,CAAC,MAAM,IAAI,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;gBAC5C,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC;gBAClD,MAAM;YACR,CAAC;iBAAM,CAAC;gBACN,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;gBACtD,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;YAClD,CAAC;QACH,CAAC,QAAQ,IAAI,EAAE;QAEf,OAAO,KAAK,CAAC;IACf,CAAC;IAEO,aAAa,CAAC,OAAgB;QACpC,QAAQ,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,EAAE,CAAC;YACtB,KAAK,KAAK;gBACR,OAAO;oBACL,UAAU;oBACV,SAAS;oBACT,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,IAAI;gBACP,OAAO;oBACL,SAAS;oBACT,QAAQ;oBACR,UAAU;oBACV,SAAS;oBACT,OAAO;oBACP,QAAQ;oBACR,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,MAAM,CAAC;YACZ,KAAK,IAAI,CAAC;YACV,KAAK,QAAQ,CAAC;YACd,KAAK,IAAI,CAAC;YACV,KAAK,IAAI,CAAC;YACV,KAAK,KAAK,CAAC;YACX,KAAK,YAAY;gBACf,OAAO;oBACL,eAAe;oBACf,eAAe;oBACf,gBAAgB;oBAChB,UAAU;oBACV,WAAW;oBACX,cAAc;oBACd,YAAY;oBACZ,WAAW;oBACX,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;oBACJ,GAAG;iBACJ,CAAC;YACJ,KAAK,IAAI,CAAC;YACV,KAAK,KAAK,CAAC;YACX,KAAK,YAAY;gBACf,OAAO;oBACL,eAAe;oBACf,eAAe;oBACf,gBAAgB;oBAChB,UAAU;oBACV,aAAa;oBACb,UAAU;oBACV,QAAQ;oBACR,QAAQ;oBACR,UAAU;oBACV,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,YAAY;oBACZ,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,KAAK;gBACR,OAAO;oBACL,aAAa;oBACb,UAAU;oBACV,OAAO;oBACP,YAAY;oBACZ,UAAU;oBACV,OAAO;oBACP,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,OAAO;gBACV,OAAO;oBACL,YAAY;oBACZ,YAAY;oBACZ,SAAS;oBACT,WAAW;oBACX,WAAW;oBACX,WAAW;oBACX,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,QAAQ,CAAC;YACd,KAAK,IAAI;gBACP,OAAO;oBACL,UAAU;oBACV,QAAQ;oBACR,UAAU;oBACV,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,KAAK;gBACR,OAAO;oBACL,SAAS;oBACT,SAAS;oBACT,SAAS;oBACT,OAAO;oBACP,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,MAAM;gBACT,OAAO;oBACL,QAAQ;oBACR,UAAU;oBACV,OAAO;oBACP,WAAW;oBACX,UAAU;oBACV,QAAQ;oBACR,OAAO;oBACP,UAAU;oBACV,WAAW;oBACX,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,MAAM;gBACT,OAAO;oBACL,OAAO;oBACP,UAAU;oBACV,QAAQ;oBACR,OAAO;oBACP,UAAU;oBACV,QAAQ;oBACR,SAAS;oBACT,UAAU;oBACV,UAAU;oBACV,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,OAAO;gBACV,OAAO;oBACL,UAAU;oBACV,WAAW;oBACX,QAAQ;oBACR,QAAQ;oBACR,QAAQ;oBACR,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,UAAU;oBACV,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,OAAO;gBACV,OAAO;oBACL,SAAS;oBACT,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,OAAO;oBACP,WAAW;oBACX,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,IAAI,CAAC;YACV,KAAK,UAAU;gBACb,OAAO;oBACL,OAAO;oBACP,QAAQ;oBACR,SAAS;oBACT,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,aAAa;oBACb,aAAa;oBACb,aAAa;oBACb,SAAS;oBACT,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,OAAO;gBACV,OAAO;oBACL,cAAc;oBACd,cAAc;oBACd,iBAAiB;oBACjB,oBAAoB;oBACpB,sBAAsB;oBACtB,oBAAoB;oBACpB,wBAAwB;oBACxB,iBAAiB;oBACjB,kBAAkB;oBAClB,sBAAsB;oBACtB,kBAAkB;oBAClB,qBAAqB;oBACrB,kBAAkB;oBAClB,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ,KAAK,MAAM;gBACT,OAAO;oBACL,QAAQ;oBACR,OAAO;oBACP,KAAK;oBACL,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,QAAQ;oBACR,SAAS;oBACT,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,MAAM;oBACN,UAAU;oBACV,UAAU;oBACV,OAAO;oBACP,QAAQ;oBACR,SAAS;oBACT,UAAU;oBACV,QAAQ;oBACR,SAAS;iBACV,CAAC;YACJ,KAAK,KAAK;gBACR,OAAO;oBACL,WAAW;oBACX,UAAU;oBACV,aAAa;oBACb,cAAc;oBACd,YAAY;oBACZ,gBAAgB;oBAChB,SAAS;oBACT,aAAa;oBACb,UAAU;oBACV,aAAa;oBACb,UAAU;oBACV,WAAW;oBACX,SAAS;oBACT,OAAO;oBACP,QAAQ;oBACR,UAAU;oBACV,aAAa;oBACb,aAAa;oBACb,MAAM;oBACN,IAAI;iBACL,CAAC;YACJ;gBACE,OAAO;oBACL,MAAM;oBACN,IAAI;iBACL,CAAC;QACN,CAAC;IACH,CAAC;CACF;AA1dD,oCA0dC"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=TextSplitter.spec.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TextSplitter.spec.d.ts","sourceRoot":"","sources":["../src/TextSplitter.spec.ts"],"names":[],"mappings":""}
@@ -0,0 +1,109 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ const mocha_1 = require("mocha");
37
+ const assert = __importStar(require("node:assert"));
38
+ const TextSplitter_1 = require("./TextSplitter");
39
+ (0, mocha_1.describe)('TextSplitter', () => {
40
+ const makeSplitter = (opts) => new TextSplitter_1.TextSplitter(Object.assign({ chunkSize: 16, chunkOverlap: 0 }, opts));
41
+ (0, mocha_1.it)('keeps a leading punctuation-only chunk ("---")', () => {
42
+ const splitter = makeSplitter({ chunkSize: 3, chunkOverlap: 0 });
43
+ const chunks = splitter.split('---');
44
+ assert.deepStrictEqual(chunks.map(c => c.text), ['---']);
45
+ });
46
+ (0, mocha_1.it)('keeps punctuation-only separators (---, ***, ====) at start, middle, and end', () => {
47
+ const splitter = makeSplitter({ chunkSize: 4, chunkOverlap: 0 });
48
+ const text = ['---', 'Hello world', '***', 'Middle', '===='].join('\n');
49
+ const chunks = splitter.split(text);
50
+ assert.ok(chunks.some(c => c.text.includes('---')));
51
+ assert.ok(chunks.some(c => c.text.includes('***')));
52
+ assert.ok(chunks.some(c => c.text.includes('====')));
53
+ });
54
+ (0, mocha_1.it)('preserves frontmatter delimiters when chunk size is small and overlap is zero', () => {
55
+ var _a;
56
+ const splitter = makeSplitter({ chunkSize: 12, chunkOverlap: 0 });
57
+ const md = [
58
+ '---',
59
+ 'title: Test',
60
+ 'tags: [a, b]',
61
+ '---',
62
+ '# Heading',
63
+ 'Body text goes here.'
64
+ ].join('\n');
65
+ const chunks = splitter.split(md);
66
+ const joined = chunks.map(c => c.text).join('\n');
67
+ const delimiterCount = ((_a = joined.match(/^---$/gm)) !== null && _a !== void 0 ? _a : []).length;
68
+ assert.strictEqual(delimiterCount, 2);
69
+ });
70
+ (0, mocha_1.it)('keeps trailing punctuation-only chunk', () => {
71
+ const splitter = makeSplitter({ chunkSize: 4, chunkOverlap: 0 });
72
+ const chunks = splitter.split('Content\n---');
73
+ assert.ok(chunks.some(c => c.text.includes('---')));
74
+ });
75
+ (0, mocha_1.it)('drops pure whitespace-only chunks', () => {
76
+ const splitter = makeSplitter({ chunkSize: 10, chunkOverlap: 0 });
77
+ const chunks1 = splitter.split(' \t ');
78
+ const chunks2 = splitter.split('\n\n');
79
+ const chunks3 = splitter.split(' \n \n ');
80
+ assert.strictEqual(chunks1.length, 0);
81
+ assert.strictEqual(chunks2.length, 0);
82
+ assert.strictEqual(chunks3.length, 0);
83
+ });
84
+ (0, mocha_1.it)('still returns alphanumeric chunks normally', () => {
85
+ const splitter = makeSplitter({ chunkSize: 5, chunkOverlap: 0 });
86
+ const chunks = splitter.split('abcde fghij');
87
+ assert.ok(chunks.length > 0);
88
+ assert.ok(chunks.map(c => c.text).join(' ').includes('abcde'));
89
+ assert.ok(chunks.map(c => c.text).join(' ').includes('fghij'));
90
+ });
91
+ (0, mocha_1.it)('does not regress with non-zero overlap', () => {
92
+ const splitter = makeSplitter({ chunkSize: 5, chunkOverlap: 2 });
93
+ const chunks = splitter.split('---\nabcdef');
94
+ assert.ok(chunks.some(c => c.text.includes('---')));
95
+ });
96
+ (0, mocha_1.it)('handles multiple punctuation-only separators interleaved with content', () => {
97
+ const splitter = makeSplitter({ chunkSize: 8, chunkOverlap: 0 });
98
+ const text = ['***', 'A', '---', 'B', '====', 'C'].join('\n');
99
+ const chunks = splitter.split(text);
100
+ assert.ok(chunks.some(c => c.text.includes('***')));
101
+ assert.ok(chunks.some(c => c.text.includes('---')));
102
+ assert.ok(chunks.some(c => c.text.includes('====')));
103
+ const joined = chunks.map(c => c.text).join('\n');
104
+ assert.ok(joined.includes('\nA\n'));
105
+ assert.ok(joined.includes('\nB\n'));
106
+ assert.ok(joined.includes('\nC'));
107
+ });
108
+ });
109
+ //# sourceMappingURL=TextSplitter.spec.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"TextSplitter.spec.js","sourceRoot":"","sources":["../src/TextSplitter.spec.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,iCAAqC;AACrC,oDAAsC;AACtC,iDAA8C;AAE9C,IAAA,gBAAQ,EAAC,cAAc,EAAE,GAAG,EAAE;IAC5B,MAAM,YAAY,GAAG,CAAC,IAA6D,EAAE,EAAE,CACrF,IAAI,2BAAY,iBAAG,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,CAAC,IAAK,IAAI,EAAG,CAAC;IAEhE,IAAA,UAAE,EAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACrC,MAAM,CAAC,eAAe,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC;IAC3D,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,8EAA8E,EAAE,GAAG,EAAE;QACtF,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,IAAI,GAAG,CAAC,KAAK,EAAE,aAAa,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEpC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;IACvD,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,+EAA+E,EAAE,GAAG,EAAE;;QACvF,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QAClE,MAAM,EAAE,GAAG;YACT,KAAK;YACL,aAAa;YACb,cAAc;YACd,KAAK;YACL,WAAW;YACX,sBAAsB;SACvB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEb,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;QAClC,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAElD,MAAM,cAAc,GAAG,CAAC,MAAA,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,mCAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAC9D,MAAM,CAAC,WAAW,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,uCAAuC,EAAE,GAAG,EAAE;QAC/C,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC;QAC9C,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,EAAE,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QAClE,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;QAC1C,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;QACvC,MAAM,OAAO,GAAG,QAAQ,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;QAC3C,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;QACtC,MAAM,CAAC,WAAW,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACxC,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC7C,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAC7B,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QAC/D,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;IACjE,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC7C,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACtD,CAAC,CAAC,CAAC;IAEH,IAAA,UAAE,EAAC,uEAAuE,EAAE,GAAG,EAAE;QAC/E,MAAM,QAAQ,GAAG,YAAY,CAAC,EAAE,SAAS,EAAE,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,CAAC,CAAC;QACjE,MAAM,IAAI,GAAG,CAAC,KAAK,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC9D,MAAM,MAAM,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAEpC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACpD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QAErD,MAAM,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAClD,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QACpC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;QACpC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC"}
@@ -0,0 +1,15 @@
1
+ import { AxiosRequestConfig } from "axios";
2
+ import { TextFetcher } from './types';
3
+ export interface WebFetcherConfig {
4
+ headers?: Record<string, string>;
5
+ requestConfig?: AxiosRequestConfig;
6
+ htmlToMarkdown: boolean;
7
+ summarizeHtml: boolean;
8
+ }
9
+ export declare class WebFetcher implements TextFetcher {
10
+ private readonly _config;
11
+ constructor(config?: Partial<WebFetcherConfig>);
12
+ fetch(uri: string, onDocument: (uri: string, text: string, docType?: string) => Promise<boolean>): Promise<boolean>;
13
+ private htmlToMarkdown;
14
+ }
15
+ //# sourceMappingURL=WebFetcher.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"WebFetcher.d.ts","sourceRoot":"","sources":["../src/WebFetcher.ts"],"names":[],"mappings":"AAAA,OAAc,EAAE,kBAAkB,EAAE,MAAM,OAAO,CAAC;AAClD,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AA2BtC,MAAM,WAAW,gBAAgB;IAC7B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAC,MAAM,CAAC,CAAC;IAChC,aAAa,CAAC,EAAE,kBAAkB,CAAC;IACnC,cAAc,EAAE,OAAO,CAAC;IACxB,aAAa,EAAE,OAAO,CAAC;CAC1B;AAED,qBAAa,UAAW,YAAW,WAAW;IAC1C,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAmB;gBAExB,MAAM,CAAC,EAAE,OAAO,CAAC,gBAAgB,CAAC;IAOxC,KAAK,CAAC,GAAG,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,MAAM,KAAK,OAAO,CAAC,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC;IAyChI,OAAO,CAAC,cAAc;CAmCzB"}