vectra 0.12.2 → 0.12.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.draft.md +499 -0
- package/README.draft.outline.md +160 -0
- package/README.research.md +2159 -0
- package/bin/vectra.js +3 -0
- package/lib/FileFetcher.d.ts +5 -0
- package/lib/FileFetcher.d.ts.map +1 -0
- package/lib/FileFetcher.js +79 -0
- package/lib/FileFetcher.js.map +1 -0
- package/lib/GPT3Tokenizer.d.ts +9 -0
- package/lib/ItemSelector.d.ts +41 -0
- package/lib/ItemSelector.d.ts.map +1 -0
- package/lib/ItemSelector.js +168 -0
- package/lib/ItemSelector.js.map +1 -0
- package/lib/LocalDocument.d.ts +54 -0
- package/lib/LocalDocument.js +156 -0
- package/lib/LocalDocument.js.map +1 -0
- package/lib/LocalDocumentIndex.d.ts +132 -0
- package/lib/LocalDocumentIndex.js +456 -0
- package/lib/LocalDocumentIndex.js.map +1 -0
- package/lib/LocalDocumentResult.d.ts +45 -0
- package/lib/LocalDocumentResult.js +328 -0
- package/lib/LocalDocumentResult.js.map +1 -0
- package/lib/LocalIndex.d.ts +150 -0
- package/lib/LocalIndex.d.ts.map +1 -1
- package/lib/LocalIndex.js +515 -0
- package/lib/LocalIndex.js.map +1 -0
- package/lib/LocalIndex.spec.d.ts +2 -0
- package/lib/LocalIndex.spec.js +218 -7
- package/lib/LocalIndex.spec.js.map +1 -1
- package/lib/OpenAIEmbeddings.d.ts +126 -0
- package/lib/OpenAIEmbeddings.d.ts.map +1 -0
- package/lib/OpenAIEmbeddings.js +174 -0
- package/lib/OpenAIEmbeddings.js.map +1 -0
- package/lib/TextSplitter.d.ts +19 -0
- package/lib/TextSplitter.d.ts.map +1 -1
- package/lib/TextSplitter.js +457 -0
- package/lib/TextSplitter.js.map +1 -0
- package/lib/TextSplitter.spec.d.ts +2 -0
- package/lib/TextSplitter.spec.d.ts.map +1 -0
- package/lib/TextSplitter.spec.js +109 -0
- package/lib/TextSplitter.spec.js.map +1 -0
- package/lib/WebFetcher.d.ts +15 -0
- package/lib/WebFetcher.d.ts.map +1 -0
- package/lib/WebFetcher.js +234 -0
- package/lib/WebFetcher.js.map +1 -0
- package/lib/index.d.ts +12 -0
- package/lib/index.js +28 -0
- package/lib/index.js.map +1 -0
- package/lib/internals/Colorize.d.ts +14 -0
- package/lib/internals/Colorize.d.ts.map +1 -0
- package/lib/internals/Colorize.js +64 -0
- package/lib/internals/Colorize.js.map +1 -0
- package/lib/internals/index.d.ts +3 -0
- package/lib/internals/index.d.ts.map +1 -0
- package/lib/internals/index.js +19 -0
- package/lib/internals/index.js.map +1 -0
- package/lib/internals/types.d.ts +43 -0
- package/lib/internals/types.d.ts.map +1 -0
- package/lib/internals/types.js +3 -0
- package/lib/internals/types.js.map +1 -0
- package/lib/types.d.ts +146 -0
- package/lib/types.d.ts.map +1 -0
- package/lib/types.js +3 -0
- package/lib/types.js.map +1 -0
- package/lib/vectra-cli.d.ts +2 -0
- package/lib/vectra-cli.js +323 -0
- package/lib/vectra-cli.js.map +1 -0
- package/package.json +3 -1
- package/src/LocalIndex.spec.ts +265 -8
- package/src/LocalIndex.ts +1 -0
- package/src/TextSplitter.spec.ts +87 -0
- package/src/TextSplitter.ts +459 -531
package/src/LocalIndex.spec.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import assert from 'node:assert'
|
|
2
|
+
import sinon from 'sinon'
|
|
2
3
|
import { LocalIndex } from './LocalIndex'
|
|
3
4
|
import { IndexItem } from './types'
|
|
4
5
|
import fs from 'fs/promises'
|
|
@@ -7,12 +8,20 @@ import path from 'path'
|
|
|
7
8
|
describe('LocalIndex', () => {
|
|
8
9
|
const testIndexDir = path.join(__dirname, 'test_index');
|
|
9
10
|
|
|
11
|
+
const basicIndexItems: Partial<IndexItem>[] = [
|
|
12
|
+
{ id: '1', vector: [1, 2, 3] },
|
|
13
|
+
{ id: '2', vector: [2, 3, 4] },
|
|
14
|
+
{ id: '3', vector: [3, 4, 5] }
|
|
15
|
+
];
|
|
16
|
+
|
|
17
|
+
|
|
10
18
|
beforeEach(async () => {
|
|
11
19
|
await fs.rm(testIndexDir, { recursive: true, force: true });
|
|
12
20
|
});
|
|
13
21
|
|
|
14
22
|
afterEach(async () => {
|
|
15
23
|
await fs.rm(testIndexDir, { recursive: true, force: true });
|
|
24
|
+
sinon.restore();
|
|
16
25
|
});
|
|
17
26
|
|
|
18
27
|
it('should create a new index', async () => {
|
|
@@ -20,20 +29,171 @@ describe('LocalIndex', () => {
|
|
|
20
29
|
await index.createIndex();
|
|
21
30
|
const created = await index.isIndexCreated();
|
|
22
31
|
assert.equal(created, true);
|
|
32
|
+
assert.equal(index.folderPath, testIndexDir);
|
|
23
33
|
});
|
|
24
34
|
|
|
25
|
-
|
|
26
|
-
const
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
35
|
+
it('blocks concurrent operations when lock is held', async () => {
|
|
36
|
+
const index = new LocalIndex(testIndexDir);
|
|
37
|
+
await index.createIndex();
|
|
38
|
+
await index.beginUpdate(); // grab lock for a big update!
|
|
39
|
+
await assert.rejects(async () => {
|
|
40
|
+
await index.beginUpdate(); // try to grab lock again. should fail!
|
|
41
|
+
}, new Error('Update already in progress'))
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
describe('createIndex', () => {
|
|
45
|
+
it('checks for existing index on creation', async () => {
|
|
46
|
+
const index = new LocalIndex(testIndexDir);
|
|
47
|
+
await index.createIndex(); // create first index.json
|
|
48
|
+
|
|
49
|
+
// create without deleteIfExists. Will reject
|
|
50
|
+
await assert.rejects(async () => {
|
|
51
|
+
await index.createIndex()
|
|
52
|
+
}, new Error('Index already exists'))
|
|
53
|
+
|
|
54
|
+
// create with deleteIfExists. Should remove old data
|
|
55
|
+
await index.insertItem({id:'1', vector: [1,2,3]})
|
|
56
|
+
const lengthBefore = (await index.listItems()).length
|
|
57
|
+
assert.equal(lengthBefore, 1)
|
|
58
|
+
await index.createIndex({deleteIfExists: true, version: 2, metadata_config: {}})
|
|
59
|
+
const lengthAfter = (await index.listItems()).length
|
|
60
|
+
assert.equal(lengthAfter, 0)
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
it('delete index if file creation fails', async () => {
|
|
64
|
+
const index = new LocalIndex(testIndexDir);
|
|
65
|
+
sinon.stub(fs, 'writeFile').rejects(new Error('fs error'))
|
|
66
|
+
|
|
67
|
+
await assert.rejects(async () => {
|
|
68
|
+
await index.createIndex();
|
|
69
|
+
}, new Error('Error creating index'))
|
|
70
|
+
|
|
71
|
+
await assert.rejects(async () => {
|
|
72
|
+
await index.listItems();
|
|
73
|
+
})
|
|
74
|
+
})
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
describe('deleteItem', () => {
|
|
78
|
+
it('does nothing when id not found', async () => {
|
|
79
|
+
const index = new LocalIndex(testIndexDir);
|
|
80
|
+
await index.createIndex();
|
|
81
|
+
await index.beginUpdate();
|
|
82
|
+
await index.insertItem(basicIndexItems[0])
|
|
83
|
+
await index.insertItem(basicIndexItems[1])
|
|
84
|
+
await index.insertItem(basicIndexItems[2])
|
|
85
|
+
await index.endUpdate();
|
|
86
|
+
|
|
87
|
+
await assert.doesNotReject(async () => {
|
|
88
|
+
await index.deleteItem('dne');
|
|
89
|
+
})
|
|
90
|
+
assert.equal((await index.listItems()).length, 3)
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
it('leaves existing empty index when last el deleted', async () => {
|
|
94
|
+
const index = new LocalIndex(testIndexDir);
|
|
95
|
+
await index.createIndex();
|
|
96
|
+
await index.insertItem(basicIndexItems[0]);
|
|
97
|
+
|
|
98
|
+
await index.deleteItem(basicIndexItems[0].id ?? '');
|
|
99
|
+
assert.equal(await index.isIndexCreated(), true);
|
|
100
|
+
assert.equal((await index.listItems()).length, 0);
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
it('removes elements from any position', async () => {
|
|
104
|
+
const index = new LocalIndex(testIndexDir);
|
|
105
|
+
await index.createIndex();
|
|
106
|
+
await index.batchInsertItems([
|
|
107
|
+
{id: '1', vector: []},
|
|
108
|
+
{id: '2', vector: []},
|
|
109
|
+
{id: '3', vector: []},
|
|
110
|
+
{id: '4', vector: []},
|
|
111
|
+
{id: '5', vector: []},
|
|
112
|
+
]);
|
|
113
|
+
|
|
114
|
+
await index.beginUpdate();
|
|
115
|
+
await index.deleteItem('1');
|
|
116
|
+
await index.deleteItem('3');
|
|
117
|
+
await index.deleteItem('5');
|
|
118
|
+
await index.endUpdate();
|
|
119
|
+
|
|
120
|
+
assert.deepStrictEqual(await index.listItems(), [{id: '2', vector: [], metadata: {}, norm: 0}, {id: '4', vector: [], metadata: {}, norm: 0}])
|
|
121
|
+
})
|
|
122
|
+
})
|
|
123
|
+
|
|
124
|
+
describe('endUpdate', () => {
|
|
125
|
+
it('throws an error if no update has begun', async () => {
|
|
126
|
+
const index = new LocalIndex(testIndexDir);
|
|
127
|
+
|
|
128
|
+
await assert.rejects(async () => {
|
|
129
|
+
await index.endUpdate();
|
|
130
|
+
}, new Error('No update in progress'));
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
it('throws an error if the index could not be saved', async () => {
|
|
134
|
+
const index = new LocalIndex(testIndexDir, 'index.json');
|
|
135
|
+
await index.createIndex();
|
|
136
|
+
await index.beginUpdate();
|
|
137
|
+
|
|
138
|
+
sinon.stub(fs, 'writeFile').rejects(new Error('fs error'))
|
|
139
|
+
|
|
140
|
+
await assert.rejects(async () => {
|
|
141
|
+
await index.endUpdate();
|
|
142
|
+
}, new Error('Error saving index: Error: fs error'))
|
|
143
|
+
})
|
|
144
|
+
})
|
|
145
|
+
|
|
146
|
+
describe('getIndexStats', () => {
|
|
147
|
+
it('reports empty index correctly', async () => {
|
|
148
|
+
const index = new LocalIndex(testIndexDir);
|
|
149
|
+
await index.createIndex();
|
|
150
|
+
|
|
151
|
+
assert.deepStrictEqual(await index.getIndexStats(), {
|
|
152
|
+
version: 1,
|
|
153
|
+
metadata_config: {},
|
|
154
|
+
items: 0
|
|
155
|
+
})
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
it('correctly reports non-empty index stats', async () => {
|
|
159
|
+
const index = new LocalIndex(testIndexDir)
|
|
160
|
+
await index.createIndex({version: 1, metadata_config: {indexed: []}})
|
|
161
|
+
await index.batchInsertItems(basicIndexItems);
|
|
162
|
+
|
|
163
|
+
assert.deepStrictEqual(await index.getIndexStats(), {
|
|
164
|
+
version: 1,
|
|
165
|
+
metadata_config: {indexed: []},
|
|
166
|
+
items: 3
|
|
167
|
+
})
|
|
168
|
+
})
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
describe('getItem', () => {
|
|
172
|
+
it('returns undefined when item not found', async () => {
|
|
173
|
+
const index = new LocalIndex(testIndexDir);
|
|
174
|
+
await index.createIndex();
|
|
175
|
+
|
|
176
|
+
assert.equal(await index.getItem('1'), undefined)
|
|
177
|
+
})
|
|
31
178
|
|
|
179
|
+
it('returns requested item', async () => {
|
|
180
|
+
const index = new LocalIndex(testIndexDir);
|
|
181
|
+
await index.createIndex();
|
|
182
|
+
await index.batchInsertItems(basicIndexItems);
|
|
183
|
+
|
|
184
|
+
const item2 = await index.getItem('2');
|
|
185
|
+
assert.equal(item2?.id, basicIndexItems[1].id)
|
|
186
|
+
assert.equal(item2?.vector, basicIndexItems[1].vector)
|
|
187
|
+
assert.equal((await index.listItems()).length, 3)
|
|
188
|
+
})
|
|
189
|
+
})
|
|
190
|
+
|
|
191
|
+
describe('batchInsertItems', () => {
|
|
32
192
|
it('should insert provided items', async () => {
|
|
33
193
|
const index = new LocalIndex(testIndexDir);
|
|
34
194
|
await index.createIndex();
|
|
35
195
|
|
|
36
|
-
const newItems = await index.batchInsertItems(
|
|
196
|
+
const newItems = await index.batchInsertItems(basicIndexItems);
|
|
37
197
|
|
|
38
198
|
assert.equal(newItems.length, 3);
|
|
39
199
|
|
|
@@ -50,7 +210,7 @@ describe('LocalIndex', () => {
|
|
|
50
210
|
// ensures insert error is bubbled up to batchIndexItems caller
|
|
51
211
|
await assert.rejects(
|
|
52
212
|
async () => {
|
|
53
|
-
await index.batchInsertItems(
|
|
213
|
+
await index.batchInsertItems(basicIndexItems);
|
|
54
214
|
},
|
|
55
215
|
{
|
|
56
216
|
name: 'Error',
|
|
@@ -63,4 +223,101 @@ describe('LocalIndex', () => {
|
|
|
63
223
|
assert.equal(storedItems.length, 1);
|
|
64
224
|
});
|
|
65
225
|
});
|
|
226
|
+
|
|
227
|
+
describe('listItemsByMetadata', () => {
|
|
228
|
+
it('returns items matching metadata filter', async () => {
|
|
229
|
+
const index = new LocalIndex(testIndexDir);
|
|
230
|
+
await index.createIndex();
|
|
231
|
+
await index.batchInsertItems([
|
|
232
|
+
{id: '1', vector: [], metadata: {category: 'food'}},
|
|
233
|
+
{id: '2', vector: [], metadata: {category: 'food'}},
|
|
234
|
+
{id: '3', vector: [], metadata: {category: 'electronics'}},
|
|
235
|
+
{id: '4', vector: [], metadata: {category: 'drink'}},
|
|
236
|
+
{id: '5', vector: [], metadata: {category: 'food'}},
|
|
237
|
+
]);
|
|
238
|
+
|
|
239
|
+
const foodItems = await index.listItemsByMetadata({category: {'$eq': 'food'}})
|
|
240
|
+
assert.deepStrictEqual(foodItems.map((item) => item.id), ["1", "2", "5"])
|
|
241
|
+
const drinkItems = await index.listItemsByMetadata({category: {'$eq': 'drink'}})
|
|
242
|
+
assert.deepStrictEqual(drinkItems.map((item) => item.id), ["4"])
|
|
243
|
+
const clothingItems = await index.listItemsByMetadata({category: {'$eq': 'clothes'}})
|
|
244
|
+
assert.deepStrictEqual(clothingItems, [])
|
|
245
|
+
})
|
|
246
|
+
|
|
247
|
+
it('returns nothing when no items in index', async () => {
|
|
248
|
+
const index = new LocalIndex(testIndexDir);
|
|
249
|
+
await index.createIndex();
|
|
250
|
+
|
|
251
|
+
const items = await index.listItemsByMetadata({});
|
|
252
|
+
assert.deepStrictEqual(items, []);
|
|
253
|
+
})
|
|
254
|
+
});
|
|
255
|
+
|
|
256
|
+
describe("queryItems", () => {
|
|
257
|
+
it("returns empty array on empty index search", async () => {
|
|
258
|
+
const index = new LocalIndex(testIndexDir);
|
|
259
|
+
await index.createIndex();
|
|
260
|
+
|
|
261
|
+
const result = await index.queryItems([1, 2, 3], "", 10);
|
|
262
|
+
assert.deepStrictEqual(result, []);
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
it("returns bad match when no better match exists", async () => {
|
|
266
|
+
const index = new LocalIndex(testIndexDir);
|
|
267
|
+
await index.createIndex();
|
|
268
|
+
await index.insertItem({ id: "1", vector: [0.9, 0, 0, 0, 0] });
|
|
269
|
+
|
|
270
|
+
const result = await index.queryItems([0, 0, 0, 0, 0.1], "", 1);
|
|
271
|
+
assert.equal(result[0]?.score, 0);
|
|
272
|
+
assert.equal(result[0]?.item.id, "1");
|
|
273
|
+
});
|
|
274
|
+
|
|
275
|
+
it("returns all vectors when fewer than topK exist", async () => {
|
|
276
|
+
const index = new LocalIndex(testIndexDir);
|
|
277
|
+
await index.createIndex();
|
|
278
|
+
await index.batchInsertItems(basicIndexItems);
|
|
279
|
+
|
|
280
|
+
const result = await index.queryItems([0, 0, 1], "", 10);
|
|
281
|
+
assert.equal(result.length, 3);
|
|
282
|
+
assert.deepStrictEqual(
|
|
283
|
+
result.map(({ item }) => item.id),
|
|
284
|
+
basicIndexItems.map((item) => item.id),
|
|
285
|
+
);
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
it("filters by metadata when filter provided", async () => {
|
|
289
|
+
const index = new LocalIndex(testIndexDir);
|
|
290
|
+
await index.createIndex();
|
|
291
|
+
await index.batchInsertItems([
|
|
292
|
+
{ id: "1", vector: [1, 0, 0], metadata: { category: "food" } },
|
|
293
|
+
{ id: "2", vector: [0, 0, 1], metadata: { category: "drink" } },
|
|
294
|
+
]);
|
|
295
|
+
|
|
296
|
+
const bestGeneralMatch = await index.queryItems([1, 0, 0], "", 1);
|
|
297
|
+
const bestDrinkMatch = await index.queryItems([1, 0, 0], "", 1, {
|
|
298
|
+
category: { $eq: "drink" },
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
assert.equal(bestGeneralMatch[0].item.id, "1");
|
|
302
|
+
assert.equal(bestDrinkMatch[0].item.id, "2");
|
|
303
|
+
});
|
|
304
|
+
|
|
305
|
+
it("reads item metadata file when provided", async () => {
|
|
306
|
+
const index = new LocalIndex(testIndexDir);
|
|
307
|
+
await index.createIndex({version: 1, metadata_config: {indexed: ['category']}});
|
|
308
|
+
await index.batchInsertItems([
|
|
309
|
+
{ id: "1", vector: [1, 0, 0] },
|
|
310
|
+
{ id: "2", vector: [0, 0, 1], metadata: {category: 'drink'} },
|
|
311
|
+
]);
|
|
312
|
+
|
|
313
|
+
sinon
|
|
314
|
+
.stub(fs, "readFile")
|
|
315
|
+
.resolves(JSON.stringify({ category: "drink" }));
|
|
316
|
+
|
|
317
|
+
const bestDrinkMatch = await index.queryItems([1, 0, 0], "", 2, {category: {'$eq': 'drink'}});
|
|
318
|
+
|
|
319
|
+
assert.notEqual(bestDrinkMatch[0].item.metadataFile, undefined);
|
|
320
|
+
assert.equal(bestDrinkMatch[0].item.id, "2");
|
|
321
|
+
});
|
|
322
|
+
});
|
|
66
323
|
});
|
package/src/LocalIndex.ts
CHANGED
|
@@ -275,6 +275,7 @@ export class LocalIndex<TMetadata extends Record<string,MetadataTypes> = Record<
|
|
|
275
275
|
* This method loads the index into memory and returns the top k items that are most similar.
|
|
276
276
|
* An optional filter can be applied to the metadata of the items.
|
|
277
277
|
* @param vector Vector to query against.
|
|
278
|
+
* @param query Query text used for BM25 keyword search.
|
|
278
279
|
* @param topK Number of items to return.
|
|
279
280
|
* @param filter Optional. Filter to apply.
|
|
280
281
|
* @returns Similar items to the vector that matche the supplied filter.
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { describe, it } from 'mocha';
|
|
2
|
+
import * as assert from 'node:assert';
|
|
3
|
+
import { TextSplitter } from './TextSplitter';
|
|
4
|
+
|
|
5
|
+
describe('TextSplitter', () => {
|
|
6
|
+
const makeSplitter = (opts?: Partial<ConstructorParameters<typeof TextSplitter>[0]>) =>
|
|
7
|
+
new TextSplitter({ chunkSize: 16, chunkOverlap: 0, ...opts });
|
|
8
|
+
|
|
9
|
+
it('keeps a leading punctuation-only chunk ("---")', () => {
|
|
10
|
+
const splitter = makeSplitter({ chunkSize: 3, chunkOverlap: 0 });
|
|
11
|
+
const chunks = splitter.split('---');
|
|
12
|
+
assert.deepStrictEqual(chunks.map(c => c.text), ['---']);
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
it('keeps punctuation-only separators (---, ***, ====) at start, middle, and end', () => {
|
|
16
|
+
const splitter = makeSplitter({ chunkSize: 4, chunkOverlap: 0 });
|
|
17
|
+
const text = ['---', 'Hello world', '***', 'Middle', '===='].join('\n');
|
|
18
|
+
const chunks = splitter.split(text);
|
|
19
|
+
|
|
20
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
21
|
+
assert.ok(chunks.some(c => c.text.includes('***')));
|
|
22
|
+
assert.ok(chunks.some(c => c.text.includes('====')));
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it('preserves frontmatter delimiters when chunk size is small and overlap is zero', () => {
|
|
26
|
+
const splitter = makeSplitter({ chunkSize: 12, chunkOverlap: 0 });
|
|
27
|
+
const md = [
|
|
28
|
+
'---',
|
|
29
|
+
'title: Test',
|
|
30
|
+
'tags: [a, b]',
|
|
31
|
+
'---',
|
|
32
|
+
'# Heading',
|
|
33
|
+
'Body text goes here.'
|
|
34
|
+
].join('\n');
|
|
35
|
+
|
|
36
|
+
const chunks = splitter.split(md);
|
|
37
|
+
const joined = chunks.map(c => c.text).join('\n');
|
|
38
|
+
|
|
39
|
+
const delimiterCount = (joined.match(/^---$/gm) ?? []).length;
|
|
40
|
+
assert.strictEqual(delimiterCount, 2);
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it('keeps trailing punctuation-only chunk', () => {
|
|
44
|
+
const splitter = makeSplitter({ chunkSize: 4, chunkOverlap: 0 });
|
|
45
|
+
const chunks = splitter.split('Content\n---');
|
|
46
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it('drops pure whitespace-only chunks', () => {
|
|
50
|
+
const splitter = makeSplitter({ chunkSize: 10, chunkOverlap: 0 });
|
|
51
|
+
const chunks1 = splitter.split(' \t ');
|
|
52
|
+
const chunks2 = splitter.split('\n\n');
|
|
53
|
+
const chunks3 = splitter.split(' \n \n ');
|
|
54
|
+
assert.strictEqual(chunks1.length, 0);
|
|
55
|
+
assert.strictEqual(chunks2.length, 0);
|
|
56
|
+
assert.strictEqual(chunks3.length, 0);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it('still returns alphanumeric chunks normally', () => {
|
|
60
|
+
const splitter = makeSplitter({ chunkSize: 5, chunkOverlap: 0 });
|
|
61
|
+
const chunks = splitter.split('abcde fghij');
|
|
62
|
+
assert.ok(chunks.length > 0);
|
|
63
|
+
assert.ok(chunks.map(c => c.text).join(' ').includes('abcde'));
|
|
64
|
+
assert.ok(chunks.map(c => c.text).join(' ').includes('fghij'));
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it('does not regress with non-zero overlap', () => {
|
|
68
|
+
const splitter = makeSplitter({ chunkSize: 5, chunkOverlap: 2 });
|
|
69
|
+
const chunks = splitter.split('---\nabcdef');
|
|
70
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it('handles multiple punctuation-only separators interleaved with content', () => {
|
|
74
|
+
const splitter = makeSplitter({ chunkSize: 8, chunkOverlap: 0 });
|
|
75
|
+
const text = ['***', 'A', '---', 'B', '====', 'C'].join('\n');
|
|
76
|
+
const chunks = splitter.split(text);
|
|
77
|
+
|
|
78
|
+
assert.ok(chunks.some(c => c.text.includes('***')));
|
|
79
|
+
assert.ok(chunks.some(c => c.text.includes('---')));
|
|
80
|
+
assert.ok(chunks.some(c => c.text.includes('====')));
|
|
81
|
+
|
|
82
|
+
const joined = chunks.map(c => c.text).join('\n');
|
|
83
|
+
assert.ok(joined.includes('\nA\n'));
|
|
84
|
+
assert.ok(joined.includes('\nB\n'));
|
|
85
|
+
assert.ok(joined.includes('\nC'));
|
|
86
|
+
});
|
|
87
|
+
});
|