@crashbytes/semantic-text-toolkit 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,582 @@
1
+ /**
2
+ * SemanticSearch Test Suite
3
+ *
4
+ * Comprehensive validation of semantic search functionality including
5
+ * indexing, searching, filtering, and index management.
6
+ */
7
+
8
+ // Mock the transformers library before any imports
9
+ jest.mock('@xenova/transformers', () => ({
10
+ pipeline: jest.fn(),
11
+ }));
12
+
13
+ import { SemanticSearch, IndexedItem } from '../SemanticSearch';
14
+ import { SemanticEngine } from '../../engine/SemanticEngine';
15
+ import { SemanticError, SemanticErrorCode } from '../../types';
16
+
17
+ // Mock the SemanticEngine
18
+ jest.mock('../../engine/SemanticEngine');
19
+
20
+ const MockedSemanticEngine = SemanticEngine as jest.MockedClass<typeof SemanticEngine>;
21
+
22
+ describe('SemanticSearch', () => {
23
+ let mockEngine: jest.Mocked<SemanticEngine>;
24
+ let search: SemanticSearch<string>;
25
+
26
+ // Helper to create mock embeddings
27
+ const createMockEmbedding = (seed: number) =>
28
+ Array(384).fill(0).map((_, i) => Math.sin(seed + i) * 0.1);
29
+
30
+ beforeEach(() => {
31
+ jest.clearAllMocks();
32
+
33
+ // Create mock engine
34
+ mockEngine = new MockedSemanticEngine() as jest.Mocked<SemanticEngine>;
35
+
36
+ // Mock embedBatch to return embeddings based on input
37
+ mockEngine.embedBatch = jest.fn().mockImplementation((texts: string[]) =>
38
+ Promise.resolve(
39
+ texts.map((text, idx) => ({
40
+ embedding: createMockEmbedding(text.length + idx),
41
+ text,
42
+ metadata: {
43
+ dimensions: 384,
44
+ modelName: 'test-model',
45
+ processingTime: 10,
46
+ },
47
+ }))
48
+ )
49
+ );
50
+
51
+ // Mock embed for single text
52
+ mockEngine.embed = jest.fn().mockImplementation((text: string) =>
53
+ Promise.resolve({
54
+ embedding: createMockEmbedding(text.length),
55
+ text,
56
+ metadata: {
57
+ dimensions: 384,
58
+ modelName: 'test-model',
59
+ processingTime: 5,
60
+ },
61
+ })
62
+ );
63
+
64
+ search = new SemanticSearch(mockEngine);
65
+ });
66
+
67
+ describe('constructor', () => {
68
+ it('creates search with default configuration', () => {
69
+ const stats = search.getStats();
70
+ expect(stats.itemCount).toBe(0);
71
+ });
72
+
73
+ it('accepts custom configuration', () => {
74
+ const customSearch = new SemanticSearch(mockEngine, {
75
+ topK: 5,
76
+ threshold: 0.5,
77
+ });
78
+
79
+ expect(customSearch).toBeDefined();
80
+ });
81
+
82
+ it('accepts custom text extractor', () => {
83
+ interface Document {
84
+ title: string;
85
+ content: string;
86
+ }
87
+
88
+ const docSearch = new SemanticSearch<Document>(mockEngine, {
89
+ textExtractor: (doc) => `${doc.title} ${doc.content}`,
90
+ });
91
+
92
+ expect(docSearch).toBeDefined();
93
+ });
94
+
95
+ it('accepts custom metadata extractor', () => {
96
+ interface Document {
97
+ id: number;
98
+ text: string;
99
+ }
100
+
101
+ const docSearch = new SemanticSearch<Document>(mockEngine, {
102
+ textExtractor: (doc) => doc.text,
103
+ metadataExtractor: (doc) => ({ id: doc.id }),
104
+ });
105
+
106
+ expect(docSearch).toBeDefined();
107
+ });
108
+ });
109
+
110
+ describe('index', () => {
111
+ it('indexes array of strings', async () => {
112
+ await search.index(['Hello', 'World', 'Test']);
113
+
114
+ const stats = search.getStats();
115
+ expect(stats.itemCount).toBe(3);
116
+ expect(stats.dimensions).toBe(384);
117
+ });
118
+
119
+ it('throws on empty array', async () => {
120
+ await expect(search.index([])).rejects.toThrow(SemanticError);
121
+
122
+ try {
123
+ await search.index([]);
124
+ } catch (error) {
125
+ expect((error as SemanticError).code).toBe(SemanticErrorCode.INVALID_INPUT);
126
+ }
127
+ });
128
+
129
+ it('throws on non-array input', async () => {
130
+ await expect(search.index(null as any)).rejects.toThrow(SemanticError);
131
+ await expect(search.index('test' as any)).rejects.toThrow(SemanticError);
132
+ });
133
+
134
+ it('appends to existing index by default', async () => {
135
+ await search.index(['A', 'B']);
136
+ await search.index(['C', 'D']);
137
+
138
+ const stats = search.getStats();
139
+ expect(stats.itemCount).toBe(4);
140
+ });
141
+
142
+ it('replaces index when replace=true', async () => {
143
+ await search.index(['A', 'B', 'C']);
144
+ await search.index(['X', 'Y'], true);
145
+
146
+ const stats = search.getStats();
147
+ expect(stats.itemCount).toBe(2);
148
+ });
149
+
150
+ it('calls embedBatch with batch size 32', async () => {
151
+ await search.index(['test']);
152
+
153
+ expect(mockEngine.embedBatch).toHaveBeenCalledWith(
154
+ ['test'],
155
+ { batchSize: 32 }
156
+ );
157
+ });
158
+
159
+ it('uses custom text extractor', async () => {
160
+ interface Doc {
161
+ title: string;
162
+ }
163
+
164
+ const docSearch = new SemanticSearch<Doc>(mockEngine, {
165
+ textExtractor: (doc) => doc.title,
166
+ });
167
+
168
+ await docSearch.index([{ title: 'Hello' }, { title: 'World' }]);
169
+
170
+ expect(mockEngine.embedBatch).toHaveBeenCalledWith(
171
+ ['Hello', 'World'],
172
+ { batchSize: 32 }
173
+ );
174
+ });
175
+
176
+ it('uses custom metadata extractor', async () => {
177
+ interface Doc {
178
+ id: number;
179
+ text: string;
180
+ }
181
+
182
+ const docSearch = new SemanticSearch<Doc>(mockEngine, {
183
+ textExtractor: (doc) => doc.text,
184
+ metadataExtractor: (doc) => ({ docId: doc.id }),
185
+ });
186
+
187
+ await docSearch.index([
188
+ { id: 1, text: 'Hello' },
189
+ { id: 2, text: 'World' },
190
+ ]);
191
+
192
+ const exported = docSearch.exportIndex();
193
+ expect(exported[0].metadata).toEqual({ docId: 1 });
194
+ expect(exported[1].metadata).toEqual({ docId: 2 });
195
+ });
196
+ });
197
+
198
+ describe('search', () => {
199
+ beforeEach(async () => {
200
+ await search.index(['apple', 'banana', 'cherry', 'date', 'elderberry']);
201
+ });
202
+
203
+ it('returns search results', async () => {
204
+ const results = await search.search('fruit');
205
+
206
+ expect(results.length).toBeGreaterThan(0);
207
+ expect(results[0]).toHaveProperty('item');
208
+ expect(results[0]).toHaveProperty('score');
209
+ expect(results[0]).toHaveProperty('rank');
210
+ });
211
+
212
+ it('throws on empty index', async () => {
213
+ const emptySearch = new SemanticSearch(mockEngine);
214
+
215
+ await expect(emptySearch.search('test')).rejects.toThrow(SemanticError);
216
+
217
+ try {
218
+ await emptySearch.search('test');
219
+ } catch (error) {
220
+ expect((error as SemanticError).code).toBe(SemanticErrorCode.INVALID_INPUT);
221
+ expect((error as SemanticError).message).toContain('Index is empty');
222
+ }
223
+ });
224
+
225
+ it('respects topK configuration', async () => {
226
+ const results = await search.search('fruit', { topK: 3 });
227
+
228
+ expect(results.length).toBeLessThanOrEqual(3);
229
+ });
230
+
231
+ it('respects threshold configuration', async () => {
232
+ const results = await search.search('fruit', { threshold: 0.99 });
233
+
234
+ // High threshold likely filters out all results
235
+ results.forEach((result) => {
236
+ expect(result.score).toBeGreaterThanOrEqual(0.99);
237
+ });
238
+ });
239
+
240
+ it('assigns correct ranks', async () => {
241
+ const results = await search.search('fruit', { topK: 5 });
242
+
243
+ results.forEach((result, idx) => {
244
+ expect(result.rank).toBe(idx + 1);
245
+ });
246
+ });
247
+
248
+ it('orders results by score descending', async () => {
249
+ const results = await search.search('fruit', { topK: 5 });
250
+
251
+ for (let i = 0; i < results.length - 1; i++) {
252
+ expect(results[i].score).toBeGreaterThanOrEqual(results[i + 1].score);
253
+ }
254
+ });
255
+
256
+ it('uses default topK of 10', async () => {
257
+ // Index more than 10 items
258
+ await search.index(
259
+ Array(15).fill(0).map((_, i) => `item${i}`),
260
+ true
261
+ );
262
+
263
+ const results = await search.search('item');
264
+
265
+ expect(results.length).toBeLessThanOrEqual(10);
266
+ });
267
+ });
268
+
269
+ describe('searchWithFilter', () => {
270
+ interface Document {
271
+ text: string;
272
+ category: string;
273
+ }
274
+
275
+ let docSearch: SemanticSearch<Document>;
276
+
277
+ beforeEach(async () => {
278
+ docSearch = new SemanticSearch<Document>(mockEngine, {
279
+ textExtractor: (doc) => doc.text,
280
+ metadataExtractor: (doc) => ({ category: doc.category }),
281
+ });
282
+
283
+ await docSearch.index([
284
+ { text: 'apple pie', category: 'dessert' },
285
+ { text: 'banana bread', category: 'dessert' },
286
+ { text: 'chicken soup', category: 'main' },
287
+ { text: 'beef stew', category: 'main' },
288
+ { text: 'fruit salad', category: 'appetizer' },
289
+ ]);
290
+ });
291
+
292
+ it('filters results by metadata', async () => {
293
+ const results = await docSearch.searchWithFilter(
294
+ 'food',
295
+ (metadata) => metadata.category === 'dessert'
296
+ );
297
+
298
+ results.forEach((result) => {
299
+ expect(result.item.category).toBe('dessert');
300
+ });
301
+ });
302
+
303
+ it('restores original index after filtering', async () => {
304
+ const beforeCount = docSearch.getStats().itemCount;
305
+
306
+ await docSearch.searchWithFilter(
307
+ 'food',
308
+ (metadata) => metadata.category === 'dessert'
309
+ );
310
+
311
+ const afterCount = docSearch.getStats().itemCount;
312
+ expect(afterCount).toBe(beforeCount);
313
+ });
314
+
315
+ it('handles filter that matches no items', async () => {
316
+ // When filter matches no items, searchWithFilter catches the empty index error
317
+ // and returns empty results
318
+ try {
319
+ const results = await docSearch.searchWithFilter(
320
+ 'food',
321
+ (metadata) => metadata.category === 'nonexistent'
322
+ );
323
+ // If we get here without error, results should be empty
324
+ expect(results).toEqual([]);
325
+ } catch (error) {
326
+ // Empty filtered index throws SemanticError
327
+ expect(error).toBeInstanceOf(SemanticError);
328
+ expect((error as SemanticError).code).toBe(SemanticErrorCode.INVALID_INPUT);
329
+ }
330
+ });
331
+
332
+ it('handles empty metadata', async () => {
333
+ await search.index(['a', 'b', 'c']);
334
+
335
+ const results = await search.searchWithFilter(
336
+ 'test',
337
+ () => true
338
+ );
339
+
340
+ expect(results.length).toBeGreaterThan(0);
341
+ });
342
+
343
+ it('respects config overrides', async () => {
344
+ const results = await docSearch.searchWithFilter(
345
+ 'food',
346
+ () => true,
347
+ { topK: 2 }
348
+ );
349
+
350
+ expect(results.length).toBeLessThanOrEqual(2);
351
+ });
352
+
353
+ it('handles items with undefined metadata', async () => {
354
+ // Create index with items that have no metadata property
355
+ const itemsWithNoMetadata: IndexedItem<string>[] = [
356
+ { item: 'test1', embedding: createMockEmbedding(1) },
357
+ { item: 'test2', embedding: createMockEmbedding(2) },
358
+ ];
359
+
360
+ search.importIndex(itemsWithNoMetadata);
361
+
362
+ // Filter should handle undefined metadata gracefully via ?? {}
363
+ const results = await search.searchWithFilter(
364
+ 'test',
365
+ () => true
366
+ );
367
+
368
+ expect(Array.isArray(results)).toBe(true);
369
+ });
370
+ });
371
+
372
+ describe('findSimilar', () => {
373
+ beforeEach(async () => {
374
+ await search.index(['apple', 'banana', 'cherry']);
375
+ });
376
+
377
+ it('finds items similar to given item', async () => {
378
+ const results = await search.findSimilar('apple');
379
+
380
+ expect(results.length).toBeGreaterThan(0);
381
+ expect(mockEngine.embed).toHaveBeenCalledWith('apple');
382
+ });
383
+
384
+ it('respects config overrides', async () => {
385
+ const results = await search.findSimilar('apple', { topK: 1 });
386
+
387
+ expect(results.length).toBeLessThanOrEqual(1);
388
+ });
389
+
390
+ it('uses text extractor for complex types', async () => {
391
+ interface Doc {
392
+ title: string;
393
+ }
394
+
395
+ const docSearch = new SemanticSearch<Doc>(mockEngine, {
396
+ textExtractor: (doc) => doc.title,
397
+ });
398
+
399
+ await docSearch.index([{ title: 'Hello' }, { title: 'World' }]);
400
+ await docSearch.findSimilar({ title: 'Test' });
401
+
402
+ expect(mockEngine.embed).toHaveBeenCalledWith('Test');
403
+ });
404
+ });
405
+
406
+ describe('getStats', () => {
407
+ it('returns zero stats for empty index', () => {
408
+ const stats = search.getStats();
409
+
410
+ expect(stats.itemCount).toBe(0);
411
+ expect(stats.dimensions).toBe(0);
412
+ expect(stats.memoryEstimate).toBe('0.00 KB');
413
+ });
414
+
415
+ it('returns correct item count', async () => {
416
+ await search.index(['a', 'b', 'c', 'd', 'e']);
417
+
418
+ const stats = search.getStats();
419
+ expect(stats.itemCount).toBe(5);
420
+ });
421
+
422
+ it('returns correct dimensions', async () => {
423
+ await search.index(['test']);
424
+
425
+ const stats = search.getStats();
426
+ expect(stats.dimensions).toBe(384);
427
+ });
428
+
429
+ it('formats memory in KB for small indexes', async () => {
430
+ await search.index(['test']);
431
+
432
+ const stats = search.getStats();
433
+ expect(stats.memoryEstimate).toMatch(/KB$/);
434
+ });
435
+
436
+ it('formats memory in MB for large indexes', async () => {
437
+ // Create many items to exceed 1MB
438
+ // 384 dimensions * 8 bytes = 3072 bytes per item
439
+ // Need ~350 items for 1MB
440
+ const items = Array(400).fill('item');
441
+ await search.index(items);
442
+
443
+ const stats = search.getStats();
444
+ expect(stats.memoryEstimate).toMatch(/MB$/);
445
+ });
446
+ });
447
+
448
+ describe('clear', () => {
449
+ it('removes all items from index', async () => {
450
+ await search.index(['a', 'b', 'c']);
451
+ expect(search.getStats().itemCount).toBe(3);
452
+
453
+ search.clear();
454
+
455
+ expect(search.getStats().itemCount).toBe(0);
456
+ });
457
+ });
458
+
459
+ describe('exportIndex', () => {
460
+ it('returns copy of indexed items', async () => {
461
+ await search.index(['apple', 'banana']);
462
+
463
+ const exported = search.exportIndex();
464
+
465
+ expect(exported).toHaveLength(2);
466
+ expect(exported[0].item).toBe('apple');
467
+ expect(exported[1].item).toBe('banana');
468
+ expect(exported[0].embedding).toHaveLength(384);
469
+ });
470
+
471
+ it('returns copy, not reference', async () => {
472
+ await search.index(['test']);
473
+
474
+ const exported1 = search.exportIndex();
475
+ const exported2 = search.exportIndex();
476
+
477
+ expect(exported1).not.toBe(exported2);
478
+ });
479
+ });
480
+
481
+ describe('importIndex', () => {
482
+ it('replaces current index', async () => {
483
+ await search.index(['old1', 'old2']);
484
+
485
+ const newIndex: IndexedItem<string>[] = [
486
+ { item: 'new1', embedding: createMockEmbedding(1), metadata: {} },
487
+ { item: 'new2', embedding: createMockEmbedding(2), metadata: {} },
488
+ { item: 'new3', embedding: createMockEmbedding(3), metadata: {} },
489
+ ];
490
+
491
+ search.importIndex(newIndex);
492
+
493
+ expect(search.getStats().itemCount).toBe(3);
494
+ const exported = search.exportIndex();
495
+ expect(exported[0].item).toBe('new1');
496
+ });
497
+
498
+ it('creates copy of imported data', async () => {
499
+ const newIndex: IndexedItem<string>[] = [
500
+ { item: 'test', embedding: createMockEmbedding(1), metadata: {} },
501
+ ];
502
+
503
+ search.importIndex(newIndex);
504
+ newIndex.push({
505
+ item: 'added',
506
+ embedding: createMockEmbedding(2),
507
+ metadata: {},
508
+ });
509
+
510
+ expect(search.getStats().itemCount).toBe(1);
511
+ });
512
+
513
+ it('allows searching after import', async () => {
514
+ const newIndex: IndexedItem<string>[] = [
515
+ { item: 'apple', embedding: createMockEmbedding(1), metadata: {} },
516
+ { item: 'banana', embedding: createMockEmbedding(2), metadata: {} },
517
+ ];
518
+
519
+ search.importIndex(newIndex);
520
+ const results = await search.search('fruit');
521
+
522
+ // Results may be empty if threshold filters them out, but search should work
523
+ expect(Array.isArray(results)).toBe(true);
524
+ // Verify search was called with the query
525
+ expect(mockEngine.embed).toHaveBeenCalledWith('fruit');
526
+ });
527
+ });
528
+
529
+ describe('integration scenarios', () => {
530
+ it('handles full workflow: index -> search -> filter -> clear', async () => {
531
+ interface Product {
532
+ name: string;
533
+ category: string;
534
+ price: number;
535
+ }
536
+
537
+ const productSearch = new SemanticSearch<Product>(mockEngine, {
538
+ textExtractor: (p) => `${p.name} ${p.category}`,
539
+ metadataExtractor: (p) => ({ category: p.category, price: p.price }),
540
+ });
541
+
542
+ // Index products
543
+ await productSearch.index([
544
+ { name: 'iPhone', category: 'electronics', price: 999 },
545
+ { name: 'Samsung Galaxy', category: 'electronics', price: 899 },
546
+ { name: 'Nike Shoes', category: 'apparel', price: 150 },
547
+ { name: 'Adidas Sneakers', category: 'apparel', price: 120 },
548
+ ]);
549
+
550
+ expect(productSearch.getStats().itemCount).toBe(4);
551
+
552
+ // Search all
553
+ const allResults = await productSearch.search('phone');
554
+ expect(allResults.length).toBeGreaterThan(0);
555
+
556
+ // Search with filter
557
+ const electronicsResults = await productSearch.searchWithFilter(
558
+ 'phone',
559
+ (m) => m.category === 'electronics'
560
+ );
561
+ electronicsResults.forEach((r) => {
562
+ expect(r.item.category).toBe('electronics');
563
+ });
564
+
565
+ // Find similar
566
+ const similar = await productSearch.findSimilar({
567
+ name: 'Google Pixel',
568
+ category: 'electronics',
569
+ price: 799,
570
+ });
571
+ expect(similar.length).toBeGreaterThan(0);
572
+
573
+ // Export and import
574
+ const exported = productSearch.exportIndex();
575
+ productSearch.clear();
576
+ expect(productSearch.getStats().itemCount).toBe(0);
577
+
578
+ productSearch.importIndex(exported);
579
+ expect(productSearch.getStats().itemCount).toBe(4);
580
+ });
581
+ });
582
+ });