@crashbytes/semantic-text-toolkit 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.json +24 -0
- package/.github/dependabot.yml +50 -0
- package/.github/workflows/ci.yml +42 -0
- package/.github/workflows/release.yml +50 -0
- package/LICENSE +5 -0
- package/README.md +6 -1
- package/jest.config.js +66 -0
- package/package.json +2 -3
- package/src/__tests__/setup.ts +43 -0
- package/src/__tests__/types.test.ts +128 -0
- package/src/engine/__tests__/SemanticEngine.test.ts +398 -0
- package/src/search/__tests__/SemanticSearch.test.ts +582 -0
- package/src/utils/__tests__/vector.test.ts +354 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SemanticEngine Test Suite
|
|
3
|
+
*
|
|
4
|
+
* Comprehensive validation of embedding generation, similarity computation,
|
|
5
|
+
* and lifecycle management for the semantic engine.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { SemanticEngine } from '../SemanticEngine';
|
|
9
|
+
import { SemanticError, SemanticErrorCode } from '../../types';
|
|
10
|
+
|
|
11
|
+
// Mock the transformers library
|
|
12
|
+
jest.mock('@xenova/transformers', () => ({
|
|
13
|
+
pipeline: jest.fn(),
|
|
14
|
+
}));
|
|
15
|
+
|
|
16
|
+
import { pipeline } from '@xenova/transformers';
|
|
17
|
+
|
|
18
|
+
const mockPipeline = pipeline as jest.MockedFunction<typeof pipeline>;
|
|
19
|
+
|
|
20
|
+
describe('SemanticEngine', () => {
|
|
21
|
+
let engine: SemanticEngine;
|
|
22
|
+
let mockModel: jest.Mock;
|
|
23
|
+
|
|
24
|
+
beforeEach(() => {
|
|
25
|
+
jest.clearAllMocks();
|
|
26
|
+
|
|
27
|
+
// Create a mock model that returns embeddings
|
|
28
|
+
mockModel = jest.fn().mockImplementation((text: string) => {
|
|
29
|
+
// Generate deterministic embeddings based on text length
|
|
30
|
+
const embedding = Array(384).fill(0).map((_, i) =>
|
|
31
|
+
Math.sin(text.length + i) * 0.1
|
|
32
|
+
);
|
|
33
|
+
return Promise.resolve({
|
|
34
|
+
data: Float32Array.from(embedding),
|
|
35
|
+
});
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
mockPipeline.mockResolvedValue(mockModel as any);
|
|
39
|
+
engine = new SemanticEngine();
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
afterEach(() => {
|
|
43
|
+
engine.dispose();
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
describe('constructor', () => {
|
|
47
|
+
it('creates engine with default configuration', () => {
|
|
48
|
+
const config = engine.getConfig();
|
|
49
|
+
expect(config.modelName).toBe('Xenova/all-MiniLM-L6-v2');
|
|
50
|
+
expect(config.maxLength).toBe(512);
|
|
51
|
+
expect(config.quantized).toBe(true);
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it('accepts custom configuration', () => {
|
|
55
|
+
const customEngine = new SemanticEngine({
|
|
56
|
+
modelName: 'custom-model',
|
|
57
|
+
maxLength: 256,
|
|
58
|
+
quantized: false,
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
const config = customEngine.getConfig();
|
|
62
|
+
expect(config.modelName).toBe('custom-model');
|
|
63
|
+
expect(config.maxLength).toBe(256);
|
|
64
|
+
expect(config.quantized).toBe(false);
|
|
65
|
+
|
|
66
|
+
customEngine.dispose();
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it('merges partial configuration with defaults', () => {
|
|
70
|
+
const customEngine = new SemanticEngine({ maxLength: 1024 });
|
|
71
|
+
|
|
72
|
+
const config = customEngine.getConfig();
|
|
73
|
+
expect(config.modelName).toBe('Xenova/all-MiniLM-L6-v2');
|
|
74
|
+
expect(config.maxLength).toBe(1024);
|
|
75
|
+
|
|
76
|
+
customEngine.dispose();
|
|
77
|
+
});
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
describe('initialize', () => {
|
|
81
|
+
it('loads the model successfully', async () => {
|
|
82
|
+
await engine.initialize();
|
|
83
|
+
|
|
84
|
+
expect(mockPipeline).toHaveBeenCalledWith(
|
|
85
|
+
'feature-extraction',
|
|
86
|
+
'Xenova/all-MiniLM-L6-v2',
|
|
87
|
+
{ quantized: true }
|
|
88
|
+
);
|
|
89
|
+
expect(engine.isReady()).toBe(true);
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('calls onProgress callback during initialization', async () => {
|
|
93
|
+
const onProgress = jest.fn();
|
|
94
|
+
const engineWithProgress = new SemanticEngine({ onProgress });
|
|
95
|
+
|
|
96
|
+
await engineWithProgress.initialize();
|
|
97
|
+
|
|
98
|
+
expect(onProgress).toHaveBeenCalledWith({
|
|
99
|
+
status: 'downloading',
|
|
100
|
+
progress: 0,
|
|
101
|
+
});
|
|
102
|
+
expect(onProgress).toHaveBeenCalledWith({
|
|
103
|
+
status: 'ready',
|
|
104
|
+
progress: 100,
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
engineWithProgress.dispose();
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
it('returns immediately if already initialized', async () => {
|
|
111
|
+
await engine.initialize();
|
|
112
|
+
await engine.initialize();
|
|
113
|
+
|
|
114
|
+
// Pipeline should only be called once
|
|
115
|
+
expect(mockPipeline).toHaveBeenCalledTimes(1);
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
it('handles concurrent initialization calls', async () => {
|
|
119
|
+
const promise1 = engine.initialize();
|
|
120
|
+
const promise2 = engine.initialize();
|
|
121
|
+
const promise3 = engine.initialize();
|
|
122
|
+
|
|
123
|
+
await Promise.all([promise1, promise2, promise3]);
|
|
124
|
+
|
|
125
|
+
// Pipeline should only be called once
|
|
126
|
+
expect(mockPipeline).toHaveBeenCalledTimes(1);
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
it('throws SemanticError on initialization failure', async () => {
|
|
130
|
+
mockPipeline.mockRejectedValue(new Error('Network error'));
|
|
131
|
+
|
|
132
|
+
await expect(engine.initialize()).rejects.toThrow(SemanticError);
|
|
133
|
+
|
|
134
|
+
try {
|
|
135
|
+
await engine.initialize();
|
|
136
|
+
} catch (error) {
|
|
137
|
+
expect(error).toBeInstanceOf(SemanticError);
|
|
138
|
+
const semanticError = error as SemanticError;
|
|
139
|
+
expect(semanticError.code).toBe(SemanticErrorCode.MODEL_NOT_LOADED);
|
|
140
|
+
expect(semanticError.message).toContain('Failed to initialize model');
|
|
141
|
+
}
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
it('handles non-Error rejection', async () => {
|
|
145
|
+
mockPipeline.mockRejectedValue('String error');
|
|
146
|
+
|
|
147
|
+
await expect(engine.initialize()).rejects.toThrow(SemanticError);
|
|
148
|
+
});
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
describe('embed', () => {
|
|
152
|
+
beforeEach(async () => {
|
|
153
|
+
await engine.initialize();
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
it('generates embedding for valid text', async () => {
|
|
157
|
+
const result = await engine.embed('Hello world');
|
|
158
|
+
|
|
159
|
+
expect(result.embedding).toHaveLength(384);
|
|
160
|
+
expect(result.text).toBe('Hello world');
|
|
161
|
+
expect(result.metadata.dimensions).toBe(384);
|
|
162
|
+
expect(result.metadata.modelName).toBe('Xenova/all-MiniLM-L6-v2');
|
|
163
|
+
expect(result.metadata.processingTime).toBeGreaterThanOrEqual(0);
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
it('throws when model not initialized', async () => {
|
|
167
|
+
const uninitializedEngine = new SemanticEngine();
|
|
168
|
+
|
|
169
|
+
await expect(uninitializedEngine.embed('test'))
|
|
170
|
+
.rejects.toThrow(SemanticError);
|
|
171
|
+
|
|
172
|
+
try {
|
|
173
|
+
await uninitializedEngine.embed('test');
|
|
174
|
+
} catch (error) {
|
|
175
|
+
expect((error as SemanticError).code).toBe(SemanticErrorCode.MODEL_NOT_LOADED);
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
it('throws on empty string input', async () => {
|
|
180
|
+
await expect(engine.embed('')).rejects.toThrow(SemanticError);
|
|
181
|
+
|
|
182
|
+
try {
|
|
183
|
+
await engine.embed('');
|
|
184
|
+
} catch (error) {
|
|
185
|
+
expect((error as SemanticError).code).toBe(SemanticErrorCode.INVALID_INPUT);
|
|
186
|
+
}
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
it('throws on non-string input', async () => {
|
|
190
|
+
await expect(engine.embed(null as any)).rejects.toThrow(SemanticError);
|
|
191
|
+
await expect(engine.embed(undefined as any)).rejects.toThrow(SemanticError);
|
|
192
|
+
await expect(engine.embed(123 as any)).rejects.toThrow(SemanticError);
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
it('handles model embedding failure', async () => {
|
|
196
|
+
mockModel.mockRejectedValue(new Error('Embedding failed'));
|
|
197
|
+
|
|
198
|
+
await expect(engine.embed('test')).rejects.toThrow(SemanticError);
|
|
199
|
+
|
|
200
|
+
try {
|
|
201
|
+
await engine.embed('test');
|
|
202
|
+
} catch (error) {
|
|
203
|
+
expect((error as SemanticError).code).toBe(SemanticErrorCode.EMBEDDING_FAILED);
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
it('handles non-Error embedding failure', async () => {
|
|
208
|
+
mockModel.mockRejectedValue('String error');
|
|
209
|
+
|
|
210
|
+
await expect(engine.embed('test')).rejects.toThrow(SemanticError);
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
it('truncates long text in error details', async () => {
|
|
214
|
+
const longText = 'x'.repeat(200);
|
|
215
|
+
mockModel.mockRejectedValue(new Error('Failed'));
|
|
216
|
+
|
|
217
|
+
try {
|
|
218
|
+
await engine.embed(longText);
|
|
219
|
+
} catch (error) {
|
|
220
|
+
expect((error as SemanticError).details?.text).toHaveLength(100);
|
|
221
|
+
}
|
|
222
|
+
});
|
|
223
|
+
});
|
|
224
|
+
|
|
225
|
+
describe('embedBatch', () => {
|
|
226
|
+
beforeEach(async () => {
|
|
227
|
+
await engine.initialize();
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
it('generates embeddings for multiple texts', async () => {
|
|
231
|
+
const texts = ['Hello', 'World', 'Test'];
|
|
232
|
+
const results = await engine.embedBatch(texts);
|
|
233
|
+
|
|
234
|
+
expect(results).toHaveLength(3);
|
|
235
|
+
results.forEach((result, idx) => {
|
|
236
|
+
expect(result.text).toBe(texts[idx]);
|
|
237
|
+
expect(result.embedding).toHaveLength(384);
|
|
238
|
+
});
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
it('throws when model not initialized', async () => {
|
|
242
|
+
const uninitializedEngine = new SemanticEngine();
|
|
243
|
+
|
|
244
|
+
await expect(uninitializedEngine.embedBatch(['test']))
|
|
245
|
+
.rejects.toThrow(SemanticError);
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
it('throws on empty array', async () => {
|
|
249
|
+
await expect(engine.embedBatch([])).rejects.toThrow(SemanticError);
|
|
250
|
+
|
|
251
|
+
try {
|
|
252
|
+
await engine.embedBatch([]);
|
|
253
|
+
} catch (error) {
|
|
254
|
+
expect((error as SemanticError).code).toBe(SemanticErrorCode.INVALID_INPUT);
|
|
255
|
+
}
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
it('throws on non-array input', async () => {
|
|
259
|
+
await expect(engine.embedBatch(null as any)).rejects.toThrow(SemanticError);
|
|
260
|
+
await expect(engine.embedBatch('test' as any)).rejects.toThrow(SemanticError);
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
it('respects batch size option', async () => {
|
|
264
|
+
const texts = Array(10).fill('text');
|
|
265
|
+
await engine.embedBatch(texts, { batchSize: 3 });
|
|
266
|
+
|
|
267
|
+
// With 10 items and batchSize 3, we need 4 batches (3+3+3+1)
|
|
268
|
+
// Each item calls embed once
|
|
269
|
+
expect(mockModel).toHaveBeenCalledTimes(10);
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
it('calls onProgress callback', async () => {
|
|
273
|
+
const onProgress = jest.fn();
|
|
274
|
+
const texts = ['a', 'b', 'c', 'd', 'e'];
|
|
275
|
+
|
|
276
|
+
await engine.embedBatch(texts, { batchSize: 2, onProgress });
|
|
277
|
+
|
|
278
|
+
// With 5 items and batchSize 2: batches complete at 2, 4, 5
|
|
279
|
+
expect(onProgress).toHaveBeenCalledWith(2, 5);
|
|
280
|
+
expect(onProgress).toHaveBeenCalledWith(4, 5);
|
|
281
|
+
expect(onProgress).toHaveBeenCalledWith(5, 5);
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
it('uses default batch size of 32', async () => {
|
|
285
|
+
const texts = Array(64).fill('text');
|
|
286
|
+
const onProgress = jest.fn();
|
|
287
|
+
|
|
288
|
+
await engine.embedBatch(texts, { onProgress });
|
|
289
|
+
|
|
290
|
+
expect(onProgress).toHaveBeenCalledWith(32, 64);
|
|
291
|
+
expect(onProgress).toHaveBeenCalledWith(64, 64);
|
|
292
|
+
});
|
|
293
|
+
});
|
|
294
|
+
|
|
295
|
+
describe('similarity', () => {
|
|
296
|
+
beforeEach(async () => {
|
|
297
|
+
await engine.initialize();
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
it('computes cosine similarity by default', async () => {
|
|
301
|
+
const result = await engine.similarity('Hello', 'Hello');
|
|
302
|
+
|
|
303
|
+
expect(result.score).toBeCloseTo(1.0, 5);
|
|
304
|
+
expect(result.texts).toEqual(['Hello', 'Hello']);
|
|
305
|
+
expect(result.metadata.method).toBe('cosine');
|
|
306
|
+
expect(result.metadata.processingTime).toBeGreaterThanOrEqual(0);
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
it('computes cosine similarity explicitly', async () => {
|
|
310
|
+
const result = await engine.similarity('Hello', 'Hello', 'cosine');
|
|
311
|
+
|
|
312
|
+
expect(result.score).toBeCloseTo(1.0, 5);
|
|
313
|
+
expect(result.metadata.method).toBe('cosine');
|
|
314
|
+
});
|
|
315
|
+
|
|
316
|
+
it('computes euclidean distance', async () => {
|
|
317
|
+
const result = await engine.similarity('Hello', 'Hello', 'euclidean');
|
|
318
|
+
|
|
319
|
+
// Same text should have distance close to 0, so negated score close to 0
|
|
320
|
+
expect(result.score).toBeCloseTo(0, 5);
|
|
321
|
+
expect(result.metadata.method).toBe('euclidean');
|
|
322
|
+
});
|
|
323
|
+
|
|
324
|
+
it('computes dot product', async () => {
|
|
325
|
+
const result = await engine.similarity('Hello', 'Hello', 'dot');
|
|
326
|
+
|
|
327
|
+
expect(result.metadata.method).toBe('dot');
|
|
328
|
+
expect(typeof result.score).toBe('number');
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
it('throws on unknown similarity method', async () => {
|
|
332
|
+
await expect(engine.similarity('a', 'b', 'unknown' as any))
|
|
333
|
+
.rejects.toThrow(SemanticError);
|
|
334
|
+
|
|
335
|
+
try {
|
|
336
|
+
await engine.similarity('a', 'b', 'unknown' as any);
|
|
337
|
+
} catch (error) {
|
|
338
|
+
expect((error as SemanticError).code).toBe(SemanticErrorCode.INVALID_INPUT);
|
|
339
|
+
expect((error as SemanticError).message).toContain('Unknown similarity method');
|
|
340
|
+
}
|
|
341
|
+
});
|
|
342
|
+
});
|
|
343
|
+
|
|
344
|
+
describe('dispose', () => {
|
|
345
|
+
it('clears model and initialization state', async () => {
|
|
346
|
+
await engine.initialize();
|
|
347
|
+
expect(engine.isReady()).toBe(true);
|
|
348
|
+
|
|
349
|
+
engine.dispose();
|
|
350
|
+
|
|
351
|
+
expect(engine.isReady()).toBe(false);
|
|
352
|
+
});
|
|
353
|
+
|
|
354
|
+
it('allows re-initialization after dispose', async () => {
|
|
355
|
+
await engine.initialize();
|
|
356
|
+
engine.dispose();
|
|
357
|
+
|
|
358
|
+
expect(engine.isReady()).toBe(false);
|
|
359
|
+
|
|
360
|
+
await engine.initialize();
|
|
361
|
+
expect(engine.isReady()).toBe(true);
|
|
362
|
+
});
|
|
363
|
+
});
|
|
364
|
+
|
|
365
|
+
describe('isReady', () => {
|
|
366
|
+
it('returns false before initialization', () => {
|
|
367
|
+
expect(engine.isReady()).toBe(false);
|
|
368
|
+
});
|
|
369
|
+
|
|
370
|
+
it('returns true after initialization', async () => {
|
|
371
|
+
await engine.initialize();
|
|
372
|
+
expect(engine.isReady()).toBe(true);
|
|
373
|
+
});
|
|
374
|
+
|
|
375
|
+
it('returns false after dispose', async () => {
|
|
376
|
+
await engine.initialize();
|
|
377
|
+
engine.dispose();
|
|
378
|
+
expect(engine.isReady()).toBe(false);
|
|
379
|
+
});
|
|
380
|
+
});
|
|
381
|
+
|
|
382
|
+
describe('getConfig', () => {
|
|
383
|
+
it('returns a copy of the configuration', () => {
|
|
384
|
+
const config1 = engine.getConfig();
|
|
385
|
+
const config2 = engine.getConfig();
|
|
386
|
+
|
|
387
|
+
expect(config1).not.toBe(config2);
|
|
388
|
+
expect(config1).toEqual(config2);
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
it('modifications do not affect internal config', () => {
|
|
392
|
+
const config = engine.getConfig();
|
|
393
|
+
config.modelName = 'modified';
|
|
394
|
+
|
|
395
|
+
expect(engine.getConfig().modelName).toBe('Xenova/all-MiniLM-L6-v2');
|
|
396
|
+
});
|
|
397
|
+
});
|
|
398
|
+
});
|