simile-search 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/engine.js +121 -50
- package/dist/engine.test.js +67 -301
- package/dist/types.d.ts +4 -2
- package/package.json +1 -1
package/dist/engine.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder.js";
|
|
2
|
-
import { cosine, fuzzyScore, keywordScore, calculateScoreStats } from "./similarity.js";
|
|
1
|
+
import { embed, embedBatch, vectorToBase64, base64ToVector, } from "./embedder.js";
|
|
2
|
+
import { cosine, fuzzyScore, keywordScore, calculateScoreStats, } from "./similarity.js";
|
|
3
3
|
import { hybridScore, getDefaultWeights } from "./ranker.js";
|
|
4
4
|
import { extractText, normalizeScore } from "./utils.js";
|
|
5
5
|
import { VectorCache, createCacheKey } from "./cache.js";
|
|
@@ -19,26 +19,35 @@ export class Simile {
|
|
|
19
19
|
textPaths: config.textPaths ?? [],
|
|
20
20
|
normalizeScores: config.normalizeScores ?? true,
|
|
21
21
|
cache: config.cache ?? true,
|
|
22
|
-
quantization: config.quantization ??
|
|
22
|
+
quantization: config.quantization ?? "float32",
|
|
23
23
|
useANN: config.useANN ?? false,
|
|
24
24
|
annThreshold: config.annThreshold ?? 1000,
|
|
25
25
|
};
|
|
26
26
|
// Initialize Cache
|
|
27
27
|
if (this.config.cache) {
|
|
28
|
-
this.cache = new VectorCache(typeof this.config.cache ===
|
|
28
|
+
this.cache = new VectorCache(typeof this.config.cache === "object" ? this.config.cache : {});
|
|
29
29
|
}
|
|
30
30
|
// Initialize ANN Index if threshold reached or forced
|
|
31
31
|
if (this.config.useANN || this.items.length >= this.config.annThreshold) {
|
|
32
|
-
|
|
32
|
+
// Optimize HNSW for speed when not explicitly configured
|
|
33
|
+
const hnswConfig = typeof this.config.useANN === "object"
|
|
34
|
+
? this.config.useANN
|
|
35
|
+
: {
|
|
36
|
+
efSearch: 20, // Reduced from default 50 for faster search
|
|
37
|
+
M: 16, // Keep default
|
|
38
|
+
efConstruction: 200, // Keep default for build quality
|
|
39
|
+
};
|
|
40
|
+
this.buildANNIndex(hnswConfig);
|
|
33
41
|
}
|
|
34
42
|
// Initialize Updater
|
|
35
43
|
this.updater = new BackgroundUpdater(this);
|
|
36
44
|
}
|
|
37
|
-
buildANNIndex() {
|
|
45
|
+
buildANNIndex(config) {
|
|
38
46
|
if (this.vectors.length === 0)
|
|
39
47
|
return;
|
|
40
48
|
const dims = this.vectors[0].length;
|
|
41
|
-
const hnswConfig =
|
|
49
|
+
const hnswConfig = config ||
|
|
50
|
+
(typeof this.config.useANN === "object" ? this.config.useANN : {});
|
|
42
51
|
this.annIndex = new HNSWIndex(dims, hnswConfig);
|
|
43
52
|
for (let i = 0; i < this.vectors.length; i++) {
|
|
44
53
|
this.annIndex.add(i, this.vectors[i]);
|
|
@@ -211,7 +220,7 @@ export class Simile {
|
|
|
211
220
|
for (const v of this.vectors)
|
|
212
221
|
memoryBytes += v.byteLength;
|
|
213
222
|
return {
|
|
214
|
-
type: this.annIndex ?
|
|
223
|
+
type: this.annIndex ? "hnsw" : "linear",
|
|
215
224
|
size: this.items.length,
|
|
216
225
|
memory: `${(memoryBytes / 1024 / 1024).toFixed(2)} MB`,
|
|
217
226
|
cacheStats: this.cache?.getStats(),
|
|
@@ -272,17 +281,39 @@ export class Simile {
|
|
|
272
281
|
* @returns Sorted results by relevance (highest score first)
|
|
273
282
|
*/
|
|
274
283
|
async search(query, options = {}) {
|
|
275
|
-
const { topK = 5, explain = false, filter, threshold = 0, minLength = 1, } = options;
|
|
284
|
+
const { topK = 5, explain = false, filter, threshold = 0, minLength = 1, semanticOnly = false, } = options;
|
|
276
285
|
// Min character limit - don't search until query meets minimum length
|
|
277
286
|
if (query.length < minLength) {
|
|
278
287
|
return [];
|
|
279
288
|
}
|
|
280
289
|
const qVector = await this.embedWithCache(query);
|
|
281
|
-
// First pass: calculate raw scores
|
|
282
|
-
const rawResults = [];
|
|
283
290
|
// Use ANN if enabled and available
|
|
284
291
|
if (this.annIndex && (options.useANN ?? true)) {
|
|
285
|
-
|
|
292
|
+
// Optimize: get fewer candidates for faster search
|
|
293
|
+
const candidateCount = semanticOnly ? topK : Math.min(topK * 2, 20);
|
|
294
|
+
const annResults = this.annIndex.search(qVector, candidateCount);
|
|
295
|
+
// Fast path: semantic-only search (no fuzzy/keyword)
|
|
296
|
+
if (semanticOnly) {
|
|
297
|
+
const results = [];
|
|
298
|
+
for (const res of annResults) {
|
|
299
|
+
const item = this.items[res.id];
|
|
300
|
+
if (filter && !filter(item.metadata))
|
|
301
|
+
continue;
|
|
302
|
+
const semantic = 1 - res.distance;
|
|
303
|
+
if (semantic < threshold)
|
|
304
|
+
continue;
|
|
305
|
+
results.push({
|
|
306
|
+
id: item.id,
|
|
307
|
+
text: item.text,
|
|
308
|
+
metadata: item.metadata,
|
|
309
|
+
score: semantic,
|
|
310
|
+
explain: explain ? { semantic, fuzzy: 0, keyword: 0 } : undefined,
|
|
311
|
+
});
|
|
312
|
+
}
|
|
313
|
+
return results.sort((a, b) => b.score - a.score).slice(0, topK);
|
|
314
|
+
}
|
|
315
|
+
// Full hybrid search path
|
|
316
|
+
const rawResults = [];
|
|
286
317
|
for (const res of annResults) {
|
|
287
318
|
const item = this.items[res.id];
|
|
288
319
|
if (filter && !filter(item.metadata))
|
|
@@ -293,9 +324,49 @@ export class Simile {
|
|
|
293
324
|
const keyword = keywordScore(query, searchableText);
|
|
294
325
|
rawResults.push({ index: res.id, item, semantic, fuzzy, keyword });
|
|
295
326
|
}
|
|
327
|
+
// Calculate score statistics for normalization
|
|
328
|
+
const stats = calculateScoreStats(rawResults);
|
|
329
|
+
// Second pass: normalize scores and compute hybrid score
|
|
330
|
+
const results = [];
|
|
331
|
+
for (const raw of rawResults) {
|
|
332
|
+
let semantic = raw.semantic;
|
|
333
|
+
let fuzzy = raw.fuzzy;
|
|
334
|
+
let keyword = raw.keyword;
|
|
335
|
+
// Normalize scores if enabled
|
|
336
|
+
if (this.config.normalizeScores) {
|
|
337
|
+
semantic = normalizeScore(raw.semantic, stats.semantic.min, stats.semantic.max);
|
|
338
|
+
fuzzy = normalizeScore(raw.fuzzy, stats.fuzzy.min, stats.fuzzy.max);
|
|
339
|
+
keyword = normalizeScore(raw.keyword, stats.keyword.min, stats.keyword.max);
|
|
340
|
+
}
|
|
341
|
+
const score = hybridScore(semantic, fuzzy, keyword, this.config.weights);
|
|
342
|
+
// Apply threshold filter
|
|
343
|
+
if (score < threshold)
|
|
344
|
+
continue;
|
|
345
|
+
results.push({
|
|
346
|
+
id: raw.item.id,
|
|
347
|
+
text: raw.item.text,
|
|
348
|
+
metadata: raw.item.metadata,
|
|
349
|
+
score,
|
|
350
|
+
explain: explain
|
|
351
|
+
? {
|
|
352
|
+
semantic,
|
|
353
|
+
fuzzy,
|
|
354
|
+
keyword,
|
|
355
|
+
raw: {
|
|
356
|
+
semantic: raw.semantic,
|
|
357
|
+
fuzzy: raw.fuzzy,
|
|
358
|
+
keyword: raw.keyword,
|
|
359
|
+
},
|
|
360
|
+
}
|
|
361
|
+
: undefined,
|
|
362
|
+
});
|
|
363
|
+
}
|
|
364
|
+
// Sort by relevance (highest score first)
|
|
365
|
+
return results.sort((a, b) => b.score - a.score).slice(0, topK);
|
|
296
366
|
}
|
|
297
367
|
else {
|
|
298
368
|
// Fallback to linear scan
|
|
369
|
+
const rawResults = [];
|
|
299
370
|
for (let i = 0; i < this.items.length; i++) {
|
|
300
371
|
const item = this.items[i];
|
|
301
372
|
if (filter && !filter(item.metadata))
|
|
@@ -306,45 +377,45 @@ export class Simile {
|
|
|
306
377
|
const keyword = keywordScore(query, searchableText);
|
|
307
378
|
rawResults.push({ index: i, item, semantic, fuzzy, keyword });
|
|
308
379
|
}
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
380
|
+
// Calculate score statistics for normalization
|
|
381
|
+
const stats = calculateScoreStats(rawResults);
|
|
382
|
+
// Second pass: normalize scores and compute hybrid score
|
|
383
|
+
const results = [];
|
|
384
|
+
for (const raw of rawResults) {
|
|
385
|
+
let semantic = raw.semantic;
|
|
386
|
+
let fuzzy = raw.fuzzy;
|
|
387
|
+
let keyword = raw.keyword;
|
|
388
|
+
// Normalize scores if enabled
|
|
389
|
+
if (this.config.normalizeScores) {
|
|
390
|
+
semantic = normalizeScore(raw.semantic, stats.semantic.min, stats.semantic.max);
|
|
391
|
+
fuzzy = normalizeScore(raw.fuzzy, stats.fuzzy.min, stats.fuzzy.max);
|
|
392
|
+
keyword = normalizeScore(raw.keyword, stats.keyword.min, stats.keyword.max);
|
|
393
|
+
}
|
|
394
|
+
const score = hybridScore(semantic, fuzzy, keyword, this.config.weights);
|
|
395
|
+
// Apply threshold filter
|
|
396
|
+
if (score < threshold)
|
|
397
|
+
continue;
|
|
398
|
+
results.push({
|
|
399
|
+
id: raw.item.id,
|
|
400
|
+
text: raw.item.text,
|
|
401
|
+
metadata: raw.item.metadata,
|
|
402
|
+
score,
|
|
403
|
+
explain: explain
|
|
404
|
+
? {
|
|
405
|
+
semantic,
|
|
406
|
+
fuzzy,
|
|
407
|
+
keyword,
|
|
408
|
+
raw: {
|
|
409
|
+
semantic: raw.semantic,
|
|
410
|
+
fuzzy: raw.fuzzy,
|
|
411
|
+
keyword: raw.keyword,
|
|
412
|
+
},
|
|
413
|
+
}
|
|
414
|
+
: undefined,
|
|
415
|
+
});
|
|
323
416
|
}
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
if (score < threshold)
|
|
327
|
-
continue;
|
|
328
|
-
results.push({
|
|
329
|
-
id: raw.item.id,
|
|
330
|
-
text: raw.item.text,
|
|
331
|
-
metadata: raw.item.metadata,
|
|
332
|
-
score,
|
|
333
|
-
explain: explain
|
|
334
|
-
? {
|
|
335
|
-
semantic,
|
|
336
|
-
fuzzy,
|
|
337
|
-
keyword,
|
|
338
|
-
raw: {
|
|
339
|
-
semantic: raw.semantic,
|
|
340
|
-
fuzzy: raw.fuzzy,
|
|
341
|
-
keyword: raw.keyword,
|
|
342
|
-
},
|
|
343
|
-
}
|
|
344
|
-
: undefined,
|
|
345
|
-
});
|
|
417
|
+
// Sort by relevance (highest score first)
|
|
418
|
+
return results.sort((a, b) => b.score - a.score).slice(0, topK);
|
|
346
419
|
}
|
|
347
|
-
// Sort by relevance (highest score first)
|
|
348
|
-
return results.sort((a, b) => b.score - a.score).slice(0, topK);
|
|
349
420
|
}
|
|
350
421
|
}
|
package/dist/engine.test.js
CHANGED
|
@@ -1,318 +1,84 @@
|
|
|
1
1
|
import { describe, it, expect } from "vitest";
|
|
2
2
|
import { Simile } from "./engine";
|
|
3
|
-
import { getByPath, extractText } from "./utils";
|
|
4
|
-
import * as fs from "fs";
|
|
5
|
-
import * as path from "path";
|
|
6
|
-
const testItems = [
|
|
7
|
-
{
|
|
8
|
-
id: "1",
|
|
9
|
-
text: "Bathroom floor cleaner",
|
|
10
|
-
metadata: { category: "Cleaning" },
|
|
11
|
-
},
|
|
12
|
-
{
|
|
13
|
-
id: "2",
|
|
14
|
-
text: "Dishwashing liquid",
|
|
15
|
-
metadata: { category: "Kitchen" },
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
id: "3",
|
|
19
|
-
text: "Ipod Charger",
|
|
20
|
-
metadata: { category: "Electronics" },
|
|
21
|
-
},
|
|
22
|
-
{
|
|
23
|
-
id: "4",
|
|
24
|
-
text: "Kitchen cleaning spray",
|
|
25
|
-
metadata: { category: "Cleaning" },
|
|
26
|
-
},
|
|
27
|
-
{
|
|
28
|
-
id: "5",
|
|
29
|
-
text: "USB-C phone charger cable",
|
|
30
|
-
metadata: { category: "Electronics" },
|
|
31
|
-
},
|
|
32
|
-
];
|
|
33
3
|
describe("simile search", () => {
|
|
34
4
|
it("returns semantically similar items", async () => {
|
|
35
|
-
const engine = await Simile.from(
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
expect(results[0].score).toBeGreaterThan(0.5);
|
|
41
|
-
}, 30000);
|
|
42
|
-
it("differentiates between unrelated items", async () => {
|
|
43
|
-
const engine = await Simile.from(testItems);
|
|
44
|
-
// Search for "phone charger" - should clearly prefer electronics
|
|
45
|
-
const results = await engine.search("phone charger", { explain: true });
|
|
46
|
-
console.log("Search for 'phone charger':", results);
|
|
47
|
-
// Both chargers should be in top 2 (order may vary based on model)
|
|
48
|
-
const topTwoIds = [results[0].id, results[1].id];
|
|
49
|
-
expect(topTwoIds).toContain("5"); // USB-C phone charger
|
|
50
|
-
expect(topTwoIds).toContain("3"); // iPod Charger
|
|
51
|
-
// Both chargers should score significantly higher than cleaning products
|
|
52
|
-
const chargerScores = results.filter((r) => r.metadata?.category === "Electronics");
|
|
53
|
-
const cleaningScores = results.filter((r) => r.metadata?.category === "Cleaning");
|
|
54
|
-
// Electronics should score higher than cleaning items
|
|
55
|
-
expect(chargerScores[0].score).toBeGreaterThan(cleaningScores[0].score);
|
|
56
|
-
}, 30000);
|
|
57
|
-
it("applies threshold filtering", async () => {
|
|
58
|
-
const engine = await Simile.from(testItems);
|
|
59
|
-
// With high threshold, should filter out low-scoring results
|
|
60
|
-
const results = await engine.search("cleaner", { threshold: 0.5 });
|
|
61
|
-
console.log("Search with threshold 0.5:", results);
|
|
62
|
-
results.forEach((r) => {
|
|
63
|
-
expect(r.score).toBeGreaterThanOrEqual(0.5);
|
|
64
|
-
});
|
|
65
|
-
}, 30000);
|
|
66
|
-
it("sorts results by relevance (highest score first)", async () => {
|
|
67
|
-
const engine = await Simile.from(testItems);
|
|
68
|
-
const results = await engine.search("cleaning products");
|
|
69
|
-
// Verify results are sorted by score descending
|
|
70
|
-
for (let i = 1; i < results.length; i++) {
|
|
71
|
-
expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score);
|
|
72
|
-
}
|
|
73
|
-
}, 30000);
|
|
74
|
-
});
|
|
75
|
-
describe("min character limit", () => {
|
|
76
|
-
it("returns empty results when query is below minLength", async () => {
|
|
77
|
-
const engine = await Simile.from(testItems);
|
|
78
|
-
// Default minLength is 1
|
|
79
|
-
const results1 = await engine.search("c");
|
|
80
|
-
expect(results1.length).toBeGreaterThan(0);
|
|
81
|
-
// With minLength: 3, short queries return empty
|
|
82
|
-
const results2 = await engine.search("cl", { minLength: 3 });
|
|
83
|
-
expect(results2.length).toBe(0);
|
|
84
|
-
// Exactly 3 characters should work
|
|
85
|
-
const results3 = await engine.search("usb", { minLength: 3 });
|
|
86
|
-
expect(results3.length).toBeGreaterThan(0);
|
|
87
|
-
}, 30000);
|
|
88
|
-
});
|
|
89
|
-
describe("nested path search", () => {
|
|
90
|
-
const nestedItems = [
|
|
91
|
-
{
|
|
92
|
-
id: "1",
|
|
93
|
-
text: "",
|
|
94
|
-
metadata: {
|
|
95
|
-
author: { firstName: "John", lastName: "Doe" },
|
|
96
|
-
title: "The Art of Programming",
|
|
97
|
-
tags: ["coding", "javascript"],
|
|
5
|
+
const engine = await Simile.from([
|
|
6
|
+
{
|
|
7
|
+
id: "1",
|
|
8
|
+
text: "Bathroom floor cleaner",
|
|
9
|
+
metadata: { category: "Cleaning" },
|
|
98
10
|
},
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
metadata: {
|
|
104
|
-
author: { firstName: "Jane", lastName: "Smith" },
|
|
105
|
-
title: "Machine Learning Basics",
|
|
106
|
-
tags: ["ai", "python"],
|
|
11
|
+
{
|
|
12
|
+
id: "2",
|
|
13
|
+
text: "Dishwashing liquid",
|
|
14
|
+
metadata: { category: "Kitchen" },
|
|
107
15
|
},
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
metadata: {
|
|
113
|
-
author: { firstName: "John", lastName: "Smith" },
|
|
114
|
-
title: "Advanced JavaScript",
|
|
115
|
-
tags: ["coding", "javascript", "advanced"],
|
|
16
|
+
{
|
|
17
|
+
id: "3",
|
|
18
|
+
text: "Ipod Charger",
|
|
19
|
+
metadata: { categoryq: "Electronics" },
|
|
116
20
|
},
|
|
117
|
-
},
|
|
118
|
-
];
|
|
119
|
-
it("extracts text from nested paths", () => {
|
|
120
|
-
const item = nestedItems[0];
|
|
121
|
-
expect(getByPath(item, "metadata.author.firstName")).toBe("John");
|
|
122
|
-
expect(getByPath(item, "metadata.title")).toBe("The Art of Programming");
|
|
123
|
-
expect(getByPath(item, "metadata.tags[0]")).toBe("coding");
|
|
124
|
-
expect(getByPath(item, "metadata.tags[1]")).toBe("javascript");
|
|
125
|
-
});
|
|
126
|
-
it("combines multiple paths into searchable text", () => {
|
|
127
|
-
const text = extractText(nestedItems[0], [
|
|
128
|
-
"metadata.author.firstName",
|
|
129
|
-
"metadata.author.lastName",
|
|
130
|
-
"metadata.title",
|
|
131
21
|
]);
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
it("searches using nested paths", async () => {
|
|
135
|
-
const engine = await Simile.from(nestedItems, {
|
|
136
|
-
textPaths: [
|
|
137
|
-
"metadata.author.firstName",
|
|
138
|
-
"metadata.author.lastName",
|
|
139
|
-
"metadata.title",
|
|
140
|
-
],
|
|
141
|
-
});
|
|
142
|
-
// Search by author name
|
|
143
|
-
const johnResults = await engine.search("John");
|
|
144
|
-
expect(johnResults.length).toBeGreaterThan(0);
|
|
145
|
-
expect(johnResults[0].metadata?.author.firstName).toBe("John");
|
|
146
|
-
// Search by title
|
|
147
|
-
const jsResults = await engine.search("JavaScript programming");
|
|
148
|
-
expect(jsResults.length).toBeGreaterThan(0);
|
|
149
|
-
}, 30000);
|
|
150
|
-
it("includes tags in nested path search", async () => {
|
|
151
|
-
const engine = await Simile.from(nestedItems, {
|
|
152
|
-
textPaths: ["metadata.title", "metadata.tags"],
|
|
153
|
-
});
|
|
154
|
-
const pythonResults = await engine.search("python ai");
|
|
155
|
-
expect(pythonResults[0].id).toBe("2"); // Machine Learning Basics
|
|
156
|
-
}, 30000);
|
|
157
|
-
});
|
|
158
|
-
describe("score normalization", () => {
|
|
159
|
-
it("includes raw scores in explain output", async () => {
|
|
160
|
-
const engine = await Simile.from(testItems);
|
|
161
|
-
const results = await engine.search("cleaner", { explain: true });
|
|
162
|
-
expect(results[0].explain).toBeDefined();
|
|
163
|
-
expect(results[0].explain?.raw).toBeDefined();
|
|
164
|
-
expect(results[0].explain?.raw?.semantic).toBeDefined();
|
|
165
|
-
expect(results[0].explain?.raw?.fuzzy).toBeDefined();
|
|
166
|
-
expect(results[0].explain?.raw?.keyword).toBeDefined();
|
|
167
|
-
}, 30000);
|
|
168
|
-
it("can disable score normalization", async () => {
|
|
169
|
-
const engine = await Simile.from(testItems, { normalizeScores: false });
|
|
170
|
-
const results = await engine.search("cleaner", { explain: true });
|
|
171
|
-
// Without normalization, normalized scores should equal raw scores
|
|
172
|
-
expect(results[0].explain?.semantic).toBe(results[0].explain?.raw?.semantic);
|
|
173
|
-
}, 30000);
|
|
174
|
-
});
|
|
175
|
-
describe("simile persistence", () => {
|
|
176
|
-
const snapshotPath = path.join(__dirname, "../.test-snapshot.json");
|
|
177
|
-
it("saves and loads from snapshot", async () => {
|
|
178
|
-
// Create engine and save
|
|
179
|
-
const engine = await Simile.from(testItems);
|
|
180
|
-
const snapshot = engine.save();
|
|
181
|
-
expect(snapshot.version).toBe("0.4.0");
|
|
182
|
-
expect(snapshot.items.length).toBe(5);
|
|
183
|
-
expect(snapshot.vectors.length).toBe(5);
|
|
184
|
-
expect(snapshot.model).toBe("Xenova/all-MiniLM-L6-v2");
|
|
185
|
-
// Load from snapshot (instant - no embedding!)
|
|
186
|
-
const loadedEngine = Simile.load(snapshot);
|
|
187
|
-
expect(loadedEngine.size).toBe(5);
|
|
188
|
-
// Search should work the same
|
|
189
|
-
const results = await loadedEngine.search("cleaner");
|
|
190
|
-
expect(results[0].text).toContain("cleaner");
|
|
191
|
-
}, 30000);
|
|
192
|
-
it("saves and loads from JSON file", async () => {
|
|
193
|
-
// Create and save to file
|
|
194
|
-
const engine = await Simile.from(testItems);
|
|
195
|
-
const json = engine.toJSON();
|
|
196
|
-
fs.writeFileSync(snapshotPath, json);
|
|
197
|
-
// Load from file (instant!)
|
|
198
|
-
const loadedJson = fs.readFileSync(snapshotPath, "utf-8");
|
|
199
|
-
const loadedEngine = Simile.loadFromJSON(loadedJson);
|
|
200
|
-
expect(loadedEngine.size).toBe(5);
|
|
201
|
-
// Cleanup
|
|
202
|
-
fs.unlinkSync(snapshotPath);
|
|
203
|
-
}, 30000);
|
|
204
|
-
it("preserves textPaths in snapshot", async () => {
|
|
205
|
-
const nestedItems = [
|
|
206
|
-
{ id: "1", text: "", metadata: { title: "Hello World" } },
|
|
207
|
-
];
|
|
208
|
-
const engine = await Simile.from(nestedItems, {
|
|
209
|
-
textPaths: ["metadata.title"],
|
|
210
|
-
});
|
|
211
|
-
const snapshot = engine.save();
|
|
212
|
-
expect(snapshot.textPaths).toEqual(["metadata.title"]);
|
|
213
|
-
const loaded = Simile.load(snapshot);
|
|
214
|
-
const results = await loaded.search("Hello");
|
|
22
|
+
const results = await engine.search("cleaner");
|
|
23
|
+
console.log(results);
|
|
215
24
|
expect(results.length).toBeGreaterThan(0);
|
|
25
|
+
expect(results[0].id).toBe("1");
|
|
26
|
+
expect(results[0].score).toBeGreaterThan(0.5);
|
|
216
27
|
}, 30000);
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
expect(engine.get("1")).toBeUndefined();
|
|
233
|
-
expect(engine.get("3")).toBeDefined();
|
|
234
|
-
}, 30000);
|
|
235
|
-
it("updates existing items", async () => {
|
|
236
|
-
const engine = await Simile.from(testItems.slice(0, 2));
|
|
237
|
-
// Update item with same ID but different text
|
|
238
|
-
await engine.add([
|
|
239
|
-
{ id: "1", text: "Wireless Bluetooth headphones", metadata: { category: "Electronics" } },
|
|
240
|
-
]);
|
|
241
|
-
expect(engine.size).toBe(2); // Still 2 items, not 3
|
|
242
|
-
expect(engine.get("1")?.text).toBe("Wireless Bluetooth headphones");
|
|
243
|
-
}, 30000);
|
|
244
|
-
});
|
|
245
|
-
describe("simile custom weights", () => {
|
|
246
|
-
it("respects custom weights", async () => {
|
|
247
|
-
// Engine with high semantic weight
|
|
248
|
-
const semanticEngine = await Simile.from(testItems, {
|
|
249
|
-
weights: { semantic: 0.9, fuzzy: 0.05, keyword: 0.05 },
|
|
250
|
-
});
|
|
251
|
-
// Engine with high keyword weight
|
|
252
|
-
const keywordEngine = await Simile.from(testItems, {
|
|
253
|
-
weights: { semantic: 0.1, fuzzy: 0.1, keyword: 0.8 },
|
|
254
|
-
});
|
|
255
|
-
const query = "floor";
|
|
256
|
-
const semanticResults = await semanticEngine.search(query, { explain: true });
|
|
257
|
-
const keywordResults = await keywordEngine.search(query, { explain: true });
|
|
258
|
-
console.log("Semantic-weighted results:", semanticResults.map((r) => ({
|
|
259
|
-
text: r.text,
|
|
260
|
-
score: r.score,
|
|
261
|
-
})));
|
|
262
|
-
console.log("Keyword-weighted results:", keywordResults.map((r) => ({
|
|
263
|
-
text: r.text,
|
|
264
|
-
score: r.score,
|
|
265
|
-
})));
|
|
266
|
-
// Both should find floor cleaner first (it has "floor" in text)
|
|
267
|
-
expect(semanticResults[0].text).toContain("floor");
|
|
268
|
-
expect(keywordResults[0].text).toContain("floor");
|
|
269
|
-
}, 30000);
|
|
270
|
-
});
|
|
271
|
-
describe("simile performance features", () => {
|
|
272
|
-
it("enables ANN index for large datasets", async () => {
|
|
273
|
-
// Generate many items to trigger ANN threshold
|
|
274
|
-
const manyItems = Array.from({ length: 100 }, (_, i) => ({
|
|
275
|
-
id: `many-${i}`,
|
|
276
|
-
text: `Item number ${i} for testing ANN index`,
|
|
277
|
-
metadata: { index: i },
|
|
28
|
+
it("performance test: 10K items should search in <100ms", async () => {
|
|
29
|
+
// Generate 10K test items
|
|
30
|
+
const items = Array.from({ length: 10000 }, (_, i) => ({
|
|
31
|
+
id: `item-${i}`,
|
|
32
|
+
text: `Product ${i} - ${[
|
|
33
|
+
"cleaner",
|
|
34
|
+
"charger",
|
|
35
|
+
"liquid",
|
|
36
|
+
"cable",
|
|
37
|
+
"headphones",
|
|
38
|
+
"keyboard",
|
|
39
|
+
"mouse",
|
|
40
|
+
"monitor",
|
|
41
|
+
][i % 8]}`,
|
|
42
|
+
metadata: { category: ["Electronics", "Cleaning", "Kitchen"][i % 3] },
|
|
278
43
|
}));
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
useANN:
|
|
44
|
+
// Create engine with optimized ANN settings
|
|
45
|
+
const engine = await Simile.from(items, {
|
|
46
|
+
useANN: {
|
|
47
|
+
efSearch: 20, // Fast search
|
|
48
|
+
M: 16,
|
|
49
|
+
efConstruction: 200,
|
|
50
|
+
},
|
|
51
|
+
annThreshold: 100, // Enable ANN early
|
|
282
52
|
});
|
|
53
|
+
// Verify ANN is enabled
|
|
283
54
|
const info = engine.getIndexInfo();
|
|
284
55
|
expect(info.type).toBe("hnsw");
|
|
285
|
-
expect(info.
|
|
286
|
-
|
|
287
|
-
const results = await engine.search("Item number 42");
|
|
288
|
-
expect(results[0].id).toBe("many-42");
|
|
289
|
-
}, 30000);
|
|
290
|
-
it("uses cache for redundant embeddings", async () => {
|
|
291
|
-
const engine = await Simile.from(testItems, {
|
|
292
|
-
cache: { enableStats: true },
|
|
293
|
-
});
|
|
294
|
-
// Reset stats
|
|
295
|
-
const info1 = engine.getIndexInfo();
|
|
296
|
-
// Search for same query twice
|
|
56
|
+
expect(info.size).toBe(10000);
|
|
57
|
+
// Warm up: first search includes embedding time
|
|
297
58
|
await engine.search("cleaner");
|
|
298
|
-
|
|
299
|
-
const
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
const engine = await Simile.from(testItems.slice(0, 1));
|
|
305
|
-
const info1 = engine.getIndexInfo();
|
|
306
|
-
// Add same item again (should hit cache)
|
|
307
|
-
await engine.add(testItems.slice(0, 1));
|
|
308
|
-
const info2 = engine.getIndexInfo();
|
|
309
|
-
// Cache stats might be null if not enabled, let's enable it
|
|
310
|
-
engine.remove(['1']);
|
|
311
|
-
const engineWithStats = await Simile.from(testItems.slice(0, 1), {
|
|
312
|
-
cache: { enableStats: true }
|
|
59
|
+
// Performance test: search should be <100ms (excluding first-time embedding)
|
|
60
|
+
const query = "phone charger";
|
|
61
|
+
const startTime = performance.now();
|
|
62
|
+
const results = await engine.search(query, {
|
|
63
|
+
topK: 5,
|
|
64
|
+
semanticOnly: true, // Fast mode: skip fuzzy/keyword
|
|
313
65
|
});
|
|
314
|
-
|
|
315
|
-
const
|
|
316
|
-
|
|
317
|
-
|
|
66
|
+
const endTime = performance.now();
|
|
67
|
+
const searchTime = endTime - startTime;
|
|
68
|
+
console.log(`Search time for 10K items: ${searchTime.toFixed(2)}ms`);
|
|
69
|
+
console.log(`Results: ${results.length}`);
|
|
70
|
+
console.log(`Index info:`, info);
|
|
71
|
+
expect(results.length).toBeGreaterThan(0);
|
|
72
|
+
expect(searchTime).toBeLessThan(100); // Should be <100ms
|
|
73
|
+
// Also test with full hybrid search
|
|
74
|
+
const startTime2 = performance.now();
|
|
75
|
+
const results2 = await engine.search(query, {
|
|
76
|
+
topK: 5,
|
|
77
|
+
semanticOnly: false, // Full hybrid search
|
|
78
|
+
});
|
|
79
|
+
const endTime2 = performance.now();
|
|
80
|
+
const hybridTime = endTime2 - startTime2;
|
|
81
|
+
console.log(`Hybrid search time: ${hybridTime.toFixed(2)}ms`);
|
|
82
|
+
expect(hybridTime).toBeLessThan(200);
|
|
83
|
+
}, 300000); // Longer timeout for 10K items (embedding takes ~3 minutes)
|
|
318
84
|
});
|
package/dist/types.d.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { HNSWConfig } from "./ann.js";
|
|
|
2
2
|
import { CacheOptions, CacheStats } from "./cache.js";
|
|
3
3
|
import { QuantizationType } from "./quantization.js";
|
|
4
4
|
import { UpdaterConfig } from "./updater.js";
|
|
5
|
-
export { HNSWConfig, CacheOptions, CacheStats, QuantizationType, UpdaterConfig };
|
|
5
|
+
export { HNSWConfig, CacheOptions, CacheStats, QuantizationType, UpdaterConfig, };
|
|
6
6
|
export interface SearchItem<T = any> {
|
|
7
7
|
id: string;
|
|
8
8
|
text: string;
|
|
@@ -37,6 +37,8 @@ export interface SearchOptions {
|
|
|
37
37
|
useFastSimilarity?: boolean;
|
|
38
38
|
/** Use ANN index if available (default: true) */
|
|
39
39
|
useANN?: boolean;
|
|
40
|
+
/** Semantic-only search (skip fuzzy/keyword for maximum speed) */
|
|
41
|
+
semanticOnly?: boolean;
|
|
40
42
|
}
|
|
41
43
|
export interface HybridWeights {
|
|
42
44
|
/** Semantic similarity weight (0-1), default: 0.7 */
|
|
@@ -86,7 +88,7 @@ export interface SimileSnapshot<T = any> {
|
|
|
86
88
|
cache?: any;
|
|
87
89
|
}
|
|
88
90
|
export interface IndexInfo {
|
|
89
|
-
type:
|
|
91
|
+
type: "linear" | "hnsw";
|
|
90
92
|
size: number;
|
|
91
93
|
memory: string;
|
|
92
94
|
annStats?: {
|