simile-search 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -13
- package/dist/engine.d.ts +10 -2
- package/dist/engine.js +70 -16
- package/dist/engine.test.js +124 -2
- package/dist/index.d.ts +5 -5
- package/dist/index.js +5 -5
- package/dist/ranker.d.ts +1 -1
- package/dist/similarity.d.ts +38 -0
- package/dist/similarity.js +50 -5
- package/dist/types.d.ts +18 -0
- package/dist/utils.d.ts +31 -0
- package/dist/utils.js +66 -0
- package/package.json +8 -1
package/README.md
CHANGED
|
@@ -21,6 +21,9 @@ Simile combines the power of AI embeddings with fuzzy string matching and keywor
|
|
|
21
21
|
- ⚡ **Batch Processing** - Optimized for large catalogs
|
|
22
22
|
- 🔧 **Configurable** - Tune scoring weights for your use case
|
|
23
23
|
- 📦 **Zero API Calls** - Everything runs locally with Transformers.js
|
|
24
|
+
- 🔗 **Nested Path Search** - Search `author.firstName` instead of flat strings
|
|
25
|
+
- 📊 **Score Normalization** - Consistent scoring across different methods
|
|
26
|
+
- ✂️ **Min Character Limit** - Control when search triggers
|
|
24
27
|
|
|
25
28
|
## 📦 Installation
|
|
26
29
|
|
|
@@ -45,8 +48,8 @@ const engine = await Simile.from([
|
|
|
45
48
|
const results = await engine.search('phone charger');
|
|
46
49
|
console.log(results);
|
|
47
50
|
// [
|
|
48
|
-
// { id: '3', text: 'iPhone Charger', score: 0.
|
|
49
|
-
// { id: '4', text: 'USB-C phone charger cable', score: 0.
|
|
51
|
+
// { id: '3', text: 'iPhone Charger', score: 0.92, ... },
|
|
52
|
+
// { id: '4', text: 'USB-C phone charger cable', score: 0.87, ... },
|
|
50
53
|
// ...
|
|
51
54
|
// ]
|
|
52
55
|
```
|
|
@@ -81,13 +84,69 @@ const snapshot = engine.save();
|
|
|
81
84
|
// model: 'Xenova/all-MiniLM-L6-v2',
|
|
82
85
|
// items: [...],
|
|
83
86
|
// vectors: ['base64...', 'base64...'],
|
|
84
|
-
// createdAt: '2024-12-28T...'
|
|
87
|
+
// createdAt: '2024-12-28T...',
|
|
88
|
+
// textPaths: ['metadata.title', ...] // if configured
|
|
85
89
|
// }
|
|
86
90
|
|
|
87
91
|
// Load from snapshot object
|
|
88
92
|
const restored = Simile.load(snapshot);
|
|
89
93
|
```
|
|
90
94
|
|
|
95
|
+
## 🔗 Nested Path Search
|
|
96
|
+
|
|
97
|
+
Search complex objects by specifying paths to extract text from:
|
|
98
|
+
|
|
99
|
+
```typescript
|
|
100
|
+
const books = [
|
|
101
|
+
{
|
|
102
|
+
id: '1',
|
|
103
|
+
text: '', // Can be empty when using textPaths
|
|
104
|
+
metadata: {
|
|
105
|
+
author: { firstName: 'John', lastName: 'Doe' },
|
|
106
|
+
title: 'The Art of Programming',
|
|
107
|
+
tags: ['coding', 'javascript'],
|
|
108
|
+
},
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
id: '2',
|
|
112
|
+
text: '',
|
|
113
|
+
metadata: {
|
|
114
|
+
author: { firstName: 'Jane', lastName: 'Smith' },
|
|
115
|
+
title: 'Machine Learning Basics',
|
|
116
|
+
tags: ['ai', 'python'],
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
];
|
|
120
|
+
|
|
121
|
+
// Configure which paths to extract and search
|
|
122
|
+
const engine = await Simile.from(books, {
|
|
123
|
+
textPaths: [
|
|
124
|
+
'metadata.author.firstName',
|
|
125
|
+
'metadata.author.lastName',
|
|
126
|
+
'metadata.title',
|
|
127
|
+
'metadata.tags', // Arrays are joined with spaces
|
|
128
|
+
],
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
// Now you can search by author name!
|
|
132
|
+
const results = await engine.search('John programming');
|
|
133
|
+
// Finds "The Art of Programming" by John Doe
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Supported Path Formats
|
|
137
|
+
|
|
138
|
+
```typescript
|
|
139
|
+
// Dot notation for nested objects
|
|
140
|
+
'metadata.author.firstName' // → "John"
|
|
141
|
+
|
|
142
|
+
// Array index access
|
|
143
|
+
'metadata.tags[0]' // → "coding"
|
|
144
|
+
'items[0].name' // → nested array access
|
|
145
|
+
|
|
146
|
+
// Arrays without index (joins all elements)
|
|
147
|
+
'metadata.tags' // → "coding javascript"
|
|
148
|
+
```
|
|
149
|
+
|
|
91
150
|
## 🔧 Configuration
|
|
92
151
|
|
|
93
152
|
### Custom Scoring Weights
|
|
@@ -107,6 +166,38 @@ const engine = await Simile.from(items, {
|
|
|
107
166
|
engine.setWeights({ semantic: 0.9, fuzzy: 0.05, keyword: 0.05 });
|
|
108
167
|
```
|
|
109
168
|
|
|
169
|
+
### Score Normalization
|
|
170
|
+
|
|
171
|
+
By default, scores are normalized so that a "0.8" semantic score means the same as a "0.8" fuzzy score. This ensures fair comparison across different scoring methods.
|
|
172
|
+
|
|
173
|
+
```typescript
|
|
174
|
+
// Enabled by default
|
|
175
|
+
const engine = await Simile.from(items, {
|
|
176
|
+
normalizeScores: true, // default
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
// Disable if you want raw scores
|
|
180
|
+
const rawEngine = await Simile.from(items, {
|
|
181
|
+
normalizeScores: false,
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
// With explain: true, you can see both normalized and raw scores
|
|
185
|
+
const results = await engine.search('cleaner', { explain: true });
|
|
186
|
+
// {
|
|
187
|
+
// score: 1.0,
|
|
188
|
+
// explain: {
|
|
189
|
+
// semantic: 1.0, // normalized
|
|
190
|
+
// fuzzy: 1.0, // normalized
|
|
191
|
+
// keyword: 1.0, // normalized
|
|
192
|
+
// raw: {
|
|
193
|
+
// semantic: 0.62, // original score
|
|
194
|
+
// fuzzy: 0.32, // original score
|
|
195
|
+
// keyword: 1.0, // original score
|
|
196
|
+
// }
|
|
197
|
+
// }
|
|
198
|
+
// }
|
|
199
|
+
```
|
|
200
|
+
|
|
110
201
|
### Search Options
|
|
111
202
|
|
|
112
203
|
```typescript
|
|
@@ -115,17 +206,25 @@ const results = await engine.search('cleaner', {
|
|
|
115
206
|
threshold: 0.5, // Minimum score (default: 0)
|
|
116
207
|
explain: true, // Include score breakdown
|
|
117
208
|
filter: (meta) => meta.category === 'Cleaning', // Filter by metadata
|
|
209
|
+
minLength: 3, // Don't search until 3+ characters typed (default: 1)
|
|
118
210
|
});
|
|
211
|
+
```
|
|
119
212
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
//
|
|
126
|
-
|
|
213
|
+
### Min Character Limit
|
|
214
|
+
|
|
215
|
+
Prevent unnecessary searches on very short queries:
|
|
216
|
+
|
|
217
|
+
```typescript
|
|
218
|
+
// Don't trigger search until user types at least 3 characters
|
|
219
|
+
const results = await engine.search('cl', { minLength: 3 });
|
|
220
|
+
// Returns [] because query length (2) < minLength (3)
|
|
221
|
+
|
|
222
|
+
const results2 = await engine.search('cle', { minLength: 3 });
|
|
223
|
+
// Returns results because query length (3) >= minLength (3)
|
|
127
224
|
```
|
|
128
225
|
|
|
226
|
+
This is useful for autocomplete/typeahead UIs where you don't want to search on every keystroke.
|
|
227
|
+
|
|
129
228
|
## 📝 Dynamic Catalog Management
|
|
130
229
|
|
|
131
230
|
Add, update, or remove items without rebuilding:
|
|
@@ -167,7 +266,11 @@ import {
|
|
|
167
266
|
keywordScore,
|
|
168
267
|
hybridScore,
|
|
169
268
|
vectorToBase64,
|
|
170
|
-
base64ToVector
|
|
269
|
+
base64ToVector,
|
|
270
|
+
getByPath,
|
|
271
|
+
extractText,
|
|
272
|
+
normalizeScore,
|
|
273
|
+
calculateScoreStats,
|
|
171
274
|
} from 'simile-search';
|
|
172
275
|
|
|
173
276
|
// Embed text directly
|
|
@@ -183,6 +286,10 @@ const keyword = keywordScore('phone charger', 'USB phone charger cable');
|
|
|
183
286
|
|
|
184
287
|
// Combine scores
|
|
185
288
|
const score = hybridScore(0.8, 0.6, 0.5, { semantic: 0.7, fuzzy: 0.15, keyword: 0.15 });
|
|
289
|
+
|
|
290
|
+
// Extract nested values
|
|
291
|
+
const firstName = getByPath(obj, 'author.firstName');
|
|
292
|
+
const text = extractText(item, ['metadata.title', 'metadata.tags']);
|
|
186
293
|
```
|
|
187
294
|
|
|
188
295
|
## 📊 API Reference
|
|
@@ -197,7 +304,7 @@ Load from a saved snapshot (instant, no embedding).
|
|
|
197
304
|
Load from JSON string.
|
|
198
305
|
|
|
199
306
|
### `engine.search(query, options?)`
|
|
200
|
-
Search for similar items.
|
|
307
|
+
Search for similar items. **Results are always sorted by relevance (highest score first).**
|
|
201
308
|
|
|
202
309
|
### `engine.save()`
|
|
203
310
|
Export snapshot object for persistence.
|
|
@@ -237,19 +344,27 @@ interface SearchResult<T = any> {
|
|
|
237
344
|
text: string;
|
|
238
345
|
score: number;
|
|
239
346
|
metadata?: T;
|
|
240
|
-
explain?: {
|
|
347
|
+
explain?: {
|
|
348
|
+
semantic: number;
|
|
349
|
+
fuzzy: number;
|
|
350
|
+
keyword: number;
|
|
351
|
+
raw?: { semantic: number; fuzzy: number; keyword: number };
|
|
352
|
+
};
|
|
241
353
|
}
|
|
242
354
|
|
|
243
355
|
interface SearchOptions {
|
|
244
356
|
topK?: number;
|
|
245
357
|
explain?: boolean;
|
|
246
358
|
threshold?: number;
|
|
359
|
+
minLength?: number; // Min query length to trigger search
|
|
247
360
|
filter?: (metadata: any) => boolean;
|
|
248
361
|
}
|
|
249
362
|
|
|
250
363
|
interface SimileConfig {
|
|
251
364
|
weights?: { semantic?: number; fuzzy?: number; keyword?: number };
|
|
252
365
|
model?: string;
|
|
366
|
+
textPaths?: string[]; // Paths for nested object search
|
|
367
|
+
normalizeScores?: boolean; // Enable score normalization (default: true)
|
|
253
368
|
}
|
|
254
369
|
```
|
|
255
370
|
|
|
@@ -266,3 +381,4 @@ MIT © [Aavash Baral](https://github.com/iaavas)
|
|
|
266
381
|
<p align="center">
|
|
267
382
|
Made with ❤️ by <a href="https://github.com/iaavas">Aavash Baral</a>
|
|
268
383
|
</p>
|
|
384
|
+
|
package/dist/engine.d.ts
CHANGED
|
@@ -1,10 +1,14 @@
|
|
|
1
|
-
import { SearchItem, SearchResult, SearchOptions, SimileConfig, SimileSnapshot, HybridWeights } from "./types";
|
|
1
|
+
import { SearchItem, SearchResult, SearchOptions, SimileConfig, SimileSnapshot, HybridWeights } from "./types.js";
|
|
2
2
|
export declare class Simile<T = any> {
|
|
3
3
|
private items;
|
|
4
4
|
private vectors;
|
|
5
5
|
private itemIndex;
|
|
6
6
|
private config;
|
|
7
7
|
private constructor();
|
|
8
|
+
/**
|
|
9
|
+
* Extract searchable text from an item using configured paths.
|
|
10
|
+
*/
|
|
11
|
+
private getSearchableText;
|
|
8
12
|
/**
|
|
9
13
|
* Create a new Simile instance from items.
|
|
10
14
|
* This will embed all items (slow for first run, but cached after).
|
|
@@ -53,7 +57,11 @@ export declare class Simile<T = any> {
|
|
|
53
57
|
*/
|
|
54
58
|
setWeights(weights: HybridWeights): void;
|
|
55
59
|
/**
|
|
56
|
-
* Search for similar items
|
|
60
|
+
* Search for similar items.
|
|
61
|
+
*
|
|
62
|
+
* @param query - The search query
|
|
63
|
+
* @param options - Search options
|
|
64
|
+
* @returns Sorted results by relevance (highest score first)
|
|
57
65
|
*/
|
|
58
66
|
search(query: string, options?: SearchOptions): Promise<SearchResult<T>[]>;
|
|
59
67
|
}
|
package/dist/engine.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder";
|
|
2
|
-
import { cosine, fuzzyScore, keywordScore } from "./similarity";
|
|
3
|
-
import { hybridScore, getDefaultWeights } from "./ranker";
|
|
4
|
-
|
|
1
|
+
import { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder.js";
|
|
2
|
+
import { cosine, fuzzyScore, keywordScore, calculateScoreStats } from "./similarity.js";
|
|
3
|
+
import { hybridScore, getDefaultWeights } from "./ranker.js";
|
|
4
|
+
import { extractText, normalizeScore } from "./utils.js";
|
|
5
|
+
const PACKAGE_VERSION = "0.3.1";
|
|
5
6
|
export class Simile {
|
|
6
7
|
constructor(items, vectors, config = {}) {
|
|
7
8
|
this.items = items;
|
|
@@ -10,15 +11,25 @@ export class Simile {
|
|
|
10
11
|
this.config = {
|
|
11
12
|
weights: config.weights ?? getDefaultWeights(),
|
|
12
13
|
model: config.model ?? "Xenova/all-MiniLM-L6-v2",
|
|
14
|
+
textPaths: config.textPaths ?? [],
|
|
15
|
+
normalizeScores: config.normalizeScores ?? true,
|
|
13
16
|
};
|
|
14
17
|
}
|
|
18
|
+
/**
|
|
19
|
+
* Extract searchable text from an item using configured paths.
|
|
20
|
+
*/
|
|
21
|
+
getSearchableText(item) {
|
|
22
|
+
return extractText(item, this.config.textPaths.length > 0 ? this.config.textPaths : undefined);
|
|
23
|
+
}
|
|
15
24
|
/**
|
|
16
25
|
* Create a new Simile instance from items.
|
|
17
26
|
* This will embed all items (slow for first run, but cached after).
|
|
18
27
|
*/
|
|
19
28
|
static async from(items, config = {}) {
|
|
20
29
|
const model = config.model ?? "Xenova/all-MiniLM-L6-v2";
|
|
21
|
-
const
|
|
30
|
+
const textPaths = config.textPaths ?? [];
|
|
31
|
+
// Extract text using paths if configured
|
|
32
|
+
const texts = items.map((item) => extractText(item, textPaths.length > 0 ? textPaths : undefined));
|
|
22
33
|
const vectors = await embedBatch(texts, model);
|
|
23
34
|
return new Simile(items, vectors, config);
|
|
24
35
|
}
|
|
@@ -28,7 +39,11 @@ export class Simile {
|
|
|
28
39
|
*/
|
|
29
40
|
static load(snapshot, config = {}) {
|
|
30
41
|
const vectors = snapshot.vectors.map(base64ToVector);
|
|
31
|
-
return new Simile(snapshot.items, vectors, {
|
|
42
|
+
return new Simile(snapshot.items, vectors, {
|
|
43
|
+
...config,
|
|
44
|
+
model: snapshot.model,
|
|
45
|
+
textPaths: snapshot.textPaths ?? config.textPaths ?? [],
|
|
46
|
+
});
|
|
32
47
|
}
|
|
33
48
|
/**
|
|
34
49
|
* Load from JSON string (e.g., from file or localStorage)
|
|
@@ -48,6 +63,7 @@ export class Simile {
|
|
|
48
63
|
items: this.items,
|
|
49
64
|
vectors: this.vectors.map(vectorToBase64),
|
|
50
65
|
createdAt: new Date().toISOString(),
|
|
66
|
+
textPaths: this.config.textPaths.length > 0 ? this.config.textPaths : undefined,
|
|
51
67
|
};
|
|
52
68
|
}
|
|
53
69
|
/**
|
|
@@ -60,7 +76,7 @@ export class Simile {
|
|
|
60
76
|
* Add new items to the index
|
|
61
77
|
*/
|
|
62
78
|
async add(items) {
|
|
63
|
-
const texts = items.map((item) => item
|
|
79
|
+
const texts = items.map((item) => this.getSearchableText(item));
|
|
64
80
|
const newVectors = await embedBatch(texts, this.config.model);
|
|
65
81
|
for (let i = 0; i < items.length; i++) {
|
|
66
82
|
const item = items[i];
|
|
@@ -122,31 +138,69 @@ export class Simile {
|
|
|
122
138
|
this.config.weights = { ...this.config.weights, ...weights };
|
|
123
139
|
}
|
|
124
140
|
/**
|
|
125
|
-
* Search for similar items
|
|
141
|
+
* Search for similar items.
|
|
142
|
+
*
|
|
143
|
+
* @param query - The search query
|
|
144
|
+
* @param options - Search options
|
|
145
|
+
* @returns Sorted results by relevance (highest score first)
|
|
126
146
|
*/
|
|
127
147
|
async search(query, options = {}) {
|
|
128
|
-
const { topK = 5, explain = false, filter, threshold = 0, } = options;
|
|
148
|
+
const { topK = 5, explain = false, filter, threshold = 0, minLength = 1, } = options;
|
|
149
|
+
// Min character limit - don't search until query meets minimum length
|
|
150
|
+
if (query.length < minLength) {
|
|
151
|
+
return [];
|
|
152
|
+
}
|
|
129
153
|
const qVector = await embed(query, this.config.model);
|
|
130
|
-
|
|
154
|
+
// First pass: calculate raw scores
|
|
155
|
+
const rawResults = [];
|
|
131
156
|
for (let i = 0; i < this.items.length; i++) {
|
|
132
157
|
const item = this.items[i];
|
|
133
158
|
if (filter && !filter(item.metadata))
|
|
134
159
|
continue;
|
|
160
|
+
const searchableText = this.getSearchableText(item);
|
|
135
161
|
const semantic = cosine(qVector, this.vectors[i]);
|
|
136
|
-
const fuzzy = fuzzyScore(query,
|
|
137
|
-
const keyword = keywordScore(query,
|
|
162
|
+
const fuzzy = fuzzyScore(query, searchableText);
|
|
163
|
+
const keyword = keywordScore(query, searchableText);
|
|
164
|
+
rawResults.push({ index: i, item, semantic, fuzzy, keyword });
|
|
165
|
+
}
|
|
166
|
+
// Calculate score statistics for normalization
|
|
167
|
+
const stats = calculateScoreStats(rawResults);
|
|
168
|
+
// Second pass: normalize scores and compute hybrid score
|
|
169
|
+
const results = [];
|
|
170
|
+
for (const raw of rawResults) {
|
|
171
|
+
let semantic = raw.semantic;
|
|
172
|
+
let fuzzy = raw.fuzzy;
|
|
173
|
+
let keyword = raw.keyword;
|
|
174
|
+
// Normalize scores if enabled
|
|
175
|
+
if (this.config.normalizeScores) {
|
|
176
|
+
semantic = normalizeScore(raw.semantic, stats.semantic.min, stats.semantic.max);
|
|
177
|
+
fuzzy = normalizeScore(raw.fuzzy, stats.fuzzy.min, stats.fuzzy.max);
|
|
178
|
+
keyword = normalizeScore(raw.keyword, stats.keyword.min, stats.keyword.max);
|
|
179
|
+
}
|
|
138
180
|
const score = hybridScore(semantic, fuzzy, keyword, this.config.weights);
|
|
139
181
|
// Apply threshold filter
|
|
140
182
|
if (score < threshold)
|
|
141
183
|
continue;
|
|
142
184
|
results.push({
|
|
143
|
-
id: item.id,
|
|
144
|
-
text: item.text,
|
|
145
|
-
metadata: item.metadata,
|
|
185
|
+
id: raw.item.id,
|
|
186
|
+
text: raw.item.text,
|
|
187
|
+
metadata: raw.item.metadata,
|
|
146
188
|
score,
|
|
147
|
-
explain: explain
|
|
189
|
+
explain: explain
|
|
190
|
+
? {
|
|
191
|
+
semantic,
|
|
192
|
+
fuzzy,
|
|
193
|
+
keyword,
|
|
194
|
+
raw: {
|
|
195
|
+
semantic: raw.semantic,
|
|
196
|
+
fuzzy: raw.fuzzy,
|
|
197
|
+
keyword: raw.keyword,
|
|
198
|
+
},
|
|
199
|
+
}
|
|
200
|
+
: undefined,
|
|
148
201
|
});
|
|
149
202
|
}
|
|
203
|
+
// Sort by relevance (highest score first)
|
|
150
204
|
return results.sort((a, b) => b.score - a.score).slice(0, topK);
|
|
151
205
|
}
|
|
152
206
|
}
|
package/dist/engine.test.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { describe, it, expect } from "vitest";
|
|
2
2
|
import { Simile } from "./engine";
|
|
3
|
+
import { getByPath, extractText } from "./utils";
|
|
3
4
|
import * as fs from "fs";
|
|
4
5
|
import * as path from "path";
|
|
5
6
|
const testItems = [
|
|
@@ -50,8 +51,8 @@ describe("simile search", () => {
|
|
|
50
51
|
// Both chargers should score significantly higher than cleaning products
|
|
51
52
|
const chargerScores = results.filter((r) => r.metadata?.category === "Electronics");
|
|
52
53
|
const cleaningScores = results.filter((r) => r.metadata?.category === "Cleaning");
|
|
53
|
-
// Electronics should score
|
|
54
|
-
expect(chargerScores[0].score).toBeGreaterThan(cleaningScores[0].score
|
|
54
|
+
// Electronics should score higher than cleaning items
|
|
55
|
+
expect(chargerScores[0].score).toBeGreaterThan(cleaningScores[0].score);
|
|
55
56
|
}, 30000);
|
|
56
57
|
it("applies threshold filtering", async () => {
|
|
57
58
|
const engine = await Simile.from(testItems);
|
|
@@ -62,6 +63,114 @@ describe("simile search", () => {
|
|
|
62
63
|
expect(r.score).toBeGreaterThanOrEqual(0.5);
|
|
63
64
|
});
|
|
64
65
|
}, 30000);
|
|
66
|
+
it("sorts results by relevance (highest score first)", async () => {
|
|
67
|
+
const engine = await Simile.from(testItems);
|
|
68
|
+
const results = await engine.search("cleaning products");
|
|
69
|
+
// Verify results are sorted by score descending
|
|
70
|
+
for (let i = 1; i < results.length; i++) {
|
|
71
|
+
expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score);
|
|
72
|
+
}
|
|
73
|
+
}, 30000);
|
|
74
|
+
});
|
|
75
|
+
describe("min character limit", () => {
|
|
76
|
+
it("returns empty results when query is below minLength", async () => {
|
|
77
|
+
const engine = await Simile.from(testItems);
|
|
78
|
+
// Default minLength is 1
|
|
79
|
+
const results1 = await engine.search("c");
|
|
80
|
+
expect(results1.length).toBeGreaterThan(0);
|
|
81
|
+
// With minLength: 3, short queries return empty
|
|
82
|
+
const results2 = await engine.search("cl", { minLength: 3 });
|
|
83
|
+
expect(results2.length).toBe(0);
|
|
84
|
+
// Exactly 3 characters should work
|
|
85
|
+
const results3 = await engine.search("usb", { minLength: 3 });
|
|
86
|
+
expect(results3.length).toBeGreaterThan(0);
|
|
87
|
+
}, 30000);
|
|
88
|
+
});
|
|
89
|
+
describe("nested path search", () => {
|
|
90
|
+
const nestedItems = [
|
|
91
|
+
{
|
|
92
|
+
id: "1",
|
|
93
|
+
text: "",
|
|
94
|
+
metadata: {
|
|
95
|
+
author: { firstName: "John", lastName: "Doe" },
|
|
96
|
+
title: "The Art of Programming",
|
|
97
|
+
tags: ["coding", "javascript"],
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
id: "2",
|
|
102
|
+
text: "",
|
|
103
|
+
metadata: {
|
|
104
|
+
author: { firstName: "Jane", lastName: "Smith" },
|
|
105
|
+
title: "Machine Learning Basics",
|
|
106
|
+
tags: ["ai", "python"],
|
|
107
|
+
},
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
id: "3",
|
|
111
|
+
text: "",
|
|
112
|
+
metadata: {
|
|
113
|
+
author: { firstName: "John", lastName: "Smith" },
|
|
114
|
+
title: "Advanced JavaScript",
|
|
115
|
+
tags: ["coding", "javascript", "advanced"],
|
|
116
|
+
},
|
|
117
|
+
},
|
|
118
|
+
];
|
|
119
|
+
it("extracts text from nested paths", () => {
|
|
120
|
+
const item = nestedItems[0];
|
|
121
|
+
expect(getByPath(item, "metadata.author.firstName")).toBe("John");
|
|
122
|
+
expect(getByPath(item, "metadata.title")).toBe("The Art of Programming");
|
|
123
|
+
expect(getByPath(item, "metadata.tags[0]")).toBe("coding");
|
|
124
|
+
expect(getByPath(item, "metadata.tags[1]")).toBe("javascript");
|
|
125
|
+
});
|
|
126
|
+
it("combines multiple paths into searchable text", () => {
|
|
127
|
+
const text = extractText(nestedItems[0], [
|
|
128
|
+
"metadata.author.firstName",
|
|
129
|
+
"metadata.author.lastName",
|
|
130
|
+
"metadata.title",
|
|
131
|
+
]);
|
|
132
|
+
expect(text).toBe("John Doe The Art of Programming");
|
|
133
|
+
});
|
|
134
|
+
it("searches using nested paths", async () => {
|
|
135
|
+
const engine = await Simile.from(nestedItems, {
|
|
136
|
+
textPaths: [
|
|
137
|
+
"metadata.author.firstName",
|
|
138
|
+
"metadata.author.lastName",
|
|
139
|
+
"metadata.title",
|
|
140
|
+
],
|
|
141
|
+
});
|
|
142
|
+
// Search by author name
|
|
143
|
+
const johnResults = await engine.search("John");
|
|
144
|
+
expect(johnResults.length).toBeGreaterThan(0);
|
|
145
|
+
expect(johnResults[0].metadata?.author.firstName).toBe("John");
|
|
146
|
+
// Search by title
|
|
147
|
+
const jsResults = await engine.search("JavaScript programming");
|
|
148
|
+
expect(jsResults.length).toBeGreaterThan(0);
|
|
149
|
+
}, 30000);
|
|
150
|
+
it("includes tags in nested path search", async () => {
|
|
151
|
+
const engine = await Simile.from(nestedItems, {
|
|
152
|
+
textPaths: ["metadata.title", "metadata.tags"],
|
|
153
|
+
});
|
|
154
|
+
const pythonResults = await engine.search("python ai");
|
|
155
|
+
expect(pythonResults[0].id).toBe("2"); // Machine Learning Basics
|
|
156
|
+
}, 30000);
|
|
157
|
+
});
|
|
158
|
+
describe("score normalization", () => {
|
|
159
|
+
it("includes raw scores in explain output", async () => {
|
|
160
|
+
const engine = await Simile.from(testItems);
|
|
161
|
+
const results = await engine.search("cleaner", { explain: true });
|
|
162
|
+
expect(results[0].explain).toBeDefined();
|
|
163
|
+
expect(results[0].explain?.raw).toBeDefined();
|
|
164
|
+
expect(results[0].explain?.raw?.semantic).toBeDefined();
|
|
165
|
+
expect(results[0].explain?.raw?.fuzzy).toBeDefined();
|
|
166
|
+
expect(results[0].explain?.raw?.keyword).toBeDefined();
|
|
167
|
+
}, 30000);
|
|
168
|
+
it("can disable score normalization", async () => {
|
|
169
|
+
const engine = await Simile.from(testItems, { normalizeScores: false });
|
|
170
|
+
const results = await engine.search("cleaner", { explain: true });
|
|
171
|
+
// Without normalization, normalized scores should equal raw scores
|
|
172
|
+
expect(results[0].explain?.semantic).toBe(results[0].explain?.raw?.semantic);
|
|
173
|
+
}, 30000);
|
|
65
174
|
});
|
|
66
175
|
describe("simile persistence", () => {
|
|
67
176
|
const snapshotPath = path.join(__dirname, "../.test-snapshot.json");
|
|
@@ -92,6 +201,19 @@ describe("simile persistence", () => {
|
|
|
92
201
|
// Cleanup
|
|
93
202
|
fs.unlinkSync(snapshotPath);
|
|
94
203
|
}, 30000);
|
|
204
|
+
it("preserves textPaths in snapshot", async () => {
|
|
205
|
+
const nestedItems = [
|
|
206
|
+
{ id: "1", text: "", metadata: { title: "Hello World" } },
|
|
207
|
+
];
|
|
208
|
+
const engine = await Simile.from(nestedItems, {
|
|
209
|
+
textPaths: ["metadata.title"],
|
|
210
|
+
});
|
|
211
|
+
const snapshot = engine.save();
|
|
212
|
+
expect(snapshot.textPaths).toEqual(["metadata.title"]);
|
|
213
|
+
const loaded = Simile.load(snapshot);
|
|
214
|
+
const results = await loaded.search("Hello");
|
|
215
|
+
expect(results.length).toBeGreaterThan(0);
|
|
216
|
+
}, 30000);
|
|
95
217
|
});
|
|
96
218
|
describe("simile dynamic items", () => {
|
|
97
219
|
it("adds new items", async () => {
|
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
export
|
|
2
|
-
export
|
|
3
|
-
export {
|
|
4
|
-
export {
|
|
5
|
-
export {
|
|
1
|
+
export * from "./types.js";
|
|
2
|
+
export { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder.js";
|
|
3
|
+
export { cosine, fuzzyScore, keywordScore, calculateScoreStats } from "./similarity.js";
|
|
4
|
+
export { hybridScore, getDefaultWeights } from "./ranker.js";
|
|
5
|
+
export { getByPath, extractText, normalizeScore } from "./utils.js";
|
package/dist/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
export
|
|
2
|
-
export
|
|
3
|
-
export {
|
|
4
|
-
export {
|
|
5
|
-
export {
|
|
1
|
+
export * from "./types.js";
|
|
2
|
+
export { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder.js";
|
|
3
|
+
export { cosine, fuzzyScore, keywordScore, calculateScoreStats } from "./similarity.js";
|
|
4
|
+
export { hybridScore, getDefaultWeights } from "./ranker.js";
|
|
5
|
+
export { getByPath, extractText, normalizeScore } from "./utils.js";
|
package/dist/ranker.d.ts
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import { HybridWeights } from "./types";
|
|
1
|
+
import { HybridWeights } from "./types.js";
|
|
2
2
|
export declare function hybridScore(semantic: number, fuzzy: number, keyword: number, weights?: HybridWeights): number;
|
|
3
3
|
export declare function getDefaultWeights(): Required<HybridWeights>;
|
package/dist/similarity.d.ts
CHANGED
|
@@ -1,3 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Compute cosine similarity between two vectors.
|
|
3
|
+
* Both vectors should be normalized (which they are from the embedder).
|
|
4
|
+
* Returns a value between -1 and 1, where 1 is identical.
|
|
5
|
+
*/
|
|
1
6
|
export declare function cosine(a: Float32Array, b: Float32Array): number;
|
|
7
|
+
/**
|
|
8
|
+
* Compute fuzzy similarity score using Levenshtein distance.
|
|
9
|
+
* Returns a value between 0 and 1, where 1 is an exact match.
|
|
10
|
+
*/
|
|
2
11
|
export declare function fuzzyScore(a: string, b: string): number;
|
|
12
|
+
/**
|
|
13
|
+
* Compute keyword match score.
|
|
14
|
+
* Returns the proportion of query words found in the text (0 to 1).
|
|
15
|
+
*/
|
|
3
16
|
export declare function keywordScore(query: string, text: string): number;
|
|
17
|
+
/**
|
|
18
|
+
* Score normalization statistics for a batch of results.
|
|
19
|
+
*/
|
|
20
|
+
export interface ScoreStats {
|
|
21
|
+
semantic: {
|
|
22
|
+
min: number;
|
|
23
|
+
max: number;
|
|
24
|
+
};
|
|
25
|
+
fuzzy: {
|
|
26
|
+
min: number;
|
|
27
|
+
max: number;
|
|
28
|
+
};
|
|
29
|
+
keyword: {
|
|
30
|
+
min: number;
|
|
31
|
+
max: number;
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Calculate min/max statistics for score normalization.
|
|
36
|
+
*/
|
|
37
|
+
export declare function calculateScoreStats(scores: Array<{
|
|
38
|
+
semantic: number;
|
|
39
|
+
fuzzy: number;
|
|
40
|
+
keyword: number;
|
|
41
|
+
}>): ScoreStats;
|
package/dist/similarity.js
CHANGED
|
@@ -1,18 +1,63 @@
|
|
|
1
1
|
import levenshtein from "fast-levenshtein";
|
|
2
|
+
/**
|
|
3
|
+
* Compute cosine similarity between two vectors.
|
|
4
|
+
* Both vectors should be normalized (which they are from the embedder).
|
|
5
|
+
* Returns a value between -1 and 1, where 1 is identical.
|
|
6
|
+
*/
|
|
2
7
|
export function cosine(a, b) {
|
|
3
8
|
let dot = 0;
|
|
4
9
|
for (let i = 0; i < a.length; i++)
|
|
5
10
|
dot += a[i] * b[i];
|
|
6
11
|
return dot;
|
|
7
12
|
}
|
|
13
|
+
/**
|
|
14
|
+
* Compute fuzzy similarity score using Levenshtein distance.
|
|
15
|
+
* Returns a value between 0 and 1, where 1 is an exact match.
|
|
16
|
+
*/
|
|
8
17
|
export function fuzzyScore(a, b) {
|
|
9
|
-
const
|
|
18
|
+
const aLower = a.toLowerCase();
|
|
19
|
+
const bLower = b.toLowerCase();
|
|
20
|
+
const dist = levenshtein.get(aLower, bLower);
|
|
10
21
|
const maxLen = Math.max(a.length, b.length);
|
|
22
|
+
if (maxLen === 0)
|
|
23
|
+
return 1;
|
|
11
24
|
return 1 - dist / maxLen;
|
|
12
25
|
}
|
|
26
|
+
/**
|
|
27
|
+
* Compute keyword match score.
|
|
28
|
+
* Returns the proportion of query words found in the text (0 to 1).
|
|
29
|
+
*/
|
|
13
30
|
export function keywordScore(query, text) {
|
|
14
|
-
const
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
31
|
+
const queryWords = query.toLowerCase().split(/\s+/).filter(Boolean);
|
|
32
|
+
if (queryWords.length === 0)
|
|
33
|
+
return 0;
|
|
34
|
+
const textLower = text.toLowerCase();
|
|
35
|
+
const hits = queryWords.filter((w) => textLower.includes(w)).length;
|
|
36
|
+
return hits / queryWords.length;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Calculate min/max statistics for score normalization.
|
|
40
|
+
*/
|
|
41
|
+
export function calculateScoreStats(scores) {
|
|
42
|
+
if (scores.length === 0) {
|
|
43
|
+
return {
|
|
44
|
+
semantic: { min: 0, max: 1 },
|
|
45
|
+
fuzzy: { min: 0, max: 1 },
|
|
46
|
+
keyword: { min: 0, max: 1 },
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
const stats = {
|
|
50
|
+
semantic: { min: Infinity, max: -Infinity },
|
|
51
|
+
fuzzy: { min: Infinity, max: -Infinity },
|
|
52
|
+
keyword: { min: Infinity, max: -Infinity },
|
|
53
|
+
};
|
|
54
|
+
for (const score of scores) {
|
|
55
|
+
stats.semantic.min = Math.min(stats.semantic.min, score.semantic);
|
|
56
|
+
stats.semantic.max = Math.max(stats.semantic.max, score.semantic);
|
|
57
|
+
stats.fuzzy.min = Math.min(stats.fuzzy.min, score.fuzzy);
|
|
58
|
+
stats.fuzzy.max = Math.max(stats.fuzzy.max, score.fuzzy);
|
|
59
|
+
stats.keyword.min = Math.min(stats.keyword.min, score.keyword);
|
|
60
|
+
stats.keyword.max = Math.max(stats.keyword.max, score.keyword);
|
|
61
|
+
}
|
|
62
|
+
return stats;
|
|
18
63
|
}
|
package/dist/types.d.ts
CHANGED
|
@@ -12,6 +12,12 @@ export interface SearchResult<T = any> {
|
|
|
12
12
|
semantic: number;
|
|
13
13
|
fuzzy: number;
|
|
14
14
|
keyword: number;
|
|
15
|
+
/** Raw scores before normalization */
|
|
16
|
+
raw?: {
|
|
17
|
+
semantic: number;
|
|
18
|
+
fuzzy: number;
|
|
19
|
+
keyword: number;
|
|
20
|
+
};
|
|
15
21
|
};
|
|
16
22
|
}
|
|
17
23
|
export interface SearchOptions {
|
|
@@ -20,6 +26,8 @@ export interface SearchOptions {
|
|
|
20
26
|
filter?: (metadata: any) => boolean;
|
|
21
27
|
/** Minimum score threshold (0-1). Results below this are filtered out */
|
|
22
28
|
threshold?: number;
|
|
29
|
+
/** Minimum query length to trigger search (default: 1) */
|
|
30
|
+
minLength?: number;
|
|
23
31
|
}
|
|
24
32
|
export interface HybridWeights {
|
|
25
33
|
/** Semantic similarity weight (0-1), default: 0.7 */
|
|
@@ -34,6 +42,14 @@ export interface SimileConfig {
|
|
|
34
42
|
weights?: HybridWeights;
|
|
35
43
|
/** Model to use for embeddings (default: "Xenova/all-MiniLM-L6-v2") */
|
|
36
44
|
model?: string;
|
|
45
|
+
/**
|
|
46
|
+
* Paths to extract searchable text from items.
|
|
47
|
+
* Supports nested paths like "author.firstName" or "tags[0]".
|
|
48
|
+
* If not provided, uses the 'text' field directly.
|
|
49
|
+
*/
|
|
50
|
+
textPaths?: string[];
|
|
51
|
+
/** Whether to normalize scores across different scoring methods (default: true) */
|
|
52
|
+
normalizeScores?: boolean;
|
|
37
53
|
}
|
|
38
54
|
/** Serialized state for persistence */
|
|
39
55
|
export interface SimileSnapshot<T = any> {
|
|
@@ -43,4 +59,6 @@ export interface SimileSnapshot<T = any> {
|
|
|
43
59
|
/** Base64-encoded Float32Array vectors */
|
|
44
60
|
vectors: string[];
|
|
45
61
|
createdAt: string;
|
|
62
|
+
/** Text paths used for extraction */
|
|
63
|
+
textPaths?: string[];
|
|
46
64
|
}
|
package/dist/utils.d.ts
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract a value from an object using a dot-notation path.
|
|
3
|
+
* Supports nested paths like "author.firstName" and array access like "tags[0]".
|
|
4
|
+
*
|
|
5
|
+
* @example
|
|
6
|
+
* getByPath({ author: { firstName: "John" } }, "author.firstName") // "John"
|
|
7
|
+
* getByPath({ tags: ["a", "b"] }, "tags[1]") // "b"
|
|
8
|
+
* getByPath({ items: [{ name: "x" }] }, "items[0].name") // "x"
|
|
9
|
+
*/
|
|
10
|
+
export declare function getByPath(obj: any, path: string): any;
|
|
11
|
+
/**
|
|
12
|
+
* Extract searchable text from an item using configured paths.
|
|
13
|
+
* If paths are provided, extracts and joins values from those paths.
|
|
14
|
+
* Otherwise, returns the item's 'text' field directly.
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* // With paths
|
|
18
|
+
* extractText(
|
|
19
|
+
* { id: "1", text: "", metadata: { author: { name: "John" }, title: "Hello" } },
|
|
20
|
+
* ["metadata.author.name", "metadata.title"]
|
|
21
|
+
* ) // "John Hello"
|
|
22
|
+
*
|
|
23
|
+
* // Without paths
|
|
24
|
+
* extractText({ id: "1", text: "Hello World" }) // "Hello World"
|
|
25
|
+
*/
|
|
26
|
+
export declare function extractText(item: any, paths?: string[]): string;
|
|
27
|
+
/**
|
|
28
|
+
* Normalize a score to a 0-1 range using min-max normalization.
|
|
29
|
+
* Handles edge cases where min equals max.
|
|
30
|
+
*/
|
|
31
|
+
export declare function normalizeScore(value: number, min: number, max: number): number;
|
package/dist/utils.js
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Extract a value from an object using a dot-notation path.
|
|
3
|
+
* Supports nested paths like "author.firstName" and array access like "tags[0]".
|
|
4
|
+
*
|
|
5
|
+
* @example
|
|
6
|
+
* getByPath({ author: { firstName: "John" } }, "author.firstName") // "John"
|
|
7
|
+
* getByPath({ tags: ["a", "b"] }, "tags[1]") // "b"
|
|
8
|
+
* getByPath({ items: [{ name: "x" }] }, "items[0].name") // "x"
|
|
9
|
+
*/
|
|
10
|
+
export function getByPath(obj, path) {
|
|
11
|
+
if (!obj || !path)
|
|
12
|
+
return undefined;
|
|
13
|
+
// Handle array notation: convert "items[0].name" to "items.0.name"
|
|
14
|
+
const normalizedPath = path.replace(/\[(\d+)\]/g, ".$1");
|
|
15
|
+
const keys = normalizedPath.split(".");
|
|
16
|
+
let current = obj;
|
|
17
|
+
for (const key of keys) {
|
|
18
|
+
if (current === null || current === undefined) {
|
|
19
|
+
return undefined;
|
|
20
|
+
}
|
|
21
|
+
current = current[key];
|
|
22
|
+
}
|
|
23
|
+
return current;
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Extract searchable text from an item using configured paths.
|
|
27
|
+
* If paths are provided, extracts and joins values from those paths.
|
|
28
|
+
* Otherwise, returns the item's 'text' field directly.
|
|
29
|
+
*
|
|
30
|
+
* @example
|
|
31
|
+
* // With paths
|
|
32
|
+
* extractText(
|
|
33
|
+
* { id: "1", text: "", metadata: { author: { name: "John" }, title: "Hello" } },
|
|
34
|
+
* ["metadata.author.name", "metadata.title"]
|
|
35
|
+
* ) // "John Hello"
|
|
36
|
+
*
|
|
37
|
+
* // Without paths
|
|
38
|
+
* extractText({ id: "1", text: "Hello World" }) // "Hello World"
|
|
39
|
+
*/
|
|
40
|
+
export function extractText(item, paths) {
|
|
41
|
+
if (!paths || paths.length === 0) {
|
|
42
|
+
return item.text || "";
|
|
43
|
+
}
|
|
44
|
+
const parts = [];
|
|
45
|
+
for (const path of paths) {
|
|
46
|
+
const value = getByPath(item, path);
|
|
47
|
+
if (value !== null && value !== undefined) {
|
|
48
|
+
if (Array.isArray(value)) {
|
|
49
|
+
parts.push(value.filter((v) => v != null).join(" "));
|
|
50
|
+
}
|
|
51
|
+
else {
|
|
52
|
+
parts.push(String(value));
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return parts.join(" ").trim();
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Normalize a score to a 0-1 range using min-max normalization.
|
|
60
|
+
* Handles edge cases where min equals max.
|
|
61
|
+
*/
|
|
62
|
+
export function normalizeScore(value, min, max) {
|
|
63
|
+
if (max === min)
|
|
64
|
+
return value > 0 ? 1 : 0;
|
|
65
|
+
return (value - min) / (max - min);
|
|
66
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "simile-search",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.1",
|
|
4
4
|
"description": "Offline-first semantic + fuzzy search engine for catalogs, names, and products",
|
|
5
|
+
"type": "module",
|
|
5
6
|
"main": "dist/index.js",
|
|
6
7
|
"types": "dist/index.d.ts",
|
|
7
8
|
"scripts": {
|
|
@@ -34,5 +35,11 @@
|
|
|
34
35
|
"ts-node": "^10.9.2",
|
|
35
36
|
"typescript": "^5.0.0",
|
|
36
37
|
"vitest": "^4.0.16"
|
|
38
|
+
},
|
|
39
|
+
"exports": {
|
|
40
|
+
".": {
|
|
41
|
+
"import": "./dist/index.js",
|
|
42
|
+
"require": "./dist/index.cjs"
|
|
43
|
+
}
|
|
37
44
|
}
|
|
38
45
|
}
|