simile-search 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +268 -0
- package/dist/embedder.d.ts +11 -0
- package/dist/embedder.js +44 -0
- package/dist/engine.d.ts +59 -0
- package/dist/engine.js +152 -0
- package/dist/engine.test.d.ts +1 -0
- package/dist/engine.test.js +148 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +5 -0
- package/dist/ranker.d.ts +3 -0
- package/dist/ranker.js +19 -0
- package/dist/similarity.d.ts +3 -0
- package/dist/similarity.js +18 -0
- package/dist/types.d.ts +46 -0
- package/dist/types.js +1 -0
- package/package.json +38 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Aavash Baral
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="assets/logo.jpeg" alt="Simile Logo" width="200">
|
|
3
|
+
</div>
|
|
4
|
+
|
|
5
|
+
# Simile ๐
|
|
6
|
+
|
|
7
|
+

|
|
8
|
+

|
|
9
|
+

|
|
10
|
+
|
|
11
|
+
**Offline-first semantic + fuzzy search engine for catalogs, names, and products.**
|
|
12
|
+
|
|
13
|
+
Simile combines the power of AI embeddings with fuzzy string matching and keyword search to deliver highly relevant search resultsโall running locally, no API calls required.
|
|
14
|
+
|
|
15
|
+
## โจ Features
|
|
16
|
+
|
|
17
|
+
- ๐ง **Semantic Search** - Understands meaning, not just keywords ("phone charger" finds "USB-C cable")
|
|
18
|
+
- ๐ค **Fuzzy Matching** - Handles typos and partial matches gracefully
|
|
19
|
+
- ๐ฏ **Keyword Boost** - Exact matches get priority
|
|
20
|
+
- ๐พ **Persistence** - Save/load embeddings to avoid re-computing
|
|
21
|
+
- โก **Batch Processing** - Optimized for large catalogs
|
|
22
|
+
- ๐ง **Configurable** - Tune scoring weights for your use case
|
|
23
|
+
- ๐ฆ **Zero API Calls** - Everything runs locally with Transformers.js
|
|
24
|
+
|
|
25
|
+
## ๐ฆ Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
npm install simile-search
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## ๐ Quick Start
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
import { Simile } from 'simile-search';
|
|
35
|
+
|
|
36
|
+
// Create a search engine with your items
|
|
37
|
+
const engine = await Simile.from([
|
|
38
|
+
{ id: '1', text: 'Bathroom floor cleaner', metadata: { category: 'Cleaning' } },
|
|
39
|
+
{ id: '2', text: 'Dishwashing liquid', metadata: { category: 'Kitchen' } },
|
|
40
|
+
{ id: '3', text: 'iPhone Charger', metadata: { category: 'Electronics' } },
|
|
41
|
+
{ id: '4', text: 'USB-C phone charger cable', metadata: { category: 'Electronics' } },
|
|
42
|
+
]);
|
|
43
|
+
|
|
44
|
+
// Search!
|
|
45
|
+
const results = await engine.search('phone charger');
|
|
46
|
+
console.log(results);
|
|
47
|
+
// [
|
|
48
|
+
// { id: '3', text: 'iPhone Charger', score: 0.72, ... },
|
|
49
|
+
// { id: '4', text: 'USB-C phone charger cable', score: 0.68, ... },
|
|
50
|
+
// ...
|
|
51
|
+
// ]
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## ๐พ Persistence (Save & Load)
|
|
55
|
+
|
|
56
|
+
The first embedding run can be slow. Save your embeddings to load instantly next time:
|
|
57
|
+
|
|
58
|
+
```typescript
|
|
59
|
+
import { Simile } from 'simile-search';
|
|
60
|
+
import * as fs from 'fs';
|
|
61
|
+
|
|
62
|
+
// First run: embed and save (slow, but only once!)
|
|
63
|
+
const engine = await Simile.from(items);
|
|
64
|
+
fs.writeFileSync('catalog.json', engine.toJSON());
|
|
65
|
+
|
|
66
|
+
// Later: instant load from file (no re-embedding!)
|
|
67
|
+
const json = fs.readFileSync('catalog.json', 'utf-8');
|
|
68
|
+
const loadedEngine = Simile.loadFromJSON(json);
|
|
69
|
+
|
|
70
|
+
// Works exactly the same
|
|
71
|
+
const results = await loadedEngine.search('cleaner');
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Snapshot Format
|
|
75
|
+
|
|
76
|
+
```typescript
|
|
77
|
+
// For database storage
|
|
78
|
+
const snapshot = engine.save();
|
|
79
|
+
// {
|
|
80
|
+
// version: '0.2.0',
|
|
81
|
+
// model: 'Xenova/all-MiniLM-L6-v2',
|
|
82
|
+
// items: [...],
|
|
83
|
+
// vectors: ['base64...', 'base64...'],
|
|
84
|
+
// createdAt: '2024-12-28T...'
|
|
85
|
+
// }
|
|
86
|
+
|
|
87
|
+
// Load from snapshot object
|
|
88
|
+
const restored = Simile.load(snapshot);
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## ๐ง Configuration
|
|
92
|
+
|
|
93
|
+
### Custom Scoring Weights
|
|
94
|
+
|
|
95
|
+
Tune how much each scoring method contributes:
|
|
96
|
+
|
|
97
|
+
```typescript
|
|
98
|
+
const engine = await Simile.from(items, {
|
|
99
|
+
weights: {
|
|
100
|
+
semantic: 0.7, // AI embedding similarity (default: 0.7)
|
|
101
|
+
fuzzy: 0.15, // Levenshtein distance (default: 0.15)
|
|
102
|
+
keyword: 0.15, // Exact keyword matches (default: 0.15)
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
// Or adjust later
|
|
107
|
+
engine.setWeights({ semantic: 0.9, fuzzy: 0.05, keyword: 0.05 });
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Search Options
|
|
111
|
+
|
|
112
|
+
```typescript
|
|
113
|
+
const results = await engine.search('cleaner', {
|
|
114
|
+
topK: 10, // Max results to return (default: 5)
|
|
115
|
+
threshold: 0.5, // Minimum score (default: 0)
|
|
116
|
+
explain: true, // Include score breakdown
|
|
117
|
+
filter: (meta) => meta.category === 'Cleaning', // Filter by metadata
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
// With explain: true
|
|
121
|
+
// {
|
|
122
|
+
// id: '1',
|
|
123
|
+
// text: 'Bathroom floor cleaner',
|
|
124
|
+
// score: 0.63,
|
|
125
|
+
// explain: { semantic: 0.62, fuzzy: 0.32, keyword: 1.0 }
|
|
126
|
+
// }
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## ๐ Dynamic Catalog Management
|
|
130
|
+
|
|
131
|
+
Add, update, or remove items without rebuilding:
|
|
132
|
+
|
|
133
|
+
```typescript
|
|
134
|
+
// Add new items
|
|
135
|
+
await engine.add([
|
|
136
|
+
{ id: '5', text: 'Wireless headphones', metadata: { category: 'Electronics' } }
|
|
137
|
+
]);
|
|
138
|
+
|
|
139
|
+
// Update existing item (same ID)
|
|
140
|
+
await engine.add([
|
|
141
|
+
{ id: '1', text: 'Premium bathroom cleaner', metadata: { category: 'Cleaning' } }
|
|
142
|
+
]);
|
|
143
|
+
|
|
144
|
+
// Remove items
|
|
145
|
+
engine.remove(['2', '3']);
|
|
146
|
+
|
|
147
|
+
// Get item by ID
|
|
148
|
+
const item = engine.get('1');
|
|
149
|
+
|
|
150
|
+
// Get all items
|
|
151
|
+
const allItems = engine.getAll();
|
|
152
|
+
|
|
153
|
+
// Get count
|
|
154
|
+
console.log(engine.size); // 3
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## ๐ฏ Advanced: Direct Access to Utilities
|
|
158
|
+
|
|
159
|
+
For custom implementations:
|
|
160
|
+
|
|
161
|
+
```typescript
|
|
162
|
+
import {
|
|
163
|
+
embed,
|
|
164
|
+
embedBatch,
|
|
165
|
+
cosine,
|
|
166
|
+
fuzzyScore,
|
|
167
|
+
keywordScore,
|
|
168
|
+
hybridScore,
|
|
169
|
+
vectorToBase64,
|
|
170
|
+
base64ToVector
|
|
171
|
+
} from 'simile-search';
|
|
172
|
+
|
|
173
|
+
// Embed text directly
|
|
174
|
+
const vector = await embed('hello world');
|
|
175
|
+
|
|
176
|
+
// Batch embed for performance
|
|
177
|
+
const vectors = await embedBatch(['text1', 'text2', 'text3']);
|
|
178
|
+
|
|
179
|
+
// Calculate similarities
|
|
180
|
+
const similarity = cosine(vectorA, vectorB);
|
|
181
|
+
const fuzzy = fuzzyScore('cleaner', 'cleenr');
|
|
182
|
+
const keyword = keywordScore('phone charger', 'USB phone charger cable');
|
|
183
|
+
|
|
184
|
+
// Combine scores
|
|
185
|
+
const score = hybridScore(0.8, 0.6, 0.5, { semantic: 0.7, fuzzy: 0.15, keyword: 0.15 });
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## ๐ API Reference
|
|
189
|
+
|
|
190
|
+
### `Simile.from(items, config?)`
|
|
191
|
+
Create a new engine from items. Embeds all items (async).
|
|
192
|
+
|
|
193
|
+
### `Simile.load(snapshot, config?)`
|
|
194
|
+
Load from a saved snapshot (instant, no embedding).
|
|
195
|
+
|
|
196
|
+
### `Simile.loadFromJSON(json, config?)`
|
|
197
|
+
Load from JSON string.
|
|
198
|
+
|
|
199
|
+
### `engine.search(query, options?)`
|
|
200
|
+
Search for similar items.
|
|
201
|
+
|
|
202
|
+
### `engine.save()`
|
|
203
|
+
Export snapshot object for persistence.
|
|
204
|
+
|
|
205
|
+
### `engine.toJSON()`
|
|
206
|
+
Export as JSON string.
|
|
207
|
+
|
|
208
|
+
### `engine.add(items)`
|
|
209
|
+
Add or update items (async).
|
|
210
|
+
|
|
211
|
+
### `engine.remove(ids)`
|
|
212
|
+
Remove items by ID.
|
|
213
|
+
|
|
214
|
+
### `engine.get(id)`
|
|
215
|
+
Get single item by ID.
|
|
216
|
+
|
|
217
|
+
### `engine.getAll()`
|
|
218
|
+
Get all items.
|
|
219
|
+
|
|
220
|
+
### `engine.size`
|
|
221
|
+
Number of items.
|
|
222
|
+
|
|
223
|
+
### `engine.setWeights(weights)`
|
|
224
|
+
Update scoring weights.
|
|
225
|
+
|
|
226
|
+
## ๐งช Types
|
|
227
|
+
|
|
228
|
+
```typescript
|
|
229
|
+
interface SearchItem<T = any> {
|
|
230
|
+
id: string;
|
|
231
|
+
text: string;
|
|
232
|
+
metadata?: T;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
interface SearchResult<T = any> {
|
|
236
|
+
id: string;
|
|
237
|
+
text: string;
|
|
238
|
+
score: number;
|
|
239
|
+
metadata?: T;
|
|
240
|
+
explain?: { semantic: number; fuzzy: number; keyword: number };
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
interface SearchOptions {
|
|
244
|
+
topK?: number;
|
|
245
|
+
explain?: boolean;
|
|
246
|
+
threshold?: number;
|
|
247
|
+
filter?: (metadata: any) => boolean;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
interface SimileConfig {
|
|
251
|
+
weights?: { semantic?: number; fuzzy?: number; keyword?: number };
|
|
252
|
+
model?: string;
|
|
253
|
+
}
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## ๐ค Model
|
|
257
|
+
|
|
258
|
+
Simile uses [Xenova/all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) via Transformers.js by default. This model runs entirely in JavaScriptโno Python or external APIs required.
|
|
259
|
+
|
|
260
|
+
## ๐ License
|
|
261
|
+
|
|
262
|
+
MIT ยฉ [Aavash Baral](https://github.com/iaavas)
|
|
263
|
+
|
|
264
|
+
---
|
|
265
|
+
|
|
266
|
+
<p align="center">
|
|
267
|
+
Made with โค๏ธ by <a href="https://github.com/iaavas">Aavash Baral</a>
|
|
268
|
+
</p>
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export declare function getEmbedder(model?: string): Promise<any>;
|
|
2
|
+
export declare function embed(text: string, model?: string): Promise<Float32Array>;
|
|
3
|
+
/**
|
|
4
|
+
* Batch embed multiple texts at once for better performance.
|
|
5
|
+
* This is significantly faster than embedding one by one.
|
|
6
|
+
*/
|
|
7
|
+
export declare function embedBatch(texts: string[], model?: string): Promise<Float32Array[]>;
|
|
8
|
+
/** Serialize Float32Array to base64 string for storage */
|
|
9
|
+
export declare function vectorToBase64(vector: Float32Array): string;
|
|
10
|
+
/** Deserialize base64 string back to Float32Array */
|
|
11
|
+
export declare function base64ToVector(base64: string): Float32Array;
|
package/dist/embedder.js
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { pipeline } from "@xenova/transformers";
|
|
2
|
+
let extractor;
|
|
3
|
+
let currentModel = "";
|
|
4
|
+
export async function getEmbedder(model = "Xenova/all-MiniLM-L6-v2") {
|
|
5
|
+
if (!extractor || currentModel !== model) {
|
|
6
|
+
extractor = await pipeline("feature-extraction", model);
|
|
7
|
+
currentModel = model;
|
|
8
|
+
}
|
|
9
|
+
return extractor;
|
|
10
|
+
}
|
|
11
|
+
export async function embed(text, model) {
|
|
12
|
+
const embedder = await getEmbedder(model);
|
|
13
|
+
const output = await embedder(text, {
|
|
14
|
+
pooling: "mean",
|
|
15
|
+
normalize: true,
|
|
16
|
+
});
|
|
17
|
+
return output.data;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Batch embed multiple texts at once for better performance.
|
|
21
|
+
* This is significantly faster than embedding one by one.
|
|
22
|
+
*/
|
|
23
|
+
export async function embedBatch(texts, model) {
|
|
24
|
+
const embedder = await getEmbedder(model);
|
|
25
|
+
const results = [];
|
|
26
|
+
// Process in batches of 32 for memory efficiency
|
|
27
|
+
const BATCH_SIZE = 32;
|
|
28
|
+
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
29
|
+
const batch = texts.slice(i, i + BATCH_SIZE);
|
|
30
|
+
const outputs = await Promise.all(batch.map((text) => embedder(text, { pooling: "mean", normalize: true })));
|
|
31
|
+
results.push(...outputs.map((o) => o.data));
|
|
32
|
+
}
|
|
33
|
+
return results;
|
|
34
|
+
}
|
|
35
|
+
/** Serialize Float32Array to base64 string for storage */
|
|
36
|
+
export function vectorToBase64(vector) {
|
|
37
|
+
const buffer = Buffer.from(vector.buffer);
|
|
38
|
+
return buffer.toString("base64");
|
|
39
|
+
}
|
|
40
|
+
/** Deserialize base64 string back to Float32Array */
|
|
41
|
+
export function base64ToVector(base64) {
|
|
42
|
+
const buffer = Buffer.from(base64, "base64");
|
|
43
|
+
return new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
|
|
44
|
+
}
|
package/dist/engine.d.ts
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import { SearchItem, SearchResult, SearchOptions, SimileConfig, SimileSnapshot, HybridWeights } from "./types";
|
|
2
|
+
export declare class Simile<T = any> {
|
|
3
|
+
private items;
|
|
4
|
+
private vectors;
|
|
5
|
+
private itemIndex;
|
|
6
|
+
private config;
|
|
7
|
+
private constructor();
|
|
8
|
+
/**
|
|
9
|
+
* Create a new Simile instance from items.
|
|
10
|
+
* This will embed all items (slow for first run, but cached after).
|
|
11
|
+
*/
|
|
12
|
+
static from<T>(items: SearchItem<T>[], config?: SimileConfig): Promise<Simile<T>>;
|
|
13
|
+
/**
|
|
14
|
+
* Load a Simile instance from a previously saved snapshot.
|
|
15
|
+
* This is INSTANT - no embedding needed!
|
|
16
|
+
*/
|
|
17
|
+
static load<T>(snapshot: SimileSnapshot<T>, config?: SimileConfig): Simile<T>;
|
|
18
|
+
/**
|
|
19
|
+
* Load from JSON string (e.g., from file or localStorage)
|
|
20
|
+
*/
|
|
21
|
+
static loadFromJSON<T>(json: string, config?: SimileConfig): Simile<T>;
|
|
22
|
+
/**
|
|
23
|
+
* Save the current state to a snapshot object.
|
|
24
|
+
* Store this in a file or database for instant loading later.
|
|
25
|
+
*/
|
|
26
|
+
save(): SimileSnapshot<T>;
|
|
27
|
+
/**
|
|
28
|
+
* Export as JSON string for file storage
|
|
29
|
+
*/
|
|
30
|
+
toJSON(): string;
|
|
31
|
+
/**
|
|
32
|
+
* Add new items to the index
|
|
33
|
+
*/
|
|
34
|
+
add(items: SearchItem<T>[]): Promise<void>;
|
|
35
|
+
/**
|
|
36
|
+
* Remove items by ID
|
|
37
|
+
*/
|
|
38
|
+
remove(ids: string[]): void;
|
|
39
|
+
/**
|
|
40
|
+
* Get item by ID
|
|
41
|
+
*/
|
|
42
|
+
get(id: string): SearchItem<T> | undefined;
|
|
43
|
+
/**
|
|
44
|
+
* Get all items
|
|
45
|
+
*/
|
|
46
|
+
getAll(): SearchItem<T>[];
|
|
47
|
+
/**
|
|
48
|
+
* Get the number of items in the index
|
|
49
|
+
*/
|
|
50
|
+
get size(): number;
|
|
51
|
+
/**
|
|
52
|
+
* Set custom scoring weights
|
|
53
|
+
*/
|
|
54
|
+
setWeights(weights: HybridWeights): void;
|
|
55
|
+
/**
|
|
56
|
+
* Search for similar items
|
|
57
|
+
*/
|
|
58
|
+
search(query: string, options?: SearchOptions): Promise<SearchResult<T>[]>;
|
|
59
|
+
}
|
package/dist/engine.js
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder";
|
|
2
|
+
import { cosine, fuzzyScore, keywordScore } from "./similarity";
|
|
3
|
+
import { hybridScore, getDefaultWeights } from "./ranker";
|
|
4
|
+
const PACKAGE_VERSION = "0.2.0";
|
|
5
|
+
export class Simile {
|
|
6
|
+
constructor(items, vectors, config = {}) {
|
|
7
|
+
this.items = items;
|
|
8
|
+
this.vectors = vectors;
|
|
9
|
+
this.itemIndex = new Map(items.map((item, i) => [item.id, i]));
|
|
10
|
+
this.config = {
|
|
11
|
+
weights: config.weights ?? getDefaultWeights(),
|
|
12
|
+
model: config.model ?? "Xenova/all-MiniLM-L6-v2",
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Create a new Simile instance from items.
|
|
17
|
+
* This will embed all items (slow for first run, but cached after).
|
|
18
|
+
*/
|
|
19
|
+
static async from(items, config = {}) {
|
|
20
|
+
const model = config.model ?? "Xenova/all-MiniLM-L6-v2";
|
|
21
|
+
const texts = items.map((item) => item.text);
|
|
22
|
+
const vectors = await embedBatch(texts, model);
|
|
23
|
+
return new Simile(items, vectors, config);
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Load a Simile instance from a previously saved snapshot.
|
|
27
|
+
* This is INSTANT - no embedding needed!
|
|
28
|
+
*/
|
|
29
|
+
static load(snapshot, config = {}) {
|
|
30
|
+
const vectors = snapshot.vectors.map(base64ToVector);
|
|
31
|
+
return new Simile(snapshot.items, vectors, { ...config, model: snapshot.model });
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Load from JSON string (e.g., from file or localStorage)
|
|
35
|
+
*/
|
|
36
|
+
static loadFromJSON(json, config = {}) {
|
|
37
|
+
const snapshot = JSON.parse(json);
|
|
38
|
+
return Simile.load(snapshot, config);
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Save the current state to a snapshot object.
|
|
42
|
+
* Store this in a file or database for instant loading later.
|
|
43
|
+
*/
|
|
44
|
+
save() {
|
|
45
|
+
return {
|
|
46
|
+
version: PACKAGE_VERSION,
|
|
47
|
+
model: this.config.model,
|
|
48
|
+
items: this.items,
|
|
49
|
+
vectors: this.vectors.map(vectorToBase64),
|
|
50
|
+
createdAt: new Date().toISOString(),
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Export as JSON string for file storage
|
|
55
|
+
*/
|
|
56
|
+
toJSON() {
|
|
57
|
+
return JSON.stringify(this.save());
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Add new items to the index
|
|
61
|
+
*/
|
|
62
|
+
async add(items) {
|
|
63
|
+
const texts = items.map((item) => item.text);
|
|
64
|
+
const newVectors = await embedBatch(texts, this.config.model);
|
|
65
|
+
for (let i = 0; i < items.length; i++) {
|
|
66
|
+
const item = items[i];
|
|
67
|
+
const existingIdx = this.itemIndex.get(item.id);
|
|
68
|
+
if (existingIdx !== undefined) {
|
|
69
|
+
// Update existing item
|
|
70
|
+
this.items[existingIdx] = item;
|
|
71
|
+
this.vectors[existingIdx] = newVectors[i];
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
// Add new item
|
|
75
|
+
const newIdx = this.items.length;
|
|
76
|
+
this.items.push(item);
|
|
77
|
+
this.vectors.push(newVectors[i]);
|
|
78
|
+
this.itemIndex.set(item.id, newIdx);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Remove items by ID
|
|
84
|
+
*/
|
|
85
|
+
remove(ids) {
|
|
86
|
+
const idsToRemove = new Set(ids);
|
|
87
|
+
const newItems = [];
|
|
88
|
+
const newVectors = [];
|
|
89
|
+
for (let i = 0; i < this.items.length; i++) {
|
|
90
|
+
if (!idsToRemove.has(this.items[i].id)) {
|
|
91
|
+
newItems.push(this.items[i]);
|
|
92
|
+
newVectors.push(this.vectors[i]);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
this.items = newItems;
|
|
96
|
+
this.vectors = newVectors;
|
|
97
|
+
this.itemIndex = new Map(this.items.map((item, i) => [item.id, i]));
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Get item by ID
|
|
101
|
+
*/
|
|
102
|
+
get(id) {
|
|
103
|
+
const idx = this.itemIndex.get(id);
|
|
104
|
+
return idx !== undefined ? this.items[idx] : undefined;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Get all items
|
|
108
|
+
*/
|
|
109
|
+
getAll() {
|
|
110
|
+
return [...this.items];
|
|
111
|
+
}
|
|
112
|
+
/**
|
|
113
|
+
* Get the number of items in the index
|
|
114
|
+
*/
|
|
115
|
+
get size() {
|
|
116
|
+
return this.items.length;
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Set custom scoring weights
|
|
120
|
+
*/
|
|
121
|
+
setWeights(weights) {
|
|
122
|
+
this.config.weights = { ...this.config.weights, ...weights };
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Search for similar items
|
|
126
|
+
*/
|
|
127
|
+
async search(query, options = {}) {
|
|
128
|
+
const { topK = 5, explain = false, filter, threshold = 0, } = options;
|
|
129
|
+
const qVector = await embed(query, this.config.model);
|
|
130
|
+
const results = [];
|
|
131
|
+
for (let i = 0; i < this.items.length; i++) {
|
|
132
|
+
const item = this.items[i];
|
|
133
|
+
if (filter && !filter(item.metadata))
|
|
134
|
+
continue;
|
|
135
|
+
const semantic = cosine(qVector, this.vectors[i]);
|
|
136
|
+
const fuzzy = fuzzyScore(query, item.text);
|
|
137
|
+
const keyword = keywordScore(query, item.text);
|
|
138
|
+
const score = hybridScore(semantic, fuzzy, keyword, this.config.weights);
|
|
139
|
+
// Apply threshold filter
|
|
140
|
+
if (score < threshold)
|
|
141
|
+
continue;
|
|
142
|
+
results.push({
|
|
143
|
+
id: item.id,
|
|
144
|
+
text: item.text,
|
|
145
|
+
metadata: item.metadata,
|
|
146
|
+
score,
|
|
147
|
+
explain: explain ? { semantic, fuzzy, keyword } : undefined,
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
return results.sort((a, b) => b.score - a.score).slice(0, topK);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import { describe, it, expect } from "vitest";
|
|
2
|
+
import { Simile } from "./engine";
|
|
3
|
+
import * as fs from "fs";
|
|
4
|
+
import * as path from "path";
|
|
5
|
+
const testItems = [
|
|
6
|
+
{
|
|
7
|
+
id: "1",
|
|
8
|
+
text: "Bathroom floor cleaner",
|
|
9
|
+
metadata: { category: "Cleaning" },
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
id: "2",
|
|
13
|
+
text: "Dishwashing liquid",
|
|
14
|
+
metadata: { category: "Kitchen" },
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
id: "3",
|
|
18
|
+
text: "Ipod Charger",
|
|
19
|
+
metadata: { category: "Electronics" },
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
id: "4",
|
|
23
|
+
text: "Kitchen cleaning spray",
|
|
24
|
+
metadata: { category: "Cleaning" },
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
id: "5",
|
|
28
|
+
text: "USB-C phone charger cable",
|
|
29
|
+
metadata: { category: "Electronics" },
|
|
30
|
+
},
|
|
31
|
+
];
|
|
32
|
+
describe("simile search", () => {
|
|
33
|
+
it("returns semantically similar items", async () => {
|
|
34
|
+
const engine = await Simile.from(testItems.slice(0, 3));
|
|
35
|
+
const results = await engine.search("cleaner", { explain: true });
|
|
36
|
+
console.log("Search for 'cleaner':", results);
|
|
37
|
+
expect(results.length).toBeGreaterThan(0);
|
|
38
|
+
expect(results[0].id).toBe("1");
|
|
39
|
+
expect(results[0].score).toBeGreaterThan(0.5);
|
|
40
|
+
}, 30000);
|
|
41
|
+
it("differentiates between unrelated items", async () => {
|
|
42
|
+
const engine = await Simile.from(testItems);
|
|
43
|
+
// Search for "phone charger" - should clearly prefer electronics
|
|
44
|
+
const results = await engine.search("phone charger", { explain: true });
|
|
45
|
+
console.log("Search for 'phone charger':", results);
|
|
46
|
+
// Both chargers should be in top 2 (order may vary based on model)
|
|
47
|
+
const topTwoIds = [results[0].id, results[1].id];
|
|
48
|
+
expect(topTwoIds).toContain("5"); // USB-C phone charger
|
|
49
|
+
expect(topTwoIds).toContain("3"); // iPod Charger
|
|
50
|
+
// Both chargers should score significantly higher than cleaning products
|
|
51
|
+
const chargerScores = results.filter((r) => r.metadata?.category === "Electronics");
|
|
52
|
+
const cleaningScores = results.filter((r) => r.metadata?.category === "Cleaning");
|
|
53
|
+
// Electronics should score at least 0.4 higher than cleaning items
|
|
54
|
+
expect(chargerScores[0].score).toBeGreaterThan(cleaningScores[0].score + 0.4);
|
|
55
|
+
}, 30000);
|
|
56
|
+
it("applies threshold filtering", async () => {
|
|
57
|
+
const engine = await Simile.from(testItems);
|
|
58
|
+
// With high threshold, should filter out low-scoring results
|
|
59
|
+
const results = await engine.search("cleaner", { threshold: 0.5 });
|
|
60
|
+
console.log("Search with threshold 0.5:", results);
|
|
61
|
+
results.forEach((r) => {
|
|
62
|
+
expect(r.score).toBeGreaterThanOrEqual(0.5);
|
|
63
|
+
});
|
|
64
|
+
}, 30000);
|
|
65
|
+
});
|
|
66
|
+
describe("simile persistence", () => {
|
|
67
|
+
const snapshotPath = path.join(__dirname, "../.test-snapshot.json");
|
|
68
|
+
it("saves and loads from snapshot", async () => {
|
|
69
|
+
// Create engine and save
|
|
70
|
+
const engine = await Simile.from(testItems);
|
|
71
|
+
const snapshot = engine.save();
|
|
72
|
+
expect(snapshot.version).toBe("0.2.0");
|
|
73
|
+
expect(snapshot.items.length).toBe(5);
|
|
74
|
+
expect(snapshot.vectors.length).toBe(5);
|
|
75
|
+
expect(snapshot.model).toBe("Xenova/all-MiniLM-L6-v2");
|
|
76
|
+
// Load from snapshot (instant - no embedding!)
|
|
77
|
+
const loadedEngine = Simile.load(snapshot);
|
|
78
|
+
expect(loadedEngine.size).toBe(5);
|
|
79
|
+
// Search should work the same
|
|
80
|
+
const results = await loadedEngine.search("cleaner");
|
|
81
|
+
expect(results[0].text).toContain("cleaner");
|
|
82
|
+
}, 30000);
|
|
83
|
+
it("saves and loads from JSON file", async () => {
|
|
84
|
+
// Create and save to file
|
|
85
|
+
const engine = await Simile.from(testItems);
|
|
86
|
+
const json = engine.toJSON();
|
|
87
|
+
fs.writeFileSync(snapshotPath, json);
|
|
88
|
+
// Load from file (instant!)
|
|
89
|
+
const loadedJson = fs.readFileSync(snapshotPath, "utf-8");
|
|
90
|
+
const loadedEngine = Simile.loadFromJSON(loadedJson);
|
|
91
|
+
expect(loadedEngine.size).toBe(5);
|
|
92
|
+
// Cleanup
|
|
93
|
+
fs.unlinkSync(snapshotPath);
|
|
94
|
+
}, 30000);
|
|
95
|
+
});
|
|
96
|
+
describe("simile dynamic items", () => {
|
|
97
|
+
it("adds new items", async () => {
|
|
98
|
+
const engine = await Simile.from(testItems.slice(0, 2));
|
|
99
|
+
expect(engine.size).toBe(2);
|
|
100
|
+
await engine.add([testItems[2], testItems[3]]);
|
|
101
|
+
expect(engine.size).toBe(4);
|
|
102
|
+
const results = await engine.search("charger");
|
|
103
|
+
expect(results.some((r) => r.id === "3")).toBe(true);
|
|
104
|
+
}, 30000);
|
|
105
|
+
it("removes items", async () => {
|
|
106
|
+
const engine = await Simile.from(testItems);
|
|
107
|
+
expect(engine.size).toBe(5);
|
|
108
|
+
engine.remove(["1", "2"]);
|
|
109
|
+
expect(engine.size).toBe(3);
|
|
110
|
+
expect(engine.get("1")).toBeUndefined();
|
|
111
|
+
expect(engine.get("3")).toBeDefined();
|
|
112
|
+
}, 30000);
|
|
113
|
+
it("updates existing items", async () => {
|
|
114
|
+
const engine = await Simile.from(testItems.slice(0, 2));
|
|
115
|
+
// Update item with same ID but different text
|
|
116
|
+
await engine.add([
|
|
117
|
+
{ id: "1", text: "Wireless Bluetooth headphones", metadata: { category: "Electronics" } },
|
|
118
|
+
]);
|
|
119
|
+
expect(engine.size).toBe(2); // Still 2 items, not 3
|
|
120
|
+
expect(engine.get("1")?.text).toBe("Wireless Bluetooth headphones");
|
|
121
|
+
}, 30000);
|
|
122
|
+
});
|
|
123
|
+
describe("simile custom weights", () => {
|
|
124
|
+
it("respects custom weights", async () => {
|
|
125
|
+
// Engine with high semantic weight
|
|
126
|
+
const semanticEngine = await Simile.from(testItems, {
|
|
127
|
+
weights: { semantic: 0.9, fuzzy: 0.05, keyword: 0.05 },
|
|
128
|
+
});
|
|
129
|
+
// Engine with high keyword weight
|
|
130
|
+
const keywordEngine = await Simile.from(testItems, {
|
|
131
|
+
weights: { semantic: 0.1, fuzzy: 0.1, keyword: 0.8 },
|
|
132
|
+
});
|
|
133
|
+
const query = "floor";
|
|
134
|
+
const semanticResults = await semanticEngine.search(query, { explain: true });
|
|
135
|
+
const keywordResults = await keywordEngine.search(query, { explain: true });
|
|
136
|
+
console.log("Semantic-weighted results:", semanticResults.map((r) => ({
|
|
137
|
+
text: r.text,
|
|
138
|
+
score: r.score,
|
|
139
|
+
})));
|
|
140
|
+
console.log("Keyword-weighted results:", keywordResults.map((r) => ({
|
|
141
|
+
text: r.text,
|
|
142
|
+
score: r.score,
|
|
143
|
+
})));
|
|
144
|
+
// Both should find floor cleaner first (it has "floor" in text)
|
|
145
|
+
expect(semanticResults[0].text).toContain("floor");
|
|
146
|
+
expect(keywordResults[0].text).toContain("floor");
|
|
147
|
+
}, 30000);
|
|
148
|
+
});
|
package/dist/index.d.ts
ADDED
package/dist/index.js
ADDED
package/dist/ranker.d.ts
ADDED
package/dist/ranker.js
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
const DEFAULT_WEIGHTS = {
|
|
2
|
+
semantic: 0.7,
|
|
3
|
+
fuzzy: 0.15,
|
|
4
|
+
keyword: 0.15,
|
|
5
|
+
};
|
|
6
|
+
export function hybridScore(semantic, fuzzy, keyword, weights = {}) {
|
|
7
|
+
const w = { ...DEFAULT_WEIGHTS, ...weights };
|
|
8
|
+
// Normalize weights to sum to 1
|
|
9
|
+
const total = w.semantic + w.fuzzy + w.keyword;
|
|
10
|
+
const normalizedSemantic = w.semantic / total;
|
|
11
|
+
const normalizedFuzzy = w.fuzzy / total;
|
|
12
|
+
const normalizedKeyword = w.keyword / total;
|
|
13
|
+
return (normalizedSemantic * semantic +
|
|
14
|
+
normalizedFuzzy * fuzzy +
|
|
15
|
+
normalizedKeyword * keyword);
|
|
16
|
+
}
|
|
17
|
+
export function getDefaultWeights() {
|
|
18
|
+
return { ...DEFAULT_WEIGHTS };
|
|
19
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import levenshtein from "fast-levenshtein";
|
|
2
|
+
export function cosine(a, b) {
|
|
3
|
+
let dot = 0;
|
|
4
|
+
for (let i = 0; i < a.length; i++)
|
|
5
|
+
dot += a[i] * b[i];
|
|
6
|
+
return dot;
|
|
7
|
+
}
|
|
8
|
+
export function fuzzyScore(a, b) {
|
|
9
|
+
const dist = levenshtein.get(a.toLowerCase(), b.toLowerCase());
|
|
10
|
+
const maxLen = Math.max(a.length, b.length);
|
|
11
|
+
return 1 - dist / maxLen;
|
|
12
|
+
}
|
|
13
|
+
export function keywordScore(query, text) {
|
|
14
|
+
const q = query.toLowerCase().split(" ");
|
|
15
|
+
const t = text.toLowerCase();
|
|
16
|
+
const hits = q.filter((w) => t.includes(w)).length;
|
|
17
|
+
return hits / q.length;
|
|
18
|
+
}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
export interface SearchItem<T = any> {
|
|
2
|
+
id: string;
|
|
3
|
+
text: string;
|
|
4
|
+
metadata?: T;
|
|
5
|
+
}
|
|
6
|
+
export interface SearchResult<T = any> {
|
|
7
|
+
id: string;
|
|
8
|
+
text: string;
|
|
9
|
+
score: number;
|
|
10
|
+
metadata?: T;
|
|
11
|
+
explain?: {
|
|
12
|
+
semantic: number;
|
|
13
|
+
fuzzy: number;
|
|
14
|
+
keyword: number;
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
export interface SearchOptions {
|
|
18
|
+
topK?: number;
|
|
19
|
+
explain?: boolean;
|
|
20
|
+
filter?: (metadata: any) => boolean;
|
|
21
|
+
/** Minimum score threshold (0-1). Results below this are filtered out */
|
|
22
|
+
threshold?: number;
|
|
23
|
+
}
|
|
24
|
+
export interface HybridWeights {
|
|
25
|
+
/** Semantic similarity weight (0-1), default: 0.7 */
|
|
26
|
+
semantic?: number;
|
|
27
|
+
/** Fuzzy string similarity weight (0-1), default: 0.15 */
|
|
28
|
+
fuzzy?: number;
|
|
29
|
+
/** Keyword match weight (0-1), default: 0.15 */
|
|
30
|
+
keyword?: number;
|
|
31
|
+
}
|
|
32
|
+
export interface SimileConfig {
|
|
33
|
+
/** Custom hybrid scoring weights */
|
|
34
|
+
weights?: HybridWeights;
|
|
35
|
+
/** Model to use for embeddings (default: "Xenova/all-MiniLM-L6-v2") */
|
|
36
|
+
model?: string;
|
|
37
|
+
}
|
|
38
|
+
/** Serialized state for persistence */
|
|
39
|
+
export interface SimileSnapshot<T = any> {
|
|
40
|
+
version: string;
|
|
41
|
+
model: string;
|
|
42
|
+
items: SearchItem<T>[];
|
|
43
|
+
/** Base64-encoded Float32Array vectors */
|
|
44
|
+
vectors: string[];
|
|
45
|
+
createdAt: string;
|
|
46
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/package.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "simile-search",
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Offline-first semantic + fuzzy search engine for catalogs, names, and products",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"scripts": {
|
|
8
|
+
"build": "tsc",
|
|
9
|
+
"test": "vitest",
|
|
10
|
+
"prepublishOnly": "npm run build"
|
|
11
|
+
},
|
|
12
|
+
"keywords": [
|
|
13
|
+
"semantic-search",
|
|
14
|
+
"vector-search",
|
|
15
|
+
"fuzzy-search",
|
|
16
|
+
"offline-search",
|
|
17
|
+
"ai-search"
|
|
18
|
+
],
|
|
19
|
+
"files": [
|
|
20
|
+
"dist",
|
|
21
|
+
"README.md",
|
|
22
|
+
"LICENSE"
|
|
23
|
+
],
|
|
24
|
+
"author": "Aavash Baral",
|
|
25
|
+
"license": "MIT",
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"@xenova/transformers": "^2.17.2",
|
|
28
|
+
"fast-levenshtein": "^3.0.0",
|
|
29
|
+
"vitest": "^4.0.16"
|
|
30
|
+
},
|
|
31
|
+
"devDependencies": {
|
|
32
|
+
"@types/node": "^25.0.3",
|
|
33
|
+
"@types/fast-levenshtein": "^0.0.4",
|
|
34
|
+
"ts-node": "^10.9.2",
|
|
35
|
+
"typescript": "^5.0.0",
|
|
36
|
+
"vitest": "^4.0.16"
|
|
37
|
+
}
|
|
38
|
+
}
|