simile-search 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Aavash Baral
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,268 @@
1
+ <div align="center">
2
+ <img src="assets/logo.jpeg" alt="Simile Logo" width="200">
3
+ </div>
4
+
5
+ # Simile ๐Ÿ”
6
+
7
+ ![npm](https://img.shields.io/npm/v/simile-search)
8
+ ![npm](https://img.shields.io/npm/dm/simile-search)
9
+ ![license](https://img.shields.io/npm/l/simile-search)
10
+
11
+ **Offline-first semantic + fuzzy search engine for catalogs, names, and products.**
12
+
13
+ Simile combines the power of AI embeddings with fuzzy string matching and keyword search to deliver highly relevant search resultsโ€”all running locally, no API calls required.
14
+
15
+ ## โœจ Features
16
+
17
+ - ๐Ÿง  **Semantic Search** - Understands meaning, not just keywords ("phone charger" finds "USB-C cable")
18
+ - ๐Ÿ”ค **Fuzzy Matching** - Handles typos and partial matches gracefully
19
+ - ๐ŸŽฏ **Keyword Boost** - Exact matches get priority
20
+ - ๐Ÿ’พ **Persistence** - Save/load embeddings to avoid re-computing
21
+ - โšก **Batch Processing** - Optimized for large catalogs
22
+ - ๐Ÿ”ง **Configurable** - Tune scoring weights for your use case
23
+ - ๐Ÿ“ฆ **Zero API Calls** - Everything runs locally with Transformers.js
24
+
25
+ ## ๐Ÿ“ฆ Installation
26
+
27
+ ```bash
28
+ npm install simile-search
29
+ ```
30
+
31
+ ## ๐Ÿš€ Quick Start
32
+
33
+ ```typescript
34
+ import { Simile } from 'simile-search';
35
+
36
+ // Create a search engine with your items
37
+ const engine = await Simile.from([
38
+ { id: '1', text: 'Bathroom floor cleaner', metadata: { category: 'Cleaning' } },
39
+ { id: '2', text: 'Dishwashing liquid', metadata: { category: 'Kitchen' } },
40
+ { id: '3', text: 'iPhone Charger', metadata: { category: 'Electronics' } },
41
+ { id: '4', text: 'USB-C phone charger cable', metadata: { category: 'Electronics' } },
42
+ ]);
43
+
44
+ // Search!
45
+ const results = await engine.search('phone charger');
46
+ console.log(results);
47
+ // [
48
+ // { id: '3', text: 'iPhone Charger', score: 0.72, ... },
49
+ // { id: '4', text: 'USB-C phone charger cable', score: 0.68, ... },
50
+ // ...
51
+ // ]
52
+ ```
53
+
54
+ ## ๐Ÿ’พ Persistence (Save & Load)
55
+
56
+ The first embedding run can be slow. Save your embeddings to load instantly next time:
57
+
58
+ ```typescript
59
+ import { Simile } from 'simile-search';
60
+ import * as fs from 'fs';
61
+
62
+ // First run: embed and save (slow, but only once!)
63
+ const engine = await Simile.from(items);
64
+ fs.writeFileSync('catalog.json', engine.toJSON());
65
+
66
+ // Later: instant load from file (no re-embedding!)
67
+ const json = fs.readFileSync('catalog.json', 'utf-8');
68
+ const loadedEngine = Simile.loadFromJSON(json);
69
+
70
+ // Works exactly the same
71
+ const results = await loadedEngine.search('cleaner');
72
+ ```
73
+
74
+ ### Snapshot Format
75
+
76
+ ```typescript
77
+ // For database storage
78
+ const snapshot = engine.save();
79
+ // {
80
+ // version: '0.2.0',
81
+ // model: 'Xenova/all-MiniLM-L6-v2',
82
+ // items: [...],
83
+ // vectors: ['base64...', 'base64...'],
84
+ // createdAt: '2024-12-28T...'
85
+ // }
86
+
87
+ // Load from snapshot object
88
+ const restored = Simile.load(snapshot);
89
+ ```
90
+
91
+ ## ๐Ÿ”ง Configuration
92
+
93
+ ### Custom Scoring Weights
94
+
95
+ Tune how much each scoring method contributes:
96
+
97
+ ```typescript
98
+ const engine = await Simile.from(items, {
99
+ weights: {
100
+ semantic: 0.7, // AI embedding similarity (default: 0.7)
101
+ fuzzy: 0.15, // Levenshtein distance (default: 0.15)
102
+ keyword: 0.15, // Exact keyword matches (default: 0.15)
103
+ }
104
+ });
105
+
106
+ // Or adjust later
107
+ engine.setWeights({ semantic: 0.9, fuzzy: 0.05, keyword: 0.05 });
108
+ ```
109
+
110
+ ### Search Options
111
+
112
+ ```typescript
113
+ const results = await engine.search('cleaner', {
114
+ topK: 10, // Max results to return (default: 5)
115
+ threshold: 0.5, // Minimum score (default: 0)
116
+ explain: true, // Include score breakdown
117
+ filter: (meta) => meta.category === 'Cleaning', // Filter by metadata
118
+ });
119
+
120
+ // With explain: true
121
+ // {
122
+ // id: '1',
123
+ // text: 'Bathroom floor cleaner',
124
+ // score: 0.63,
125
+ // explain: { semantic: 0.62, fuzzy: 0.32, keyword: 1.0 }
126
+ // }
127
+ ```
128
+
129
+ ## ๐Ÿ“ Dynamic Catalog Management
130
+
131
+ Add, update, or remove items without rebuilding:
132
+
133
+ ```typescript
134
+ // Add new items
135
+ await engine.add([
136
+ { id: '5', text: 'Wireless headphones', metadata: { category: 'Electronics' } }
137
+ ]);
138
+
139
+ // Update existing item (same ID)
140
+ await engine.add([
141
+ { id: '1', text: 'Premium bathroom cleaner', metadata: { category: 'Cleaning' } }
142
+ ]);
143
+
144
+ // Remove items
145
+ engine.remove(['2', '3']);
146
+
147
+ // Get item by ID
148
+ const item = engine.get('1');
149
+
150
+ // Get all items
151
+ const allItems = engine.getAll();
152
+
153
+ // Get count
154
+ console.log(engine.size); // 3
155
+ ```
156
+
157
+ ## ๐ŸŽฏ Advanced: Direct Access to Utilities
158
+
159
+ For custom implementations:
160
+
161
+ ```typescript
162
+ import {
163
+ embed,
164
+ embedBatch,
165
+ cosine,
166
+ fuzzyScore,
167
+ keywordScore,
168
+ hybridScore,
169
+ vectorToBase64,
170
+ base64ToVector
171
+ } from 'simile-search';
172
+
173
+ // Embed text directly
174
+ const vector = await embed('hello world');
175
+
176
+ // Batch embed for performance
177
+ const vectors = await embedBatch(['text1', 'text2', 'text3']);
178
+
179
+ // Calculate similarities
180
+ const similarity = cosine(vectorA, vectorB);
181
+ const fuzzy = fuzzyScore('cleaner', 'cleenr');
182
+ const keyword = keywordScore('phone charger', 'USB phone charger cable');
183
+
184
+ // Combine scores
185
+ const score = hybridScore(0.8, 0.6, 0.5, { semantic: 0.7, fuzzy: 0.15, keyword: 0.15 });
186
+ ```
187
+
188
+ ## ๐Ÿ“Š API Reference
189
+
190
+ ### `Simile.from(items, config?)`
191
+ Create a new engine from items. Embeds all items (async).
192
+
193
+ ### `Simile.load(snapshot, config?)`
194
+ Load from a saved snapshot (instant, no embedding).
195
+
196
+ ### `Simile.loadFromJSON(json, config?)`
197
+ Load from JSON string.
198
+
199
+ ### `engine.search(query, options?)`
200
+ Search for similar items.
201
+
202
+ ### `engine.save()`
203
+ Export snapshot object for persistence.
204
+
205
+ ### `engine.toJSON()`
206
+ Export as JSON string.
207
+
208
+ ### `engine.add(items)`
209
+ Add or update items (async).
210
+
211
+ ### `engine.remove(ids)`
212
+ Remove items by ID.
213
+
214
+ ### `engine.get(id)`
215
+ Get single item by ID.
216
+
217
+ ### `engine.getAll()`
218
+ Get all items.
219
+
220
+ ### `engine.size`
221
+ Number of items.
222
+
223
+ ### `engine.setWeights(weights)`
224
+ Update scoring weights.
225
+
226
+ ## ๐Ÿงช Types
227
+
228
+ ```typescript
229
+ interface SearchItem<T = any> {
230
+ id: string;
231
+ text: string;
232
+ metadata?: T;
233
+ }
234
+
235
+ interface SearchResult<T = any> {
236
+ id: string;
237
+ text: string;
238
+ score: number;
239
+ metadata?: T;
240
+ explain?: { semantic: number; fuzzy: number; keyword: number };
241
+ }
242
+
243
+ interface SearchOptions {
244
+ topK?: number;
245
+ explain?: boolean;
246
+ threshold?: number;
247
+ filter?: (metadata: any) => boolean;
248
+ }
249
+
250
+ interface SimileConfig {
251
+ weights?: { semantic?: number; fuzzy?: number; keyword?: number };
252
+ model?: string;
253
+ }
254
+ ```
255
+
256
+ ## ๐Ÿค– Model
257
+
258
+ Simile uses [Xenova/all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) via Transformers.js by default. This model runs entirely in JavaScriptโ€”no Python or external APIs required.
259
+
260
+ ## ๐Ÿ“„ License
261
+
262
+ MIT ยฉ [Aavash Baral](https://github.com/iaavas)
263
+
264
+ ---
265
+
266
+ <p align="center">
267
+ Made with โค๏ธ by <a href="https://github.com/iaavas">Aavash Baral</a>
268
+ </p>
@@ -0,0 +1,11 @@
1
+ export declare function getEmbedder(model?: string): Promise<any>;
2
+ export declare function embed(text: string, model?: string): Promise<Float32Array>;
3
+ /**
4
+ * Batch embed multiple texts at once for better performance.
5
+ * This is significantly faster than embedding one by one.
6
+ */
7
+ export declare function embedBatch(texts: string[], model?: string): Promise<Float32Array[]>;
8
+ /** Serialize Float32Array to base64 string for storage */
9
+ export declare function vectorToBase64(vector: Float32Array): string;
10
+ /** Deserialize base64 string back to Float32Array */
11
+ export declare function base64ToVector(base64: string): Float32Array;
@@ -0,0 +1,44 @@
1
+ import { pipeline } from "@xenova/transformers";
2
+ let extractor;
3
+ let currentModel = "";
4
+ export async function getEmbedder(model = "Xenova/all-MiniLM-L6-v2") {
5
+ if (!extractor || currentModel !== model) {
6
+ extractor = await pipeline("feature-extraction", model);
7
+ currentModel = model;
8
+ }
9
+ return extractor;
10
+ }
11
+ export async function embed(text, model) {
12
+ const embedder = await getEmbedder(model);
13
+ const output = await embedder(text, {
14
+ pooling: "mean",
15
+ normalize: true,
16
+ });
17
+ return output.data;
18
+ }
19
+ /**
20
+ * Batch embed multiple texts at once for better performance.
21
+ * This is significantly faster than embedding one by one.
22
+ */
23
+ export async function embedBatch(texts, model) {
24
+ const embedder = await getEmbedder(model);
25
+ const results = [];
26
+ // Process in batches of 32 for memory efficiency
27
+ const BATCH_SIZE = 32;
28
+ for (let i = 0; i < texts.length; i += BATCH_SIZE) {
29
+ const batch = texts.slice(i, i + BATCH_SIZE);
30
+ const outputs = await Promise.all(batch.map((text) => embedder(text, { pooling: "mean", normalize: true })));
31
+ results.push(...outputs.map((o) => o.data));
32
+ }
33
+ return results;
34
+ }
35
+ /** Serialize Float32Array to base64 string for storage */
36
+ export function vectorToBase64(vector) {
37
+ const buffer = Buffer.from(vector.buffer);
38
+ return buffer.toString("base64");
39
+ }
40
+ /** Deserialize base64 string back to Float32Array */
41
+ export function base64ToVector(base64) {
42
+ const buffer = Buffer.from(base64, "base64");
43
+ return new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
44
+ }
@@ -0,0 +1,59 @@
1
+ import { SearchItem, SearchResult, SearchOptions, SimileConfig, SimileSnapshot, HybridWeights } from "./types";
2
+ export declare class Simile<T = any> {
3
+ private items;
4
+ private vectors;
5
+ private itemIndex;
6
+ private config;
7
+ private constructor();
8
+ /**
9
+ * Create a new Simile instance from items.
10
+ * This will embed all items (slow for first run, but cached after).
11
+ */
12
+ static from<T>(items: SearchItem<T>[], config?: SimileConfig): Promise<Simile<T>>;
13
+ /**
14
+ * Load a Simile instance from a previously saved snapshot.
15
+ * This is INSTANT - no embedding needed!
16
+ */
17
+ static load<T>(snapshot: SimileSnapshot<T>, config?: SimileConfig): Simile<T>;
18
+ /**
19
+ * Load from JSON string (e.g., from file or localStorage)
20
+ */
21
+ static loadFromJSON<T>(json: string, config?: SimileConfig): Simile<T>;
22
+ /**
23
+ * Save the current state to a snapshot object.
24
+ * Store this in a file or database for instant loading later.
25
+ */
26
+ save(): SimileSnapshot<T>;
27
+ /**
28
+ * Export as JSON string for file storage
29
+ */
30
+ toJSON(): string;
31
+ /**
32
+ * Add new items to the index
33
+ */
34
+ add(items: SearchItem<T>[]): Promise<void>;
35
+ /**
36
+ * Remove items by ID
37
+ */
38
+ remove(ids: string[]): void;
39
+ /**
40
+ * Get item by ID
41
+ */
42
+ get(id: string): SearchItem<T> | undefined;
43
+ /**
44
+ * Get all items
45
+ */
46
+ getAll(): SearchItem<T>[];
47
+ /**
48
+ * Get the number of items in the index
49
+ */
50
+ get size(): number;
51
+ /**
52
+ * Set custom scoring weights
53
+ */
54
+ setWeights(weights: HybridWeights): void;
55
+ /**
56
+ * Search for similar items
57
+ */
58
+ search(query: string, options?: SearchOptions): Promise<SearchResult<T>[]>;
59
+ }
package/dist/engine.js ADDED
@@ -0,0 +1,152 @@
1
+ import { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder";
2
+ import { cosine, fuzzyScore, keywordScore } from "./similarity";
3
+ import { hybridScore, getDefaultWeights } from "./ranker";
4
+ const PACKAGE_VERSION = "0.2.0";
5
+ export class Simile {
6
+ constructor(items, vectors, config = {}) {
7
+ this.items = items;
8
+ this.vectors = vectors;
9
+ this.itemIndex = new Map(items.map((item, i) => [item.id, i]));
10
+ this.config = {
11
+ weights: config.weights ?? getDefaultWeights(),
12
+ model: config.model ?? "Xenova/all-MiniLM-L6-v2",
13
+ };
14
+ }
15
+ /**
16
+ * Create a new Simile instance from items.
17
+ * This will embed all items (slow for first run, but cached after).
18
+ */
19
+ static async from(items, config = {}) {
20
+ const model = config.model ?? "Xenova/all-MiniLM-L6-v2";
21
+ const texts = items.map((item) => item.text);
22
+ const vectors = await embedBatch(texts, model);
23
+ return new Simile(items, vectors, config);
24
+ }
25
+ /**
26
+ * Load a Simile instance from a previously saved snapshot.
27
+ * This is INSTANT - no embedding needed!
28
+ */
29
+ static load(snapshot, config = {}) {
30
+ const vectors = snapshot.vectors.map(base64ToVector);
31
+ return new Simile(snapshot.items, vectors, { ...config, model: snapshot.model });
32
+ }
33
+ /**
34
+ * Load from JSON string (e.g., from file or localStorage)
35
+ */
36
+ static loadFromJSON(json, config = {}) {
37
+ const snapshot = JSON.parse(json);
38
+ return Simile.load(snapshot, config);
39
+ }
40
+ /**
41
+ * Save the current state to a snapshot object.
42
+ * Store this in a file or database for instant loading later.
43
+ */
44
+ save() {
45
+ return {
46
+ version: PACKAGE_VERSION,
47
+ model: this.config.model,
48
+ items: this.items,
49
+ vectors: this.vectors.map(vectorToBase64),
50
+ createdAt: new Date().toISOString(),
51
+ };
52
+ }
53
+ /**
54
+ * Export as JSON string for file storage
55
+ */
56
+ toJSON() {
57
+ return JSON.stringify(this.save());
58
+ }
59
+ /**
60
+ * Add new items to the index
61
+ */
62
+ async add(items) {
63
+ const texts = items.map((item) => item.text);
64
+ const newVectors = await embedBatch(texts, this.config.model);
65
+ for (let i = 0; i < items.length; i++) {
66
+ const item = items[i];
67
+ const existingIdx = this.itemIndex.get(item.id);
68
+ if (existingIdx !== undefined) {
69
+ // Update existing item
70
+ this.items[existingIdx] = item;
71
+ this.vectors[existingIdx] = newVectors[i];
72
+ }
73
+ else {
74
+ // Add new item
75
+ const newIdx = this.items.length;
76
+ this.items.push(item);
77
+ this.vectors.push(newVectors[i]);
78
+ this.itemIndex.set(item.id, newIdx);
79
+ }
80
+ }
81
+ }
82
+ /**
83
+ * Remove items by ID
84
+ */
85
+ remove(ids) {
86
+ const idsToRemove = new Set(ids);
87
+ const newItems = [];
88
+ const newVectors = [];
89
+ for (let i = 0; i < this.items.length; i++) {
90
+ if (!idsToRemove.has(this.items[i].id)) {
91
+ newItems.push(this.items[i]);
92
+ newVectors.push(this.vectors[i]);
93
+ }
94
+ }
95
+ this.items = newItems;
96
+ this.vectors = newVectors;
97
+ this.itemIndex = new Map(this.items.map((item, i) => [item.id, i]));
98
+ }
99
+ /**
100
+ * Get item by ID
101
+ */
102
+ get(id) {
103
+ const idx = this.itemIndex.get(id);
104
+ return idx !== undefined ? this.items[idx] : undefined;
105
+ }
106
+ /**
107
+ * Get all items
108
+ */
109
+ getAll() {
110
+ return [...this.items];
111
+ }
112
+ /**
113
+ * Get the number of items in the index
114
+ */
115
+ get size() {
116
+ return this.items.length;
117
+ }
118
+ /**
119
+ * Set custom scoring weights
120
+ */
121
+ setWeights(weights) {
122
+ this.config.weights = { ...this.config.weights, ...weights };
123
+ }
124
+ /**
125
+ * Search for similar items
126
+ */
127
+ async search(query, options = {}) {
128
+ const { topK = 5, explain = false, filter, threshold = 0, } = options;
129
+ const qVector = await embed(query, this.config.model);
130
+ const results = [];
131
+ for (let i = 0; i < this.items.length; i++) {
132
+ const item = this.items[i];
133
+ if (filter && !filter(item.metadata))
134
+ continue;
135
+ const semantic = cosine(qVector, this.vectors[i]);
136
+ const fuzzy = fuzzyScore(query, item.text);
137
+ const keyword = keywordScore(query, item.text);
138
+ const score = hybridScore(semantic, fuzzy, keyword, this.config.weights);
139
+ // Apply threshold filter
140
+ if (score < threshold)
141
+ continue;
142
+ results.push({
143
+ id: item.id,
144
+ text: item.text,
145
+ metadata: item.metadata,
146
+ score,
147
+ explain: explain ? { semantic, fuzzy, keyword } : undefined,
148
+ });
149
+ }
150
+ return results.sort((a, b) => b.score - a.score).slice(0, topK);
151
+ }
152
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,148 @@
1
+ import { describe, it, expect } from "vitest";
2
+ import { Simile } from "./engine";
3
+ import * as fs from "fs";
4
+ import * as path from "path";
5
+ const testItems = [
6
+ {
7
+ id: "1",
8
+ text: "Bathroom floor cleaner",
9
+ metadata: { category: "Cleaning" },
10
+ },
11
+ {
12
+ id: "2",
13
+ text: "Dishwashing liquid",
14
+ metadata: { category: "Kitchen" },
15
+ },
16
+ {
17
+ id: "3",
18
+ text: "Ipod Charger",
19
+ metadata: { category: "Electronics" },
20
+ },
21
+ {
22
+ id: "4",
23
+ text: "Kitchen cleaning spray",
24
+ metadata: { category: "Cleaning" },
25
+ },
26
+ {
27
+ id: "5",
28
+ text: "USB-C phone charger cable",
29
+ metadata: { category: "Electronics" },
30
+ },
31
+ ];
32
+ describe("simile search", () => {
33
+ it("returns semantically similar items", async () => {
34
+ const engine = await Simile.from(testItems.slice(0, 3));
35
+ const results = await engine.search("cleaner", { explain: true });
36
+ console.log("Search for 'cleaner':", results);
37
+ expect(results.length).toBeGreaterThan(0);
38
+ expect(results[0].id).toBe("1");
39
+ expect(results[0].score).toBeGreaterThan(0.5);
40
+ }, 30000);
41
+ it("differentiates between unrelated items", async () => {
42
+ const engine = await Simile.from(testItems);
43
+ // Search for "phone charger" - should clearly prefer electronics
44
+ const results = await engine.search("phone charger", { explain: true });
45
+ console.log("Search for 'phone charger':", results);
46
+ // Both chargers should be in top 2 (order may vary based on model)
47
+ const topTwoIds = [results[0].id, results[1].id];
48
+ expect(topTwoIds).toContain("5"); // USB-C phone charger
49
+ expect(topTwoIds).toContain("3"); // iPod Charger
50
+ // Both chargers should score significantly higher than cleaning products
51
+ const chargerScores = results.filter((r) => r.metadata?.category === "Electronics");
52
+ const cleaningScores = results.filter((r) => r.metadata?.category === "Cleaning");
53
+ // Electronics should score at least 0.4 higher than cleaning items
54
+ expect(chargerScores[0].score).toBeGreaterThan(cleaningScores[0].score + 0.4);
55
+ }, 30000);
56
+ it("applies threshold filtering", async () => {
57
+ const engine = await Simile.from(testItems);
58
+ // With high threshold, should filter out low-scoring results
59
+ const results = await engine.search("cleaner", { threshold: 0.5 });
60
+ console.log("Search with threshold 0.5:", results);
61
+ results.forEach((r) => {
62
+ expect(r.score).toBeGreaterThanOrEqual(0.5);
63
+ });
64
+ }, 30000);
65
+ });
66
+ describe("simile persistence", () => {
67
+ const snapshotPath = path.join(__dirname, "../.test-snapshot.json");
68
+ it("saves and loads from snapshot", async () => {
69
+ // Create engine and save
70
+ const engine = await Simile.from(testItems);
71
+ const snapshot = engine.save();
72
+ expect(snapshot.version).toBe("0.2.0");
73
+ expect(snapshot.items.length).toBe(5);
74
+ expect(snapshot.vectors.length).toBe(5);
75
+ expect(snapshot.model).toBe("Xenova/all-MiniLM-L6-v2");
76
+ // Load from snapshot (instant - no embedding!)
77
+ const loadedEngine = Simile.load(snapshot);
78
+ expect(loadedEngine.size).toBe(5);
79
+ // Search should work the same
80
+ const results = await loadedEngine.search("cleaner");
81
+ expect(results[0].text).toContain("cleaner");
82
+ }, 30000);
83
+ it("saves and loads from JSON file", async () => {
84
+ // Create and save to file
85
+ const engine = await Simile.from(testItems);
86
+ const json = engine.toJSON();
87
+ fs.writeFileSync(snapshotPath, json);
88
+ // Load from file (instant!)
89
+ const loadedJson = fs.readFileSync(snapshotPath, "utf-8");
90
+ const loadedEngine = Simile.loadFromJSON(loadedJson);
91
+ expect(loadedEngine.size).toBe(5);
92
+ // Cleanup
93
+ fs.unlinkSync(snapshotPath);
94
+ }, 30000);
95
+ });
96
+ describe("simile dynamic items", () => {
97
+ it("adds new items", async () => {
98
+ const engine = await Simile.from(testItems.slice(0, 2));
99
+ expect(engine.size).toBe(2);
100
+ await engine.add([testItems[2], testItems[3]]);
101
+ expect(engine.size).toBe(4);
102
+ const results = await engine.search("charger");
103
+ expect(results.some((r) => r.id === "3")).toBe(true);
104
+ }, 30000);
105
+ it("removes items", async () => {
106
+ const engine = await Simile.from(testItems);
107
+ expect(engine.size).toBe(5);
108
+ engine.remove(["1", "2"]);
109
+ expect(engine.size).toBe(3);
110
+ expect(engine.get("1")).toBeUndefined();
111
+ expect(engine.get("3")).toBeDefined();
112
+ }, 30000);
113
+ it("updates existing items", async () => {
114
+ const engine = await Simile.from(testItems.slice(0, 2));
115
+ // Update item with same ID but different text
116
+ await engine.add([
117
+ { id: "1", text: "Wireless Bluetooth headphones", metadata: { category: "Electronics" } },
118
+ ]);
119
+ expect(engine.size).toBe(2); // Still 2 items, not 3
120
+ expect(engine.get("1")?.text).toBe("Wireless Bluetooth headphones");
121
+ }, 30000);
122
+ });
123
+ describe("simile custom weights", () => {
124
+ it("respects custom weights", async () => {
125
+ // Engine with high semantic weight
126
+ const semanticEngine = await Simile.from(testItems, {
127
+ weights: { semantic: 0.9, fuzzy: 0.05, keyword: 0.05 },
128
+ });
129
+ // Engine with high keyword weight
130
+ const keywordEngine = await Simile.from(testItems, {
131
+ weights: { semantic: 0.1, fuzzy: 0.1, keyword: 0.8 },
132
+ });
133
+ const query = "floor";
134
+ const semanticResults = await semanticEngine.search(query, { explain: true });
135
+ const keywordResults = await keywordEngine.search(query, { explain: true });
136
+ console.log("Semantic-weighted results:", semanticResults.map((r) => ({
137
+ text: r.text,
138
+ score: r.score,
139
+ })));
140
+ console.log("Keyword-weighted results:", keywordResults.map((r) => ({
141
+ text: r.text,
142
+ score: r.score,
143
+ })));
144
+ // Both should find floor cleaner first (it has "floor" in text)
145
+ expect(semanticResults[0].text).toContain("floor");
146
+ expect(keywordResults[0].text).toContain("floor");
147
+ }, 30000);
148
+ });
@@ -0,0 +1,5 @@
1
+ export { Simile } from "./engine";
2
+ export * from "./types";
3
+ export { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder";
4
+ export { cosine, fuzzyScore, keywordScore } from "./similarity";
5
+ export { hybridScore, getDefaultWeights } from "./ranker";
package/dist/index.js ADDED
@@ -0,0 +1,5 @@
1
+ export { Simile } from "./engine";
2
+ export * from "./types";
3
+ export { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder";
4
+ export { cosine, fuzzyScore, keywordScore } from "./similarity";
5
+ export { hybridScore, getDefaultWeights } from "./ranker";
@@ -0,0 +1,3 @@
1
+ import { HybridWeights } from "./types";
2
+ export declare function hybridScore(semantic: number, fuzzy: number, keyword: number, weights?: HybridWeights): number;
3
+ export declare function getDefaultWeights(): Required<HybridWeights>;
package/dist/ranker.js ADDED
@@ -0,0 +1,19 @@
1
+ const DEFAULT_WEIGHTS = {
2
+ semantic: 0.7,
3
+ fuzzy: 0.15,
4
+ keyword: 0.15,
5
+ };
6
+ export function hybridScore(semantic, fuzzy, keyword, weights = {}) {
7
+ const w = { ...DEFAULT_WEIGHTS, ...weights };
8
+ // Normalize weights to sum to 1
9
+ const total = w.semantic + w.fuzzy + w.keyword;
10
+ const normalizedSemantic = w.semantic / total;
11
+ const normalizedFuzzy = w.fuzzy / total;
12
+ const normalizedKeyword = w.keyword / total;
13
+ return (normalizedSemantic * semantic +
14
+ normalizedFuzzy * fuzzy +
15
+ normalizedKeyword * keyword);
16
+ }
17
+ export function getDefaultWeights() {
18
+ return { ...DEFAULT_WEIGHTS };
19
+ }
@@ -0,0 +1,3 @@
1
+ export declare function cosine(a: Float32Array, b: Float32Array): number;
2
+ export declare function fuzzyScore(a: string, b: string): number;
3
+ export declare function keywordScore(query: string, text: string): number;
@@ -0,0 +1,18 @@
1
+ import levenshtein from "fast-levenshtein";
2
+ export function cosine(a, b) {
3
+ let dot = 0;
4
+ for (let i = 0; i < a.length; i++)
5
+ dot += a[i] * b[i];
6
+ return dot;
7
+ }
8
+ export function fuzzyScore(a, b) {
9
+ const dist = levenshtein.get(a.toLowerCase(), b.toLowerCase());
10
+ const maxLen = Math.max(a.length, b.length);
11
+ return 1 - dist / maxLen;
12
+ }
13
+ export function keywordScore(query, text) {
14
+ const q = query.toLowerCase().split(" ");
15
+ const t = text.toLowerCase();
16
+ const hits = q.filter((w) => t.includes(w)).length;
17
+ return hits / q.length;
18
+ }
@@ -0,0 +1,46 @@
1
+ export interface SearchItem<T = any> {
2
+ id: string;
3
+ text: string;
4
+ metadata?: T;
5
+ }
6
+ export interface SearchResult<T = any> {
7
+ id: string;
8
+ text: string;
9
+ score: number;
10
+ metadata?: T;
11
+ explain?: {
12
+ semantic: number;
13
+ fuzzy: number;
14
+ keyword: number;
15
+ };
16
+ }
17
+ export interface SearchOptions {
18
+ topK?: number;
19
+ explain?: boolean;
20
+ filter?: (metadata: any) => boolean;
21
+ /** Minimum score threshold (0-1). Results below this are filtered out */
22
+ threshold?: number;
23
+ }
24
+ export interface HybridWeights {
25
+ /** Semantic similarity weight (0-1), default: 0.7 */
26
+ semantic?: number;
27
+ /** Fuzzy string similarity weight (0-1), default: 0.15 */
28
+ fuzzy?: number;
29
+ /** Keyword match weight (0-1), default: 0.15 */
30
+ keyword?: number;
31
+ }
32
+ export interface SimileConfig {
33
+ /** Custom hybrid scoring weights */
34
+ weights?: HybridWeights;
35
+ /** Model to use for embeddings (default: "Xenova/all-MiniLM-L6-v2") */
36
+ model?: string;
37
+ }
38
+ /** Serialized state for persistence */
39
+ export interface SimileSnapshot<T = any> {
40
+ version: string;
41
+ model: string;
42
+ items: SearchItem<T>[];
43
+ /** Base64-encoded Float32Array vectors */
44
+ vectors: string[];
45
+ createdAt: string;
46
+ }
package/dist/types.js ADDED
@@ -0,0 +1 @@
1
+ export {};
package/package.json ADDED
@@ -0,0 +1,38 @@
1
+ {
2
+ "name": "simile-search",
3
+ "version": "0.2.0",
4
+ "description": "Offline-first semantic + fuzzy search engine for catalogs, names, and products",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "scripts": {
8
+ "build": "tsc",
9
+ "test": "vitest",
10
+ "prepublishOnly": "npm run build"
11
+ },
12
+ "keywords": [
13
+ "semantic-search",
14
+ "vector-search",
15
+ "fuzzy-search",
16
+ "offline-search",
17
+ "ai-search"
18
+ ],
19
+ "files": [
20
+ "dist",
21
+ "README.md",
22
+ "LICENSE"
23
+ ],
24
+ "author": "Aavash Baral",
25
+ "license": "MIT",
26
+ "dependencies": {
27
+ "@xenova/transformers": "^2.17.2",
28
+ "fast-levenshtein": "^3.0.0",
29
+ "vitest": "^4.0.16"
30
+ },
31
+ "devDependencies": {
32
+ "@types/node": "^25.0.3",
33
+ "@types/fast-levenshtein": "^0.0.4",
34
+ "ts-node": "^10.9.2",
35
+ "typescript": "^5.0.0",
36
+ "vitest": "^4.0.16"
37
+ }
38
+ }