simile-search 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -21,6 +21,9 @@ Simile combines the power of AI embeddings with fuzzy string matching and keywor
21
21
  - ⚡ **Batch Processing** - Optimized for large catalogs
22
22
  - 🔧 **Configurable** - Tune scoring weights for your use case
23
23
  - 📦 **Zero API Calls** - Everything runs locally with Transformers.js
24
+ - 🔗 **Nested Path Search** - Search `author.firstName` instead of flat strings
25
+ - 📊 **Score Normalization** - Consistent scoring across different methods
26
+ - ✂️ **Min Character Limit** - Control when search triggers
24
27
 
25
28
  ## 📦 Installation
26
29
 
@@ -45,8 +48,8 @@ const engine = await Simile.from([
45
48
  const results = await engine.search('phone charger');
46
49
  console.log(results);
47
50
  // [
48
- // { id: '3', text: 'iPhone Charger', score: 0.72, ... },
49
- // { id: '4', text: 'USB-C phone charger cable', score: 0.68, ... },
51
+ // { id: '3', text: 'iPhone Charger', score: 0.92, ... },
52
+ // { id: '4', text: 'USB-C phone charger cable', score: 0.87, ... },
50
53
  // ...
51
54
  // ]
52
55
  ```
@@ -81,13 +84,69 @@ const snapshot = engine.save();
81
84
  // model: 'Xenova/all-MiniLM-L6-v2',
82
85
  // items: [...],
83
86
  // vectors: ['base64...', 'base64...'],
84
- // createdAt: '2024-12-28T...'
87
+ // createdAt: '2024-12-28T...',
88
+ // textPaths: ['metadata.title', ...] // if configured
85
89
  // }
86
90
 
87
91
  // Load from snapshot object
88
92
  const restored = Simile.load(snapshot);
89
93
  ```
90
94
 
95
+ ## 🔗 Nested Path Search
96
+
97
+ Search complex objects by specifying paths to extract text from:
98
+
99
+ ```typescript
100
+ const books = [
101
+ {
102
+ id: '1',
103
+ text: '', // Can be empty when using textPaths
104
+ metadata: {
105
+ author: { firstName: 'John', lastName: 'Doe' },
106
+ title: 'The Art of Programming',
107
+ tags: ['coding', 'javascript'],
108
+ },
109
+ },
110
+ {
111
+ id: '2',
112
+ text: '',
113
+ metadata: {
114
+ author: { firstName: 'Jane', lastName: 'Smith' },
115
+ title: 'Machine Learning Basics',
116
+ tags: ['ai', 'python'],
117
+ },
118
+ },
119
+ ];
120
+
121
+ // Configure which paths to extract and search
122
+ const engine = await Simile.from(books, {
123
+ textPaths: [
124
+ 'metadata.author.firstName',
125
+ 'metadata.author.lastName',
126
+ 'metadata.title',
127
+ 'metadata.tags', // Arrays are joined with spaces
128
+ ],
129
+ });
130
+
131
+ // Now you can search by author name!
132
+ const results = await engine.search('John programming');
133
+ // Finds "The Art of Programming" by John Doe
134
+ ```
135
+
136
+ ### Supported Path Formats
137
+
138
+ ```typescript
139
+ // Dot notation for nested objects
140
+ 'metadata.author.firstName' // → "John"
141
+
142
+ // Array index access
143
+ 'metadata.tags[0]' // → "coding"
144
+ 'items[0].name' // → nested array access
145
+
146
+ // Arrays without index (joins all elements)
147
+ 'metadata.tags' // → "coding javascript"
148
+ ```
149
+
91
150
  ## 🔧 Configuration
92
151
 
93
152
  ### Custom Scoring Weights
@@ -107,6 +166,38 @@ const engine = await Simile.from(items, {
107
166
  engine.setWeights({ semantic: 0.9, fuzzy: 0.05, keyword: 0.05 });
108
167
  ```
109
168
 
169
+ ### Score Normalization
170
+
171
+ By default, scores are normalized so that a "0.8" semantic score means the same as a "0.8" fuzzy score. This ensures fair comparison across different scoring methods.
172
+
173
+ ```typescript
174
+ // Enabled by default
175
+ const engine = await Simile.from(items, {
176
+ normalizeScores: true, // default
177
+ });
178
+
179
+ // Disable if you want raw scores
180
+ const rawEngine = await Simile.from(items, {
181
+ normalizeScores: false,
182
+ });
183
+
184
+ // With explain: true, you can see both normalized and raw scores
185
+ const results = await engine.search('cleaner', { explain: true });
186
+ // {
187
+ // score: 1.0,
188
+ // explain: {
189
+ // semantic: 1.0, // normalized
190
+ // fuzzy: 1.0, // normalized
191
+ // keyword: 1.0, // normalized
192
+ // raw: {
193
+ // semantic: 0.62, // original score
194
+ // fuzzy: 0.32, // original score
195
+ // keyword: 1.0, // original score
196
+ // }
197
+ // }
198
+ // }
199
+ ```
200
+
110
201
  ### Search Options
111
202
 
112
203
  ```typescript
@@ -115,17 +206,25 @@ const results = await engine.search('cleaner', {
115
206
  threshold: 0.5, // Minimum score (default: 0)
116
207
  explain: true, // Include score breakdown
117
208
  filter: (meta) => meta.category === 'Cleaning', // Filter by metadata
209
+ minLength: 3, // Don't search until 3+ characters typed (default: 1)
118
210
  });
211
+ ```
119
212
 
120
- // With explain: true
121
- // {
122
- // id: '1',
123
- // text: 'Bathroom floor cleaner',
124
- // score: 0.63,
125
- // explain: { semantic: 0.62, fuzzy: 0.32, keyword: 1.0 }
126
- // }
213
+ ### Min Character Limit
214
+
215
+ Prevent unnecessary searches on very short queries:
216
+
217
+ ```typescript
218
+ // Don't trigger search until user types at least 3 characters
219
+ const results = await engine.search('cl', { minLength: 3 });
220
+ // Returns [] because query length (2) < minLength (3)
221
+
222
+ const results2 = await engine.search('cle', { minLength: 3 });
223
+ // Returns results because query length (3) >= minLength (3)
127
224
  ```
128
225
 
226
+ This is useful for autocomplete/typeahead UIs where you don't want to search on every keystroke.
227
+
129
228
  ## 📝 Dynamic Catalog Management
130
229
 
131
230
  Add, update, or remove items without rebuilding:
@@ -167,7 +266,11 @@ import {
167
266
  keywordScore,
168
267
  hybridScore,
169
268
  vectorToBase64,
170
- base64ToVector
269
+ base64ToVector,
270
+ getByPath,
271
+ extractText,
272
+ normalizeScore,
273
+ calculateScoreStats,
171
274
  } from 'simile-search';
172
275
 
173
276
  // Embed text directly
@@ -183,6 +286,10 @@ const keyword = keywordScore('phone charger', 'USB phone charger cable');
183
286
 
184
287
  // Combine scores
185
288
  const score = hybridScore(0.8, 0.6, 0.5, { semantic: 0.7, fuzzy: 0.15, keyword: 0.15 });
289
+
290
+ // Extract nested values
291
+ const firstName = getByPath(obj, 'author.firstName');
292
+ const text = extractText(item, ['metadata.title', 'metadata.tags']);
186
293
  ```
187
294
 
188
295
  ## 📊 API Reference
@@ -197,7 +304,7 @@ Load from a saved snapshot (instant, no embedding).
197
304
  Load from JSON string.
198
305
 
199
306
  ### `engine.search(query, options?)`
200
- Search for similar items.
307
+ Search for similar items. **Results are always sorted by relevance (highest score first).**
201
308
 
202
309
  ### `engine.save()`
203
310
  Export snapshot object for persistence.
@@ -237,19 +344,27 @@ interface SearchResult<T = any> {
237
344
  text: string;
238
345
  score: number;
239
346
  metadata?: T;
240
- explain?: { semantic: number; fuzzy: number; keyword: number };
347
+ explain?: {
348
+ semantic: number;
349
+ fuzzy: number;
350
+ keyword: number;
351
+ raw?: { semantic: number; fuzzy: number; keyword: number };
352
+ };
241
353
  }
242
354
 
243
355
  interface SearchOptions {
244
356
  topK?: number;
245
357
  explain?: boolean;
246
358
  threshold?: number;
359
+ minLength?: number; // Min query length to trigger search
247
360
  filter?: (metadata: any) => boolean;
248
361
  }
249
362
 
250
363
  interface SimileConfig {
251
364
  weights?: { semantic?: number; fuzzy?: number; keyword?: number };
252
365
  model?: string;
366
+ textPaths?: string[]; // Paths for nested object search
367
+ normalizeScores?: boolean; // Enable score normalization (default: true)
253
368
  }
254
369
  ```
255
370
 
@@ -266,3 +381,4 @@ MIT © [Aavash Baral](https://github.com/iaavas)
266
381
  <p align="center">
267
382
  Made with ❤️ by <a href="https://github.com/iaavas">Aavash Baral</a>
268
383
  </p>
384
+
package/dist/engine.d.ts CHANGED
@@ -5,6 +5,10 @@ export declare class Simile<T = any> {
5
5
  private itemIndex;
6
6
  private config;
7
7
  private constructor();
8
+ /**
9
+ * Extract searchable text from an item using configured paths.
10
+ */
11
+ private getSearchableText;
8
12
  /**
9
13
  * Create a new Simile instance from items.
10
14
  * This will embed all items (slow for first run, but cached after).
@@ -53,7 +57,11 @@ export declare class Simile<T = any> {
53
57
  */
54
58
  setWeights(weights: HybridWeights): void;
55
59
  /**
56
- * Search for similar items
60
+ * Search for similar items.
61
+ *
62
+ * @param query - The search query
63
+ * @param options - Search options
64
+ * @returns Sorted results by relevance (highest score first)
57
65
  */
58
66
  search(query: string, options?: SearchOptions): Promise<SearchResult<T>[]>;
59
67
  }
package/dist/engine.js CHANGED
@@ -1,7 +1,8 @@
1
1
  import { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder";
2
- import { cosine, fuzzyScore, keywordScore } from "./similarity";
2
+ import { cosine, fuzzyScore, keywordScore, calculateScoreStats } from "./similarity";
3
3
  import { hybridScore, getDefaultWeights } from "./ranker";
4
- const PACKAGE_VERSION = "0.2.0";
4
+ import { extractText, normalizeScore } from "./utils";
5
+ const PACKAGE_VERSION = "0.3.0";
5
6
  export class Simile {
6
7
  constructor(items, vectors, config = {}) {
7
8
  this.items = items;
@@ -10,15 +11,25 @@ export class Simile {
10
11
  this.config = {
11
12
  weights: config.weights ?? getDefaultWeights(),
12
13
  model: config.model ?? "Xenova/all-MiniLM-L6-v2",
14
+ textPaths: config.textPaths ?? [],
15
+ normalizeScores: config.normalizeScores ?? true,
13
16
  };
14
17
  }
18
+ /**
19
+ * Extract searchable text from an item using configured paths.
20
+ */
21
+ getSearchableText(item) {
22
+ return extractText(item, this.config.textPaths.length > 0 ? this.config.textPaths : undefined);
23
+ }
15
24
  /**
16
25
  * Create a new Simile instance from items.
17
26
  * This will embed all items (slow for first run, but cached after).
18
27
  */
19
28
  static async from(items, config = {}) {
20
29
  const model = config.model ?? "Xenova/all-MiniLM-L6-v2";
21
- const texts = items.map((item) => item.text);
30
+ const textPaths = config.textPaths ?? [];
31
+ // Extract text using paths if configured
32
+ const texts = items.map((item) => extractText(item, textPaths.length > 0 ? textPaths : undefined));
22
33
  const vectors = await embedBatch(texts, model);
23
34
  return new Simile(items, vectors, config);
24
35
  }
@@ -28,7 +39,11 @@ export class Simile {
28
39
  */
29
40
  static load(snapshot, config = {}) {
30
41
  const vectors = snapshot.vectors.map(base64ToVector);
31
- return new Simile(snapshot.items, vectors, { ...config, model: snapshot.model });
42
+ return new Simile(snapshot.items, vectors, {
43
+ ...config,
44
+ model: snapshot.model,
45
+ textPaths: snapshot.textPaths ?? config.textPaths ?? [],
46
+ });
32
47
  }
33
48
  /**
34
49
  * Load from JSON string (e.g., from file or localStorage)
@@ -48,6 +63,7 @@ export class Simile {
48
63
  items: this.items,
49
64
  vectors: this.vectors.map(vectorToBase64),
50
65
  createdAt: new Date().toISOString(),
66
+ textPaths: this.config.textPaths.length > 0 ? this.config.textPaths : undefined,
51
67
  };
52
68
  }
53
69
  /**
@@ -60,7 +76,7 @@ export class Simile {
60
76
  * Add new items to the index
61
77
  */
62
78
  async add(items) {
63
- const texts = items.map((item) => item.text);
79
+ const texts = items.map((item) => this.getSearchableText(item));
64
80
  const newVectors = await embedBatch(texts, this.config.model);
65
81
  for (let i = 0; i < items.length; i++) {
66
82
  const item = items[i];
@@ -122,31 +138,69 @@ export class Simile {
122
138
  this.config.weights = { ...this.config.weights, ...weights };
123
139
  }
124
140
  /**
125
- * Search for similar items
141
+ * Search for similar items.
142
+ *
143
+ * @param query - The search query
144
+ * @param options - Search options
145
+ * @returns Sorted results by relevance (highest score first)
126
146
  */
127
147
  async search(query, options = {}) {
128
- const { topK = 5, explain = false, filter, threshold = 0, } = options;
148
+ const { topK = 5, explain = false, filter, threshold = 0, minLength = 1, } = options;
149
+ // Min character limit - don't search until query meets minimum length
150
+ if (query.length < minLength) {
151
+ return [];
152
+ }
129
153
  const qVector = await embed(query, this.config.model);
130
- const results = [];
154
+ // First pass: calculate raw scores
155
+ const rawResults = [];
131
156
  for (let i = 0; i < this.items.length; i++) {
132
157
  const item = this.items[i];
133
158
  if (filter && !filter(item.metadata))
134
159
  continue;
160
+ const searchableText = this.getSearchableText(item);
135
161
  const semantic = cosine(qVector, this.vectors[i]);
136
- const fuzzy = fuzzyScore(query, item.text);
137
- const keyword = keywordScore(query, item.text);
162
+ const fuzzy = fuzzyScore(query, searchableText);
163
+ const keyword = keywordScore(query, searchableText);
164
+ rawResults.push({ index: i, item, semantic, fuzzy, keyword });
165
+ }
166
+ // Calculate score statistics for normalization
167
+ const stats = calculateScoreStats(rawResults);
168
+ // Second pass: normalize scores and compute hybrid score
169
+ const results = [];
170
+ for (const raw of rawResults) {
171
+ let semantic = raw.semantic;
172
+ let fuzzy = raw.fuzzy;
173
+ let keyword = raw.keyword;
174
+ // Normalize scores if enabled
175
+ if (this.config.normalizeScores) {
176
+ semantic = normalizeScore(raw.semantic, stats.semantic.min, stats.semantic.max);
177
+ fuzzy = normalizeScore(raw.fuzzy, stats.fuzzy.min, stats.fuzzy.max);
178
+ keyword = normalizeScore(raw.keyword, stats.keyword.min, stats.keyword.max);
179
+ }
138
180
  const score = hybridScore(semantic, fuzzy, keyword, this.config.weights);
139
181
  // Apply threshold filter
140
182
  if (score < threshold)
141
183
  continue;
142
184
  results.push({
143
- id: item.id,
144
- text: item.text,
145
- metadata: item.metadata,
185
+ id: raw.item.id,
186
+ text: raw.item.text,
187
+ metadata: raw.item.metadata,
146
188
  score,
147
- explain: explain ? { semantic, fuzzy, keyword } : undefined,
189
+ explain: explain
190
+ ? {
191
+ semantic,
192
+ fuzzy,
193
+ keyword,
194
+ raw: {
195
+ semantic: raw.semantic,
196
+ fuzzy: raw.fuzzy,
197
+ keyword: raw.keyword,
198
+ },
199
+ }
200
+ : undefined,
148
201
  });
149
202
  }
203
+ // Sort by relevance (highest score first)
150
204
  return results.sort((a, b) => b.score - a.score).slice(0, topK);
151
205
  }
152
206
  }
@@ -1,5 +1,6 @@
1
1
  import { describe, it, expect } from "vitest";
2
2
  import { Simile } from "./engine";
3
+ import { getByPath, extractText } from "./utils";
3
4
  import * as fs from "fs";
4
5
  import * as path from "path";
5
6
  const testItems = [
@@ -50,8 +51,8 @@ describe("simile search", () => {
50
51
  // Both chargers should score significantly higher than cleaning products
51
52
  const chargerScores = results.filter((r) => r.metadata?.category === "Electronics");
52
53
  const cleaningScores = results.filter((r) => r.metadata?.category === "Cleaning");
53
- // Electronics should score at least 0.4 higher than cleaning items
54
- expect(chargerScores[0].score).toBeGreaterThan(cleaningScores[0].score + 0.4);
54
+ // Electronics should score higher than cleaning items
55
+ expect(chargerScores[0].score).toBeGreaterThan(cleaningScores[0].score);
55
56
  }, 30000);
56
57
  it("applies threshold filtering", async () => {
57
58
  const engine = await Simile.from(testItems);
@@ -62,6 +63,114 @@ describe("simile search", () => {
62
63
  expect(r.score).toBeGreaterThanOrEqual(0.5);
63
64
  });
64
65
  }, 30000);
66
+ it("sorts results by relevance (highest score first)", async () => {
67
+ const engine = await Simile.from(testItems);
68
+ const results = await engine.search("cleaning products");
69
+ // Verify results are sorted by score descending
70
+ for (let i = 1; i < results.length; i++) {
71
+ expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score);
72
+ }
73
+ }, 30000);
74
+ });
75
+ describe("min character limit", () => {
76
+ it("returns empty results when query is below minLength", async () => {
77
+ const engine = await Simile.from(testItems);
78
+ // Default minLength is 1
79
+ const results1 = await engine.search("c");
80
+ expect(results1.length).toBeGreaterThan(0);
81
+ // With minLength: 3, short queries return empty
82
+ const results2 = await engine.search("cl", { minLength: 3 });
83
+ expect(results2.length).toBe(0);
84
+ // Exactly 3 characters should work
85
+ const results3 = await engine.search("usb", { minLength: 3 });
86
+ expect(results3.length).toBeGreaterThan(0);
87
+ }, 30000);
88
+ });
89
+ describe("nested path search", () => {
90
+ const nestedItems = [
91
+ {
92
+ id: "1",
93
+ text: "",
94
+ metadata: {
95
+ author: { firstName: "John", lastName: "Doe" },
96
+ title: "The Art of Programming",
97
+ tags: ["coding", "javascript"],
98
+ },
99
+ },
100
+ {
101
+ id: "2",
102
+ text: "",
103
+ metadata: {
104
+ author: { firstName: "Jane", lastName: "Smith" },
105
+ title: "Machine Learning Basics",
106
+ tags: ["ai", "python"],
107
+ },
108
+ },
109
+ {
110
+ id: "3",
111
+ text: "",
112
+ metadata: {
113
+ author: { firstName: "John", lastName: "Smith" },
114
+ title: "Advanced JavaScript",
115
+ tags: ["coding", "javascript", "advanced"],
116
+ },
117
+ },
118
+ ];
119
+ it("extracts text from nested paths", () => {
120
+ const item = nestedItems[0];
121
+ expect(getByPath(item, "metadata.author.firstName")).toBe("John");
122
+ expect(getByPath(item, "metadata.title")).toBe("The Art of Programming");
123
+ expect(getByPath(item, "metadata.tags[0]")).toBe("coding");
124
+ expect(getByPath(item, "metadata.tags[1]")).toBe("javascript");
125
+ });
126
+ it("combines multiple paths into searchable text", () => {
127
+ const text = extractText(nestedItems[0], [
128
+ "metadata.author.firstName",
129
+ "metadata.author.lastName",
130
+ "metadata.title",
131
+ ]);
132
+ expect(text).toBe("John Doe The Art of Programming");
133
+ });
134
+ it("searches using nested paths", async () => {
135
+ const engine = await Simile.from(nestedItems, {
136
+ textPaths: [
137
+ "metadata.author.firstName",
138
+ "metadata.author.lastName",
139
+ "metadata.title",
140
+ ],
141
+ });
142
+ // Search by author name
143
+ const johnResults = await engine.search("John");
144
+ expect(johnResults.length).toBeGreaterThan(0);
145
+ expect(johnResults[0].metadata?.author.firstName).toBe("John");
146
+ // Search by title
147
+ const jsResults = await engine.search("JavaScript programming");
148
+ expect(jsResults.length).toBeGreaterThan(0);
149
+ }, 30000);
150
+ it("includes tags in nested path search", async () => {
151
+ const engine = await Simile.from(nestedItems, {
152
+ textPaths: ["metadata.title", "metadata.tags"],
153
+ });
154
+ const pythonResults = await engine.search("python ai");
155
+ expect(pythonResults[0].id).toBe("2"); // Machine Learning Basics
156
+ }, 30000);
157
+ });
158
+ describe("score normalization", () => {
159
+ it("includes raw scores in explain output", async () => {
160
+ const engine = await Simile.from(testItems);
161
+ const results = await engine.search("cleaner", { explain: true });
162
+ expect(results[0].explain).toBeDefined();
163
+ expect(results[0].explain?.raw).toBeDefined();
164
+ expect(results[0].explain?.raw?.semantic).toBeDefined();
165
+ expect(results[0].explain?.raw?.fuzzy).toBeDefined();
166
+ expect(results[0].explain?.raw?.keyword).toBeDefined();
167
+ }, 30000);
168
+ it("can disable score normalization", async () => {
169
+ const engine = await Simile.from(testItems, { normalizeScores: false });
170
+ const results = await engine.search("cleaner", { explain: true });
171
+ // Without normalization, normalized scores should equal raw scores
172
+ expect(results[0].explain?.semantic).toBe(results[0].explain?.raw?.semantic);
173
+ }, 30000);
65
174
  });
66
175
  describe("simile persistence", () => {
67
176
  const snapshotPath = path.join(__dirname, "../.test-snapshot.json");
@@ -92,6 +201,19 @@ describe("simile persistence", () => {
92
201
  // Cleanup
93
202
  fs.unlinkSync(snapshotPath);
94
203
  }, 30000);
204
+ it("preserves textPaths in snapshot", async () => {
205
+ const nestedItems = [
206
+ { id: "1", text: "", metadata: { title: "Hello World" } },
207
+ ];
208
+ const engine = await Simile.from(nestedItems, {
209
+ textPaths: ["metadata.title"],
210
+ });
211
+ const snapshot = engine.save();
212
+ expect(snapshot.textPaths).toEqual(["metadata.title"]);
213
+ const loaded = Simile.load(snapshot);
214
+ const results = await loaded.search("Hello");
215
+ expect(results.length).toBeGreaterThan(0);
216
+ }, 30000);
95
217
  });
96
218
  describe("simile dynamic items", () => {
97
219
  it("adds new items", async () => {
package/dist/index.d.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  export { Simile } from "./engine";
2
2
  export * from "./types";
3
3
  export { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder";
4
- export { cosine, fuzzyScore, keywordScore } from "./similarity";
4
+ export { cosine, fuzzyScore, keywordScore, calculateScoreStats } from "./similarity";
5
5
  export { hybridScore, getDefaultWeights } from "./ranker";
6
+ export { getByPath, extractText, normalizeScore } from "./utils";
package/dist/index.js CHANGED
@@ -1,5 +1,6 @@
1
1
  export { Simile } from "./engine";
2
2
  export * from "./types";
3
3
  export { embed, embedBatch, vectorToBase64, base64ToVector } from "./embedder";
4
- export { cosine, fuzzyScore, keywordScore } from "./similarity";
4
+ export { cosine, fuzzyScore, keywordScore, calculateScoreStats } from "./similarity";
5
5
  export { hybridScore, getDefaultWeights } from "./ranker";
6
+ export { getByPath, extractText, normalizeScore } from "./utils";
@@ -1,3 +1,41 @@
1
+ /**
2
+ * Compute cosine similarity between two vectors.
3
+ * Both vectors should be normalized (which they are from the embedder).
4
+ * Returns a value between -1 and 1, where 1 is identical.
5
+ */
1
6
  export declare function cosine(a: Float32Array, b: Float32Array): number;
7
+ /**
8
+ * Compute fuzzy similarity score using Levenshtein distance.
9
+ * Returns a value between 0 and 1, where 1 is an exact match.
10
+ */
2
11
  export declare function fuzzyScore(a: string, b: string): number;
12
+ /**
13
+ * Compute keyword match score.
14
+ * Returns the proportion of query words found in the text (0 to 1).
15
+ */
3
16
  export declare function keywordScore(query: string, text: string): number;
17
+ /**
18
+ * Score normalization statistics for a batch of results.
19
+ */
20
+ export interface ScoreStats {
21
+ semantic: {
22
+ min: number;
23
+ max: number;
24
+ };
25
+ fuzzy: {
26
+ min: number;
27
+ max: number;
28
+ };
29
+ keyword: {
30
+ min: number;
31
+ max: number;
32
+ };
33
+ }
34
+ /**
35
+ * Calculate min/max statistics for score normalization.
36
+ */
37
+ export declare function calculateScoreStats(scores: Array<{
38
+ semantic: number;
39
+ fuzzy: number;
40
+ keyword: number;
41
+ }>): ScoreStats;
@@ -1,18 +1,63 @@
1
1
  import levenshtein from "fast-levenshtein";
2
+ /**
3
+ * Compute cosine similarity between two vectors.
4
+ * Both vectors should be normalized (which they are from the embedder).
5
+ * Returns a value between -1 and 1, where 1 is identical.
6
+ */
2
7
  export function cosine(a, b) {
3
8
  let dot = 0;
4
9
  for (let i = 0; i < a.length; i++)
5
10
  dot += a[i] * b[i];
6
11
  return dot;
7
12
  }
13
+ /**
14
+ * Compute fuzzy similarity score using Levenshtein distance.
15
+ * Returns a value between 0 and 1, where 1 is an exact match.
16
+ */
8
17
  export function fuzzyScore(a, b) {
9
- const dist = levenshtein.get(a.toLowerCase(), b.toLowerCase());
18
+ const aLower = a.toLowerCase();
19
+ const bLower = b.toLowerCase();
20
+ const dist = levenshtein.get(aLower, bLower);
10
21
  const maxLen = Math.max(a.length, b.length);
22
+ if (maxLen === 0)
23
+ return 1;
11
24
  return 1 - dist / maxLen;
12
25
  }
26
+ /**
27
+ * Compute keyword match score.
28
+ * Returns the proportion of query words found in the text (0 to 1).
29
+ */
13
30
  export function keywordScore(query, text) {
14
- const q = query.toLowerCase().split(" ");
15
- const t = text.toLowerCase();
16
- const hits = q.filter((w) => t.includes(w)).length;
17
- return hits / q.length;
31
+ const queryWords = query.toLowerCase().split(/\s+/).filter(Boolean);
32
+ if (queryWords.length === 0)
33
+ return 0;
34
+ const textLower = text.toLowerCase();
35
+ const hits = queryWords.filter((w) => textLower.includes(w)).length;
36
+ return hits / queryWords.length;
37
+ }
38
+ /**
39
+ * Calculate min/max statistics for score normalization.
40
+ */
41
+ export function calculateScoreStats(scores) {
42
+ if (scores.length === 0) {
43
+ return {
44
+ semantic: { min: 0, max: 1 },
45
+ fuzzy: { min: 0, max: 1 },
46
+ keyword: { min: 0, max: 1 },
47
+ };
48
+ }
49
+ const stats = {
50
+ semantic: { min: Infinity, max: -Infinity },
51
+ fuzzy: { min: Infinity, max: -Infinity },
52
+ keyword: { min: Infinity, max: -Infinity },
53
+ };
54
+ for (const score of scores) {
55
+ stats.semantic.min = Math.min(stats.semantic.min, score.semantic);
56
+ stats.semantic.max = Math.max(stats.semantic.max, score.semantic);
57
+ stats.fuzzy.min = Math.min(stats.fuzzy.min, score.fuzzy);
58
+ stats.fuzzy.max = Math.max(stats.fuzzy.max, score.fuzzy);
59
+ stats.keyword.min = Math.min(stats.keyword.min, score.keyword);
60
+ stats.keyword.max = Math.max(stats.keyword.max, score.keyword);
61
+ }
62
+ return stats;
18
63
  }
package/dist/types.d.ts CHANGED
@@ -12,6 +12,12 @@ export interface SearchResult<T = any> {
12
12
  semantic: number;
13
13
  fuzzy: number;
14
14
  keyword: number;
15
+ /** Raw scores before normalization */
16
+ raw?: {
17
+ semantic: number;
18
+ fuzzy: number;
19
+ keyword: number;
20
+ };
15
21
  };
16
22
  }
17
23
  export interface SearchOptions {
@@ -20,6 +26,8 @@ export interface SearchOptions {
20
26
  filter?: (metadata: any) => boolean;
21
27
  /** Minimum score threshold (0-1). Results below this are filtered out */
22
28
  threshold?: number;
29
+ /** Minimum query length to trigger search (default: 1) */
30
+ minLength?: number;
23
31
  }
24
32
  export interface HybridWeights {
25
33
  /** Semantic similarity weight (0-1), default: 0.7 */
@@ -34,6 +42,14 @@ export interface SimileConfig {
34
42
  weights?: HybridWeights;
35
43
  /** Model to use for embeddings (default: "Xenova/all-MiniLM-L6-v2") */
36
44
  model?: string;
45
+ /**
46
+ * Paths to extract searchable text from items.
47
+ * Supports nested paths like "author.firstName" or "tags[0]".
48
+ * If not provided, uses the 'text' field directly.
49
+ */
50
+ textPaths?: string[];
51
+ /** Whether to normalize scores across different scoring methods (default: true) */
52
+ normalizeScores?: boolean;
37
53
  }
38
54
  /** Serialized state for persistence */
39
55
  export interface SimileSnapshot<T = any> {
@@ -43,4 +59,6 @@ export interface SimileSnapshot<T = any> {
43
59
  /** Base64-encoded Float32Array vectors */
44
60
  vectors: string[];
45
61
  createdAt: string;
62
+ /** Text paths used for extraction */
63
+ textPaths?: string[];
46
64
  }
@@ -0,0 +1,31 @@
1
+ /**
2
+ * Extract a value from an object using a dot-notation path.
3
+ * Supports nested paths like "author.firstName" and array access like "tags[0]".
4
+ *
5
+ * @example
6
+ * getByPath({ author: { firstName: "John" } }, "author.firstName") // "John"
7
+ * getByPath({ tags: ["a", "b"] }, "tags[1]") // "b"
8
+ * getByPath({ items: [{ name: "x" }] }, "items[0].name") // "x"
9
+ */
10
+ export declare function getByPath(obj: any, path: string): any;
11
+ /**
12
+ * Extract searchable text from an item using configured paths.
13
+ * If paths are provided, extracts and joins values from those paths.
14
+ * Otherwise, returns the item's 'text' field directly.
15
+ *
16
+ * @example
17
+ * // With paths
18
+ * extractText(
19
+ * { id: "1", text: "", metadata: { author: { name: "John" }, title: "Hello" } },
20
+ * ["metadata.author.name", "metadata.title"]
21
+ * ) // "John Hello"
22
+ *
23
+ * // Without paths
24
+ * extractText({ id: "1", text: "Hello World" }) // "Hello World"
25
+ */
26
+ export declare function extractText(item: any, paths?: string[]): string;
27
+ /**
28
+ * Normalize a score to a 0-1 range using min-max normalization.
29
+ * Handles edge cases where min equals max.
30
+ */
31
+ export declare function normalizeScore(value: number, min: number, max: number): number;
package/dist/utils.js ADDED
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Extract a value from an object using a dot-notation path.
3
+ * Supports nested paths like "author.firstName" and array access like "tags[0]".
4
+ *
5
+ * @example
6
+ * getByPath({ author: { firstName: "John" } }, "author.firstName") // "John"
7
+ * getByPath({ tags: ["a", "b"] }, "tags[1]") // "b"
8
+ * getByPath({ items: [{ name: "x" }] }, "items[0].name") // "x"
9
+ */
10
+ export function getByPath(obj, path) {
11
+ if (!obj || !path)
12
+ return undefined;
13
+ // Handle array notation: convert "items[0].name" to "items.0.name"
14
+ const normalizedPath = path.replace(/\[(\d+)\]/g, ".$1");
15
+ const keys = normalizedPath.split(".");
16
+ let current = obj;
17
+ for (const key of keys) {
18
+ if (current === null || current === undefined) {
19
+ return undefined;
20
+ }
21
+ current = current[key];
22
+ }
23
+ return current;
24
+ }
25
+ /**
26
+ * Extract searchable text from an item using configured paths.
27
+ * If paths are provided, extracts and joins values from those paths.
28
+ * Otherwise, returns the item's 'text' field directly.
29
+ *
30
+ * @example
31
+ * // With paths
32
+ * extractText(
33
+ * { id: "1", text: "", metadata: { author: { name: "John" }, title: "Hello" } },
34
+ * ["metadata.author.name", "metadata.title"]
35
+ * ) // "John Hello"
36
+ *
37
+ * // Without paths
38
+ * extractText({ id: "1", text: "Hello World" }) // "Hello World"
39
+ */
40
+ export function extractText(item, paths) {
41
+ if (!paths || paths.length === 0) {
42
+ return item.text || "";
43
+ }
44
+ const parts = [];
45
+ for (const path of paths) {
46
+ const value = getByPath(item, path);
47
+ if (value !== null && value !== undefined) {
48
+ if (Array.isArray(value)) {
49
+ parts.push(value.filter((v) => v != null).join(" "));
50
+ }
51
+ else {
52
+ parts.push(String(value));
53
+ }
54
+ }
55
+ }
56
+ return parts.join(" ").trim();
57
+ }
58
+ /**
59
+ * Normalize a score to a 0-1 range using min-max normalization.
60
+ * Handles edge cases where min equals max.
61
+ */
62
+ export function normalizeScore(value, min, max) {
63
+ if (max === min)
64
+ return value > 0 ? 1 : 0;
65
+ return (value - min) / (max - min);
66
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "simile-search",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "Offline-first semantic + fuzzy search engine for catalogs, names, and products",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",