embrix 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +289 -0
- package/dist/index.cjs +708 -0
- package/dist/index.d.cts +547 -0
- package/dist/index.d.ts +547 -0
- package/dist/index.js +676 -0
- package/package.json +67 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 embrix
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
# embrix
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/embrix)
|
|
4
|
+
[](https://www.npmjs.com/package/embrix)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
[](https://example.com/paper.pdf)
|
|
7
|
+
|
|
8
|
+
Production-ready local text embeddings using `@xenova/transformers`. Zero external API calls, runs entirely in Node.js.
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
- Local Execution - No API calls, runs entirely on your machine
|
|
13
|
+
- Two Optimized Models - MiniLM and BGE for different use cases
|
|
14
|
+
- Zero Dependencies - Only `@xenova/transformers` as dependency
|
|
15
|
+
- Type-Safe - Full TypeScript support with strict typing
|
|
16
|
+
- Efficient - Lazy loading, singleton pattern, batch processing
|
|
17
|
+
- Benchmark Tools - Built-in performance measurement utilities
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
npm install embrix
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Quick Start
|
|
26
|
+
|
|
27
|
+
```typescript
|
|
28
|
+
import { Embedder, EmbeddingModel, cosineSimilarity } from 'embrix';
|
|
29
|
+
|
|
30
|
+
// Create an embedder
|
|
31
|
+
const embedder = new Embedder(EmbeddingModel.MiniLM);
|
|
32
|
+
|
|
33
|
+
// Generate a single embedding
|
|
34
|
+
const embedding = await embedder.embed("Hello, world!");
|
|
35
|
+
console.log(embedding.length); // 384
|
|
36
|
+
|
|
37
|
+
// Generate batch embeddings
|
|
38
|
+
const embeddings = await embedder.embedBatch([
|
|
39
|
+
"Hello, world!",
|
|
40
|
+
"Goodbye, world!"
|
|
41
|
+
]);
|
|
42
|
+
|
|
43
|
+
// Compare similarity
|
|
44
|
+
const hello = await embedder.embed("Hello!");
|
|
45
|
+
const goodbye = await embedder.embed("Goodbye!");
|
|
46
|
+
const similarity = cosineSimilarity(hello, goodbye);
|
|
47
|
+
console.log(`Similarity: ${similarity}`);
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Supported Models
|
|
51
|
+
|
|
52
|
+
| Model | Enum | Dimensions | Description |
|
|
53
|
+
|-------|------|------------|-------------|
|
|
54
|
+
| all-MiniLM-L6-v2 | `EmbeddingModel.MiniLM` | 384 | Fast and efficient, great for most use cases |
|
|
55
|
+
| bge-small-en-v1.5 | `EmbeddingModel.BGE` | 384 | High quality English embeddings from BAAI |
|
|
56
|
+
|
|
57
|
+
## API Reference
|
|
58
|
+
|
|
59
|
+
### Embedder Class
|
|
60
|
+
|
|
61
|
+
```typescript
|
|
62
|
+
import { Embedder, EmbeddingModel } from 'embrix';
|
|
63
|
+
|
|
64
|
+
const embedder = new Embedder(EmbeddingModel.MiniLM);
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
#### Properties
|
|
68
|
+
|
|
69
|
+
- `dimension: number` - Embedding dimension (384 for both models)
|
|
70
|
+
- `modelName: string` - Human-readable model name
|
|
71
|
+
- `modelType: EmbeddingModel` - The model enum value
|
|
72
|
+
|
|
73
|
+
#### Methods
|
|
74
|
+
|
|
75
|
+
##### `embed(text: string, options?): Promise<Float32Array>`
|
|
76
|
+
|
|
77
|
+
Generate an embedding for a single text.
|
|
78
|
+
|
|
79
|
+
```typescript
|
|
80
|
+
const vector = await embedder.embed("Your text here");
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
##### `embedBatch(texts: string[], options?): Promise<Float32Array[]>`
|
|
84
|
+
|
|
85
|
+
Generate embeddings for multiple texts efficiently.
|
|
86
|
+
|
|
87
|
+
```typescript
|
|
88
|
+
const vectors = await embedder.embedBatch(["Text 1", "Text 2", "Text 3"]);
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
##### `embedWithMetadata(text: string, options?): Promise<EmbeddingResult>`
|
|
92
|
+
|
|
93
|
+
Generate embedding with full metadata.
|
|
94
|
+
|
|
95
|
+
```typescript
|
|
96
|
+
const result = await embedder.embedWithMetadata("Your text");
|
|
97
|
+
console.log(result.model); // EmbeddingModel.MiniLM
|
|
98
|
+
console.log(result.dimension); // 384
|
|
99
|
+
console.log(result.embedding); // Float32Array
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Similarity Functions
|
|
103
|
+
|
|
104
|
+
```typescript
|
|
105
|
+
import {
|
|
106
|
+
cosineSimilarity,
|
|
107
|
+
dotProduct,
|
|
108
|
+
euclideanDistance,
|
|
109
|
+
manhattanDistance,
|
|
110
|
+
findMostSimilar,
|
|
111
|
+
findKMostSimilar
|
|
112
|
+
} from 'embrix';
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
#### `cosineSimilarity(a: Float32Array, b: Float32Array): number`
|
|
116
|
+
|
|
117
|
+
Calculate cosine similarity between two vectors. Range: [-1, 1].
|
|
118
|
+
|
|
119
|
+
```typescript
|
|
120
|
+
const similarity = cosineSimilarity(vector1, vector2);
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
#### `dotProduct(a: Float32Array, b: Float32Array): number`
|
|
124
|
+
|
|
125
|
+
Calculate dot product of two vectors.
|
|
126
|
+
|
|
127
|
+
```typescript
|
|
128
|
+
const product = dotProduct(vector1, vector2);
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
#### `euclideanDistance(a: Float32Array, b: Float32Array): number`
|
|
132
|
+
|
|
133
|
+
Calculate Euclidean (L2) distance between two vectors.
|
|
134
|
+
|
|
135
|
+
```typescript
|
|
136
|
+
const distance = euclideanDistance(vector1, vector2);
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
#### `findMostSimilar(query: Float32Array, candidates: Float32Array[])`
|
|
140
|
+
|
|
141
|
+
Find the most similar vector to a query.
|
|
142
|
+
|
|
143
|
+
```typescript
|
|
144
|
+
const query = await embedder.embed("search query");
|
|
145
|
+
const docs = await embedder.embedBatch(["doc 1", "doc 2", "doc 3"]);
|
|
146
|
+
const best = findMostSimilar(query, docs);
|
|
147
|
+
console.log(`Best match: index ${best.index}, similarity ${best.similarity}`);
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
#### `findKMostSimilar(query: Float32Array, candidates: Float32Array[], k: number)`
|
|
151
|
+
|
|
152
|
+
Find the k most similar vectors.
|
|
153
|
+
|
|
154
|
+
```typescript
|
|
155
|
+
const top5 = findKMostSimilar(query, docs, 5);
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Model Loading
|
|
159
|
+
|
|
160
|
+
```typescript
|
|
161
|
+
import { preloadModel, preloadAllModels, isModelLoaded, clearModelCache } from 'embrix';
|
|
162
|
+
|
|
163
|
+
// Preload a specific model
|
|
164
|
+
await preloadModel(EmbeddingModel.MiniLM);
|
|
165
|
+
|
|
166
|
+
// Preload all models
|
|
167
|
+
await preloadAllModels();
|
|
168
|
+
|
|
169
|
+
// Check if model is loaded
|
|
170
|
+
if (isModelLoaded(EmbeddingModel.MiniLM)) {
|
|
171
|
+
// Model is ready
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Clear cache to free memory
|
|
175
|
+
clearModelCache();
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Benchmark Utilities
|
|
179
|
+
|
|
180
|
+
```typescript
|
|
181
|
+
import { runBenchmark, runAllBenchmarks, formatBenchmarkResult } from 'embrix';
|
|
182
|
+
|
|
183
|
+
// Benchmark a single model
|
|
184
|
+
const results = await runBenchmark(EmbeddingModel.MiniLM);
|
|
185
|
+
console.log(formatBenchmarkResult(results));
|
|
186
|
+
|
|
187
|
+
// Benchmark all models
|
|
188
|
+
const allResults = await runAllBenchmarks();
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## CLI Benchmark
|
|
192
|
+
|
|
193
|
+
Run benchmarks from the command line:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
npm run benchmark
|
|
197
|
+
|
|
198
|
+
# Options
|
|
199
|
+
npm run benchmark -- --model minilm
|
|
200
|
+
npm run benchmark -- --batch-size 50
|
|
201
|
+
npm run benchmark -- --help
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Example Output
|
|
205
|
+
|
|
206
|
+
```
|
|
207
|
+
============================================================
|
|
208
|
+
Benchmark: all-MiniLM-L6-v2
|
|
209
|
+
Model: minilm
|
|
210
|
+
Dimension: 384
|
|
211
|
+
============================================================
|
|
212
|
+
|
|
213
|
+
Running cold start benchmark...
|
|
214
|
+
Duration: 2345.67ms
|
|
215
|
+
|
|
216
|
+
Running warm start benchmark...
|
|
217
|
+
Duration: 12.34ms
|
|
218
|
+
Speedup: 190.15x faster than cold start
|
|
219
|
+
|
|
220
|
+
Running batch benchmark (100 texts)...
|
|
221
|
+
Total duration: 567.89ms
|
|
222
|
+
Avg per embedding: 5.68ms
|
|
223
|
+
Throughput: 176.09 embeddings/sec
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Architecture
|
|
227
|
+
|
|
228
|
+
```
|
|
229
|
+
embrix/
|
|
230
|
+
├── src/
|
|
231
|
+
│ ├── models.ts # Model definitions and metadata
|
|
232
|
+
│ ├── loader.ts # Lazy singleton model loader
|
|
233
|
+
│ ├── embedder.ts # Core embedding class
|
|
234
|
+
│ ├── similarity.ts # Vector similarity functions
|
|
235
|
+
│ ├── benchmark.ts # Performance measurement utilities
|
|
236
|
+
│ └── index.ts # Barrel export
|
|
237
|
+
├── scripts/
|
|
238
|
+
│ └── benchmark.ts # CLI benchmark script
|
|
239
|
+
├── examples/
|
|
240
|
+
│ └── usage.ts # Usage examples
|
|
241
|
+
├── package.json
|
|
242
|
+
├── tsconfig.json
|
|
243
|
+
└── README.md
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Design Decisions
|
|
247
|
+
|
|
248
|
+
### Lazy Singleton Loading
|
|
249
|
+
|
|
250
|
+
Models are loaded on-demand and cached in memory. This ensures:
|
|
251
|
+
- Fast subsequent calls (warm start)
|
|
252
|
+
- No duplicate model loads
|
|
253
|
+
- Memory efficiency
|
|
254
|
+
|
|
255
|
+
### Float32Array Throughout
|
|
256
|
+
|
|
257
|
+
All embeddings are returned as `Float32Array` for:
|
|
258
|
+
- Consistency with the underlying tensor output
|
|
259
|
+
- Memory efficiency vs regular arrays
|
|
260
|
+
- Compatibility with WebAssembly and GPU operations
|
|
261
|
+
|
|
262
|
+
### No External Dependencies
|
|
263
|
+
|
|
264
|
+
Only `@xenova/transformers` is required. This keeps the package:
|
|
265
|
+
- Lightweight
|
|
266
|
+
- Secure
|
|
267
|
+
- Easy to audit
|
|
268
|
+
|
|
269
|
+
## Requirements
|
|
270
|
+
|
|
271
|
+
- Node.js >= 18.0.0
|
|
272
|
+
- ~500MB disk space for model cache (first run)
|
|
273
|
+
|
|
274
|
+
## Citation
|
|
275
|
+
|
|
276
|
+
If you use embrix in your research, please cite:
|
|
277
|
+
|
|
278
|
+
```bibtex
|
|
279
|
+
@software{embrix2024,
|
|
280
|
+
title = {embrix: Production-Ready Local Text Embeddings for Node.js},
|
|
281
|
+
author = {Your Name},
|
|
282
|
+
year = {2024},
|
|
283
|
+
url = {https://github.com/yourusername/embrix}
|
|
284
|
+
}
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
## License
|
|
288
|
+
|
|
289
|
+
MIT
|