elid 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +142 -6
- package/elid.d.ts +489 -87
- package/elid_bg.js +780 -182
- package/elid_bg.wasm +0 -0
- package/package.json +5 -5
package/README.md
CHANGED
|
@@ -1,11 +1,31 @@
|
|
|
1
|
-
# ELID -
|
|
1
|
+
# ELID - Embedding Locality IDentifier
|
|
2
2
|
|
|
3
3
|
[](https://github.com/ZachHandley/ELID/actions)
|
|
4
4
|
[](LICENSE-MIT)
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
**ELID** enables vector search without a vector store by encoding high-dimensional embeddings into sortable string IDs that preserve locality. Similar vectors produce similar IDs, allowing you to use standard database indexes for similarity search.
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
ELID also includes a complete suite of fast string similarity algorithms.
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
### Embedding Encoding (Vector Search Without Vector Stores)
|
|
13
|
+
|
|
14
|
+
Convert embeddings from any ML model into compact, sortable identifiers:
|
|
15
|
+
|
|
16
|
+
| Profile | Output | Best For |
|
|
17
|
+
|---------|--------|----------|
|
|
18
|
+
| **Mini128** | 26-char base32hex | Fast similarity via Hamming distance |
|
|
19
|
+
| **Morton10x10** | 20-char base32hex | Database range queries (Z-order) |
|
|
20
|
+
| **Hilbert10x10** | 20-char base32hex | Maximum locality preservation |
|
|
21
|
+
|
|
22
|
+
**Key benefits:**
|
|
23
|
+
- Similar vectors produce similar IDs (locality preservation)
|
|
24
|
+
- IDs are lexicographically sortable for database indexing
|
|
25
|
+
- No vector store required - use any database with string indexes
|
|
26
|
+
- Deterministic: same embedding always produces the same ID
|
|
27
|
+
|
|
28
|
+
### String Similarity Algorithms
|
|
9
29
|
|
|
10
30
|
| Algorithm | Type | Best For |
|
|
11
31
|
|-----------|------|----------|
|
|
@@ -23,8 +43,17 @@ A fast, zero-dependency Rust library for computing string similarity metrics wit
|
|
|
23
43
|
### Rust
|
|
24
44
|
|
|
25
45
|
```toml
|
|
46
|
+
# String similarity only (zero dependencies)
|
|
26
47
|
[dependencies]
|
|
27
|
-
elid = "0.
|
|
48
|
+
elid = "0.1"
|
|
49
|
+
|
|
50
|
+
# Embedding encoding
|
|
51
|
+
[dependencies]
|
|
52
|
+
elid = { version = "0.1", features = ["embeddings"] }
|
|
53
|
+
|
|
54
|
+
# Both features
|
|
55
|
+
[dependencies]
|
|
56
|
+
elid = { version = "0.1", features = ["strings", "embeddings"] }
|
|
28
57
|
```
|
|
29
58
|
|
|
30
59
|
### Python
|
|
@@ -45,6 +74,58 @@ Build with `cargo build --release --features ffi` to get `libelid.so` and `elid.
|
|
|
45
74
|
|
|
46
75
|
## Quick Start
|
|
47
76
|
|
|
77
|
+
### Embedding Encoding (Rust)
|
|
78
|
+
|
|
79
|
+
```rust
|
|
80
|
+
use elid::embeddings::{encode, Profile, Elid};
|
|
81
|
+
|
|
82
|
+
// Get an embedding from your ML model (e.g., OpenAI, Cohere, sentence-transformers)
|
|
83
|
+
let embedding: Vec<f32> = model.embed("Hello, world!")?;
|
|
84
|
+
|
|
85
|
+
// Encode to a sortable ELID
|
|
86
|
+
let profile = Profile::default(); // Mini128
|
|
87
|
+
let elid: Elid = encode(&embedding, &profile)?;
|
|
88
|
+
|
|
89
|
+
println!("ELID: {}", elid); // e.g., "01a3f5g7h9jklmnopqrstuv"
|
|
90
|
+
|
|
91
|
+
// Similar texts produce similar ELIDs
|
|
92
|
+
let elid2 = encode(&model.embed("Hello, universe!")?, &profile)?;
|
|
93
|
+
|
|
94
|
+
// Compare similarity via Hamming distance
|
|
95
|
+
use elid::embeddings::hamming_distance;
|
|
96
|
+
let distance = hamming_distance(&elid, &elid2)?; // Lower = more similar
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Encoding Profiles
|
|
100
|
+
|
|
101
|
+
```rust
|
|
102
|
+
use elid::embeddings::Profile;
|
|
103
|
+
|
|
104
|
+
// Mini128: 128-bit SimHash (default)
|
|
105
|
+
// Best for: Fast similarity search via Hamming distance
|
|
106
|
+
let mini = Profile::Mini128 {
|
|
107
|
+
seed: 0x454c4944_53494d48, // Deterministic seed
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
// Morton10x10: Z-order curve encoding
|
|
111
|
+
// Best for: Database range queries
|
|
112
|
+
let morton = Profile::Morton10x10 {
|
|
113
|
+
dims: 10,
|
|
114
|
+
bits_per_dim: 10,
|
|
115
|
+
transform_id: None,
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
// Hilbert10x10: Hilbert curve encoding
|
|
119
|
+
// Best for: Maximum locality preservation
|
|
120
|
+
let hilbert = Profile::Hilbert10x10 {
|
|
121
|
+
dims: 10,
|
|
122
|
+
bits_per_dim: 10,
|
|
123
|
+
transform_id: None,
|
|
124
|
+
};
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### String Similarity (Rust)
|
|
128
|
+
|
|
48
129
|
```rust
|
|
49
130
|
use elid::*;
|
|
50
131
|
|
|
@@ -71,9 +152,14 @@ let (idx, score) = find_best_match("app", &candidates);
|
|
|
71
152
|
```python
|
|
72
153
|
import elid
|
|
73
154
|
|
|
155
|
+
# String similarity
|
|
74
156
|
elid.levenshtein("kitten", "sitting") # 3
|
|
75
157
|
elid.jaro_winkler("martha", "marhta") # 0.961
|
|
76
158
|
elid.simhash_similarity("iPhone 14", "iPhone 15") # 0.922
|
|
159
|
+
|
|
160
|
+
# Embedding encoding (with embeddings feature)
|
|
161
|
+
embedding = model.embed("Hello, world!")
|
|
162
|
+
elid_str = elid.encode_embedding(embedding)
|
|
77
163
|
```
|
|
78
164
|
|
|
79
165
|
### JavaScript
|
|
@@ -102,12 +188,62 @@ let opts = SimilarityOpts {
|
|
|
102
188
|
let distance = levenshtein_with_opts(" HELLO ", "hello", &opts); // 0
|
|
103
189
|
```
|
|
104
190
|
|
|
191
|
+
## Feature Flags
|
|
192
|
+
|
|
193
|
+
| Feature | Description | Dependencies |
|
|
194
|
+
|---------|-------------|--------------|
|
|
195
|
+
| `strings` | String similarity algorithms (default) | None |
|
|
196
|
+
| `embeddings` | Embedding encoding (default) | rand, blake3, etc. |
|
|
197
|
+
| `wasm` | WebAssembly bindings (includes embeddings) | wasm-bindgen, js-sys, getrandom |
|
|
198
|
+
| `python` | Python bindings via PyO3 (includes embeddings) | pyo3, numpy, rayon |
|
|
199
|
+
| `ffi` | C FFI bindings | None (enables unsafe) |
|
|
200
|
+
|
|
105
201
|
## Performance
|
|
106
202
|
|
|
107
|
-
- Zero external dependencies for
|
|
203
|
+
- Zero external dependencies for string-only use
|
|
108
204
|
- O(min(m,n)) space-optimized Levenshtein
|
|
109
205
|
- 1.4M+ string comparisons per second (Python benchmarks)
|
|
110
|
-
- ~96KB WASM binary
|
|
206
|
+
- ~96KB WASM binary (strings only)
|
|
207
|
+
- Embedding encoding: <1ms per vector
|
|
208
|
+
|
|
209
|
+
## Use Cases
|
|
210
|
+
|
|
211
|
+
### Vector Search Without Vector Stores
|
|
212
|
+
|
|
213
|
+
Store ELIDs directly in PostgreSQL, SQLite, or any database:
|
|
214
|
+
|
|
215
|
+
```sql
|
|
216
|
+
-- Create index on ELID column
|
|
217
|
+
CREATE INDEX idx_documents_elid ON documents(elid);
|
|
218
|
+
|
|
219
|
+
-- Find similar documents using string prefix matching
|
|
220
|
+
SELECT * FROM documents
|
|
221
|
+
WHERE elid LIKE 'abc%' -- Prefix match for locality
|
|
222
|
+
ORDER BY elid;
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Deduplication
|
|
226
|
+
|
|
227
|
+
Use SimHash to find near-duplicate content:
|
|
228
|
+
|
|
229
|
+
```rust
|
|
230
|
+
let hash1 = simhash("The quick brown fox");
|
|
231
|
+
let hash2 = simhash("The quick brown dog");
|
|
232
|
+
let similarity = simhash_similarity_from_hashes(hash1, hash2);
|
|
233
|
+
if similarity > 0.9 {
|
|
234
|
+
println!("Likely duplicates!");
|
|
235
|
+
}
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
### Fuzzy Search
|
|
239
|
+
|
|
240
|
+
Find matches with typo tolerance:
|
|
241
|
+
|
|
242
|
+
```rust
|
|
243
|
+
let candidates = vec!["apple", "application", "apply", "banana"];
|
|
244
|
+
let matches = find_matches_above_threshold("aple", &candidates, 0.7);
|
|
245
|
+
// Returns: [("apple", 0.8), ...]
|
|
246
|
+
```
|
|
111
247
|
|
|
112
248
|
## Building
|
|
113
249
|
|