elid 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,11 +1,31 @@
1
- # ELID - Efficient Levenshtein and String Similarity Library
1
+ # ELID - Embedding Locality IDentifier
2
2
 
3
3
  [![CI](https://github.com/ZachHandley/ELID/actions/workflows/ci.yml/badge.svg)](https://github.com/ZachHandley/ELID/actions)
4
4
  [![License](https://img.shields.io/badge/license-MIT%2FApache--2.0-blue.svg)](LICENSE-MIT)
5
5
 
6
- A fast, zero-dependency Rust library for computing string similarity metrics with bindings for Python, JavaScript (WASM), and C.
6
+ **ELID** enables vector search without a vector store by encoding high-dimensional embeddings into sortable string IDs that preserve locality. Similar vectors produce similar IDs, allowing you to use standard database indexes for similarity search.
7
7
 
8
- ## Algorithms
8
+ ELID also includes a complete suite of fast string similarity algorithms.
9
+
10
+ ## Features
11
+
12
+ ### Embedding Encoding (Vector Search Without Vector Stores)
13
+
14
+ Convert embeddings from any ML model into compact, sortable identifiers:
15
+
16
+ | Profile | Output | Best For |
17
+ |---------|--------|----------|
18
+ | **Mini128** | 26-char base32hex | Fast similarity via Hamming distance |
19
+ | **Morton10x10** | 20-char base32hex | Database range queries (Z-order) |
20
+ | **Hilbert10x10** | 20-char base32hex | Maximum locality preservation |
21
+
22
+ **Key benefits:**
23
+ - Similar vectors produce similar IDs (locality preservation)
24
+ - IDs are lexicographically sortable for database indexing
25
+ - No vector store required - use any database with string indexes
26
+ - Deterministic: same embedding always produces the same ID
27
+
28
+ ### String Similarity Algorithms
9
29
 
10
30
  | Algorithm | Type | Best For |
11
31
  |-----------|------|----------|
@@ -23,8 +43,17 @@ A fast, zero-dependency Rust library for computing string similarity metrics wit
23
43
  ### Rust
24
44
 
25
45
  ```toml
46
+ # String similarity only (zero dependencies)
26
47
  [dependencies]
27
- elid = "0.2.1"
48
+ elid = "0.1"
49
+
50
+ # Embedding encoding
51
+ [dependencies]
52
+ elid = { version = "0.1", features = ["embeddings"] }
53
+
54
+ # Both features
55
+ [dependencies]
56
+ elid = { version = "0.1", features = ["strings", "embeddings"] }
28
57
  ```
29
58
 
30
59
  ### Python
@@ -45,6 +74,58 @@ Build with `cargo build --release --features ffi` to get `libelid.so` and `elid.
45
74
 
46
75
  ## Quick Start
47
76
 
77
+ ### Embedding Encoding (Rust)
78
+
79
+ ```rust
80
+ use elid::embeddings::{encode, Profile, Elid};
81
+
82
+ // Get an embedding from your ML model (e.g., OpenAI, Cohere, sentence-transformers)
83
+ let embedding: Vec<f32> = model.embed("Hello, world!")?;
84
+
85
+ // Encode to a sortable ELID
86
+ let profile = Profile::default(); // Mini128
87
+ let elid: Elid = encode(&embedding, &profile)?;
88
+
89
+ println!("ELID: {}", elid); // e.g., "01a3f5g7h9jklmnopqrstuv"
90
+
91
+ // Similar texts produce similar ELIDs
92
+ let elid2 = encode(&model.embed("Hello, universe!")?, &profile)?;
93
+
94
+ // Compare similarity via Hamming distance
95
+ use elid::embeddings::hamming_distance;
96
+ let distance = hamming_distance(&elid, &elid2)?; // Lower = more similar
97
+ ```
98
+
99
+ ### Encoding Profiles
100
+
101
+ ```rust
102
+ use elid::embeddings::Profile;
103
+
104
+ // Mini128: 128-bit SimHash (default)
105
+ // Best for: Fast similarity search via Hamming distance
106
+ let mini = Profile::Mini128 {
107
+ seed: 0x454c4944_53494d48, // Deterministic seed
108
+ };
109
+
110
+ // Morton10x10: Z-order curve encoding
111
+ // Best for: Database range queries
112
+ let morton = Profile::Morton10x10 {
113
+ dims: 10,
114
+ bits_per_dim: 10,
115
+ transform_id: None,
116
+ };
117
+
118
+ // Hilbert10x10: Hilbert curve encoding
119
+ // Best for: Maximum locality preservation
120
+ let hilbert = Profile::Hilbert10x10 {
121
+ dims: 10,
122
+ bits_per_dim: 10,
123
+ transform_id: None,
124
+ };
125
+ ```
126
+
127
+ ### String Similarity (Rust)
128
+
48
129
  ```rust
49
130
  use elid::*;
50
131
 
@@ -71,9 +152,14 @@ let (idx, score) = find_best_match("app", &candidates);
71
152
  ```python
72
153
  import elid
73
154
 
155
+ # String similarity
74
156
  elid.levenshtein("kitten", "sitting") # 3
75
157
  elid.jaro_winkler("martha", "marhta") # 0.961
76
158
  elid.simhash_similarity("iPhone 14", "iPhone 15") # 0.922
159
+
160
+ # Embedding encoding (with embeddings feature)
161
+ embedding = model.embed("Hello, world!")
162
+ elid_str = elid.encode_embedding(embedding)
77
163
  ```
78
164
 
79
165
  ### JavaScript
@@ -102,12 +188,62 @@ let opts = SimilarityOpts {
102
188
  let distance = levenshtein_with_opts(" HELLO ", "hello", &opts); // 0
103
189
  ```
104
190
 
191
+ ## Feature Flags
192
+
193
+ | Feature | Description | Dependencies |
194
+ |---------|-------------|--------------|
195
+ | `strings` | String similarity algorithms (default) | None |
196
+ | `embeddings` | Embedding encoding (default) | rand, blake3, etc. |
197
+ | `wasm` | WebAssembly bindings (includes embeddings) | wasm-bindgen, js-sys, getrandom |
198
+ | `python` | Python bindings via PyO3 (includes embeddings) | pyo3, numpy, rayon |
199
+ | `ffi` | C FFI bindings | None (enables unsafe) |
200
+
105
201
  ## Performance
106
202
 
107
- - Zero external dependencies for core algorithms
203
+ - Zero external dependencies for string-only use
108
204
  - O(min(m,n)) space-optimized Levenshtein
109
205
  - 1.4M+ string comparisons per second (Python benchmarks)
110
- - ~96KB WASM binary
206
+ - ~96KB WASM binary (strings only)
207
+ - Embedding encoding: <1ms per vector
208
+
209
+ ## Use Cases
210
+
211
+ ### Vector Search Without Vector Stores
212
+
213
+ Store ELIDs directly in PostgreSQL, SQLite, or any database:
214
+
215
+ ```sql
216
+ -- Create index on ELID column
217
+ CREATE INDEX idx_documents_elid ON documents(elid);
218
+
219
+ -- Find similar documents using string prefix matching
220
+ SELECT * FROM documents
221
+ WHERE elid LIKE 'abc%' -- Prefix match for locality
222
+ ORDER BY elid;
223
+ ```
224
+
225
+ ### Deduplication
226
+
227
+ Use SimHash to find near-duplicate content:
228
+
229
+ ```rust
230
+ let hash1 = simhash("The quick brown fox");
231
+ let hash2 = simhash("The quick brown dog");
232
+ let similarity = simhash_similarity_from_hashes(hash1, hash2);
233
+ if similarity > 0.9 {
234
+ println!("Likely duplicates!");
235
+ }
236
+ ```
237
+
238
+ ### Fuzzy Search
239
+
240
+ Find matches with typo tolerance:
241
+
242
+ ```rust
243
+ let candidates = vec!["apple", "application", "apply", "banana"];
244
+ let matches = find_matches_above_threshold("aple", &candidates, 0.7);
245
+ // Returns: [("apple", 0.8), ...]
246
+ ```
111
247
 
112
248
  ## Building
113
249