lavinhash 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +501 -0
- package/lavinhash.d.ts +47 -0
- package/lavinhash.js +5 -0
- package/lavinhash_bg.js +294 -0
- package/lavinhash_bg.wasm +0 -0
- package/package.json +46 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 LavinHash Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
# LavinHash
|
|
2
|
+
|
|
3
|
+
High-performance fuzzy hashing library implementing the Dual-Layer Adaptive Hashing (DLAH) algorithm for detecting file and text similarity.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
LavinHash is a Rust-based fuzzy hashing library that analyzes both structural patterns and content features to compute similarity scores between data. The library uses a dual-layer approach that separates structural similarity (topology) from content similarity (semantic features), providing accurate similarity detection even for modified or partially similar data.
|
|
8
|
+
|
|
9
|
+
**Key Features:**
|
|
10
|
+
|
|
11
|
+
- Dual-layer similarity analysis (structure + content)
|
|
12
|
+
- Adaptive scaling for constant-time comparison regardless of file size
|
|
13
|
+
- Cross-platform support (Linux, macOS, Windows, WebAssembly)
|
|
14
|
+
- High performance with SIMD optimizations and parallel processing
|
|
15
|
+
- Multiple language bindings (JavaScript/TypeScript, with more planned)
|
|
16
|
+
- Deterministic hashing across all platforms
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
### JavaScript/TypeScript (npm)
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npm install lavinhash
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Rust (crates.io)
|
|
27
|
+
|
|
28
|
+
```toml
|
|
29
|
+
[dependencies]
|
|
30
|
+
lavinhash = "1.0"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Building from Source
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
git clone https://github.com/RafaCalRob/LavinHash.git
|
|
37
|
+
cd LavinHash
|
|
38
|
+
cargo build --release
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
### React (Vite, Create React App, Next.js)
|
|
44
|
+
|
|
45
|
+
```javascript
|
|
46
|
+
import { wasm_compare_data } from 'lavinhash';
|
|
47
|
+
|
|
48
|
+
function App() {
|
|
49
|
+
const checkSimilarity = () => {
|
|
50
|
+
const encoder = new TextEncoder();
|
|
51
|
+
const text1 = encoder.encode("The quick brown fox jumps over the lazy dog");
|
|
52
|
+
const text2 = encoder.encode("The quick brown fox leaps over the lazy dog");
|
|
53
|
+
|
|
54
|
+
const similarity = wasm_compare_data(text1, text2);
|
|
55
|
+
console.log(`Similarity: ${similarity}%`); // Output: Similarity: 95%
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
return <button onClick={checkSimilarity}>Check Similarity</button>;
|
|
59
|
+
}
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Angular
|
|
63
|
+
|
|
64
|
+
```typescript
|
|
65
|
+
import { Component } from '@angular/core';
|
|
66
|
+
import { wasm_compare_data } from 'lavinhash';
|
|
67
|
+
|
|
68
|
+
@Component({
|
|
69
|
+
selector: 'app-root',
|
|
70
|
+
template: '<button (click)="checkSimilarity()">Check Similarity</button>'
|
|
71
|
+
})
|
|
72
|
+
export class AppComponent {
|
|
73
|
+
checkSimilarity() {
|
|
74
|
+
const encoder = new TextEncoder();
|
|
75
|
+
const text1 = encoder.encode("Sample text");
|
|
76
|
+
const text2 = encoder.encode("Sample text modified");
|
|
77
|
+
|
|
78
|
+
const similarity = wasm_compare_data(text1, text2);
|
|
79
|
+
console.log(`Similarity: ${similarity}%`);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Vue 3 (Vite, Nuxt 3)
|
|
85
|
+
|
|
86
|
+
```vue
|
|
87
|
+
<script setup>
|
|
88
|
+
import { wasm_compare_data } from 'lavinhash';
|
|
89
|
+
|
|
90
|
+
const checkSimilarity = () => {
|
|
91
|
+
const encoder = new TextEncoder();
|
|
92
|
+
const text1 = encoder.encode("Sample text");
|
|
93
|
+
const text2 = encoder.encode("Sample text modified");
|
|
94
|
+
|
|
95
|
+
const similarity = wasm_compare_data(text1, text2);
|
|
96
|
+
console.log(`Similarity: ${similarity}%`);
|
|
97
|
+
};
|
|
98
|
+
</script>
|
|
99
|
+
|
|
100
|
+
<template>
|
|
101
|
+
<button @click="checkSimilarity">Check Similarity</button>
|
|
102
|
+
</template>
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Vanilla JavaScript (with bundler)
|
|
106
|
+
|
|
107
|
+
```javascript
|
|
108
|
+
import { wasm_compare_data, wasm_generate_hash } from 'lavinhash';
|
|
109
|
+
|
|
110
|
+
const encoder = new TextEncoder();
|
|
111
|
+
const text1 = encoder.encode("Sample text");
|
|
112
|
+
const text2 = encoder.encode("Sample text modified");
|
|
113
|
+
|
|
114
|
+
const similarity = wasm_compare_data(text1, text2);
|
|
115
|
+
console.log(`Similarity: ${similarity}%`);
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Rust
|
|
119
|
+
|
|
120
|
+
```rust
|
|
121
|
+
use lavinhash::{generate_hash, compare_hashes, HashConfig};
|
|
122
|
+
|
|
123
|
+
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
124
|
+
let data1 = b"Document content version 1";
|
|
125
|
+
let data2 = b"Document content version 2";
|
|
126
|
+
|
|
127
|
+
let config = HashConfig::default();
|
|
128
|
+
|
|
129
|
+
let hash1 = generate_hash(data1, &config)?;
|
|
130
|
+
let hash2 = generate_hash(data2, &config)?;
|
|
131
|
+
|
|
132
|
+
let similarity = compare_hashes(&hash1, &hash2, 0.3);
|
|
133
|
+
println!("Similarity: {}%", similarity);
|
|
134
|
+
|
|
135
|
+
Ok(())
|
|
136
|
+
}
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## API Reference
|
|
140
|
+
|
|
141
|
+
### JavaScript/WASM API
|
|
142
|
+
|
|
143
|
+
#### `wasm_generate_hash(data: Uint8Array): Uint8Array`
|
|
144
|
+
|
|
145
|
+
Generates a fuzzy hash fingerprint from input data.
|
|
146
|
+
|
|
147
|
+
**Parameters:**
|
|
148
|
+
- `data`: Input data as Uint8Array
|
|
149
|
+
|
|
150
|
+
**Returns:**
|
|
151
|
+
- Serialized fingerprint (approximately 1KB)
|
|
152
|
+
|
|
153
|
+
**Example:**
|
|
154
|
+
```javascript
|
|
155
|
+
const data = encoder.encode("Text to hash");
|
|
156
|
+
const hash = wasm_generate_hash(data);
|
|
157
|
+
console.log(`Hash size: ${hash.length} bytes`);
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
#### `wasm_compare_hashes(hash_a: Uint8Array, hash_b: Uint8Array): number`
|
|
161
|
+
|
|
162
|
+
Compares two previously generated hashes.
|
|
163
|
+
|
|
164
|
+
**Parameters:**
|
|
165
|
+
- `hash_a`: First fingerprint
|
|
166
|
+
- `hash_b`: Second fingerprint
|
|
167
|
+
|
|
168
|
+
**Returns:**
|
|
169
|
+
- Similarity score (0-100)
|
|
170
|
+
|
|
171
|
+
**Example:**
|
|
172
|
+
```javascript
|
|
173
|
+
const hash1 = wasm_generate_hash(data1);
|
|
174
|
+
const hash2 = wasm_generate_hash(data2);
|
|
175
|
+
const similarity = wasm_compare_hashes(hash1, hash2);
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
#### `wasm_compare_data(data_a: Uint8Array, data_b: Uint8Array): number`
|
|
179
|
+
|
|
180
|
+
Generates hashes and compares in a single operation.
|
|
181
|
+
|
|
182
|
+
**Parameters:**
|
|
183
|
+
- `data_a`: First data array
|
|
184
|
+
- `data_b`: Second data array
|
|
185
|
+
|
|
186
|
+
**Returns:**
|
|
187
|
+
- Similarity score (0-100)
|
|
188
|
+
|
|
189
|
+
**Example:**
|
|
190
|
+
```javascript
|
|
191
|
+
const similarity = wasm_compare_data(text1, text2);
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Rust API
|
|
195
|
+
|
|
196
|
+
#### `generate_hash(data: &[u8], config: &HashConfig) -> Result<FuzzyFingerprint, FingerprintError>`
|
|
197
|
+
|
|
198
|
+
Generates a fuzzy hash from input data.
|
|
199
|
+
|
|
200
|
+
**Parameters:**
|
|
201
|
+
- `data`: Input data slice
|
|
202
|
+
- `config`: Configuration options
|
|
203
|
+
|
|
204
|
+
**Returns:**
|
|
205
|
+
- `Ok(FuzzyFingerprint)`: Generated fingerprint
|
|
206
|
+
- `Err(FingerprintError)`: Error if data is invalid
|
|
207
|
+
|
|
208
|
+
#### `compare_hashes(hash_a: &FuzzyFingerprint, hash_b: &FuzzyFingerprint, alpha: f32) -> u8`
|
|
209
|
+
|
|
210
|
+
Compares two fingerprints.
|
|
211
|
+
|
|
212
|
+
**Parameters:**
|
|
213
|
+
- `hash_a`: First fingerprint
|
|
214
|
+
- `hash_b`: Second fingerprint
|
|
215
|
+
- `alpha`: Weight coefficient (0.0-1.0, default 0.3)
|
|
216
|
+
|
|
217
|
+
**Returns:**
|
|
218
|
+
- Similarity score (0-100)
|
|
219
|
+
|
|
220
|
+
#### `HashConfig`
|
|
221
|
+
|
|
222
|
+
Configuration structure for hash generation.
|
|
223
|
+
|
|
224
|
+
**Fields:**
|
|
225
|
+
- `enable_parallel: bool` - Enable parallel processing for large files (default: true)
|
|
226
|
+
- `alpha: f32` - Weight for structure vs content (default: 0.3)
|
|
227
|
+
- `min_modulus: u64` - Feature density control (default: 16)
|
|
228
|
+
|
|
229
|
+
**Example:**
|
|
230
|
+
```rust
|
|
231
|
+
let mut config = HashConfig::default();
|
|
232
|
+
config.alpha = 0.5; // 50% structure, 50% content
|
|
233
|
+
config.enable_parallel = false; // Disable parallel processing
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## Algorithm Details
|
|
237
|
+
|
|
238
|
+
### DLAH (Dual-Layer Adaptive Hashing)
|
|
239
|
+
|
|
240
|
+
LavinHash implements a three-phase pipeline:
|
|
241
|
+
|
|
242
|
+
**Phase I: Adaptive Normalization**
|
|
243
|
+
- Case folding (A-Z to a-z)
|
|
244
|
+
- Whitespace normalization
|
|
245
|
+
- Control character filtering
|
|
246
|
+
- Zero-copy iterator-based processing
|
|
247
|
+
|
|
248
|
+
**Phase II: Structural Hash**
|
|
249
|
+
- Shannon entropy calculation with adaptive block sizing
|
|
250
|
+
- Quantization to 4-bit nibbles
|
|
251
|
+
- Compact vector representation
|
|
252
|
+
- Levenshtein distance for comparison
|
|
253
|
+
|
|
254
|
+
**Phase III: Content Hash**
|
|
255
|
+
- BuzHash rolling hash algorithm
|
|
256
|
+
- Adaptive modulus scaling
|
|
257
|
+
- 8192-bit Bloom filter (1KB)
|
|
258
|
+
- Jaccard similarity for comparison
|
|
259
|
+
|
|
260
|
+
### Similarity Formula
|
|
261
|
+
|
|
262
|
+
```
|
|
263
|
+
Similarity = α × Levenshtein(Structure) + (1-α) × Jaccard(Content)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
Where:
|
|
267
|
+
- `α = 0.3` (default) - 30% weight to structure, 70% to content
|
|
268
|
+
- Levenshtein: Normalized edit distance on entropy vectors
|
|
269
|
+
- Jaccard: Set similarity on Bloom filter features
|
|
270
|
+
|
|
271
|
+
### Performance Characteristics
|
|
272
|
+
|
|
273
|
+
**Time Complexity:**
|
|
274
|
+
- Hash generation: O(n) where n is data size
|
|
275
|
+
- Hash comparison: O(1) - constant time regardless of file size
|
|
276
|
+
|
|
277
|
+
**Space Complexity:**
|
|
278
|
+
- Fingerprint size: ~1KB + O(log n) structural data
|
|
279
|
+
- Memory usage: O(1) for comparison, O(n) for generation
|
|
280
|
+
|
|
281
|
+
**Throughput:**
|
|
282
|
+
- Single-threaded: ~500 MB/s
|
|
283
|
+
- Multi-threaded: ~2 GB/s (files larger than 1MB)
|
|
284
|
+
|
|
285
|
+
## Configuration
|
|
286
|
+
|
|
287
|
+
### Basic Configuration
|
|
288
|
+
|
|
289
|
+
```rust
|
|
290
|
+
use lavinhash::HashConfig;
|
|
291
|
+
|
|
292
|
+
let config = HashConfig {
|
|
293
|
+
enable_parallel: true,
|
|
294
|
+
alpha: 0.3,
|
|
295
|
+
min_modulus: 16,
|
|
296
|
+
};
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
### Advanced Configuration
|
|
300
|
+
|
|
301
|
+
**Adjusting Structure vs Content Weight:**
|
|
302
|
+
|
|
303
|
+
```rust
|
|
304
|
+
// More weight to structure (topology)
|
|
305
|
+
config.alpha = 0.5; // 50% structure, 50% content
|
|
306
|
+
|
|
307
|
+
// More weight to content (features)
|
|
308
|
+
config.alpha = 0.1; // 10% structure, 90% content
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
**Controlling Feature Density:**
|
|
312
|
+
|
|
313
|
+
```rust
|
|
314
|
+
// Higher sensitivity (more features)
|
|
315
|
+
config.min_modulus = 8;
|
|
316
|
+
|
|
317
|
+
// Lower sensitivity (fewer features)
|
|
318
|
+
config.min_modulus = 32;
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
**Parallel Processing:**
|
|
322
|
+
|
|
323
|
+
```rust
|
|
324
|
+
// Force sequential processing
|
|
325
|
+
config.enable_parallel = false;
|
|
326
|
+
|
|
327
|
+
// Enable automatic parallel processing for files > 1MB
|
|
328
|
+
config.enable_parallel = true;
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
## Use Cases
|
|
332
|
+
|
|
333
|
+
### Document Similarity Detection
|
|
334
|
+
|
|
335
|
+
Compare different versions of documents to detect modifications and measure similarity.
|
|
336
|
+
|
|
337
|
+
```javascript
|
|
338
|
+
import { wasm_compare_data } from 'lavinhash';
|
|
339
|
+
|
|
340
|
+
// In a React/Vue/Angular app with file upload
|
|
341
|
+
async function compareDocuments(file1, file2) {
|
|
342
|
+
const buffer1 = await file1.arrayBuffer();
|
|
343
|
+
const buffer2 = await file2.arrayBuffer();
|
|
344
|
+
|
|
345
|
+
const data1 = new Uint8Array(buffer1);
|
|
346
|
+
const data2 = new Uint8Array(buffer2);
|
|
347
|
+
|
|
348
|
+
const similarity = wasm_compare_data(data1, data2);
|
|
349
|
+
console.log(`Similarity: ${similarity}%`);
|
|
350
|
+
return similarity;
|
|
351
|
+
}
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
### Duplicate Detection
|
|
355
|
+
|
|
356
|
+
Identify duplicate or near-duplicate files in large datasets.
|
|
357
|
+
|
|
358
|
+
```rust
|
|
359
|
+
let files = vec![file1, file2, file3];
|
|
360
|
+
let hashes: Vec<_> = files.iter()
|
|
361
|
+
.map(|f| generate_hash(f, &config).unwrap())
|
|
362
|
+
.collect();
|
|
363
|
+
|
|
364
|
+
for i in 0..hashes.len() {
|
|
365
|
+
for j in i+1..hashes.len() {
|
|
366
|
+
let sim = compare_hashes(&hashes[i], &hashes[j], 0.3);
|
|
367
|
+
if sim > 90 {
|
|
368
|
+
println!("Files {} and {} are similar: {}%", i, j, sim);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
### Version Tracking
|
|
375
|
+
|
|
376
|
+
Track changes between different versions of files or content.
|
|
377
|
+
|
|
378
|
+
```javascript
|
|
379
|
+
import { wasm_generate_hash, wasm_compare_hashes } from 'lavinhash';
|
|
380
|
+
|
|
381
|
+
// Compare multiple file versions
|
|
382
|
+
async function trackVersions(files) {
|
|
383
|
+
const encoder = new TextEncoder();
|
|
384
|
+
|
|
385
|
+
const hashes = files.map(content => {
|
|
386
|
+
const data = encoder.encode(content);
|
|
387
|
+
return wasm_generate_hash(data);
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
const results = [];
|
|
391
|
+
for (let i = 0; i < hashes.length - 1; i++) {
|
|
392
|
+
const sim = wasm_compare_hashes(hashes[i], hashes[i + 1]);
|
|
393
|
+
results.push({
|
|
394
|
+
from: `v${i+1}`,
|
|
395
|
+
to: `v${i+2}`,
|
|
396
|
+
similarity: sim
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
return results;
|
|
401
|
+
}
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
## Framework Compatibility
|
|
405
|
+
|
|
406
|
+
LavinHash works seamlessly with all modern JavaScript frameworks and build tools:
|
|
407
|
+
|
|
408
|
+
- **React**: Vite, Create React App, Next.js, Remix
|
|
409
|
+
- **Angular**: Angular CLI (v12+)
|
|
410
|
+
- **Vue**: Vue 3, Nuxt 3, Vite
|
|
411
|
+
- **Svelte**: SvelteKit, Vite
|
|
412
|
+
- **Build Tools**: Webpack 5+, Vite, Rollup, Parcel, esbuild
|
|
413
|
+
|
|
414
|
+
The library uses ES modules and is optimized for modern bundlers.
|
|
415
|
+
|
|
416
|
+
## Building WASM
|
|
417
|
+
|
|
418
|
+
To build the WebAssembly bindings:
|
|
419
|
+
|
|
420
|
+
```bash
|
|
421
|
+
# Install wasm-pack
|
|
422
|
+
cargo install wasm-pack
|
|
423
|
+
|
|
424
|
+
# Build for modern bundlers (React, Angular, Vue, etc.)
|
|
425
|
+
wasm-pack build --target bundler --out-dir pkg --out-name lavinhash
|
|
426
|
+
|
|
427
|
+
# The compiled files will be in the pkg/ directory
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
## Testing
|
|
431
|
+
|
|
432
|
+
### Rust Tests
|
|
433
|
+
|
|
434
|
+
```bash
|
|
435
|
+
# Run all tests
|
|
436
|
+
cargo test
|
|
437
|
+
|
|
438
|
+
# Run tests with output
|
|
439
|
+
cargo test -- --nocapture
|
|
440
|
+
|
|
441
|
+
# Run specific test
|
|
442
|
+
cargo test test_generate_hash_basic
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
### Benchmarks
|
|
446
|
+
|
|
447
|
+
```bash
|
|
448
|
+
# Run benchmarks
|
|
449
|
+
cargo bench
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
## Technical Specifications
|
|
453
|
+
|
|
454
|
+
**Fingerprint Format:**
|
|
455
|
+
|
|
456
|
+
```
|
|
457
|
+
Offset | Field | Type | Size
|
|
458
|
+
-------|------------------|----------|-------------
|
|
459
|
+
0x00 | Magic | u8 | 1 byte (0x48)
|
|
460
|
+
0x01 | Version | u8 | 1 byte (0x01)
|
|
461
|
+
0x02 | Struct Length | u16 LE | 2 bytes
|
|
462
|
+
0x04 | Content Bloom | u64[128] | 1024 bytes
|
|
463
|
+
0x404 | Structural Data | u8[] | Variable
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
**Cross-Platform Determinism:**
|
|
467
|
+
- Identical input produces identical hash on all platforms
|
|
468
|
+
- Little-endian byte ordering
|
|
469
|
+
- IEEE 754 floating-point compliance
|
|
470
|
+
|
|
471
|
+
**Thread Safety:**
|
|
472
|
+
- Hash generation is thread-safe
|
|
473
|
+
- Parallel processing uses Rayon for data parallelism
|
|
474
|
+
- No global state or locks
|
|
475
|
+
|
|
476
|
+
## Examples
|
|
477
|
+
|
|
478
|
+
See the `examples/` directory for complete working examples:
|
|
479
|
+
|
|
480
|
+
- `basic_usage.rs` - Rust usage examples
|
|
481
|
+
- `javascript_example.js` - Node.js integration
|
|
482
|
+
- `browser_example.html` - Browser-based demo
|
|
483
|
+
|
|
484
|
+
## Documentation
|
|
485
|
+
|
|
486
|
+
- **API Documentation**: Available at [docs.rs/lavinhash](https://docs.rs/lavinhash)
|
|
487
|
+
- **Technical Specification**: See `docs/TECHNICAL.md` in the repository
|
|
488
|
+
- **Contributing Guide**: See `CONTRIBUTING.md`
|
|
489
|
+
|
|
490
|
+
## License
|
|
491
|
+
|
|
492
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
493
|
+
|
|
494
|
+
## Repository
|
|
495
|
+
|
|
496
|
+
Source code: [https://github.com/RafaCalRob/LavinHash](https://github.com/RafaCalRob/LavinHash)
|
|
497
|
+
|
|
498
|
+
## Support
|
|
499
|
+
|
|
500
|
+
For bug reports and feature requests, please open an issue on GitHub:
|
|
501
|
+
[https://github.com/RafaCalRob/LavinHash/issues](https://github.com/RafaCalRob/LavinHash/issues)
|
package/lavinhash.d.ts
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/* tslint:disable */
|
|
2
|
+
/* eslint-disable */
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Initialize the WASM module
|
|
6
|
+
*/
|
|
7
|
+
export function init(): void;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Generate hash and compare in one step (WASM wrapper)
|
|
11
|
+
*
|
|
12
|
+
* # Arguments
|
|
13
|
+
* * `data_a` - First data as Uint8Array
|
|
14
|
+
* * `data_b` - Second data as Uint8Array
|
|
15
|
+
*
|
|
16
|
+
* # Returns
|
|
17
|
+
* Similarity score 0-100
|
|
18
|
+
*/
|
|
19
|
+
export function wasm_compare_data(data_a: Uint8Array, data_b: Uint8Array): number;
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Compare two fuzzy hashes (WASM wrapper)
|
|
23
|
+
*
|
|
24
|
+
* # Arguments
|
|
25
|
+
* * `hash_a` - First fingerprint (serialized)
|
|
26
|
+
* * `hash_b` - Second fingerprint (serialized)
|
|
27
|
+
*
|
|
28
|
+
* # Returns
|
|
29
|
+
* Similarity score 0-100
|
|
30
|
+
*/
|
|
31
|
+
export function wasm_compare_hashes(hash_a: Uint8Array, hash_b: Uint8Array): number;
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Get fingerprint size in bytes (WASM wrapper)
|
|
35
|
+
*/
|
|
36
|
+
export function wasm_fingerprint_size(hash: Uint8Array): number;
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Generate a fuzzy hash from data (WASM wrapper)
|
|
40
|
+
*
|
|
41
|
+
* # Arguments
|
|
42
|
+
* * `data` - Input data as Uint8Array
|
|
43
|
+
*
|
|
44
|
+
* # Returns
|
|
45
|
+
* Serialized fingerprint as Uint8Array
|
|
46
|
+
*/
|
|
47
|
+
export function wasm_generate_hash(data: Uint8Array): Uint8Array;
|
package/lavinhash.js
ADDED
package/lavinhash_bg.js
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
let wasm;
|
|
2
|
+
export function __wbg_set_wasm(val) {
|
|
3
|
+
wasm = val;
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
function addHeapObject(obj) {
|
|
7
|
+
if (heap_next === heap.length) heap.push(heap.length + 1);
|
|
8
|
+
const idx = heap_next;
|
|
9
|
+
heap_next = heap[idx];
|
|
10
|
+
|
|
11
|
+
heap[idx] = obj;
|
|
12
|
+
return idx;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function dropObject(idx) {
|
|
16
|
+
if (idx < 132) return;
|
|
17
|
+
heap[idx] = heap_next;
|
|
18
|
+
heap_next = idx;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function getArrayU8FromWasm0(ptr, len) {
|
|
22
|
+
ptr = ptr >>> 0;
|
|
23
|
+
return getUint8ArrayMemory0().subarray(ptr / 1, ptr / 1 + len);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
let cachedDataViewMemory0 = null;
|
|
27
|
+
function getDataViewMemory0() {
|
|
28
|
+
if (cachedDataViewMemory0 === null || cachedDataViewMemory0.buffer.detached === true || (cachedDataViewMemory0.buffer.detached === undefined && cachedDataViewMemory0.buffer !== wasm.memory.buffer)) {
|
|
29
|
+
cachedDataViewMemory0 = new DataView(wasm.memory.buffer);
|
|
30
|
+
}
|
|
31
|
+
return cachedDataViewMemory0;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function getStringFromWasm0(ptr, len) {
|
|
35
|
+
ptr = ptr >>> 0;
|
|
36
|
+
return decodeText(ptr, len);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
let cachedUint8ArrayMemory0 = null;
|
|
40
|
+
function getUint8ArrayMemory0() {
|
|
41
|
+
if (cachedUint8ArrayMemory0 === null || cachedUint8ArrayMemory0.byteLength === 0) {
|
|
42
|
+
cachedUint8ArrayMemory0 = new Uint8Array(wasm.memory.buffer);
|
|
43
|
+
}
|
|
44
|
+
return cachedUint8ArrayMemory0;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function getObject(idx) { return heap[idx]; }
|
|
48
|
+
|
|
49
|
+
let heap = new Array(128).fill(undefined);
|
|
50
|
+
heap.push(undefined, null, true, false);
|
|
51
|
+
|
|
52
|
+
let heap_next = heap.length;
|
|
53
|
+
|
|
54
|
+
function passArray8ToWasm0(arg, malloc) {
|
|
55
|
+
const ptr = malloc(arg.length * 1, 1) >>> 0;
|
|
56
|
+
getUint8ArrayMemory0().set(arg, ptr / 1);
|
|
57
|
+
WASM_VECTOR_LEN = arg.length;
|
|
58
|
+
return ptr;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function passStringToWasm0(arg, malloc, realloc) {
|
|
62
|
+
if (realloc === undefined) {
|
|
63
|
+
const buf = cachedTextEncoder.encode(arg);
|
|
64
|
+
const ptr = malloc(buf.length, 1) >>> 0;
|
|
65
|
+
getUint8ArrayMemory0().subarray(ptr, ptr + buf.length).set(buf);
|
|
66
|
+
WASM_VECTOR_LEN = buf.length;
|
|
67
|
+
return ptr;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
let len = arg.length;
|
|
71
|
+
let ptr = malloc(len, 1) >>> 0;
|
|
72
|
+
|
|
73
|
+
const mem = getUint8ArrayMemory0();
|
|
74
|
+
|
|
75
|
+
let offset = 0;
|
|
76
|
+
|
|
77
|
+
for (; offset < len; offset++) {
|
|
78
|
+
const code = arg.charCodeAt(offset);
|
|
79
|
+
if (code > 0x7F) break;
|
|
80
|
+
mem[ptr + offset] = code;
|
|
81
|
+
}
|
|
82
|
+
if (offset !== len) {
|
|
83
|
+
if (offset !== 0) {
|
|
84
|
+
arg = arg.slice(offset);
|
|
85
|
+
}
|
|
86
|
+
ptr = realloc(ptr, len, len = offset + arg.length * 3, 1) >>> 0;
|
|
87
|
+
const view = getUint8ArrayMemory0().subarray(ptr + offset, ptr + len);
|
|
88
|
+
const ret = cachedTextEncoder.encodeInto(arg, view);
|
|
89
|
+
|
|
90
|
+
offset += ret.written;
|
|
91
|
+
ptr = realloc(ptr, len, offset, 1) >>> 0;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
WASM_VECTOR_LEN = offset;
|
|
95
|
+
return ptr;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
function takeObject(idx) {
|
|
99
|
+
const ret = getObject(idx);
|
|
100
|
+
dropObject(idx);
|
|
101
|
+
return ret;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
let cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
|
|
105
|
+
cachedTextDecoder.decode();
|
|
106
|
+
const MAX_SAFARI_DECODE_BYTES = 2146435072;
|
|
107
|
+
let numBytesDecoded = 0;
|
|
108
|
+
function decodeText(ptr, len) {
|
|
109
|
+
numBytesDecoded += len;
|
|
110
|
+
if (numBytesDecoded >= MAX_SAFARI_DECODE_BYTES) {
|
|
111
|
+
cachedTextDecoder = new TextDecoder('utf-8', { ignoreBOM: true, fatal: true });
|
|
112
|
+
cachedTextDecoder.decode();
|
|
113
|
+
numBytesDecoded = len;
|
|
114
|
+
}
|
|
115
|
+
return cachedTextDecoder.decode(getUint8ArrayMemory0().subarray(ptr, ptr + len));
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const cachedTextEncoder = new TextEncoder();
|
|
119
|
+
|
|
120
|
+
if (!('encodeInto' in cachedTextEncoder)) {
|
|
121
|
+
cachedTextEncoder.encodeInto = function (arg, view) {
|
|
122
|
+
const buf = cachedTextEncoder.encode(arg);
|
|
123
|
+
view.set(buf);
|
|
124
|
+
return {
|
|
125
|
+
read: arg.length,
|
|
126
|
+
written: buf.length
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
let WASM_VECTOR_LEN = 0;
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Initialize the WASM module
|
|
135
|
+
*/
|
|
136
|
+
export function init() {
|
|
137
|
+
wasm.init();
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Generate hash and compare in one step (WASM wrapper)
|
|
142
|
+
*
|
|
143
|
+
* # Arguments
|
|
144
|
+
* * `data_a` - First data as Uint8Array
|
|
145
|
+
* * `data_b` - Second data as Uint8Array
|
|
146
|
+
*
|
|
147
|
+
* # Returns
|
|
148
|
+
* Similarity score 0-100
|
|
149
|
+
* @param {Uint8Array} data_a
|
|
150
|
+
* @param {Uint8Array} data_b
|
|
151
|
+
* @returns {number}
|
|
152
|
+
*/
|
|
153
|
+
export function wasm_compare_data(data_a, data_b) {
|
|
154
|
+
try {
|
|
155
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
156
|
+
const ptr0 = passArray8ToWasm0(data_a, wasm.__wbindgen_export2);
|
|
157
|
+
const len0 = WASM_VECTOR_LEN;
|
|
158
|
+
const ptr1 = passArray8ToWasm0(data_b, wasm.__wbindgen_export2);
|
|
159
|
+
const len1 = WASM_VECTOR_LEN;
|
|
160
|
+
wasm.wasm_compare_data(retptr, ptr0, len0, ptr1, len1);
|
|
161
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
162
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
163
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
164
|
+
if (r2) {
|
|
165
|
+
throw takeObject(r1);
|
|
166
|
+
}
|
|
167
|
+
return r0;
|
|
168
|
+
} finally {
|
|
169
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Compare two fuzzy hashes (WASM wrapper)
|
|
175
|
+
*
|
|
176
|
+
* # Arguments
|
|
177
|
+
* * `hash_a` - First fingerprint (serialized)
|
|
178
|
+
* * `hash_b` - Second fingerprint (serialized)
|
|
179
|
+
*
|
|
180
|
+
* # Returns
|
|
181
|
+
* Similarity score 0-100
|
|
182
|
+
* @param {Uint8Array} hash_a
|
|
183
|
+
* @param {Uint8Array} hash_b
|
|
184
|
+
* @returns {number}
|
|
185
|
+
*/
|
|
186
|
+
export function wasm_compare_hashes(hash_a, hash_b) {
|
|
187
|
+
try {
|
|
188
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
189
|
+
const ptr0 = passArray8ToWasm0(hash_a, wasm.__wbindgen_export2);
|
|
190
|
+
const len0 = WASM_VECTOR_LEN;
|
|
191
|
+
const ptr1 = passArray8ToWasm0(hash_b, wasm.__wbindgen_export2);
|
|
192
|
+
const len1 = WASM_VECTOR_LEN;
|
|
193
|
+
wasm.wasm_compare_hashes(retptr, ptr0, len0, ptr1, len1);
|
|
194
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
195
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
196
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
197
|
+
if (r2) {
|
|
198
|
+
throw takeObject(r1);
|
|
199
|
+
}
|
|
200
|
+
return r0;
|
|
201
|
+
} finally {
|
|
202
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Get fingerprint size in bytes (WASM wrapper)
|
|
208
|
+
* @param {Uint8Array} hash
|
|
209
|
+
* @returns {number}
|
|
210
|
+
*/
|
|
211
|
+
export function wasm_fingerprint_size(hash) {
|
|
212
|
+
try {
|
|
213
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
214
|
+
const ptr0 = passArray8ToWasm0(hash, wasm.__wbindgen_export2);
|
|
215
|
+
const len0 = WASM_VECTOR_LEN;
|
|
216
|
+
wasm.wasm_fingerprint_size(retptr, ptr0, len0);
|
|
217
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
218
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
219
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
220
|
+
if (r2) {
|
|
221
|
+
throw takeObject(r1);
|
|
222
|
+
}
|
|
223
|
+
return r0 >>> 0;
|
|
224
|
+
} finally {
|
|
225
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Generate a fuzzy hash from data (WASM wrapper)
|
|
231
|
+
*
|
|
232
|
+
* # Arguments
|
|
233
|
+
* * `data` - Input data as Uint8Array
|
|
234
|
+
*
|
|
235
|
+
* # Returns
|
|
236
|
+
* Serialized fingerprint as Uint8Array
|
|
237
|
+
* @param {Uint8Array} data
|
|
238
|
+
* @returns {Uint8Array}
|
|
239
|
+
*/
|
|
240
|
+
export function wasm_generate_hash(data) {
|
|
241
|
+
try {
|
|
242
|
+
const retptr = wasm.__wbindgen_add_to_stack_pointer(-16);
|
|
243
|
+
const ptr0 = passArray8ToWasm0(data, wasm.__wbindgen_export2);
|
|
244
|
+
const len0 = WASM_VECTOR_LEN;
|
|
245
|
+
wasm.wasm_generate_hash(retptr, ptr0, len0);
|
|
246
|
+
var r0 = getDataViewMemory0().getInt32(retptr + 4 * 0, true);
|
|
247
|
+
var r1 = getDataViewMemory0().getInt32(retptr + 4 * 1, true);
|
|
248
|
+
var r2 = getDataViewMemory0().getInt32(retptr + 4 * 2, true);
|
|
249
|
+
var r3 = getDataViewMemory0().getInt32(retptr + 4 * 3, true);
|
|
250
|
+
if (r3) {
|
|
251
|
+
throw takeObject(r2);
|
|
252
|
+
}
|
|
253
|
+
var v2 = getArrayU8FromWasm0(r0, r1).slice();
|
|
254
|
+
wasm.__wbindgen_export(r0, r1 * 1, 1);
|
|
255
|
+
return v2;
|
|
256
|
+
} finally {
|
|
257
|
+
wasm.__wbindgen_add_to_stack_pointer(16);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
export function __wbg_error_7534b8e9a36f1ab4(arg0, arg1) {
|
|
262
|
+
let deferred0_0;
|
|
263
|
+
let deferred0_1;
|
|
264
|
+
try {
|
|
265
|
+
deferred0_0 = arg0;
|
|
266
|
+
deferred0_1 = arg1;
|
|
267
|
+
console.error(getStringFromWasm0(arg0, arg1));
|
|
268
|
+
} finally {
|
|
269
|
+
wasm.__wbindgen_export(deferred0_0, deferred0_1, 1);
|
|
270
|
+
}
|
|
271
|
+
};
|
|
272
|
+
|
|
273
|
+
export function __wbg_new_8a6f238a6ece86ea() {
|
|
274
|
+
const ret = new Error();
|
|
275
|
+
return addHeapObject(ret);
|
|
276
|
+
};
|
|
277
|
+
|
|
278
|
+
export function __wbg_stack_0ed75d68575b0f3c(arg0, arg1) {
|
|
279
|
+
const ret = getObject(arg1).stack;
|
|
280
|
+
const ptr1 = passStringToWasm0(ret, wasm.__wbindgen_export2, wasm.__wbindgen_export3);
|
|
281
|
+
const len1 = WASM_VECTOR_LEN;
|
|
282
|
+
getDataViewMemory0().setInt32(arg0 + 4 * 1, len1, true);
|
|
283
|
+
getDataViewMemory0().setInt32(arg0 + 4 * 0, ptr1, true);
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
export function __wbindgen_cast_2241b6af4c4b2941(arg0, arg1) {
|
|
287
|
+
// Cast intrinsic for `Ref(String) -> Externref`.
|
|
288
|
+
const ret = getStringFromWasm0(arg0, arg1);
|
|
289
|
+
return addHeapObject(ret);
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
export function __wbindgen_object_drop_ref(arg0) {
|
|
293
|
+
takeObject(arg0);
|
|
294
|
+
};
|
|
Binary file
|
package/package.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "lavinhash",
|
|
3
|
+
"type": "module",
|
|
4
|
+
"collaborators": [
|
|
5
|
+
"LavinHash Contributors"
|
|
6
|
+
],
|
|
7
|
+
"description": "High-performance fuzzy hashing library implementing the DLAH (Dual-Layer Adaptive Hashing) algorithm",
|
|
8
|
+
"version": "1.0.0",
|
|
9
|
+
"license": "MIT",
|
|
10
|
+
"repository": {
|
|
11
|
+
"type": "git",
|
|
12
|
+
"url": "https://github.com/RafaCalRob/lavinhash"
|
|
13
|
+
},
|
|
14
|
+
"files": [
|
|
15
|
+
"lavinhash_bg.wasm",
|
|
16
|
+
"lavinhash.js",
|
|
17
|
+
"lavinhash_bg.js",
|
|
18
|
+
"lavinhash.d.ts"
|
|
19
|
+
],
|
|
20
|
+
"main": "lavinhash.js",
|
|
21
|
+
"homepage": "https://bdovenbird.com/lavinhash/",
|
|
22
|
+
"types": "lavinhash.d.ts",
|
|
23
|
+
"sideEffects": [
|
|
24
|
+
"./lavinhash.js",
|
|
25
|
+
"./snippets/*"
|
|
26
|
+
],
|
|
27
|
+
"keywords": [
|
|
28
|
+
"fuzzy-hashing",
|
|
29
|
+
"similarity",
|
|
30
|
+
"hash",
|
|
31
|
+
"fingerprint",
|
|
32
|
+
"dlah",
|
|
33
|
+
"wasm",
|
|
34
|
+
"webassembly",
|
|
35
|
+
"rust",
|
|
36
|
+
"duplicate-detection",
|
|
37
|
+
"text-similarity",
|
|
38
|
+
"file-similarity",
|
|
39
|
+
"content-hashing",
|
|
40
|
+
"bloom-filter",
|
|
41
|
+
"react",
|
|
42
|
+
"angular",
|
|
43
|
+
"vue",
|
|
44
|
+
"typescript"
|
|
45
|
+
]
|
|
46
|
+
}
|