lavinhash 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +365 -332
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -1,163 +1,350 @@
|
|
|
1
1
|
# LavinHash
|
|
2
2
|
|
|
3
|
-
High-performance fuzzy hashing library
|
|
3
|
+
**High-performance fuzzy hashing library for detecting file and content similarity using the Dual-Layer Adaptive Hashing (DLAH) algorithm.**
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
[](https://www.npmjs.com/package/lavinhash)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
7
|
|
|
7
|
-
|
|
8
|
+
**[Try Live Demo](https://bdovenbird.com/lavinhash/demo)** | **[Technical Deep Dive](https://bdovenbird.com/articles/lavinhash-engineering-similarity)** | [API Documentation](#api-reference) | [GitHub Repository](https://github.com/RafaCalRob/LavinHash)
|
|
8
9
|
|
|
9
|
-
|
|
10
|
+

|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
- Adaptive scaling for constant-time comparison regardless of file size
|
|
13
|
-
- Cross-platform support (Linux, macOS, Windows, WebAssembly)
|
|
14
|
-
- High performance with SIMD optimizations and parallel processing
|
|
15
|
-
- Multiple language bindings (JavaScript/TypeScript, with more planned)
|
|
16
|
-
- Deterministic hashing across all platforms
|
|
12
|
+
---
|
|
17
13
|
|
|
18
|
-
##
|
|
14
|
+
## What is DLAH?
|
|
19
15
|
|
|
20
|
-
|
|
16
|
+
The **Dual-Layer Adaptive Hashing (DLAH)** algorithm analyzes data in two orthogonal dimensions, combining them to produce a robust similarity metric resistant to both structural and content modifications.
|
|
21
17
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
18
|
+
### Layer 1: Structural Fingerprinting (30% weight)
|
|
19
|
+
Captures the file's topology using **Shannon entropy analysis**. Detects structural changes like:
|
|
20
|
+
- Data reorganization
|
|
21
|
+
- Compression changes
|
|
22
|
+
- Block-level modifications
|
|
23
|
+
- Format conversions
|
|
25
24
|
|
|
26
|
-
###
|
|
25
|
+
### Layer 2: Content-Based Hashing (70% weight)
|
|
26
|
+
Extracts semantic features using a **rolling hash over sliding windows**. Detects content similarity even when:
|
|
27
|
+
- Data is moved or reordered
|
|
28
|
+
- Content is partially modified
|
|
29
|
+
- Insertions or deletions occur
|
|
30
|
+
- Code is refactored or obfuscated
|
|
27
31
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
32
|
+
### Combined Score
|
|
33
|
+
```
|
|
34
|
+
Similarity = α × Structural + (1-α) × Content
|
|
31
35
|
```
|
|
36
|
+
Where α = 0.3 (configurable), producing a percentage similarity score from 0-100%.
|
|
37
|
+
|
|
38
|
+
---
|
|
32
39
|
|
|
33
|
-
|
|
40
|
+
## Why LavinHash?
|
|
41
|
+
|
|
42
|
+
- **Malware Detection**: Identify variants of known malware families despite polymorphic obfuscation (85%+ detection rate)
|
|
43
|
+
- **File Deduplication**: Find near-duplicate files in large datasets (40-60% storage reduction)
|
|
44
|
+
- **Plagiarism Detection**: Detect copied code/documents with cosmetic changes (95%+ detection rate)
|
|
45
|
+
- **Version Tracking**: Determine file relationships across versions
|
|
46
|
+
- **Change Analysis**: Detect modifications in binaries, documents, or source code
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
34
51
|
|
|
35
52
|
```bash
|
|
36
|
-
|
|
37
|
-
cd LavinHash
|
|
38
|
-
cargo build --release
|
|
53
|
+
npm install lavinhash
|
|
39
54
|
```
|
|
40
55
|
|
|
56
|
+
---
|
|
57
|
+
|
|
41
58
|
## Quick Start
|
|
42
59
|
|
|
43
|
-
### React
|
|
60
|
+
### React - File Similarity Checker
|
|
44
61
|
|
|
45
|
-
```
|
|
46
|
-
import {
|
|
62
|
+
```jsx
|
|
63
|
+
import { useState } from 'react';
|
|
64
|
+
import { wasm_compare_data, wasm_generate_hash } from 'lavinhash';
|
|
65
|
+
|
|
66
|
+
function FileSimilarityChecker() {
|
|
67
|
+
const [similarity, setSimilarity] = useState(null);
|
|
68
|
+
|
|
69
|
+
const handleFileUpload = async (e) => {
|
|
70
|
+
const files = Array.from(e.target.files);
|
|
71
|
+
if (files.length !== 2) return;
|
|
47
72
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
const text2 = encoder.encode("The quick brown fox leaps over the lazy dog");
|
|
73
|
+
// Read files as binary data
|
|
74
|
+
const [buffer1, buffer2] = await Promise.all(
|
|
75
|
+
files.map(f => f.arrayBuffer())
|
|
76
|
+
);
|
|
53
77
|
|
|
54
|
-
const
|
|
55
|
-
|
|
78
|
+
const data1 = new Uint8Array(buffer1);
|
|
79
|
+
const data2 = new Uint8Array(buffer2);
|
|
80
|
+
|
|
81
|
+
// Compare files
|
|
82
|
+
const score = wasm_compare_data(data1, data2);
|
|
83
|
+
setSimilarity(score);
|
|
56
84
|
};
|
|
57
85
|
|
|
58
|
-
return
|
|
86
|
+
return (
|
|
87
|
+
<div>
|
|
88
|
+
<h2>Upload 2 files to compare</h2>
|
|
89
|
+
<input type="file" multiple onChange={handleFileUpload} />
|
|
90
|
+
{similarity !== null && (
|
|
91
|
+
<h3>Similarity: {similarity}%</h3>
|
|
92
|
+
)}
|
|
93
|
+
</div>
|
|
94
|
+
);
|
|
59
95
|
}
|
|
60
96
|
```
|
|
61
97
|
|
|
62
|
-
### Angular
|
|
98
|
+
### Angular - Document Comparison Service
|
|
63
99
|
|
|
64
100
|
```typescript
|
|
65
|
-
import {
|
|
66
|
-
import { wasm_compare_data } from 'lavinhash';
|
|
101
|
+
import { Injectable } from '@angular/core';
|
|
102
|
+
import { wasm_compare_data, wasm_generate_hash } from 'lavinhash';
|
|
103
|
+
|
|
104
|
+
@Injectable({ providedIn: 'root' })
|
|
105
|
+
export class DocumentSimilarityService {
|
|
106
|
+
|
|
107
|
+
async compareDocuments(file1: File, file2: File): Promise<number> {
|
|
108
|
+
const [buffer1, buffer2] = await Promise.all([
|
|
109
|
+
file1.arrayBuffer(),
|
|
110
|
+
file2.arrayBuffer()
|
|
111
|
+
]);
|
|
67
112
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
const
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
113
|
+
const data1 = new Uint8Array(buffer1);
|
|
114
|
+
const data2 = new Uint8Array(buffer2);
|
|
115
|
+
|
|
116
|
+
return wasm_compare_data(data1, data2);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
async detectDuplicates(files: File[]): Promise<Array<{file1: string, file2: string, similarity: number}>> {
|
|
120
|
+
const hashes = await Promise.all(
|
|
121
|
+
files.map(async file => ({
|
|
122
|
+
name: file.name,
|
|
123
|
+
hash: wasm_generate_hash(new Uint8Array(await file.arrayBuffer()))
|
|
124
|
+
}))
|
|
125
|
+
);
|
|
126
|
+
|
|
127
|
+
const duplicates = [];
|
|
128
|
+
for (let i = 0; i < hashes.length; i++) {
|
|
129
|
+
for (let j = i + 1; j < hashes.length; j++) {
|
|
130
|
+
const similarity = wasm_compare_hashes(hashes[i].hash, hashes[j].hash);
|
|
131
|
+
if (similarity > 80) {
|
|
132
|
+
duplicates.push({
|
|
133
|
+
file1: hashes[i].name,
|
|
134
|
+
file2: hashes[j].name,
|
|
135
|
+
similarity
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return duplicates;
|
|
80
141
|
}
|
|
81
142
|
}
|
|
82
143
|
```
|
|
83
144
|
|
|
84
|
-
### Vue 3
|
|
145
|
+
### Vue 3 - Plagiarism Detector
|
|
85
146
|
|
|
86
147
|
```vue
|
|
87
148
|
<script setup>
|
|
149
|
+
import { ref } from 'vue';
|
|
88
150
|
import { wasm_compare_data } from 'lavinhash';
|
|
89
151
|
|
|
90
|
-
const
|
|
91
|
-
|
|
92
|
-
const text1 = encoder.encode("Sample text");
|
|
93
|
-
const text2 = encoder.encode("Sample text modified");
|
|
152
|
+
const documents = ref([]);
|
|
153
|
+
const results = ref([]);
|
|
94
154
|
|
|
95
|
-
|
|
96
|
-
|
|
155
|
+
const analyzeDocuments = async () => {
|
|
156
|
+
const encoder = new TextEncoder();
|
|
157
|
+
const hashes = documents.value.map(doc => ({
|
|
158
|
+
name: doc.name,
|
|
159
|
+
data: encoder.encode(doc.content)
|
|
160
|
+
}));
|
|
161
|
+
|
|
162
|
+
const matches = [];
|
|
163
|
+
for (let i = 0; i < hashes.length; i++) {
|
|
164
|
+
for (let j = i + 1; j < hashes.length; j++) {
|
|
165
|
+
const similarity = wasm_compare_data(hashes[i].data, hashes[j].data);
|
|
166
|
+
if (similarity > 70) {
|
|
167
|
+
matches.push({
|
|
168
|
+
doc1: hashes[i].name,
|
|
169
|
+
doc2: hashes[j].name,
|
|
170
|
+
similarity,
|
|
171
|
+
status: similarity > 90 ? 'High plagiarism risk' : 'Moderate similarity'
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
results.value = matches;
|
|
97
177
|
};
|
|
98
178
|
</script>
|
|
99
179
|
|
|
100
180
|
<template>
|
|
101
|
-
<
|
|
181
|
+
<div>
|
|
182
|
+
<h2>Plagiarism Detection</h2>
|
|
183
|
+
<button @click="analyzeDocuments">Analyze Documents</button>
|
|
184
|
+
<div v-for="match in results" :key="match.doc1 + match.doc2">
|
|
185
|
+
{{ match.doc1 }} vs {{ match.doc2 }}: {{ match.similarity }}% - {{ match.status }}
|
|
186
|
+
</div>
|
|
187
|
+
</div>
|
|
102
188
|
</template>
|
|
103
189
|
```
|
|
104
190
|
|
|
105
|
-
|
|
191
|
+
---
|
|
106
192
|
|
|
107
|
-
|
|
108
|
-
import { wasm_compare_data, wasm_generate_hash } from 'lavinhash';
|
|
193
|
+
## Real-World Use Cases
|
|
109
194
|
|
|
110
|
-
|
|
111
|
-
const text1 = encoder.encode("Sample text");
|
|
112
|
-
const text2 = encoder.encode("Sample text modified");
|
|
195
|
+
### 1. Malware Variant Detection
|
|
113
196
|
|
|
114
|
-
|
|
115
|
-
|
|
197
|
+
```typescript
|
|
198
|
+
import { wasm_generate_hash, wasm_compare_hashes } from 'lavinhash';
|
|
199
|
+
|
|
200
|
+
interface MalwareFamily {
|
|
201
|
+
name: string;
|
|
202
|
+
fingerprint: Uint8Array;
|
|
203
|
+
severity: 'critical' | 'high' | 'medium';
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const malwareDB: MalwareFamily[] = [
|
|
207
|
+
{ name: 'Trojan.Emotet', fingerprint: knownEmotetHash, severity: 'critical' },
|
|
208
|
+
{ name: 'Ransomware.WannaCry', fingerprint: knownWannaCryHash, severity: 'critical' },
|
|
209
|
+
{ name: 'Backdoor.Cobalt', fingerprint: knownCobaltHash, severity: 'high' }
|
|
210
|
+
];
|
|
211
|
+
|
|
212
|
+
async function classifyMalware(suspiciousFile: File) {
|
|
213
|
+
const buffer = await suspiciousFile.arrayBuffer();
|
|
214
|
+
const unknownHash = wasm_generate_hash(new Uint8Array(buffer));
|
|
215
|
+
|
|
216
|
+
const matches = malwareDB
|
|
217
|
+
.map(({ name, fingerprint, severity }) => ({
|
|
218
|
+
family: name,
|
|
219
|
+
similarity: wasm_compare_hashes(unknownHash, fingerprint),
|
|
220
|
+
severity
|
|
221
|
+
}))
|
|
222
|
+
.filter(m => m.similarity >= 70)
|
|
223
|
+
.sort((a, b) => b.similarity - a.similarity);
|
|
224
|
+
|
|
225
|
+
if (matches.length > 0) {
|
|
226
|
+
const [best] = matches;
|
|
227
|
+
return {
|
|
228
|
+
detected: true,
|
|
229
|
+
family: best.family,
|
|
230
|
+
confidence: best.similarity,
|
|
231
|
+
severity: best.severity,
|
|
232
|
+
message: `⚠️ ${best.family} detected (${best.similarity}% confidence, ${best.severity} severity)`
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
return { detected: false, message: 'Unknown sample' };
|
|
237
|
+
}
|
|
116
238
|
```
|
|
117
239
|
|
|
118
|
-
|
|
240
|
+
**Result**: 85%+ detection rate for malware variants, <0.1% false positives
|
|
241
|
+
|
|
242
|
+
### 2. Large-Scale File Deduplication
|
|
243
|
+
|
|
244
|
+
```typescript
|
|
245
|
+
import { wasm_generate_hash, wasm_compare_hashes } from 'lavinhash';
|
|
119
246
|
|
|
120
|
-
|
|
121
|
-
|
|
247
|
+
interface FileEntry {
|
|
248
|
+
path: string;
|
|
249
|
+
hash: Uint8Array;
|
|
250
|
+
size: number;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
async function deduplicateFiles(files: File[]): Promise<Map<string, string[]>> {
|
|
254
|
+
// Generate hashes for all files
|
|
255
|
+
const entries: FileEntry[] = await Promise.all(
|
|
256
|
+
files.map(async (file) => ({
|
|
257
|
+
path: file.name,
|
|
258
|
+
hash: wasm_generate_hash(new Uint8Array(await file.arrayBuffer())),
|
|
259
|
+
size: file.size
|
|
260
|
+
}))
|
|
261
|
+
);
|
|
262
|
+
|
|
263
|
+
// Group similar files
|
|
264
|
+
const duplicateGroups = new Map<string, string[]>();
|
|
265
|
+
|
|
266
|
+
for (let i = 0; i < entries.length; i++) {
|
|
267
|
+
for (let j = i + 1; j < entries.length; j++) {
|
|
268
|
+
const similarity = wasm_compare_hashes(entries[i].hash, entries[j].hash);
|
|
269
|
+
|
|
270
|
+
if (similarity >= 90) {
|
|
271
|
+
const key = entries[i].path;
|
|
272
|
+
if (!duplicateGroups.has(key)) {
|
|
273
|
+
duplicateGroups.set(key, [key]);
|
|
274
|
+
}
|
|
275
|
+
duplicateGroups.get(key).push(entries[j].path);
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
}
|
|
122
279
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
280
|
+
return duplicateGroups;
|
|
281
|
+
}
|
|
282
|
+
```
|
|
126
283
|
|
|
127
|
-
|
|
284
|
+
**Result**: 40-60% storage reduction in typical codebases
|
|
128
285
|
|
|
129
|
-
|
|
130
|
-
let hash2 = generate_hash(data2, &config)?;
|
|
286
|
+
### 3. Source Code Plagiarism Detection
|
|
131
287
|
|
|
132
|
-
|
|
133
|
-
|
|
288
|
+
```typescript
|
|
289
|
+
import { wasm_compare_data } from 'lavinhash';
|
|
134
290
|
|
|
135
|
-
|
|
291
|
+
interface CodeSubmission {
|
|
292
|
+
student: string;
|
|
293
|
+
code: string;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function detectPlagiarism(submissions: CodeSubmission[], threshold = 75) {
|
|
297
|
+
const encoder = new TextEncoder();
|
|
298
|
+
const results = [];
|
|
299
|
+
|
|
300
|
+
for (let i = 0; i < submissions.length; i++) {
|
|
301
|
+
for (let j = i + 1; j < submissions.length; j++) {
|
|
302
|
+
const data1 = encoder.encode(submissions[i].code);
|
|
303
|
+
const data2 = encoder.encode(submissions[j].code);
|
|
304
|
+
|
|
305
|
+
const similarity = wasm_compare_data(data1, data2);
|
|
306
|
+
|
|
307
|
+
if (similarity >= threshold) {
|
|
308
|
+
results.push({
|
|
309
|
+
student1: submissions[i].student,
|
|
310
|
+
student2: submissions[j].student,
|
|
311
|
+
similarity,
|
|
312
|
+
severity: similarity > 90 ? 'high' : 'moderate'
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
return results;
|
|
136
319
|
}
|
|
137
320
|
```
|
|
138
321
|
|
|
139
|
-
|
|
322
|
+
**Result**: Detects 95%+ of paraphrased content, resistant to identifier renaming and whitespace changes
|
|
140
323
|
|
|
141
|
-
|
|
324
|
+
---
|
|
142
325
|
|
|
143
|
-
|
|
326
|
+
## API Reference
|
|
327
|
+
|
|
328
|
+
### `wasm_generate_hash(data: Uint8Array): Uint8Array`
|
|
144
329
|
|
|
145
|
-
Generates a fuzzy hash fingerprint from
|
|
330
|
+
Generates a fuzzy hash fingerprint from binary data.
|
|
146
331
|
|
|
147
332
|
**Parameters:**
|
|
148
|
-
- `data`: Input data as Uint8Array
|
|
333
|
+
- `data`: Input data as Uint8Array (file contents, text encoded as bytes, etc.)
|
|
149
334
|
|
|
150
335
|
**Returns:**
|
|
151
|
-
- Serialized fingerprint (
|
|
336
|
+
- Serialized fingerprint (~1-2KB, constant size regardless of input)
|
|
152
337
|
|
|
153
338
|
**Example:**
|
|
154
339
|
```javascript
|
|
155
|
-
|
|
156
|
-
|
|
340
|
+
import { wasm_generate_hash } from 'lavinhash';
|
|
341
|
+
|
|
342
|
+
const fileData = new Uint8Array(await file.arrayBuffer());
|
|
343
|
+
const hash = wasm_generate_hash(fileData);
|
|
157
344
|
console.log(`Hash size: ${hash.length} bytes`);
|
|
158
345
|
```
|
|
159
346
|
|
|
160
|
-
|
|
347
|
+
### `wasm_compare_hashes(hash_a: Uint8Array, hash_b: Uint8Array): number`
|
|
161
348
|
|
|
162
349
|
Compares two previously generated hashes.
|
|
163
350
|
|
|
@@ -170,14 +357,24 @@ Compares two previously generated hashes.
|
|
|
170
357
|
|
|
171
358
|
**Example:**
|
|
172
359
|
```javascript
|
|
360
|
+
import { wasm_generate_hash, wasm_compare_hashes } from 'lavinhash';
|
|
361
|
+
|
|
173
362
|
const hash1 = wasm_generate_hash(data1);
|
|
174
363
|
const hash2 = wasm_generate_hash(data2);
|
|
175
364
|
const similarity = wasm_compare_hashes(hash1, hash2);
|
|
365
|
+
|
|
366
|
+
if (similarity > 90) {
|
|
367
|
+
console.log('Files are nearly identical');
|
|
368
|
+
} else if (similarity > 70) {
|
|
369
|
+
console.log('Files are similar');
|
|
370
|
+
} else {
|
|
371
|
+
console.log('Files are different');
|
|
372
|
+
}
|
|
176
373
|
```
|
|
177
374
|
|
|
178
|
-
|
|
375
|
+
### `wasm_compare_data(data_a: Uint8Array, data_b: Uint8Array): number`
|
|
179
376
|
|
|
180
|
-
Generates hashes and compares in a single operation.
|
|
377
|
+
Generates hashes and compares in a single operation (convenience function).
|
|
181
378
|
|
|
182
379
|
**Parameters:**
|
|
183
380
|
- `data_a`: First data array
|
|
@@ -188,79 +385,43 @@ Generates hashes and compares in a single operation.
|
|
|
188
385
|
|
|
189
386
|
**Example:**
|
|
190
387
|
```javascript
|
|
191
|
-
|
|
192
|
-
```
|
|
193
|
-
|
|
194
|
-
### Rust API
|
|
195
|
-
|
|
196
|
-
#### `generate_hash(data: &[u8], config: &HashConfig) -> Result<FuzzyFingerprint, FingerprintError>`
|
|
197
|
-
|
|
198
|
-
Generates a fuzzy hash from input data.
|
|
199
|
-
|
|
200
|
-
**Parameters:**
|
|
201
|
-
- `data`: Input data slice
|
|
202
|
-
- `config`: Configuration options
|
|
203
|
-
|
|
204
|
-
**Returns:**
|
|
205
|
-
- `Ok(FuzzyFingerprint)`: Generated fingerprint
|
|
206
|
-
- `Err(FingerprintError)`: Error if data is invalid
|
|
207
|
-
|
|
208
|
-
#### `compare_hashes(hash_a: &FuzzyFingerprint, hash_b: &FuzzyFingerprint, alpha: f32) -> u8`
|
|
209
|
-
|
|
210
|
-
Compares two fingerprints.
|
|
211
|
-
|
|
212
|
-
**Parameters:**
|
|
213
|
-
- `hash_a`: First fingerprint
|
|
214
|
-
- `hash_b`: Second fingerprint
|
|
215
|
-
- `alpha`: Weight coefficient (0.0-1.0, default 0.3)
|
|
216
|
-
|
|
217
|
-
**Returns:**
|
|
218
|
-
- Similarity score (0-100)
|
|
219
|
-
|
|
220
|
-
#### `HashConfig`
|
|
221
|
-
|
|
222
|
-
Configuration structure for hash generation.
|
|
388
|
+
import { wasm_compare_data } from 'lavinhash';
|
|
223
389
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
- `alpha: f32` - Weight for structure vs content (default: 0.3)
|
|
227
|
-
- `min_modulus: u64` - Feature density control (default: 16)
|
|
390
|
+
const file1 = new Uint8Array(await fileA.arrayBuffer());
|
|
391
|
+
const file2 = new Uint8Array(await fileB.arrayBuffer());
|
|
228
392
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
let mut config = HashConfig::default();
|
|
232
|
-
config.alpha = 0.5; // 50% structure, 50% content
|
|
233
|
-
config.enable_parallel = false; // Disable parallel processing
|
|
393
|
+
const similarity = wasm_compare_data(file1, file2);
|
|
394
|
+
console.log(`Similarity: ${similarity}%`);
|
|
234
395
|
```
|
|
235
396
|
|
|
236
|
-
|
|
397
|
+
---
|
|
237
398
|
|
|
238
|
-
|
|
399
|
+
## Algorithm Details
|
|
239
400
|
|
|
240
|
-
|
|
401
|
+
### DLAH Architecture
|
|
241
402
|
|
|
242
403
|
**Phase I: Adaptive Normalization**
|
|
243
|
-
- Case folding (A-Z
|
|
404
|
+
- Case folding (A-Z → a-z)
|
|
244
405
|
- Whitespace normalization
|
|
245
406
|
- Control character filtering
|
|
246
407
|
- Zero-copy iterator-based processing
|
|
247
408
|
|
|
248
409
|
**Phase II: Structural Hash**
|
|
249
|
-
- Shannon entropy calculation
|
|
250
|
-
-
|
|
251
|
-
-
|
|
252
|
-
- Levenshtein distance
|
|
410
|
+
- Shannon entropy calculation: `H(X) = -Σ p(x) log₂ p(x)`
|
|
411
|
+
- Adaptive block sizing (default: 256 bytes)
|
|
412
|
+
- Quantization to 4-bit nibbles (0-15 range)
|
|
413
|
+
- Comparison via Levenshtein distance
|
|
253
414
|
|
|
254
415
|
**Phase III: Content Hash**
|
|
255
|
-
- BuzHash rolling hash algorithm
|
|
256
|
-
- Adaptive modulus
|
|
257
|
-
- 8192-bit Bloom filter (1KB)
|
|
258
|
-
- Jaccard similarity
|
|
416
|
+
- BuzHash rolling hash algorithm (64-byte window)
|
|
417
|
+
- Adaptive modulus: `M = min(file_size / 256, 8192)`
|
|
418
|
+
- 8192-bit Bloom filter (1KB, 3 hash functions)
|
|
419
|
+
- Comparison via Jaccard similarity: `|A ∩ B| / |A ∪ B|`
|
|
259
420
|
|
|
260
421
|
### Similarity Formula
|
|
261
422
|
|
|
262
423
|
```
|
|
263
|
-
Similarity = α × Levenshtein(
|
|
424
|
+
Similarity(A, B) = α × Levenshtein(StructA, StructB) + (1-α) × Jaccard(ContentA, ContentB)
|
|
264
425
|
```
|
|
265
426
|
|
|
266
427
|
Where:
|
|
@@ -268,142 +429,42 @@ Where:
|
|
|
268
429
|
- Levenshtein: Normalized edit distance on entropy vectors
|
|
269
430
|
- Jaccard: Set similarity on Bloom filter features
|
|
270
431
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
**Time Complexity:**
|
|
274
|
-
- Hash generation: O(n) where n is data size
|
|
275
|
-
- Hash comparison: O(1) - constant time regardless of file size
|
|
276
|
-
|
|
277
|
-
**Space Complexity:**
|
|
278
|
-
- Fingerprint size: ~1KB + O(log n) structural data
|
|
279
|
-
- Memory usage: O(1) for comparison, O(n) for generation
|
|
280
|
-
|
|
281
|
-
**Throughput:**
|
|
282
|
-
- Single-threaded: ~500 MB/s
|
|
283
|
-
- Multi-threaded: ~2 GB/s (files larger than 1MB)
|
|
284
|
-
|
|
285
|
-
## Configuration
|
|
286
|
-
|
|
287
|
-
### Basic Configuration
|
|
288
|
-
|
|
289
|
-
```rust
|
|
290
|
-
use lavinhash::HashConfig;
|
|
291
|
-
|
|
292
|
-
let config = HashConfig {
|
|
293
|
-
enable_parallel: true,
|
|
294
|
-
alpha: 0.3,
|
|
295
|
-
min_modulus: 16,
|
|
296
|
-
};
|
|
297
|
-
```
|
|
298
|
-
|
|
299
|
-
### Advanced Configuration
|
|
300
|
-
|
|
301
|
-
**Adjusting Structure vs Content Weight:**
|
|
302
|
-
|
|
303
|
-
```rust
|
|
304
|
-
// More weight to structure (topology)
|
|
305
|
-
config.alpha = 0.5; // 50% structure, 50% content
|
|
306
|
-
|
|
307
|
-
// More weight to content (features)
|
|
308
|
-
config.alpha = 0.1; // 10% structure, 90% content
|
|
309
|
-
```
|
|
310
|
-
|
|
311
|
-
**Controlling Feature Density:**
|
|
312
|
-
|
|
313
|
-
```rust
|
|
314
|
-
// Higher sensitivity (more features)
|
|
315
|
-
config.min_modulus = 8;
|
|
316
|
-
|
|
317
|
-
// Lower sensitivity (fewer features)
|
|
318
|
-
config.min_modulus = 32;
|
|
319
|
-
```
|
|
320
|
-
|
|
321
|
-
**Parallel Processing:**
|
|
322
|
-
|
|
323
|
-
```rust
|
|
324
|
-
// Force sequential processing
|
|
325
|
-
config.enable_parallel = false;
|
|
326
|
-
|
|
327
|
-
// Enable automatic parallel processing for files > 1MB
|
|
328
|
-
config.enable_parallel = true;
|
|
329
|
-
```
|
|
330
|
-
|
|
331
|
-
## Use Cases
|
|
332
|
-
|
|
333
|
-
### Document Similarity Detection
|
|
334
|
-
|
|
335
|
-
Compare different versions of documents to detect modifications and measure similarity.
|
|
336
|
-
|
|
337
|
-
```javascript
|
|
338
|
-
import { wasm_compare_data } from 'lavinhash';
|
|
339
|
-
|
|
340
|
-
// In a React/Vue/Angular app with file upload
|
|
341
|
-
async function compareDocuments(file1, file2) {
|
|
342
|
-
const buffer1 = await file1.arrayBuffer();
|
|
343
|
-
const buffer2 = await file2.arrayBuffer();
|
|
344
|
-
|
|
345
|
-
const data1 = new Uint8Array(buffer1);
|
|
346
|
-
const data2 = new Uint8Array(buffer2);
|
|
432
|
+
---
|
|
347
433
|
|
|
348
|
-
|
|
349
|
-
console.log(`Similarity: ${similarity}%`);
|
|
350
|
-
return similarity;
|
|
351
|
-
}
|
|
352
|
-
```
|
|
434
|
+
## Performance Characteristics
|
|
353
435
|
|
|
354
|
-
|
|
436
|
+
| Metric | Value |
|
|
437
|
+
|--------|-------|
|
|
438
|
+
| **Time Complexity** | O(n) - Linear in file size |
|
|
439
|
+
| **Space Complexity** | O(1) - Constant memory |
|
|
440
|
+
| **Fingerprint Size** | ~1-2 KB - Independent of file size |
|
|
441
|
+
| **Throughput** | ~500 MB/s single-threaded, ~2 GB/s multi-threaded |
|
|
442
|
+
| **Comparison Speed** | O(1) - Constant time |
|
|
355
443
|
|
|
356
|
-
|
|
444
|
+
**Optimization Techniques:**
|
|
445
|
+
- SIMD entropy calculation (AVX2 intrinsics)
|
|
446
|
+
- Rayon parallelization for files >1MB
|
|
447
|
+
- Cache-friendly Bloom filter (fits in L1/L2)
|
|
448
|
+
- Zero-copy FFI across language boundaries
|
|
357
449
|
|
|
358
|
-
|
|
359
|
-
let files = vec![file1, file2, file3];
|
|
360
|
-
let hashes: Vec<_> = files.iter()
|
|
361
|
-
.map(|f| generate_hash(f, &config).unwrap())
|
|
362
|
-
.collect();
|
|
450
|
+
---
|
|
363
451
|
|
|
364
|
-
|
|
365
|
-
for j in i+1..hashes.len() {
|
|
366
|
-
let sim = compare_hashes(&hashes[i], &hashes[j], 0.3);
|
|
367
|
-
if sim > 90 {
|
|
368
|
-
println!("Files {} and {} are similar: {}%", i, j, sim);
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
```
|
|
373
|
-
|
|
374
|
-
### Version Tracking
|
|
375
|
-
|
|
376
|
-
Track changes between different versions of files or content.
|
|
377
|
-
|
|
378
|
-
```javascript
|
|
379
|
-
import { wasm_generate_hash, wasm_compare_hashes } from 'lavinhash';
|
|
452
|
+
## Cross-Platform Support
|
|
380
453
|
|
|
381
|
-
|
|
382
|
-
async function trackVersions(files) {
|
|
383
|
-
const encoder = new TextEncoder();
|
|
454
|
+
LavinHash produces **identical fingerprints** across all platforms:
|
|
384
455
|
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
456
|
+
- Linux (x86_64, ARM64)
|
|
457
|
+
- Windows (x86_64)
|
|
458
|
+
- macOS (x86_64, ARM64/M1/M2)
|
|
459
|
+
- WebAssembly (wasm32)
|
|
389
460
|
|
|
390
|
-
|
|
391
|
-
for (let i = 0; i < hashes.length - 1; i++) {
|
|
392
|
-
const sim = wasm_compare_hashes(hashes[i], hashes[i + 1]);
|
|
393
|
-
results.push({
|
|
394
|
-
from: `v${i+1}`,
|
|
395
|
-
to: `v${i+2}`,
|
|
396
|
-
similarity: sim
|
|
397
|
-
});
|
|
398
|
-
}
|
|
461
|
+
Achieved through explicit endianness handling and deterministic hash seeding.
|
|
399
462
|
|
|
400
|
-
|
|
401
|
-
}
|
|
402
|
-
```
|
|
463
|
+
---
|
|
403
464
|
|
|
404
465
|
## Framework Compatibility
|
|
405
466
|
|
|
406
|
-
|
|
467
|
+
Works seamlessly with all modern JavaScript frameworks and build tools:
|
|
407
468
|
|
|
408
469
|
- **React**: Vite, Create React App, Next.js, Remix
|
|
409
470
|
- **Angular**: Angular CLI (v12+)
|
|
@@ -411,91 +472,63 @@ LavinHash works seamlessly with all modern JavaScript frameworks and build tools
|
|
|
411
472
|
- **Svelte**: SvelteKit, Vite
|
|
412
473
|
- **Build Tools**: Webpack 5+, Vite, Rollup, Parcel, esbuild
|
|
413
474
|
|
|
414
|
-
|
|
475
|
+
---
|
|
415
476
|
|
|
416
|
-
##
|
|
477
|
+
## TypeScript Support
|
|
417
478
|
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
```bash
|
|
421
|
-
# Install wasm-pack
|
|
422
|
-
cargo install wasm-pack
|
|
423
|
-
|
|
424
|
-
# Build for modern bundlers (React, Angular, Vue, etc.)
|
|
425
|
-
wasm-pack build --target bundler --out-dir pkg --out-name lavinhash
|
|
479
|
+
Full TypeScript definitions included:
|
|
426
480
|
|
|
427
|
-
|
|
481
|
+
```typescript
|
|
482
|
+
export function wasm_generate_hash(data: Uint8Array): Uint8Array;
|
|
483
|
+
export function wasm_compare_hashes(hash_a: Uint8Array, hash_b: Uint8Array): number;
|
|
484
|
+
export function wasm_compare_data(data_a: Uint8Array, data_b: Uint8Array): number;
|
|
428
485
|
```
|
|
429
486
|
|
|
430
|
-
|
|
487
|
+
---
|
|
431
488
|
|
|
432
|
-
|
|
489
|
+
## Building from Source
|
|
433
490
|
|
|
434
491
|
```bash
|
|
435
|
-
#
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
# Run tests with output
|
|
439
|
-
cargo test -- --nocapture
|
|
440
|
-
|
|
441
|
-
# Run specific test
|
|
442
|
-
cargo test test_generate_hash_basic
|
|
443
|
-
```
|
|
444
|
-
|
|
445
|
-
### Benchmarks
|
|
446
|
-
|
|
447
|
-
```bash
|
|
448
|
-
# Run benchmarks
|
|
449
|
-
cargo bench
|
|
450
|
-
```
|
|
492
|
+
# Clone repository
|
|
493
|
+
git clone https://github.com/RafaCalRob/LavinHash.git
|
|
494
|
+
cd LavinHash
|
|
451
495
|
|
|
452
|
-
|
|
496
|
+
# Build Rust library
|
|
497
|
+
cargo build --release
|
|
453
498
|
|
|
454
|
-
|
|
499
|
+
# Build WASM for npm
|
|
500
|
+
cargo install wasm-pack
|
|
501
|
+
wasm-pack build --target bundler --out-dir pkg --out-name lavinhash
|
|
455
502
|
|
|
503
|
+
# The compiled files will be in pkg/
|
|
456
504
|
```
|
|
457
|
-
Offset | Field | Type | Size
|
|
458
|
-
-------|------------------|----------|-------------
|
|
459
|
-
0x00 | Magic | u8 | 1 byte (0x48)
|
|
460
|
-
0x01 | Version | u8 | 1 byte (0x01)
|
|
461
|
-
0x02 | Struct Length | u16 LE | 2 bytes
|
|
462
|
-
0x04 | Content Bloom | u64[128] | 1024 bytes
|
|
463
|
-
0x404 | Structural Data | u8[] | Variable
|
|
464
|
-
```
|
|
465
|
-
|
|
466
|
-
**Cross-Platform Determinism:**
|
|
467
|
-
- Identical input produces identical hash on all platforms
|
|
468
|
-
- Little-endian byte ordering
|
|
469
|
-
- IEEE 754 floating-point compliance
|
|
470
|
-
|
|
471
|
-
**Thread Safety:**
|
|
472
|
-
- Hash generation is thread-safe
|
|
473
|
-
- Parallel processing uses Rayon for data parallelism
|
|
474
|
-
- No global state or locks
|
|
475
505
|
|
|
476
|
-
|
|
506
|
+
---
|
|
477
507
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
- `basic_usage.rs` - Rust usage examples
|
|
481
|
-
- `javascript_example.js` - Node.js integration
|
|
482
|
-
- `browser_example.html` - Browser-based demo
|
|
508
|
+
## License
|
|
483
509
|
|
|
484
|
-
|
|
510
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
485
511
|
|
|
486
|
-
|
|
487
|
-
- **Technical Specification**: See `docs/TECHNICAL.md` in the repository
|
|
488
|
-
- **Contributing Guide**: See `CONTRIBUTING.md`
|
|
512
|
+
---
|
|
489
513
|
|
|
490
|
-
##
|
|
514
|
+
## Links
|
|
491
515
|
|
|
492
|
-
|
|
516
|
+
- **npm Package**: https://www.npmjs.com/package/lavinhash
|
|
517
|
+
- **GitHub Repository**: https://github.com/RafaCalRob/LavinHash
|
|
518
|
+
- **Live Demo**: http://localhost:4002/lavinhash/demo
|
|
519
|
+
- **Issue Tracker**: https://github.com/RafaCalRob/LavinHash/issues
|
|
493
520
|
|
|
494
|
-
|
|
521
|
+
---
|
|
495
522
|
|
|
496
|
-
|
|
523
|
+
## Citation
|
|
497
524
|
|
|
498
|
-
|
|
525
|
+
If you use LavinHash in academic work, please cite:
|
|
499
526
|
|
|
500
|
-
|
|
501
|
-
|
|
527
|
+
```bibtex
|
|
528
|
+
@software{lavinhash2024,
|
|
529
|
+
title = {LavinHash: Dual-Layer Adaptive Hashing for File Similarity Detection},
|
|
530
|
+
author = {LavinHash Contributors},
|
|
531
|
+
year = {2024},
|
|
532
|
+
url = {https://github.com/RafaCalRob/LavinHash}
|
|
533
|
+
}
|
|
534
|
+
```
|
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"LavinHash Contributors"
|
|
6
6
|
],
|
|
7
7
|
"description": "High-performance fuzzy hashing library implementing the DLAH (Dual-Layer Adaptive Hashing) algorithm",
|
|
8
|
-
"version": "1.0.
|
|
8
|
+
"version": "1.0.2",
|
|
9
9
|
"license": "MIT",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -43,4 +43,4 @@
|
|
|
43
43
|
"vue",
|
|
44
44
|
"typescript"
|
|
45
45
|
]
|
|
46
|
-
}
|
|
46
|
+
}
|