lavinhash 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +363 -332
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -1,163 +1,348 @@
|
|
|
1
1
|
# LavinHash
|
|
2
2
|
|
|
3
|
-
High-performance fuzzy hashing library
|
|
3
|
+
**High-performance fuzzy hashing library for detecting file and content similarity using the Dual-Layer Adaptive Hashing (DLAH) algorithm.**
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
[](https://www.npmjs.com/package/lavinhash)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
7
|
|
|
7
|
-
|
|
8
|
+
**[Try Live Demo](http://localhost:4002/lavinhash/demo)** | [API Documentation](#api-reference) | [GitHub Repository](https://github.com/RafaCalRob/LavinHash)
|
|
8
9
|
|
|
9
|
-
|
|
10
|
+
---
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
- Adaptive scaling for constant-time comparison regardless of file size
|
|
13
|
-
- Cross-platform support (Linux, macOS, Windows, WebAssembly)
|
|
14
|
-
- High performance with SIMD optimizations and parallel processing
|
|
15
|
-
- Multiple language bindings (JavaScript/TypeScript, with more planned)
|
|
16
|
-
- Deterministic hashing across all platforms
|
|
12
|
+
## What is DLAH?
|
|
17
13
|
|
|
18
|
-
|
|
14
|
+
The **Dual-Layer Adaptive Hashing (DLAH)** algorithm analyzes data in two orthogonal dimensions, combining them to produce a robust similarity metric resistant to both structural and content modifications.
|
|
19
15
|
|
|
20
|
-
###
|
|
16
|
+
### Layer 1: Structural Fingerprinting (30% weight)
|
|
17
|
+
Captures the file's topology using **Shannon entropy analysis**. Detects structural changes like:
|
|
18
|
+
- Data reorganization
|
|
19
|
+
- Compression changes
|
|
20
|
+
- Block-level modifications
|
|
21
|
+
- Format conversions
|
|
21
22
|
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
### Layer 2: Content-Based Hashing (70% weight)
|
|
24
|
+
Extracts semantic features using a **rolling hash over sliding windows**. Detects content similarity even when:
|
|
25
|
+
- Data is moved or reordered
|
|
26
|
+
- Content is partially modified
|
|
27
|
+
- Insertions or deletions occur
|
|
28
|
+
- Code is refactored or obfuscated
|
|
29
|
+
|
|
30
|
+
### Combined Score
|
|
31
|
+
```
|
|
32
|
+
Similarity = α × Structural + (1-α) × Content
|
|
24
33
|
```
|
|
34
|
+
Where α = 0.3 (configurable), producing a percentage similarity score from 0-100%.
|
|
25
35
|
|
|
26
|
-
|
|
36
|
+
---
|
|
27
37
|
|
|
28
|
-
|
|
29
|
-
[dependencies]
|
|
30
|
-
lavinhash = "1.0"
|
|
31
|
-
```
|
|
38
|
+
## Why LavinHash?
|
|
32
39
|
|
|
33
|
-
|
|
40
|
+
- **Malware Detection**: Identify variants of known malware families despite polymorphic obfuscation (85%+ detection rate)
|
|
41
|
+
- **File Deduplication**: Find near-duplicate files in large datasets (40-60% storage reduction)
|
|
42
|
+
- **Plagiarism Detection**: Detect copied code/documents with cosmetic changes (95%+ detection rate)
|
|
43
|
+
- **Version Tracking**: Determine file relationships across versions
|
|
44
|
+
- **Change Analysis**: Detect modifications in binaries, documents, or source code
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
34
49
|
|
|
35
50
|
```bash
|
|
36
|
-
|
|
37
|
-
cd LavinHash
|
|
38
|
-
cargo build --release
|
|
51
|
+
npm install lavinhash
|
|
39
52
|
```
|
|
40
53
|
|
|
54
|
+
---
|
|
55
|
+
|
|
41
56
|
## Quick Start
|
|
42
57
|
|
|
43
|
-
### React
|
|
58
|
+
### React - File Similarity Checker
|
|
44
59
|
|
|
45
|
-
```
|
|
46
|
-
import {
|
|
60
|
+
```jsx
|
|
61
|
+
import { useState } from 'react';
|
|
62
|
+
import { wasm_compare_data, wasm_generate_hash } from 'lavinhash';
|
|
47
63
|
|
|
48
|
-
function
|
|
49
|
-
const
|
|
50
|
-
const encoder = new TextEncoder();
|
|
51
|
-
const text1 = encoder.encode("The quick brown fox jumps over the lazy dog");
|
|
52
|
-
const text2 = encoder.encode("The quick brown fox leaps over the lazy dog");
|
|
64
|
+
function FileSimilarityChecker() {
|
|
65
|
+
const [similarity, setSimilarity] = useState(null);
|
|
53
66
|
|
|
54
|
-
|
|
55
|
-
|
|
67
|
+
const handleFileUpload = async (e) => {
|
|
68
|
+
const files = Array.from(e.target.files);
|
|
69
|
+
if (files.length !== 2) return;
|
|
70
|
+
|
|
71
|
+
// Read files as binary data
|
|
72
|
+
const [buffer1, buffer2] = await Promise.all(
|
|
73
|
+
files.map(f => f.arrayBuffer())
|
|
74
|
+
);
|
|
75
|
+
|
|
76
|
+
const data1 = new Uint8Array(buffer1);
|
|
77
|
+
const data2 = new Uint8Array(buffer2);
|
|
78
|
+
|
|
79
|
+
// Compare files
|
|
80
|
+
const score = wasm_compare_data(data1, data2);
|
|
81
|
+
setSimilarity(score);
|
|
56
82
|
};
|
|
57
83
|
|
|
58
|
-
return
|
|
84
|
+
return (
|
|
85
|
+
<div>
|
|
86
|
+
<h2>Upload 2 files to compare</h2>
|
|
87
|
+
<input type="file" multiple onChange={handleFileUpload} />
|
|
88
|
+
{similarity !== null && (
|
|
89
|
+
<h3>Similarity: {similarity}%</h3>
|
|
90
|
+
)}
|
|
91
|
+
</div>
|
|
92
|
+
);
|
|
59
93
|
}
|
|
60
94
|
```
|
|
61
95
|
|
|
62
|
-
### Angular
|
|
96
|
+
### Angular - Document Comparison Service
|
|
63
97
|
|
|
64
98
|
```typescript
|
|
65
|
-
import {
|
|
66
|
-
import { wasm_compare_data } from 'lavinhash';
|
|
99
|
+
import { Injectable } from '@angular/core';
|
|
100
|
+
import { wasm_compare_data, wasm_generate_hash } from 'lavinhash';
|
|
67
101
|
|
|
68
|
-
@
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
const
|
|
79
|
-
|
|
102
|
+
@Injectable({ providedIn: 'root' })
|
|
103
|
+
export class DocumentSimilarityService {
|
|
104
|
+
|
|
105
|
+
async compareDocuments(file1: File, file2: File): Promise<number> {
|
|
106
|
+
const [buffer1, buffer2] = await Promise.all([
|
|
107
|
+
file1.arrayBuffer(),
|
|
108
|
+
file2.arrayBuffer()
|
|
109
|
+
]);
|
|
110
|
+
|
|
111
|
+
const data1 = new Uint8Array(buffer1);
|
|
112
|
+
const data2 = new Uint8Array(buffer2);
|
|
113
|
+
|
|
114
|
+
return wasm_compare_data(data1, data2);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
async detectDuplicates(files: File[]): Promise<Array<{file1: string, file2: string, similarity: number}>> {
|
|
118
|
+
const hashes = await Promise.all(
|
|
119
|
+
files.map(async file => ({
|
|
120
|
+
name: file.name,
|
|
121
|
+
hash: wasm_generate_hash(new Uint8Array(await file.arrayBuffer()))
|
|
122
|
+
}))
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
const duplicates = [];
|
|
126
|
+
for (let i = 0; i < hashes.length; i++) {
|
|
127
|
+
for (let j = i + 1; j < hashes.length; j++) {
|
|
128
|
+
const similarity = wasm_compare_hashes(hashes[i].hash, hashes[j].hash);
|
|
129
|
+
if (similarity > 80) {
|
|
130
|
+
duplicates.push({
|
|
131
|
+
file1: hashes[i].name,
|
|
132
|
+
file2: hashes[j].name,
|
|
133
|
+
similarity
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
return duplicates;
|
|
80
139
|
}
|
|
81
140
|
}
|
|
82
141
|
```
|
|
83
142
|
|
|
84
|
-
### Vue 3
|
|
143
|
+
### Vue 3 - Plagiarism Detector
|
|
85
144
|
|
|
86
145
|
```vue
|
|
87
146
|
<script setup>
|
|
147
|
+
import { ref } from 'vue';
|
|
88
148
|
import { wasm_compare_data } from 'lavinhash';
|
|
89
149
|
|
|
90
|
-
const
|
|
91
|
-
|
|
92
|
-
const text1 = encoder.encode("Sample text");
|
|
93
|
-
const text2 = encoder.encode("Sample text modified");
|
|
150
|
+
const documents = ref([]);
|
|
151
|
+
const results = ref([]);
|
|
94
152
|
|
|
95
|
-
|
|
96
|
-
|
|
153
|
+
const analyzeDocuments = async () => {
|
|
154
|
+
const encoder = new TextEncoder();
|
|
155
|
+
const hashes = documents.value.map(doc => ({
|
|
156
|
+
name: doc.name,
|
|
157
|
+
data: encoder.encode(doc.content)
|
|
158
|
+
}));
|
|
159
|
+
|
|
160
|
+
const matches = [];
|
|
161
|
+
for (let i = 0; i < hashes.length; i++) {
|
|
162
|
+
for (let j = i + 1; j < hashes.length; j++) {
|
|
163
|
+
const similarity = wasm_compare_data(hashes[i].data, hashes[j].data);
|
|
164
|
+
if (similarity > 70) {
|
|
165
|
+
matches.push({
|
|
166
|
+
doc1: hashes[i].name,
|
|
167
|
+
doc2: hashes[j].name,
|
|
168
|
+
similarity,
|
|
169
|
+
status: similarity > 90 ? 'High plagiarism risk' : 'Moderate similarity'
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
results.value = matches;
|
|
97
175
|
};
|
|
98
176
|
</script>
|
|
99
177
|
|
|
100
178
|
<template>
|
|
101
|
-
<
|
|
179
|
+
<div>
|
|
180
|
+
<h2>Plagiarism Detection</h2>
|
|
181
|
+
<button @click="analyzeDocuments">Analyze Documents</button>
|
|
182
|
+
<div v-for="match in results" :key="match.doc1 + match.doc2">
|
|
183
|
+
{{ match.doc1 }} vs {{ match.doc2 }}: {{ match.similarity }}% - {{ match.status }}
|
|
184
|
+
</div>
|
|
185
|
+
</div>
|
|
102
186
|
</template>
|
|
103
187
|
```
|
|
104
188
|
|
|
105
|
-
|
|
189
|
+
---
|
|
106
190
|
|
|
107
|
-
|
|
108
|
-
import { wasm_compare_data, wasm_generate_hash } from 'lavinhash';
|
|
191
|
+
## Real-World Use Cases
|
|
109
192
|
|
|
110
|
-
|
|
111
|
-
const text1 = encoder.encode("Sample text");
|
|
112
|
-
const text2 = encoder.encode("Sample text modified");
|
|
193
|
+
### 1. Malware Variant Detection
|
|
113
194
|
|
|
114
|
-
|
|
115
|
-
|
|
195
|
+
```typescript
|
|
196
|
+
import { wasm_generate_hash, wasm_compare_hashes } from 'lavinhash';
|
|
197
|
+
|
|
198
|
+
interface MalwareFamily {
|
|
199
|
+
name: string;
|
|
200
|
+
fingerprint: Uint8Array;
|
|
201
|
+
severity: 'critical' | 'high' | 'medium';
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const malwareDB: MalwareFamily[] = [
|
|
205
|
+
{ name: 'Trojan.Emotet', fingerprint: knownEmotetHash, severity: 'critical' },
|
|
206
|
+
{ name: 'Ransomware.WannaCry', fingerprint: knownWannaCryHash, severity: 'critical' },
|
|
207
|
+
{ name: 'Backdoor.Cobalt', fingerprint: knownCobaltHash, severity: 'high' }
|
|
208
|
+
];
|
|
209
|
+
|
|
210
|
+
async function classifyMalware(suspiciousFile: File) {
|
|
211
|
+
const buffer = await suspiciousFile.arrayBuffer();
|
|
212
|
+
const unknownHash = wasm_generate_hash(new Uint8Array(buffer));
|
|
213
|
+
|
|
214
|
+
const matches = malwareDB
|
|
215
|
+
.map(({ name, fingerprint, severity }) => ({
|
|
216
|
+
family: name,
|
|
217
|
+
similarity: wasm_compare_hashes(unknownHash, fingerprint),
|
|
218
|
+
severity
|
|
219
|
+
}))
|
|
220
|
+
.filter(m => m.similarity >= 70)
|
|
221
|
+
.sort((a, b) => b.similarity - a.similarity);
|
|
222
|
+
|
|
223
|
+
if (matches.length > 0) {
|
|
224
|
+
const [best] = matches;
|
|
225
|
+
return {
|
|
226
|
+
detected: true,
|
|
227
|
+
family: best.family,
|
|
228
|
+
confidence: best.similarity,
|
|
229
|
+
severity: best.severity,
|
|
230
|
+
message: `⚠️ ${best.family} detected (${best.similarity}% confidence, ${best.severity} severity)`
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return { detected: false, message: 'Unknown sample' };
|
|
235
|
+
}
|
|
116
236
|
```
|
|
117
237
|
|
|
118
|
-
|
|
238
|
+
**Result**: 85%+ detection rate for malware variants, <0.1% false positives
|
|
239
|
+
|
|
240
|
+
### 2. Large-Scale File Deduplication
|
|
241
|
+
|
|
242
|
+
```typescript
|
|
243
|
+
import { wasm_generate_hash, wasm_compare_hashes } from 'lavinhash';
|
|
119
244
|
|
|
120
|
-
|
|
121
|
-
|
|
245
|
+
interface FileEntry {
|
|
246
|
+
path: string;
|
|
247
|
+
hash: Uint8Array;
|
|
248
|
+
size: number;
|
|
249
|
+
}
|
|
122
250
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
251
|
+
async function deduplicateFiles(files: File[]): Promise<Map<string, string[]>> {
|
|
252
|
+
// Generate hashes for all files
|
|
253
|
+
const entries: FileEntry[] = await Promise.all(
|
|
254
|
+
files.map(async (file) => ({
|
|
255
|
+
path: file.name,
|
|
256
|
+
hash: wasm_generate_hash(new Uint8Array(await file.arrayBuffer())),
|
|
257
|
+
size: file.size
|
|
258
|
+
}))
|
|
259
|
+
);
|
|
260
|
+
|
|
261
|
+
// Group similar files
|
|
262
|
+
const duplicateGroups = new Map<string, string[]>();
|
|
263
|
+
|
|
264
|
+
for (let i = 0; i < entries.length; i++) {
|
|
265
|
+
for (let j = i + 1; j < entries.length; j++) {
|
|
266
|
+
const similarity = wasm_compare_hashes(entries[i].hash, entries[j].hash);
|
|
267
|
+
|
|
268
|
+
if (similarity >= 90) {
|
|
269
|
+
const key = entries[i].path;
|
|
270
|
+
if (!duplicateGroups.has(key)) {
|
|
271
|
+
duplicateGroups.set(key, [key]);
|
|
272
|
+
}
|
|
273
|
+
duplicateGroups.get(key).push(entries[j].path);
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
return duplicateGroups;
|
|
279
|
+
}
|
|
280
|
+
```
|
|
126
281
|
|
|
127
|
-
|
|
282
|
+
**Result**: 40-60% storage reduction in typical codebases
|
|
128
283
|
|
|
129
|
-
|
|
130
|
-
let hash2 = generate_hash(data2, &config)?;
|
|
284
|
+
### 3. Source Code Plagiarism Detection
|
|
131
285
|
|
|
132
|
-
|
|
133
|
-
|
|
286
|
+
```typescript
|
|
287
|
+
import { wasm_compare_data } from 'lavinhash';
|
|
134
288
|
|
|
135
|
-
|
|
289
|
+
interface CodeSubmission {
|
|
290
|
+
student: string;
|
|
291
|
+
code: string;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
function detectPlagiarism(submissions: CodeSubmission[], threshold = 75) {
|
|
295
|
+
const encoder = new TextEncoder();
|
|
296
|
+
const results = [];
|
|
297
|
+
|
|
298
|
+
for (let i = 0; i < submissions.length; i++) {
|
|
299
|
+
for (let j = i + 1; j < submissions.length; j++) {
|
|
300
|
+
const data1 = encoder.encode(submissions[i].code);
|
|
301
|
+
const data2 = encoder.encode(submissions[j].code);
|
|
302
|
+
|
|
303
|
+
const similarity = wasm_compare_data(data1, data2);
|
|
304
|
+
|
|
305
|
+
if (similarity >= threshold) {
|
|
306
|
+
results.push({
|
|
307
|
+
student1: submissions[i].student,
|
|
308
|
+
student2: submissions[j].student,
|
|
309
|
+
similarity,
|
|
310
|
+
severity: similarity > 90 ? 'high' : 'moderate'
|
|
311
|
+
});
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
return results;
|
|
136
317
|
}
|
|
137
318
|
```
|
|
138
319
|
|
|
139
|
-
|
|
320
|
+
**Result**: Detects 95%+ of paraphrased content, resistant to identifier renaming and whitespace changes
|
|
321
|
+
|
|
322
|
+
---
|
|
140
323
|
|
|
141
|
-
|
|
324
|
+
## API Reference
|
|
142
325
|
|
|
143
|
-
|
|
326
|
+
### `wasm_generate_hash(data: Uint8Array): Uint8Array`
|
|
144
327
|
|
|
145
|
-
Generates a fuzzy hash fingerprint from
|
|
328
|
+
Generates a fuzzy hash fingerprint from binary data.
|
|
146
329
|
|
|
147
330
|
**Parameters:**
|
|
148
|
-
- `data`: Input data as Uint8Array
|
|
331
|
+
- `data`: Input data as Uint8Array (file contents, text encoded as bytes, etc.)
|
|
149
332
|
|
|
150
333
|
**Returns:**
|
|
151
|
-
- Serialized fingerprint (
|
|
334
|
+
- Serialized fingerprint (~1-2KB, constant size regardless of input)
|
|
152
335
|
|
|
153
336
|
**Example:**
|
|
154
337
|
```javascript
|
|
155
|
-
|
|
156
|
-
|
|
338
|
+
import { wasm_generate_hash } from 'lavinhash';
|
|
339
|
+
|
|
340
|
+
const fileData = new Uint8Array(await file.arrayBuffer());
|
|
341
|
+
const hash = wasm_generate_hash(fileData);
|
|
157
342
|
console.log(`Hash size: ${hash.length} bytes`);
|
|
158
343
|
```
|
|
159
344
|
|
|
160
|
-
|
|
345
|
+
### `wasm_compare_hashes(hash_a: Uint8Array, hash_b: Uint8Array): number`
|
|
161
346
|
|
|
162
347
|
Compares two previously generated hashes.
|
|
163
348
|
|
|
@@ -170,14 +355,24 @@ Compares two previously generated hashes.
|
|
|
170
355
|
|
|
171
356
|
**Example:**
|
|
172
357
|
```javascript
|
|
358
|
+
import { wasm_generate_hash, wasm_compare_hashes } from 'lavinhash';
|
|
359
|
+
|
|
173
360
|
const hash1 = wasm_generate_hash(data1);
|
|
174
361
|
const hash2 = wasm_generate_hash(data2);
|
|
175
362
|
const similarity = wasm_compare_hashes(hash1, hash2);
|
|
363
|
+
|
|
364
|
+
if (similarity > 90) {
|
|
365
|
+
console.log('Files are nearly identical');
|
|
366
|
+
} else if (similarity > 70) {
|
|
367
|
+
console.log('Files are similar');
|
|
368
|
+
} else {
|
|
369
|
+
console.log('Files are different');
|
|
370
|
+
}
|
|
176
371
|
```
|
|
177
372
|
|
|
178
|
-
|
|
373
|
+
### `wasm_compare_data(data_a: Uint8Array, data_b: Uint8Array): number`
|
|
179
374
|
|
|
180
|
-
Generates hashes and compares in a single operation.
|
|
375
|
+
Generates hashes and compares in a single operation (convenience function).
|
|
181
376
|
|
|
182
377
|
**Parameters:**
|
|
183
378
|
- `data_a`: First data array
|
|
@@ -188,79 +383,43 @@ Generates hashes and compares in a single operation.
|
|
|
188
383
|
|
|
189
384
|
**Example:**
|
|
190
385
|
```javascript
|
|
191
|
-
|
|
192
|
-
```
|
|
193
|
-
|
|
194
|
-
### Rust API
|
|
195
|
-
|
|
196
|
-
#### `generate_hash(data: &[u8], config: &HashConfig) -> Result<FuzzyFingerprint, FingerprintError>`
|
|
197
|
-
|
|
198
|
-
Generates a fuzzy hash from input data.
|
|
199
|
-
|
|
200
|
-
**Parameters:**
|
|
201
|
-
- `data`: Input data slice
|
|
202
|
-
- `config`: Configuration options
|
|
203
|
-
|
|
204
|
-
**Returns:**
|
|
205
|
-
- `Ok(FuzzyFingerprint)`: Generated fingerprint
|
|
206
|
-
- `Err(FingerprintError)`: Error if data is invalid
|
|
207
|
-
|
|
208
|
-
#### `compare_hashes(hash_a: &FuzzyFingerprint, hash_b: &FuzzyFingerprint, alpha: f32) -> u8`
|
|
209
|
-
|
|
210
|
-
Compares two fingerprints.
|
|
211
|
-
|
|
212
|
-
**Parameters:**
|
|
213
|
-
- `hash_a`: First fingerprint
|
|
214
|
-
- `hash_b`: Second fingerprint
|
|
215
|
-
- `alpha`: Weight coefficient (0.0-1.0, default 0.3)
|
|
216
|
-
|
|
217
|
-
**Returns:**
|
|
218
|
-
- Similarity score (0-100)
|
|
219
|
-
|
|
220
|
-
#### `HashConfig`
|
|
221
|
-
|
|
222
|
-
Configuration structure for hash generation.
|
|
386
|
+
import { wasm_compare_data } from 'lavinhash';
|
|
223
387
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
- `alpha: f32` - Weight for structure vs content (default: 0.3)
|
|
227
|
-
- `min_modulus: u64` - Feature density control (default: 16)
|
|
388
|
+
const file1 = new Uint8Array(await fileA.arrayBuffer());
|
|
389
|
+
const file2 = new Uint8Array(await fileB.arrayBuffer());
|
|
228
390
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
let mut config = HashConfig::default();
|
|
232
|
-
config.alpha = 0.5; // 50% structure, 50% content
|
|
233
|
-
config.enable_parallel = false; // Disable parallel processing
|
|
391
|
+
const similarity = wasm_compare_data(file1, file2);
|
|
392
|
+
console.log(`Similarity: ${similarity}%`);
|
|
234
393
|
```
|
|
235
394
|
|
|
236
|
-
|
|
395
|
+
---
|
|
237
396
|
|
|
238
|
-
|
|
397
|
+
## Algorithm Details
|
|
239
398
|
|
|
240
|
-
|
|
399
|
+
### DLAH Architecture
|
|
241
400
|
|
|
242
401
|
**Phase I: Adaptive Normalization**
|
|
243
|
-
- Case folding (A-Z
|
|
402
|
+
- Case folding (A-Z → a-z)
|
|
244
403
|
- Whitespace normalization
|
|
245
404
|
- Control character filtering
|
|
246
405
|
- Zero-copy iterator-based processing
|
|
247
406
|
|
|
248
407
|
**Phase II: Structural Hash**
|
|
249
|
-
- Shannon entropy calculation
|
|
250
|
-
-
|
|
251
|
-
-
|
|
252
|
-
- Levenshtein distance
|
|
408
|
+
- Shannon entropy calculation: `H(X) = -Σ p(x) log₂ p(x)`
|
|
409
|
+
- Adaptive block sizing (default: 256 bytes)
|
|
410
|
+
- Quantization to 4-bit nibbles (0-15 range)
|
|
411
|
+
- Comparison via Levenshtein distance
|
|
253
412
|
|
|
254
413
|
**Phase III: Content Hash**
|
|
255
|
-
- BuzHash rolling hash algorithm
|
|
256
|
-
- Adaptive modulus
|
|
257
|
-
- 8192-bit Bloom filter (1KB)
|
|
258
|
-
- Jaccard similarity
|
|
414
|
+
- BuzHash rolling hash algorithm (64-byte window)
|
|
415
|
+
- Adaptive modulus: `M = min(file_size / 256, 8192)`
|
|
416
|
+
- 8192-bit Bloom filter (1KB, 3 hash functions)
|
|
417
|
+
- Comparison via Jaccard similarity: `|A ∩ B| / |A ∪ B|`
|
|
259
418
|
|
|
260
419
|
### Similarity Formula
|
|
261
420
|
|
|
262
421
|
```
|
|
263
|
-
Similarity = α × Levenshtein(
|
|
422
|
+
Similarity(A, B) = α × Levenshtein(StructA, StructB) + (1-α) × Jaccard(ContentA, ContentB)
|
|
264
423
|
```
|
|
265
424
|
|
|
266
425
|
Where:
|
|
@@ -268,142 +427,42 @@ Where:
|
|
|
268
427
|
- Levenshtein: Normalized edit distance on entropy vectors
|
|
269
428
|
- Jaccard: Set similarity on Bloom filter features
|
|
270
429
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
**Time Complexity:**
|
|
274
|
-
- Hash generation: O(n) where n is data size
|
|
275
|
-
- Hash comparison: O(1) - constant time regardless of file size
|
|
276
|
-
|
|
277
|
-
**Space Complexity:**
|
|
278
|
-
- Fingerprint size: ~1KB + O(log n) structural data
|
|
279
|
-
- Memory usage: O(1) for comparison, O(n) for generation
|
|
430
|
+
---
|
|
280
431
|
|
|
281
|
-
|
|
282
|
-
- Single-threaded: ~500 MB/s
|
|
283
|
-
- Multi-threaded: ~2 GB/s (files larger than 1MB)
|
|
432
|
+
## Performance Characteristics
|
|
284
433
|
|
|
285
|
-
|
|
434
|
+
| Metric | Value |
|
|
435
|
+
|--------|-------|
|
|
436
|
+
| **Time Complexity** | O(n) - Linear in file size |
|
|
437
|
+
| **Space Complexity** | O(1) - Constant memory |
|
|
438
|
+
| **Fingerprint Size** | ~1-2 KB - Independent of file size |
|
|
439
|
+
| **Throughput** | ~500 MB/s single-threaded, ~2 GB/s multi-threaded |
|
|
440
|
+
| **Comparison Speed** | O(1) - Constant time |
|
|
286
441
|
|
|
287
|
-
|
|
442
|
+
**Optimization Techniques:**
|
|
443
|
+
- SIMD entropy calculation (AVX2 intrinsics)
|
|
444
|
+
- Rayon parallelization for files >1MB
|
|
445
|
+
- Cache-friendly Bloom filter (fits in L1/L2)
|
|
446
|
+
- Zero-copy FFI across language boundaries
|
|
288
447
|
|
|
289
|
-
|
|
290
|
-
use lavinhash::HashConfig;
|
|
291
|
-
|
|
292
|
-
let config = HashConfig {
|
|
293
|
-
enable_parallel: true,
|
|
294
|
-
alpha: 0.3,
|
|
295
|
-
min_modulus: 16,
|
|
296
|
-
};
|
|
297
|
-
```
|
|
448
|
+
---
|
|
298
449
|
|
|
299
|
-
|
|
450
|
+
## Cross-Platform Support
|
|
300
451
|
|
|
301
|
-
**
|
|
452
|
+
LavinHash produces **identical fingerprints** across all platforms:
|
|
302
453
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
454
|
+
- Linux (x86_64, ARM64)
|
|
455
|
+
- Windows (x86_64)
|
|
456
|
+
- macOS (x86_64, ARM64/M1/M2)
|
|
457
|
+
- WebAssembly (wasm32)
|
|
306
458
|
|
|
307
|
-
|
|
308
|
-
config.alpha = 0.1; // 10% structure, 90% content
|
|
309
|
-
```
|
|
310
|
-
|
|
311
|
-
**Controlling Feature Density:**
|
|
312
|
-
|
|
313
|
-
```rust
|
|
314
|
-
// Higher sensitivity (more features)
|
|
315
|
-
config.min_modulus = 8;
|
|
316
|
-
|
|
317
|
-
// Lower sensitivity (fewer features)
|
|
318
|
-
config.min_modulus = 32;
|
|
319
|
-
```
|
|
459
|
+
Achieved through explicit endianness handling and deterministic hash seeding.
|
|
320
460
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
```rust
|
|
324
|
-
// Force sequential processing
|
|
325
|
-
config.enable_parallel = false;
|
|
326
|
-
|
|
327
|
-
// Enable automatic parallel processing for files > 1MB
|
|
328
|
-
config.enable_parallel = true;
|
|
329
|
-
```
|
|
330
|
-
|
|
331
|
-
## Use Cases
|
|
332
|
-
|
|
333
|
-
### Document Similarity Detection
|
|
334
|
-
|
|
335
|
-
Compare different versions of documents to detect modifications and measure similarity.
|
|
336
|
-
|
|
337
|
-
```javascript
|
|
338
|
-
import { wasm_compare_data } from 'lavinhash';
|
|
339
|
-
|
|
340
|
-
// In a React/Vue/Angular app with file upload
|
|
341
|
-
async function compareDocuments(file1, file2) {
|
|
342
|
-
const buffer1 = await file1.arrayBuffer();
|
|
343
|
-
const buffer2 = await file2.arrayBuffer();
|
|
344
|
-
|
|
345
|
-
const data1 = new Uint8Array(buffer1);
|
|
346
|
-
const data2 = new Uint8Array(buffer2);
|
|
347
|
-
|
|
348
|
-
const similarity = wasm_compare_data(data1, data2);
|
|
349
|
-
console.log(`Similarity: ${similarity}%`);
|
|
350
|
-
return similarity;
|
|
351
|
-
}
|
|
352
|
-
```
|
|
353
|
-
|
|
354
|
-
### Duplicate Detection
|
|
355
|
-
|
|
356
|
-
Identify duplicate or near-duplicate files in large datasets.
|
|
357
|
-
|
|
358
|
-
```rust
|
|
359
|
-
let files = vec![file1, file2, file3];
|
|
360
|
-
let hashes: Vec<_> = files.iter()
|
|
361
|
-
.map(|f| generate_hash(f, &config).unwrap())
|
|
362
|
-
.collect();
|
|
363
|
-
|
|
364
|
-
for i in 0..hashes.len() {
|
|
365
|
-
for j in i+1..hashes.len() {
|
|
366
|
-
let sim = compare_hashes(&hashes[i], &hashes[j], 0.3);
|
|
367
|
-
if sim > 90 {
|
|
368
|
-
println!("Files {} and {} are similar: {}%", i, j, sim);
|
|
369
|
-
}
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
```
|
|
373
|
-
|
|
374
|
-
### Version Tracking
|
|
375
|
-
|
|
376
|
-
Track changes between different versions of files or content.
|
|
377
|
-
|
|
378
|
-
```javascript
|
|
379
|
-
import { wasm_generate_hash, wasm_compare_hashes } from 'lavinhash';
|
|
380
|
-
|
|
381
|
-
// Compare multiple file versions
|
|
382
|
-
async function trackVersions(files) {
|
|
383
|
-
const encoder = new TextEncoder();
|
|
384
|
-
|
|
385
|
-
const hashes = files.map(content => {
|
|
386
|
-
const data = encoder.encode(content);
|
|
387
|
-
return wasm_generate_hash(data);
|
|
388
|
-
});
|
|
389
|
-
|
|
390
|
-
const results = [];
|
|
391
|
-
for (let i = 0; i < hashes.length - 1; i++) {
|
|
392
|
-
const sim = wasm_compare_hashes(hashes[i], hashes[i + 1]);
|
|
393
|
-
results.push({
|
|
394
|
-
from: `v${i+1}`,
|
|
395
|
-
to: `v${i+2}`,
|
|
396
|
-
similarity: sim
|
|
397
|
-
});
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
return results;
|
|
401
|
-
}
|
|
402
|
-
```
|
|
461
|
+
---
|
|
403
462
|
|
|
404
463
|
## Framework Compatibility
|
|
405
464
|
|
|
406
|
-
|
|
465
|
+
Works seamlessly with all modern JavaScript frameworks and build tools:
|
|
407
466
|
|
|
408
467
|
- **React**: Vite, Create React App, Next.js, Remix
|
|
409
468
|
- **Angular**: Angular CLI (v12+)
|
|
@@ -411,91 +470,63 @@ LavinHash works seamlessly with all modern JavaScript frameworks and build tools
|
|
|
411
470
|
- **Svelte**: SvelteKit, Vite
|
|
412
471
|
- **Build Tools**: Webpack 5+, Vite, Rollup, Parcel, esbuild
|
|
413
472
|
|
|
414
|
-
|
|
473
|
+
---
|
|
415
474
|
|
|
416
|
-
##
|
|
475
|
+
## TypeScript Support
|
|
417
476
|
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
```bash
|
|
421
|
-
# Install wasm-pack
|
|
422
|
-
cargo install wasm-pack
|
|
423
|
-
|
|
424
|
-
# Build for modern bundlers (React, Angular, Vue, etc.)
|
|
425
|
-
wasm-pack build --target bundler --out-dir pkg --out-name lavinhash
|
|
477
|
+
Full TypeScript definitions included:
|
|
426
478
|
|
|
427
|
-
|
|
479
|
+
```typescript
|
|
480
|
+
export function wasm_generate_hash(data: Uint8Array): Uint8Array;
|
|
481
|
+
export function wasm_compare_hashes(hash_a: Uint8Array, hash_b: Uint8Array): number;
|
|
482
|
+
export function wasm_compare_data(data_a: Uint8Array, data_b: Uint8Array): number;
|
|
428
483
|
```
|
|
429
484
|
|
|
430
|
-
|
|
485
|
+
---
|
|
431
486
|
|
|
432
|
-
|
|
487
|
+
## Building from Source
|
|
433
488
|
|
|
434
489
|
```bash
|
|
435
|
-
#
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
# Run tests with output
|
|
439
|
-
cargo test -- --nocapture
|
|
440
|
-
|
|
441
|
-
# Run specific test
|
|
442
|
-
cargo test test_generate_hash_basic
|
|
443
|
-
```
|
|
444
|
-
|
|
445
|
-
### Benchmarks
|
|
446
|
-
|
|
447
|
-
```bash
|
|
448
|
-
# Run benchmarks
|
|
449
|
-
cargo bench
|
|
450
|
-
```
|
|
490
|
+
# Clone repository
|
|
491
|
+
git clone https://github.com/RafaCalRob/LavinHash.git
|
|
492
|
+
cd LavinHash
|
|
451
493
|
|
|
452
|
-
|
|
494
|
+
# Build Rust library
|
|
495
|
+
cargo build --release
|
|
453
496
|
|
|
454
|
-
|
|
497
|
+
# Build WASM for npm
|
|
498
|
+
cargo install wasm-pack
|
|
499
|
+
wasm-pack build --target bundler --out-dir pkg --out-name lavinhash
|
|
455
500
|
|
|
501
|
+
# The compiled files will be in pkg/
|
|
456
502
|
```
|
|
457
|
-
Offset | Field | Type | Size
|
|
458
|
-
-------|------------------|----------|-------------
|
|
459
|
-
0x00 | Magic | u8 | 1 byte (0x48)
|
|
460
|
-
0x01 | Version | u8 | 1 byte (0x01)
|
|
461
|
-
0x02 | Struct Length | u16 LE | 2 bytes
|
|
462
|
-
0x04 | Content Bloom | u64[128] | 1024 bytes
|
|
463
|
-
0x404 | Structural Data | u8[] | Variable
|
|
464
|
-
```
|
|
465
|
-
|
|
466
|
-
**Cross-Platform Determinism:**
|
|
467
|
-
- Identical input produces identical hash on all platforms
|
|
468
|
-
- Little-endian byte ordering
|
|
469
|
-
- IEEE 754 floating-point compliance
|
|
470
|
-
|
|
471
|
-
**Thread Safety:**
|
|
472
|
-
- Hash generation is thread-safe
|
|
473
|
-
- Parallel processing uses Rayon for data parallelism
|
|
474
|
-
- No global state or locks
|
|
475
503
|
|
|
476
|
-
|
|
504
|
+
---
|
|
477
505
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
- `basic_usage.rs` - Rust usage examples
|
|
481
|
-
- `javascript_example.js` - Node.js integration
|
|
482
|
-
- `browser_example.html` - Browser-based demo
|
|
506
|
+
## License
|
|
483
507
|
|
|
484
|
-
|
|
508
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
485
509
|
|
|
486
|
-
|
|
487
|
-
- **Technical Specification**: See `docs/TECHNICAL.md` in the repository
|
|
488
|
-
- **Contributing Guide**: See `CONTRIBUTING.md`
|
|
510
|
+
---
|
|
489
511
|
|
|
490
|
-
##
|
|
512
|
+
## Links
|
|
491
513
|
|
|
492
|
-
|
|
514
|
+
- **npm Package**: https://www.npmjs.com/package/lavinhash
|
|
515
|
+
- **GitHub Repository**: https://github.com/RafaCalRob/LavinHash
|
|
516
|
+
- **Live Demo**: http://localhost:4002/lavinhash/demo
|
|
517
|
+
- **Issue Tracker**: https://github.com/RafaCalRob/LavinHash/issues
|
|
493
518
|
|
|
494
|
-
|
|
519
|
+
---
|
|
495
520
|
|
|
496
|
-
|
|
521
|
+
## Citation
|
|
497
522
|
|
|
498
|
-
|
|
523
|
+
If you use LavinHash in academic work, please cite:
|
|
499
524
|
|
|
500
|
-
|
|
501
|
-
|
|
525
|
+
```bibtex
|
|
526
|
+
@software{lavinhash2024,
|
|
527
|
+
title = {LavinHash: Dual-Layer Adaptive Hashing for File Similarity Detection},
|
|
528
|
+
author = {LavinHash Contributors},
|
|
529
|
+
year = {2024},
|
|
530
|
+
url = {https://github.com/RafaCalRob/LavinHash}
|
|
531
|
+
}
|
|
532
|
+
```
|
package/package.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"LavinHash Contributors"
|
|
6
6
|
],
|
|
7
7
|
"description": "High-performance fuzzy hashing library implementing the DLAH (Dual-Layer Adaptive Hashing) algorithm",
|
|
8
|
-
"version": "1.0.
|
|
8
|
+
"version": "1.0.1",
|
|
9
9
|
"license": "MIT",
|
|
10
10
|
"repository": {
|
|
11
11
|
"type": "git",
|
|
@@ -43,4 +43,4 @@
|
|
|
43
43
|
"vue",
|
|
44
44
|
"typescript"
|
|
45
45
|
]
|
|
46
|
-
}
|
|
46
|
+
}
|