mammoth-hasher 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +8 -0
- data/lib/mammoth-hasher.rb +16 -19
- data/test/test_mammoth-hasher.rb +24 -0
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e8a53d70781a6348d58af444f42747ffcb700b64
|
4
|
+
data.tar.gz: caa0fff7b6d9053006daf8b1bcb2cd4e4f0e5c5f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4dcad29b2156bbe8343a2f5fd93c637b1ec447ffdb2f062b5fc6eb5dab3cc9857ab7c325aa29a3fe862c2ee2622f8f211e04d24f8e3da2204747bba2f2531947
|
7
|
+
data.tar.gz: 7b224f29d208466247a2a89b7e0e60190ab42a4b088bd01a72bbc982cdefbd6389f0369e3304c70838a6f5f6b591667e189f03cf4b73a3d6a45c91befb489a1d
|
data/Rakefile
ADDED
data/lib/mammoth-hasher.rb
CHANGED
@@ -13,7 +13,7 @@ class MammothHasher
|
|
13
13
|
# algorithm parameters
|
14
14
|
# WARNING: if you change them, the resulting hash will be different !
|
15
15
|
number_of_chunks = 100
|
16
|
-
length_of_chunks =
|
16
|
+
length_of_chunks = 4
|
17
17
|
|
18
18
|
# we get the file size (in bytes), used as PRNG (Pseudo Random Number Generator)
|
19
19
|
filesize = File.size filename
|
@@ -22,18 +22,22 @@ class MammothHasher
|
|
22
22
|
# the MD5 of the whole file than to apply our custom algorithm
|
23
23
|
if filesize <= number_of_chunks*length_of_chunks
|
24
24
|
file = File.open(filename, 'r')
|
25
|
-
|
25
|
+
hash = Digest::MD5.file(file).hexdigest
|
26
26
|
file.close
|
27
27
|
puts (Time.now - time_start).to_s + " seconds" if debug
|
28
|
-
return
|
28
|
+
return hash
|
29
29
|
end
|
30
30
|
|
31
31
|
# we initialize the PRNG
|
32
32
|
prng = Random.new filesize
|
33
33
|
|
34
|
-
# we get
|
35
|
-
|
36
|
-
|
34
|
+
# we always get a chunk at the offset 0 (beginning of file)
|
35
|
+
# because that's where the magic number indicating the file type is
|
36
|
+
# so making sure that it's still the same may prevent from some attacks
|
37
|
+
offsets = [0]
|
38
|
+
|
39
|
+
# we get 99 other offsets between 0 and (filesize - length_of_chunk)
|
40
|
+
for i in 1..(number_of_chunks-1)
|
37
41
|
offsets << prng.rand(filesize - length_of_chunks)
|
38
42
|
end
|
39
43
|
|
@@ -41,24 +45,17 @@ class MammothHasher
|
|
41
45
|
# (in order to optimize the way the file will be read (in only one direction))
|
42
46
|
offsets.sort
|
43
47
|
|
44
|
-
# we
|
45
|
-
|
46
|
-
# first, we compute the hash of the first bytes of the file,
|
47
|
-
# because that's where the magic number indicating the file type is
|
48
|
-
# so making sure that it's still the same may be safer
|
49
|
-
hashes << Digest::MD5.new.hexdigest(File.read(filename, 100))
|
50
|
-
# for each offset, we compute the hash of the following bytes
|
51
|
-
# and we concatenate these hashes
|
48
|
+
# we concatenate all the bytes from all the chunks at the offset we choose
|
49
|
+
bytes = ""
|
52
50
|
for offset in offsets
|
53
|
-
|
51
|
+
bytes = "#{bytes}#{File.read(filename, length_of_chunks, offset)}"
|
54
52
|
end
|
55
53
|
|
56
|
-
# we compute the final hash, which is the hash of the concatenation
|
57
|
-
|
58
|
-
final_hash = Digest::MD5.new.hexdigest hashes
|
54
|
+
# we compute the final hash, which is the hash of the concatenation of all chunks
|
55
|
+
hash = Digest::MD5.new.hexdigest bytes
|
59
56
|
|
60
57
|
puts (Time.now - time_start).to_s + " seconds" if debug
|
61
58
|
|
62
|
-
return
|
59
|
+
return hash
|
63
60
|
end
|
64
61
|
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'digest'
|
3
|
+
require 'mammoth-hasher'
|
4
|
+
|
5
|
+
class MammothHasherTest < Test::Unit::TestCase
|
6
|
+
# for small files, instead of using our custom algorithm,
|
7
|
+
# it's simpler to use the md5 hash directly,
|
8
|
+
# so here we test that MammothHasher hash is the same than md5 hash
|
9
|
+
def test_small_file_hash
|
10
|
+
filename = "test/fixtures/small.txt"
|
11
|
+
file = File.open(filename, 'r')
|
12
|
+
assert_equal MammothHasher.hash(filename), Digest::MD5.file(file).hexdigest
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_hash_size
|
16
|
+
filename = "test/fixtures/large.txt"
|
17
|
+
assert_equal MammothHasher.hash(filename).length, 32
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_hash_result
|
21
|
+
filename = "test/fixtures/large.txt"
|
22
|
+
assert_equal MammothHasher.hash(filename), "d5d198a347f02adafa6e1749ad594340"
|
23
|
+
end
|
24
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mammoth-hasher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Vincent Marquet
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A library to compute fingerprints for big files, when runnning usual
|
14
14
|
algorithms as MD5 is too long.
|
@@ -17,10 +17,12 @@ executables: []
|
|
17
17
|
extensions: []
|
18
18
|
extra_rdoc_files: []
|
19
19
|
files:
|
20
|
+
- Rakefile
|
20
21
|
- lib/mammoth-hasher.rb
|
22
|
+
- test/test_mammoth-hasher.rb
|
21
23
|
homepage: http://github.com/vmarquet/ruby-mammoth-hasher
|
22
24
|
licenses:
|
23
|
-
-
|
25
|
+
- MIT
|
24
26
|
metadata: {}
|
25
27
|
post_install_message:
|
26
28
|
rdoc_options: []
|
@@ -38,8 +40,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
38
40
|
version: '0'
|
39
41
|
requirements: []
|
40
42
|
rubyforge_project:
|
41
|
-
rubygems_version: 2.4.
|
43
|
+
rubygems_version: 2.4.6
|
42
44
|
signing_key:
|
43
45
|
specification_version: 4
|
44
46
|
summary: A library to compute fingerprints for big files.
|
45
|
-
test_files:
|
47
|
+
test_files:
|
48
|
+
- test/test_mammoth-hasher.rb
|