mammoth-hasher 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/mammoth-hasher.rb +64 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5c93a9f01f7da01607911a7f7c5df3acc1ff0957
4
+ data.tar.gz: 1704ea09e5aeb7422ed007826d66363c2ec6737d
5
+ SHA512:
6
+ metadata.gz: b6040ec90679e24ecd5e1bbdf08a6ac719ac1a0621ec4fa8767a092b1864b448aa47a0bc52c1940fa5060f825392bc060030d25b0ff14b17c3e0ea0827729cc2
7
+ data.tar.gz: 9c5802cf32e2743a6f99448f62e108880c3ad1963b6052de8442b52aee504c7e02237d088879beb2819854caa0c4d0ccc305f84d5dd0e936fb4f54cd1f35614e
@@ -0,0 +1,64 @@
1
+ require 'digest' # needed for the md5 hash algorithm
2
+
3
+ class MammothHasher
4
+ def self.hash filename, debug=false
5
+ time_start = Time.now if debug
6
+
7
+ # we check that the file exist
8
+ raise ArgumentError, "give the filename as a parameter (got nil)" if filename == nil
9
+ raise ArgumentError, "filename must be a string" if ! filename.is_a? String
10
+ filename = File.expand_path filename
11
+ raise ArgumentError, "#{filename} does not exist" if ! File.exist? filename
12
+
13
+ # algorithm parameters
14
+ # WARNING: if you change them, the resulting hash will be different !
15
+ number_of_chunks = 100
16
+ length_of_chunks = 100
17
+
18
+ # we get the file size (in bytes), used as PRNG (Pseudo Random Number Generator)
19
+ filesize = File.size filename
20
+
21
+ # if the file is not a big file, it's quicker to compute
22
+ # the MD5 of the whole file than to apply our custom algorithm
23
+ if filesize <= number_of_chunks*length_of_chunks
24
+ file = File.open(filename, 'r')
25
+ final_hash = Digest::MD5.file(file).hexdigest
26
+ file.close
27
+ puts (Time.now - time_start).to_s + " seconds" if debug
28
+ return final_hash
29
+ end
30
+
31
+ # we initialize the PRNG
32
+ prng = Random.new filesize
33
+
34
+ # we get 1000 numbers between 0 and filesize-size_of_chunk
35
+ offsets = []
36
+ for i in 0..number_of_chunks
37
+ offsets << prng.rand(filesize - length_of_chunks)
38
+ end
39
+
40
+ # we sort the offsets in ascending order
41
+ # (in order to optimize the way the file will be read (in only one direction))
42
+ offsets.sort
43
+
44
+ # we compute the hashes of several parts of the file
45
+ hashes = ""
46
+ # first, we compute the hash of the first bytes of the file,
47
+ # because that's where the magic number indicating the file type is
48
+ # so making sure that it's still the same may be safer
49
+ hashes << Digest::MD5.new.hexdigest(File.read(filename, 100))
50
+ # for each offset, we compute the hash of the following bytes
51
+ # and we concatenate these hashes
52
+ for offset in offsets
53
+ hashes += Digest::MD5.new.hexdigest(File.read(filename, length_of_chunks, offset))
54
+ end
55
+
56
+ # we compute the final hash, which is the hash of the concatenation
57
+ # of the previous hashes
58
+ final_hash = Digest::MD5.new.hexdigest hashes
59
+
60
+ puts (Time.now - time_start).to_s + " seconds" if debug
61
+
62
+ return final_hash
63
+ end
64
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mammoth-hasher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Vincent Marquet
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-21 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A library to compute fingerprints for big files, when runnning usual
14
+ algorithms as MD5 is too long.
15
+ email:
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/mammoth-hasher.rb
21
+ homepage: http://github.com/vmarquet/ruby-mammoth-hasher
22
+ licenses:
23
+ - WTFPL
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.4.5
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: A library to compute fingerprints for big files.
45
+ test_files: []