mammoth-hasher 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/mammoth-hasher.rb +64 -0
  3. metadata +45 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5c93a9f01f7da01607911a7f7c5df3acc1ff0957
4
+ data.tar.gz: 1704ea09e5aeb7422ed007826d66363c2ec6737d
5
+ SHA512:
6
+ metadata.gz: b6040ec90679e24ecd5e1bbdf08a6ac719ac1a0621ec4fa8767a092b1864b448aa47a0bc52c1940fa5060f825392bc060030d25b0ff14b17c3e0ea0827729cc2
7
+ data.tar.gz: 9c5802cf32e2743a6f99448f62e108880c3ad1963b6052de8442b52aee504c7e02237d088879beb2819854caa0c4d0ccc305f84d5dd0e936fb4f54cd1f35614e
@@ -0,0 +1,64 @@
1
+ require 'digest' # needed for the md5 hash algorithm
2
+
3
+ class MammothHasher
4
+ def self.hash filename, debug=false
5
+ time_start = Time.now if debug
6
+
7
+ # we check that the file exist
8
+ raise ArgumentError, "give the filename as a parameter (got nil)" if filename == nil
9
+ raise ArgumentError, "filename must be a string" if ! filename.is_a? String
10
+ filename = File.expand_path filename
11
+ raise ArgumentError, "#{filename} does not exist" if ! File.exist? filename
12
+
13
+ # algorithm parameters
14
+ # WARNING: if you change them, the resulting hash will be different !
15
+ number_of_chunks = 100
16
+ length_of_chunks = 100
17
+
18
+ # we get the file size (in bytes), used as PRNG (Pseudo Random Number Generator)
19
+ filesize = File.size filename
20
+
21
+ # if the file is not a big file, it's quicker to compute
22
+ # the MD5 of the whole file than to apply our custom algorithm
23
+ if filesize <= number_of_chunks*length_of_chunks
24
+ file = File.open(filename, 'r')
25
+ final_hash = Digest::MD5.file(file).hexdigest
26
+ file.close
27
+ puts (Time.now - time_start).to_s + " seconds" if debug
28
+ return final_hash
29
+ end
30
+
31
+ # we initialize the PRNG
32
+ prng = Random.new filesize
33
+
34
+ # we get 1000 numbers between 0 and filesize-size_of_chunk
35
+ offsets = []
36
+ for i in 0..number_of_chunks
37
+ offsets << prng.rand(filesize - length_of_chunks)
38
+ end
39
+
40
+ # we sort the offsets in ascending order
41
+ # (in order to optimize the way the file will be read (in only one direction))
42
+ offsets.sort
43
+
44
+ # we compute the hashes of several parts of the file
45
+ hashes = ""
46
+ # first, we compute the hash of the first bytes of the file,
47
+ # because that's where the magic number indicating the file type is
48
+ # so making sure that it's still the same may be safer
49
+ hashes << Digest::MD5.new.hexdigest(File.read(filename, 100))
50
+ # for each offset, we compute the hash of the following bytes
51
+ # and we concatenate these hashes
52
+ for offset in offsets
53
+ hashes += Digest::MD5.new.hexdigest(File.read(filename, length_of_chunks, offset))
54
+ end
55
+
56
+ # we compute the final hash, which is the hash of the concatenation
57
+ # of the previous hashes
58
+ final_hash = Digest::MD5.new.hexdigest hashes
59
+
60
+ puts (Time.now - time_start).to_s + " seconds" if debug
61
+
62
+ return final_hash
63
+ end
64
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mammoth-hasher
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Vincent Marquet
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-21 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: A library to compute fingerprints for big files, when runnning usual
14
+ algorithms as MD5 is too long.
15
+ email:
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/mammoth-hasher.rb
21
+ homepage: http://github.com/vmarquet/ruby-mammoth-hasher
22
+ licenses:
23
+ - WTFPL
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.4.5
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: A library to compute fingerprints for big files.
45
+ test_files: []