RubyGems - mammoth-hasher - Versions diffs - 0.1.0 - Mend

mammoth-hasher 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 5c93a9f01f7da01607911a7f7c5df3acc1ff0957
+  data.tar.gz: 1704ea09e5aeb7422ed007826d66363c2ec6737d
+SHA512:
+  metadata.gz: b6040ec90679e24ecd5e1bbdf08a6ac719ac1a0621ec4fa8767a092b1864b448aa47a0bc52c1940fa5060f825392bc060030d25b0ff14b17c3e0ea0827729cc2
+  data.tar.gz: 9c5802cf32e2743a6f99448f62e108880c3ad1963b6052de8442b52aee504c7e02237d088879beb2819854caa0c4d0ccc305f84d5dd0e936fb4f54cd1f35614e

data/lib/mammoth-hasher.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'digest'  # needed for the md5 hash algorithm
+class MammothHasher
+  def self.hash filename, debug=false
+    time_start = Time.now if debug
+    # we check that the file exist
+    raise ArgumentError, "give the filename as a parameter (got nil)" if filename == nil
+    raise ArgumentError, "filename must be a string" if ! filename.is_a? String
+    filename = File.expand_path filename
+    raise ArgumentError, "#{filename} does not exist" if ! File.exist? filename
+    # algorithm parameters
+    # WARNING: if you change them, the resulting hash will be different !
+    number_of_chunks = 100
+    length_of_chunks = 100
+    # we get the file size (in bytes), used as PRNG (Pseudo Random Number Generator)
+    filesize = File.size filename
+    # if the file is not a big file, it's quicker to compute
+    # the MD5 of the whole file than to apply our custom algorithm
+    if filesize <= number_of_chunks*length_of_chunks
+      file = File.open(filename, 'r')
+      final_hash = Digest::MD5.file(file).hexdigest
+      file.close
+      puts (Time.now - time_start).to_s + " seconds" if debug
+      return final_hash
+    end
+    # we initialize the PRNG
+    prng = Random.new filesize
+    # we get 1000 numbers between 0 and filesize-size_of_chunk
+    offsets = []
+    for i in 0..number_of_chunks
+      offsets << prng.rand(filesize - length_of_chunks)
+    end
+    # we sort the offsets in ascending order
+    # (in order to optimize the way the file will be read (in only one direction))
+    offsets.sort
+    # we compute the hashes of several parts of the file
+    hashes = ""
+    # first, we compute the hash of the first bytes of the file,
+    # because that's where the magic number indicating the file type is
+    # so making sure that it's still the same may be safer
+    hashes << Digest::MD5.new.hexdigest(File.read(filename, 100))
+    # for each offset, we compute the hash of the following bytes
+    # and we concatenate these hashes
+    for offset in offsets
+      hashes += Digest::MD5.new.hexdigest(File.read(filename, length_of_chunks, offset))
+    end
+    # we compute the final hash, which is the hash of the concatenation
+    # of the previous hashes
+    final_hash = Digest::MD5.new.hexdigest hashes
+    puts (Time.now - time_start).to_s + " seconds" if debug
+    return final_hash
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,45 @@
+--- !ruby/object:Gem::Specification
+name: mammoth-hasher
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Vincent Marquet
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2015-02-21 00:00:00.000000000 Z
+dependencies: []
+description: A library to compute fingerprints for big files, when runnning usual
+  algorithms as MD5 is too long.
+email:
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/mammoth-hasher.rb
+homepage: http://github.com/vmarquet/ruby-mammoth-hasher
+licenses:
+- WTFPL
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.5
+signing_key:
+specification_version: 4
+summary: A library to compute fingerprints for big files.
+test_files: []