sampling-hash 0.0.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +6 -14
- data/lib/sampling-hash.rb +19 -6
- data/lib/sampling-hash/hash.rb +97 -0
- data/lib/sampling-hash/sampler.rb +60 -0
- data/lib/sampling-hash/sampling-io.rb +6 -31
- data/lib/sampling-hash/version.rb +1 -1
- metadata +25 -24
    
        checksums.yaml
    CHANGED
    
    | @@ -1,15 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 | 
            -
             | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
             | 
| 5 | 
            -
             | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
              metadata.gz: !binary |-
         | 
| 9 | 
            -
                MjRjZDZjMGVjZTE1ZDczMDA1Y2NkYmJiMmY4NjkyODk0Y2I3MjdhMGU0ODFi
         | 
| 10 | 
            -
                ODRkMmYzMTBiNDk5ZjVlMDVmZmIwY2M2MmY2Y2IzMWZmYzBiNjY5Mjg1NzBi
         | 
| 11 | 
            -
                ZDlmZTJmMjg3YzllOGI1NTk0M2I5OTVkNGRkMTdmYzU5MTRlOTQ=
         | 
| 12 | 
            -
              data.tar.gz: !binary |-
         | 
| 13 | 
            -
                MjNmNThiZTRhNGQxNDMxOGJjOGMxZGJhYzBhNTE1NmY1MjkwNDg2NGIyYzk4
         | 
| 14 | 
            -
                ZTg5YTAzNGJlYjVkY2Y1ZDE3YzdkMzZhNmM2OTBkZDMzZWJlYmE1NjYzNjI2
         | 
| 15 | 
            -
                NjdlNDNjNWZkYWFlZTM5MjU2YjNmNzMzNzY1NjRhYTIxZDU1MTk=
         | 
| 2 | 
            +
            SHA256:
         | 
| 3 | 
            +
              metadata.gz: e944cda00228ac5a7405198955de90e17134f243b828daead2fab77155103030
         | 
| 4 | 
            +
              data.tar.gz: de8b6e76b09460cfad680c842edbe195922e9db5a48de1fde050be2d30156d47
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: a0a7043b089963ecdff6986b9026aef765cb12cfe0630283b8c6719385c80b7531650c8a4875b8923025b3b239ec8641d9d9bb180be971e39420f6242614402c
         | 
| 7 | 
            +
              data.tar.gz: '0295e1b2f327a286b3bc1029dc15fd141d92f2a3c424404f9ea9534b71dfc45a3bf1d893b9c34edfc62abbd013309905b8bd460a4ec0d8b06ff3058e622db26c'
         | 
    
        data/lib/sampling-hash.rb
    CHANGED
    
    | @@ -1,18 +1,31 @@ | |
| 1 | 
            +
            require 'sampling-hash/hash'
         | 
| 2 | 
            +
            require 'sampling-hash/sampler'
         | 
| 1 3 | 
             
            require 'sampling-hash/sampling-io'
         | 
| 2 4 | 
             
            require 'sampling-hash/version'
         | 
| 3 5 | 
             
            require 'xxhash'
         | 
| 4 6 |  | 
| 5 7 | 
             
            module SamplingHash
         | 
| 6 | 
            -
               | 
| 8 | 
            +
              # We default to 64 bit xxhash.
         | 
| 9 | 
            +
              def self.hash(path, seed = File.size(path), hash = XXhash::XXhashInternal::StreamingHash64.new(seed))
         | 
| 7 10 | 
             
                raise ArgumentError, 'file not found' unless File.file?(path)
         | 
| 8 11 |  | 
| 9 | 
            -
                 | 
| 10 | 
            -
             | 
| 12 | 
            +
                File.open(path, 'r') do |fd|
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                  sio = SamplingIO.new(fd)
         | 
| 15 | 
            +
                  sio.samples do |chunk|
         | 
| 16 | 
            +
                    hash.update(chunk)
         | 
| 17 | 
            +
                  end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                  hash.digest
         | 
| 11 20 |  | 
| 12 | 
            -
                while chunk = sio.sample
         | 
| 13 | 
            -
                  hash.update(chunk)
         | 
| 14 21 | 
             
                end
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
              def self.hash32(path, seed = File.size(path))
         | 
| 25 | 
            +
                hash path, seed, XXHash::XXhashInternal::StreamingHash32.new(seed)
         | 
| 26 | 
            +
              end
         | 
| 15 27 |  | 
| 16 | 
            -
             | 
| 28 | 
            +
              def self.hash64(path, seed = File.size(path))
         | 
| 29 | 
            +
                hash path, seed, XXHash::XXhashInternal::StreamingHash64.new(seed)
         | 
| 17 30 | 
             
              end
         | 
| 18 31 | 
             
            end
         | 
| @@ -0,0 +1,97 @@ | |
| 1 | 
            +
            module SamplingHash
         | 
| 2 | 
            +
              class Hash
         | 
| 3 | 
            +
                def initialize(size, seed = size, sampler = nil, xxhash = XXhash::XXhashInternal::StreamingHash64.new(seed))
         | 
| 4 | 
            +
                  @sampler = sampler || Sampler.new(size)
         | 
| 5 | 
            +
                  @xxhash = xxhash
         | 
| 6 | 
            +
                  
         | 
| 7 | 
            +
                  # Position in data stream.
         | 
| 8 | 
            +
                  @position = 0
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  # Current sample.
         | 
| 11 | 
            +
                  @current_sample        = nil # The data.
         | 
| 12 | 
            +
                  @current_sample_offset = 0   # The offset (within the stream).
         | 
| 13 | 
            +
                  @current_sample_size   = 0   # The sample size.
         | 
| 14 | 
            +
                  @next                  = 0   # The next sample index.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                  # Start.
         | 
| 17 | 
            +
                  next_sample
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                def update(chunk)
         | 
| 21 | 
            +
                  pos = 0
         | 
| 22 | 
            +
                  while pos < chunk.size
         | 
| 23 | 
            +
                    len = chunk.size - pos
         | 
| 24 | 
            +
                    used = advance(chunk, pos, len)
         | 
| 25 | 
            +
                    @position += used
         | 
| 26 | 
            +
                    pos += used
         | 
| 27 | 
            +
                  end
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                def digest
         | 
| 31 | 
            +
                  @xxhash.digest
         | 
| 32 | 
            +
                end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
              private
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                def advance(chunk, pos, len)
         | 
| 37 | 
            +
                  if in_sample?
         | 
| 38 | 
            +
                    # Use some bytes.
         | 
| 39 | 
            +
                    msb = missing_sample_bytes
         | 
| 40 | 
            +
                    if msb > len
         | 
| 41 | 
            +
                      update_sample chunk[pos..(pos + len - 1)]
         | 
| 42 | 
            +
                      len
         | 
| 43 | 
            +
                    else
         | 
| 44 | 
            +
                      finish_sample chunk[pos..(pos + msb - 1)]
         | 
| 45 | 
            +
                      msb
         | 
| 46 | 
            +
                    end
         | 
| 47 | 
            +
                  elsif samples_left?
         | 
| 48 | 
            +
                    # Discard some bytes until the next sample starts.
         | 
| 49 | 
            +
                    mgb = missing_gap_bytes
         | 
| 50 | 
            +
                    if mgb > len
         | 
| 51 | 
            +
                      len
         | 
| 52 | 
            +
                    else
         | 
| 53 | 
            +
                      mgb
         | 
| 54 | 
            +
                    end
         | 
| 55 | 
            +
                  else
         | 
| 56 | 
            +
                    # Discard the rest.
         | 
| 57 | 
            +
                    len
         | 
| 58 | 
            +
                  end
         | 
| 59 | 
            +
                end
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                def in_sample?
         | 
| 62 | 
            +
                  samples_left? && @position >= @current_sample_offset && @position < @current_sample_offset + @current_sample_size
         | 
| 63 | 
            +
                end
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                def samples_left?
         | 
| 66 | 
            +
                  !!@current_sample
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                def missing_sample_bytes
         | 
| 70 | 
            +
                  @current_sample_size - @current_sample.size
         | 
| 71 | 
            +
                end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                def missing_gap_bytes
         | 
| 74 | 
            +
                  @current_sample_offset - @position
         | 
| 75 | 
            +
                end
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                def update_sample(data)
         | 
| 78 | 
            +
                  @current_sample += data
         | 
| 79 | 
            +
                end
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                def finish_sample(data)
         | 
| 82 | 
            +
                  @current_sample += data
         | 
| 83 | 
            +
                  @xxhash.update(@current_sample)
         | 
| 84 | 
            +
                  next_sample
         | 
| 85 | 
            +
                end
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                def next_sample
         | 
| 88 | 
            +
                  if @next < @sampler.samples.size
         | 
| 89 | 
            +
                    @current_sample = String.new
         | 
| 90 | 
            +
                    @current_sample_offset, @current_sample_size = @sampler.samples[@next]
         | 
| 91 | 
            +
                    @next += 1
         | 
| 92 | 
            +
                  else
         | 
| 93 | 
            +
                    @current_sample = nil
         | 
| 94 | 
            +
                  end
         | 
| 95 | 
            +
                end
         | 
| 96 | 
            +
              end
         | 
| 97 | 
            +
            end
         | 
| @@ -0,0 +1,60 @@ | |
| 1 | 
            +
            module SamplingHash
         | 
| 2 | 
            +
              class Sampler
         | 
| 3 | 
            +
                include Enumerable
         | 
| 4 | 
            +
             | 
| 5 | 
            +
                attr_reader :samples, :size
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                # Calculates sample offsets.
         | 
| 8 | 
            +
                # 
         | 
| 9 | 
            +
                # Parameters:
         | 
| 10 | 
            +
                # - sample_size: Size of a sample (in bytes).
         | 
| 11 | 
            +
                # - header_samples: Number of samples at front of data always to be included.
         | 
| 12 | 
            +
                # - minimum_samples: Minimum number of samples to be included.
         | 
| 13 | 
            +
                # - remaining_factor: If size is greater than minimum_samples * sample_size, this specifies the
         | 
| 14 | 
            +
                #              linear factor function used to determine the additional data used.
         | 
| 15 | 
            +
                def initialize(size, sample_size = 1024, header_samples = 1000, minimum_samples = 5000, remaining_factor = 0.001)
         | 
| 16 | 
            +
                  @samples = []
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                  minimum_sampling_size = minimum_samples * sample_size
         | 
| 19 | 
            +
                  if (size > minimum_sampling_size)
         | 
| 20 | 
            +
                    # Continuous header samples first.
         | 
| 21 | 
            +
                    header_samples.times { |i| @samples << [i * sample_size, sample_size] }
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    # Spread the rest.
         | 
| 24 | 
            +
                    start_offset                       = header_samples * sample_size
         | 
| 25 | 
            +
                    remaining_size                     = size - start_offset
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    remaining_minimum_samples          = [0, minimum_samples - header_samples].max
         | 
| 28 | 
            +
                    remaining_minimum_sampling_size    = remaining_minimum_samples * sample_size
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    remaining_additional_size          = remaining_size - remaining_minimum_sampling_size
         | 
| 31 | 
            +
                    remaining_additional_sampling_size = remaining_additional_size * remaining_factor
         | 
| 32 | 
            +
                    remaining_additional_samples       = (remaining_additional_sampling_size / sample_size).truncate
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    remaining_total_samples            = remaining_minimum_samples + remaining_additional_samples
         | 
| 35 | 
            +
                    remaining_total_sampling_size      = remaining_minimum_sampling_size + remaining_additional_sampling_size
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    remaining_unsampled_size           = remaining_size - remaining_total_sampling_size
         | 
| 38 | 
            +
                    remaining_sampling_gap             = (remaining_unsampled_size / remaining_total_samples).truncate
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    # NOTE: We can not overflow since we calculated the remaining_additional_samples with integer division.
         | 
| 41 | 
            +
                    remaining_total_samples.times do |i|
         | 
| 42 | 
            +
                      @samples << [start_offset + i * (sample_size + remaining_sampling_gap), sample_size]
         | 
| 43 | 
            +
                    end
         | 
| 44 | 
            +
                  else
         | 
| 45 | 
            +
                    total_full_samples = size / sample_size
         | 
| 46 | 
            +
                    last_sample_size   = size - ((size / sample_size) * sample_size)
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                    # Simply take them all.
         | 
| 49 | 
            +
                    total_full_samples.times { |i| @samples << [i * sample_size, sample_size] }
         | 
| 50 | 
            +
                    @samples << [total_full_samples * sample_size, last_sample_size] if last_sample_size != 0
         | 
| 51 | 
            +
                  end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  @size = @samples.inject(0) { |i, v| i + v[1] }
         | 
| 54 | 
            +
                end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                def each(&block)
         | 
| 57 | 
            +
                  @samples.each(&block)
         | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
            end
         | 
| @@ -1,42 +1,17 @@ | |
| 1 1 | 
             
            module SamplingHash
         | 
| 2 2 | 
             
              class SamplingIO
         | 
| 3 | 
            -
                def initialize(io)
         | 
| 3 | 
            +
                def initialize(io, sampler = nil)
         | 
| 4 4 | 
             
                  raise ArgumentError, 'first parameter should be IO' unless io.kind_of?(IO)
         | 
| 5 5 |  | 
| 6 6 | 
             
                  @io = io
         | 
| 7 | 
            -
                  @ | 
| 8 | 
            -
                end
         | 
| 9 | 
            -
             | 
| 10 | 
            -
                def sample
         | 
| 11 | 
            -
                  return nil if @chunk > samples
         | 
| 12 | 
            -
             | 
| 13 | 
            -
                  @io.seek(offset, IO::SEEK_SET)
         | 
| 14 | 
            -
                  @chunk += 1
         | 
| 15 | 
            -
                  @io.read(CHUNK_SIZE)
         | 
| 16 | 
            -
                end
         | 
| 17 | 
            -
             | 
| 18 | 
            -
              private
         | 
| 19 | 
            -
             | 
| 20 | 
            -
                CHUNK_SIZE = 256
         | 
| 21 | 
            -
             | 
| 22 | 
            -
                def file_size
         | 
| 23 | 
            -
                  @file_size ||= @io.stat.size
         | 
| 24 | 
            -
                end
         | 
| 25 | 
            -
             | 
| 26 | 
            -
                def reduce
         | 
| 27 | 
            -
                  (Math.log(file_size / 1000) * 1000).truncate & 0xFFFFFF00
         | 
| 28 | 
            -
                end
         | 
| 29 | 
            -
             | 
| 30 | 
            -
                def samples_size
         | 
| 31 | 
            -
                  @samples_size ||= (file_size < 3000 ? file_size : reduce)
         | 
| 7 | 
            +
                  @sampler = sampler || Sampler.new(io.stat.size)
         | 
| 32 8 | 
             
                end
         | 
| 33 9 |  | 
| 34 10 | 
             
                def samples
         | 
| 35 | 
            -
                   | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
                  @chunk * CHUNK_SIZE
         | 
| 11 | 
            +
                  @sampler.each do |offset, size|
         | 
| 12 | 
            +
                    @io.seek(offset, IO::SEEK_SET)
         | 
| 13 | 
            +
                    yield @io.read(size)
         | 
| 14 | 
            +
                  end
         | 
| 40 15 | 
             
                end
         | 
| 41 16 | 
             
              end
         | 
| 42 17 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,69 +1,71 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: sampling-hash
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.1.2
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 | 
            -
            -  | 
| 7 | 
            +
            - Projective Technology GmbH
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2021-04-15 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: xxhash
         | 
| 15 15 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 16 16 | 
             
                requirements:
         | 
| 17 | 
            -
                - -  | 
| 17 | 
            +
                - - "~>"
         | 
| 18 18 | 
             
                  - !ruby/object:Gem::Version
         | 
| 19 | 
            -
                    version: 0. | 
| 19 | 
            +
                    version: '0.3'
         | 
| 20 20 | 
             
              type: :runtime
         | 
| 21 21 | 
             
              prerelease: false
         | 
| 22 22 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 23 23 | 
             
                requirements:
         | 
| 24 | 
            -
                - -  | 
| 24 | 
            +
                - - "~>"
         | 
| 25 25 | 
             
                  - !ruby/object:Gem::Version
         | 
| 26 | 
            -
                    version: 0. | 
| 26 | 
            +
                    version: '0.3'
         | 
| 27 27 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 28 28 | 
             
              name: rake
         | 
| 29 29 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 30 30 | 
             
                requirements:
         | 
| 31 | 
            -
                - -  | 
| 31 | 
            +
                - - ">="
         | 
| 32 32 | 
             
                  - !ruby/object:Gem::Version
         | 
| 33 | 
            -
                    version:  | 
| 33 | 
            +
                    version: 12.3.3
         | 
| 34 34 | 
             
              type: :development
         | 
| 35 35 | 
             
              prerelease: false
         | 
| 36 36 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 37 37 | 
             
                requirements:
         | 
| 38 | 
            -
                - -  | 
| 38 | 
            +
                - - ">="
         | 
| 39 39 | 
             
                  - !ruby/object:Gem::Version
         | 
| 40 | 
            -
                    version:  | 
| 40 | 
            +
                    version: 12.3.3
         | 
| 41 41 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 42 42 | 
             
              name: minitest
         | 
| 43 43 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 44 44 | 
             
                requirements:
         | 
| 45 | 
            -
                - -  | 
| 45 | 
            +
                - - "~>"
         | 
| 46 46 | 
             
                  - !ruby/object:Gem::Version
         | 
| 47 | 
            -
                    version: ' | 
| 47 | 
            +
                    version: '5.5'
         | 
| 48 48 | 
             
              type: :development
         | 
| 49 49 | 
             
              prerelease: false
         | 
| 50 50 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 51 51 | 
             
                requirements:
         | 
| 52 | 
            -
                - -  | 
| 52 | 
            +
                - - "~>"
         | 
| 53 53 | 
             
                  - !ruby/object:Gem::Version
         | 
| 54 | 
            -
                    version: ' | 
| 54 | 
            +
                    version: '5.5'
         | 
| 55 55 | 
             
            description: Calculates deterministic hashes from file samples
         | 
| 56 | 
            -
            email:
         | 
| 57 | 
            -
            - malte.rohde@flavoursys.com
         | 
| 56 | 
            +
            email: technology@projective.io
         | 
| 58 57 | 
             
            executables: []
         | 
| 59 58 | 
             
            extensions: []
         | 
| 60 59 | 
             
            extra_rdoc_files: []
         | 
| 61 60 | 
             
            files:
         | 
| 62 61 | 
             
            - lib/sampling-hash.rb
         | 
| 63 | 
            -
            - lib/sampling-hash/ | 
| 62 | 
            +
            - lib/sampling-hash/hash.rb
         | 
| 63 | 
            +
            - lib/sampling-hash/sampler.rb
         | 
| 64 64 | 
             
            - lib/sampling-hash/sampling-io.rb
         | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 65 | 
            +
            - lib/sampling-hash/version.rb
         | 
| 66 | 
            +
            homepage: https://github.com/projectivetech/sampling-hash
         | 
| 67 | 
            +
            licenses:
         | 
| 68 | 
            +
            - MIT
         | 
| 67 69 | 
             
            metadata: {}
         | 
| 68 70 | 
             
            post_install_message: 
         | 
| 69 71 | 
             
            rdoc_options: []
         | 
| @@ -71,17 +73,16 @@ require_paths: | |
| 71 73 | 
             
            - lib
         | 
| 72 74 | 
             
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 73 75 | 
             
              requirements:
         | 
| 74 | 
            -
              - -  | 
| 76 | 
            +
              - - ">="
         | 
| 75 77 | 
             
                - !ruby/object:Gem::Version
         | 
| 76 78 | 
             
                  version: '0'
         | 
| 77 79 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 78 80 | 
             
              requirements:
         | 
| 79 | 
            -
              - -  | 
| 81 | 
            +
              - - ">="
         | 
| 80 82 | 
             
                - !ruby/object:Gem::Version
         | 
| 81 83 | 
             
                  version: '0'
         | 
| 82 84 | 
             
            requirements: []
         | 
| 83 | 
            -
             | 
| 84 | 
            -
            rubygems_version: 2.0.3
         | 
| 85 | 
            +
            rubygems_version: 3.2.5
         | 
| 85 86 | 
             
            signing_key: 
         | 
| 86 87 | 
             
            specification_version: 4
         | 
| 87 88 | 
             
            summary: Sampling hash algorithm for large files
         |