sampling-hash 0.0.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- MDAwYWQ2MTFmMWQ4ZTdlNDYxY2RhZDhjZGQ4M2MyODBjMTViMmJiOQ==
5
- data.tar.gz: !binary |-
6
- MTk2ZGJjOGYxYjIyM2QwZmI4ZGE1NWNiNjE3M2ZlMjcxYTU0YjVhNQ==
7
- !binary "U0hBNTEy":
8
- metadata.gz: !binary |-
9
- MjRjZDZjMGVjZTE1ZDczMDA1Y2NkYmJiMmY4NjkyODk0Y2I3MjdhMGU0ODFi
10
- ODRkMmYzMTBiNDk5ZjVlMDVmZmIwY2M2MmY2Y2IzMWZmYzBiNjY5Mjg1NzBi
11
- ZDlmZTJmMjg3YzllOGI1NTk0M2I5OTVkNGRkMTdmYzU5MTRlOTQ=
12
- data.tar.gz: !binary |-
13
- MjNmNThiZTRhNGQxNDMxOGJjOGMxZGJhYzBhNTE1NmY1MjkwNDg2NGIyYzk4
14
- ZTg5YTAzNGJlYjVkY2Y1ZDE3YzdkMzZhNmM2OTBkZDMzZWJlYmE1NjYzNjI2
15
- NjdlNDNjNWZkYWFlZTM5MjU2YjNmNzMzNzY1NjRhYTIxZDU1MTk=
2
+ SHA256:
3
+ metadata.gz: e944cda00228ac5a7405198955de90e17134f243b828daead2fab77155103030
4
+ data.tar.gz: de8b6e76b09460cfad680c842edbe195922e9db5a48de1fde050be2d30156d47
5
+ SHA512:
6
+ metadata.gz: a0a7043b089963ecdff6986b9026aef765cb12cfe0630283b8c6719385c80b7531650c8a4875b8923025b3b239ec8641d9d9bb180be971e39420f6242614402c
7
+ data.tar.gz: '0295e1b2f327a286b3bc1029dc15fd141d92f2a3c424404f9ea9534b71dfc45a3bf1d893b9c34edfc62abbd013309905b8bd460a4ec0d8b06ff3058e622db26c'
data/lib/sampling-hash.rb CHANGED
@@ -1,18 +1,31 @@
1
+ require 'sampling-hash/hash'
2
+ require 'sampling-hash/sampler'
1
3
  require 'sampling-hash/sampling-io'
2
4
  require 'sampling-hash/version'
3
5
  require 'xxhash'
4
6
 
5
7
  module SamplingHash
6
- def self.hash(path, seed)
8
+ # We default to 64 bit xxhash.
9
+ def self.hash(path, seed = File.size(path), hash = XXhash::XXhashInternal::StreamingHash64.new(seed))
7
10
  raise ArgumentError, 'file not found' unless File.file?(path)
8
11
 
9
- hash = XXhash::Internal::StreamingHash.new(seed)
10
- sio = SamplingIO.new(File.open(path, 'r'))
12
+ File.open(path, 'r') do |fd|
13
+
14
+ sio = SamplingIO.new(fd)
15
+ sio.samples do |chunk|
16
+ hash.update(chunk)
17
+ end
18
+
19
+ hash.digest
11
20
 
12
- while chunk = sio.sample
13
- hash.update(chunk)
14
21
  end
22
+ end
23
+
24
+ def self.hash32(path, seed = File.size(path))
25
+ hash path, seed, XXHash::XXhashInternal::StreamingHash32.new(seed)
26
+ end
15
27
 
16
- hash.digest
28
+ def self.hash64(path, seed = File.size(path))
29
+ hash path, seed, XXHash::XXhashInternal::StreamingHash64.new(seed)
17
30
  end
18
31
  end
@@ -0,0 +1,97 @@
1
+ module SamplingHash
2
+ class Hash
3
+ def initialize(size, seed = size, sampler = nil, xxhash = XXhash::XXhashInternal::StreamingHash64.new(seed))
4
+ @sampler = sampler || Sampler.new(size)
5
+ @xxhash = xxhash
6
+
7
+ # Position in data stream.
8
+ @position = 0
9
+
10
+ # Current sample.
11
+ @current_sample = nil # The data.
12
+ @current_sample_offset = 0 # The offset (within the stream).
13
+ @current_sample_size = 0 # The sample size.
14
+ @next = 0 # The next sample index.
15
+
16
+ # Start.
17
+ next_sample
18
+ end
19
+
20
+ def update(chunk)
21
+ pos = 0
22
+ while pos < chunk.size
23
+ len = chunk.size - pos
24
+ used = advance(chunk, pos, len)
25
+ @position += used
26
+ pos += used
27
+ end
28
+ end
29
+
30
+ def digest
31
+ @xxhash.digest
32
+ end
33
+
34
+ private
35
+
36
+ def advance(chunk, pos, len)
37
+ if in_sample?
38
+ # Use some bytes.
39
+ msb = missing_sample_bytes
40
+ if msb > len
41
+ update_sample chunk[pos..(pos + len - 1)]
42
+ len
43
+ else
44
+ finish_sample chunk[pos..(pos + msb - 1)]
45
+ msb
46
+ end
47
+ elsif samples_left?
48
+ # Discard some bytes until the next sample starts.
49
+ mgb = missing_gap_bytes
50
+ if mgb > len
51
+ len
52
+ else
53
+ mgb
54
+ end
55
+ else
56
+ # Discard the rest.
57
+ len
58
+ end
59
+ end
60
+
61
+ def in_sample?
62
+ samples_left? && @position >= @current_sample_offset && @position < @current_sample_offset + @current_sample_size
63
+ end
64
+
65
+ def samples_left?
66
+ !!@current_sample
67
+ end
68
+
69
+ def missing_sample_bytes
70
+ @current_sample_size - @current_sample.size
71
+ end
72
+
73
+ def missing_gap_bytes
74
+ @current_sample_offset - @position
75
+ end
76
+
77
+ def update_sample(data)
78
+ @current_sample += data
79
+ end
80
+
81
+ def finish_sample(data)
82
+ @current_sample += data
83
+ @xxhash.update(@current_sample)
84
+ next_sample
85
+ end
86
+
87
+ def next_sample
88
+ if @next < @sampler.samples.size
89
+ @current_sample = String.new
90
+ @current_sample_offset, @current_sample_size = @sampler.samples[@next]
91
+ @next += 1
92
+ else
93
+ @current_sample = nil
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,60 @@
1
+ module SamplingHash
2
+ class Sampler
3
+ include Enumerable
4
+
5
+ attr_reader :samples, :size
6
+
7
+ # Calculates sample offsets.
8
+ #
9
+ # Parameters:
10
+ # - sample_size: Size of a sample (in bytes).
11
+ # - header_samples: Number of samples at front of data always to be included.
12
+ # - minimum_samples: Minimum number of samples to be included.
13
+ # - remaining_factor: If size is greater than minimum_samples * sample_size, this specifies the
14
+ # linear factor function used to determine the additional data used.
15
+ def initialize(size, sample_size = 1024, header_samples = 1000, minimum_samples = 5000, remaining_factor = 0.001)
16
+ @samples = []
17
+
18
+ minimum_sampling_size = minimum_samples * sample_size
19
+ if (size > minimum_sampling_size)
20
+ # Continuous header samples first.
21
+ header_samples.times { |i| @samples << [i * sample_size, sample_size] }
22
+
23
+ # Spread the rest.
24
+ start_offset = header_samples * sample_size
25
+ remaining_size = size - start_offset
26
+
27
+ remaining_minimum_samples = [0, minimum_samples - header_samples].max
28
+ remaining_minimum_sampling_size = remaining_minimum_samples * sample_size
29
+
30
+ remaining_additional_size = remaining_size - remaining_minimum_sampling_size
31
+ remaining_additional_sampling_size = remaining_additional_size * remaining_factor
32
+ remaining_additional_samples = (remaining_additional_sampling_size / sample_size).truncate
33
+
34
+ remaining_total_samples = remaining_minimum_samples + remaining_additional_samples
35
+ remaining_total_sampling_size = remaining_minimum_sampling_size + remaining_additional_sampling_size
36
+
37
+ remaining_unsampled_size = remaining_size - remaining_total_sampling_size
38
+ remaining_sampling_gap = (remaining_unsampled_size / remaining_total_samples).truncate
39
+
40
+ # NOTE: We can not overflow since we calculated the remaining_additional_samples with integer division.
41
+ remaining_total_samples.times do |i|
42
+ @samples << [start_offset + i * (sample_size + remaining_sampling_gap), sample_size]
43
+ end
44
+ else
45
+ total_full_samples = size / sample_size
46
+ last_sample_size = size - ((size / sample_size) * sample_size)
47
+
48
+ # Simply take them all.
49
+ total_full_samples.times { |i| @samples << [i * sample_size, sample_size] }
50
+ @samples << [total_full_samples * sample_size, last_sample_size] if last_sample_size != 0
51
+ end
52
+
53
+ @size = @samples.inject(0) { |i, v| i + v[1] }
54
+ end
55
+
56
+ def each(&block)
57
+ @samples.each(&block)
58
+ end
59
+ end
60
+ end
@@ -1,42 +1,17 @@
1
1
  module SamplingHash
2
2
  class SamplingIO
3
- def initialize(io)
3
+ def initialize(io, sampler = nil)
4
4
  raise ArgumentError, 'first parameter should be IO' unless io.kind_of?(IO)
5
5
 
6
6
  @io = io
7
- @chunk = 0
8
- end
9
-
10
- def sample
11
- return nil if @chunk > samples
12
-
13
- @io.seek(offset, IO::SEEK_SET)
14
- @chunk += 1
15
- @io.read(CHUNK_SIZE)
16
- end
17
-
18
- private
19
-
20
- CHUNK_SIZE = 256
21
-
22
- def file_size
23
- @file_size ||= @io.stat.size
24
- end
25
-
26
- def reduce
27
- (Math.log(file_size / 1000) * 1000).truncate & 0xFFFFFF00
28
- end
29
-
30
- def samples_size
31
- @samples_size ||= (file_size < 3000 ? file_size : reduce)
7
+ @sampler = sampler || Sampler.new(io.stat.size)
32
8
  end
33
9
 
34
10
  def samples
35
- samples_size / CHUNK_SIZE + 1
36
- end
37
-
38
- def offset
39
- @chunk * CHUNK_SIZE
11
+ @sampler.each do |offset, size|
12
+ @io.seek(offset, IO::SEEK_SET)
13
+ yield @io.read(size)
14
+ end
40
15
  end
41
16
  end
42
17
  end
@@ -1,3 +1,3 @@
1
1
  module SamplingHash
2
- VERSION = '0.0.1'
2
+ VERSION = '0.1.2'
3
3
  end
metadata CHANGED
@@ -1,69 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sampling-hash
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
- - Malte Rohde
7
+ - Projective Technology GmbH
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-04 00:00:00.000000000 Z
11
+ date: 2021-04-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: xxhash
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 0.2.0
19
+ version: '0.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ! '>='
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 0.2.0
26
+ version: '0.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ! '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: 12.3.3
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ! '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: 12.3.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: minitest
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ! '>='
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '5.5'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ! '>='
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: '5.5'
55
55
  description: Calculates deterministic hashes from file samples
56
- email:
57
- - malte.rohde@flavoursys.com
56
+ email: technology@projective.io
58
57
  executables: []
59
58
  extensions: []
60
59
  extra_rdoc_files: []
61
60
  files:
62
61
  - lib/sampling-hash.rb
63
- - lib/sampling-hash/version.rb
62
+ - lib/sampling-hash/hash.rb
63
+ - lib/sampling-hash/sampler.rb
64
64
  - lib/sampling-hash/sampling-io.rb
65
- homepage: http://github.com/flavoursys/sampling-hash
66
- licenses: []
65
+ - lib/sampling-hash/version.rb
66
+ homepage: https://github.com/projectivetech/sampling-hash
67
+ licenses:
68
+ - MIT
67
69
  metadata: {}
68
70
  post_install_message:
69
71
  rdoc_options: []
@@ -71,17 +73,16 @@ require_paths:
71
73
  - lib
72
74
  required_ruby_version: !ruby/object:Gem::Requirement
73
75
  requirements:
74
- - - ! '>='
76
+ - - ">="
75
77
  - !ruby/object:Gem::Version
76
78
  version: '0'
77
79
  required_rubygems_version: !ruby/object:Gem::Requirement
78
80
  requirements:
79
- - - ! '>='
81
+ - - ">="
80
82
  - !ruby/object:Gem::Version
81
83
  version: '0'
82
84
  requirements: []
83
- rubyforge_project:
84
- rubygems_version: 2.0.3
85
+ rubygems_version: 3.2.5
85
86
  signing_key:
86
87
  specification_version: 4
87
88
  summary: Sampling hash algorithm for large files