sampling-hash 0.0.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- MDAwYWQ2MTFmMWQ4ZTdlNDYxY2RhZDhjZGQ4M2MyODBjMTViMmJiOQ==
5
- data.tar.gz: !binary |-
6
- MTk2ZGJjOGYxYjIyM2QwZmI4ZGE1NWNiNjE3M2ZlMjcxYTU0YjVhNQ==
7
- !binary "U0hBNTEy":
8
- metadata.gz: !binary |-
9
- MjRjZDZjMGVjZTE1ZDczMDA1Y2NkYmJiMmY4NjkyODk0Y2I3MjdhMGU0ODFi
10
- ODRkMmYzMTBiNDk5ZjVlMDVmZmIwY2M2MmY2Y2IzMWZmYzBiNjY5Mjg1NzBi
11
- ZDlmZTJmMjg3YzllOGI1NTk0M2I5OTVkNGRkMTdmYzU5MTRlOTQ=
12
- data.tar.gz: !binary |-
13
- MjNmNThiZTRhNGQxNDMxOGJjOGMxZGJhYzBhNTE1NmY1MjkwNDg2NGIyYzk4
14
- ZTg5YTAzNGJlYjVkY2Y1ZDE3YzdkMzZhNmM2OTBkZDMzZWJlYmE1NjYzNjI2
15
- NjdlNDNjNWZkYWFlZTM5MjU2YjNmNzMzNzY1NjRhYTIxZDU1MTk=
2
+ SHA256:
3
+ metadata.gz: e944cda00228ac5a7405198955de90e17134f243b828daead2fab77155103030
4
+ data.tar.gz: de8b6e76b09460cfad680c842edbe195922e9db5a48de1fde050be2d30156d47
5
+ SHA512:
6
+ metadata.gz: a0a7043b089963ecdff6986b9026aef765cb12cfe0630283b8c6719385c80b7531650c8a4875b8923025b3b239ec8641d9d9bb180be971e39420f6242614402c
7
+ data.tar.gz: '0295e1b2f327a286b3bc1029dc15fd141d92f2a3c424404f9ea9534b71dfc45a3bf1d893b9c34edfc62abbd013309905b8bd460a4ec0d8b06ff3058e622db26c'
data/lib/sampling-hash.rb CHANGED
@@ -1,18 +1,31 @@
1
+ require 'sampling-hash/hash'
2
+ require 'sampling-hash/sampler'
1
3
  require 'sampling-hash/sampling-io'
2
4
  require 'sampling-hash/version'
3
5
  require 'xxhash'
4
6
 
5
7
  module SamplingHash
6
- def self.hash(path, seed)
8
+ # We default to 64 bit xxhash.
9
+ def self.hash(path, seed = File.size(path), hash = XXhash::XXhashInternal::StreamingHash64.new(seed))
7
10
  raise ArgumentError, 'file not found' unless File.file?(path)
8
11
 
9
- hash = XXhash::Internal::StreamingHash.new(seed)
10
- sio = SamplingIO.new(File.open(path, 'r'))
12
+ File.open(path, 'r') do |fd|
13
+
14
+ sio = SamplingIO.new(fd)
15
+ sio.samples do |chunk|
16
+ hash.update(chunk)
17
+ end
18
+
19
+ hash.digest
11
20
 
12
- while chunk = sio.sample
13
- hash.update(chunk)
14
21
  end
22
+ end
23
+
24
+ def self.hash32(path, seed = File.size(path))
25
+ hash path, seed, XXHash::XXhashInternal::StreamingHash32.new(seed)
26
+ end
15
27
 
16
- hash.digest
28
+ def self.hash64(path, seed = File.size(path))
29
+ hash path, seed, XXHash::XXhashInternal::StreamingHash64.new(seed)
17
30
  end
18
31
  end
@@ -0,0 +1,97 @@
1
+ module SamplingHash
2
+ class Hash
3
+ def initialize(size, seed = size, sampler = nil, xxhash = XXhash::XXhashInternal::StreamingHash64.new(seed))
4
+ @sampler = sampler || Sampler.new(size)
5
+ @xxhash = xxhash
6
+
7
+ # Position in data stream.
8
+ @position = 0
9
+
10
+ # Current sample.
11
+ @current_sample = nil # The data.
12
+ @current_sample_offset = 0 # The offset (within the stream).
13
+ @current_sample_size = 0 # The sample size.
14
+ @next = 0 # The next sample index.
15
+
16
+ # Start.
17
+ next_sample
18
+ end
19
+
20
+ def update(chunk)
21
+ pos = 0
22
+ while pos < chunk.size
23
+ len = chunk.size - pos
24
+ used = advance(chunk, pos, len)
25
+ @position += used
26
+ pos += used
27
+ end
28
+ end
29
+
30
+ def digest
31
+ @xxhash.digest
32
+ end
33
+
34
+ private
35
+
36
+ def advance(chunk, pos, len)
37
+ if in_sample?
38
+ # Use some bytes.
39
+ msb = missing_sample_bytes
40
+ if msb > len
41
+ update_sample chunk[pos..(pos + len - 1)]
42
+ len
43
+ else
44
+ finish_sample chunk[pos..(pos + msb - 1)]
45
+ msb
46
+ end
47
+ elsif samples_left?
48
+ # Discard some bytes until the next sample starts.
49
+ mgb = missing_gap_bytes
50
+ if mgb > len
51
+ len
52
+ else
53
+ mgb
54
+ end
55
+ else
56
+ # Discard the rest.
57
+ len
58
+ end
59
+ end
60
+
61
+ def in_sample?
62
+ samples_left? && @position >= @current_sample_offset && @position < @current_sample_offset + @current_sample_size
63
+ end
64
+
65
+ def samples_left?
66
+ !!@current_sample
67
+ end
68
+
69
+ def missing_sample_bytes
70
+ @current_sample_size - @current_sample.size
71
+ end
72
+
73
+ def missing_gap_bytes
74
+ @current_sample_offset - @position
75
+ end
76
+
77
+ def update_sample(data)
78
+ @current_sample += data
79
+ end
80
+
81
+ def finish_sample(data)
82
+ @current_sample += data
83
+ @xxhash.update(@current_sample)
84
+ next_sample
85
+ end
86
+
87
+ def next_sample
88
+ if @next < @sampler.samples.size
89
+ @current_sample = String.new
90
+ @current_sample_offset, @current_sample_size = @sampler.samples[@next]
91
+ @next += 1
92
+ else
93
+ @current_sample = nil
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,60 @@
1
+ module SamplingHash
2
+ class Sampler
3
+ include Enumerable
4
+
5
+ attr_reader :samples, :size
6
+
7
+ # Calculates sample offsets.
8
+ #
9
+ # Parameters:
10
+ # - sample_size: Size of a sample (in bytes).
11
+ # - header_samples: Number of samples at front of data always to be included.
12
+ # - minimum_samples: Minimum number of samples to be included.
13
+ # - remaining_factor: If size is greater than minimum_samples * sample_size, this specifies the
14
+ # linear factor function used to determine the additional data used.
15
+ def initialize(size, sample_size = 1024, header_samples = 1000, minimum_samples = 5000, remaining_factor = 0.001)
16
+ @samples = []
17
+
18
+ minimum_sampling_size = minimum_samples * sample_size
19
+ if (size > minimum_sampling_size)
20
+ # Continuous header samples first.
21
+ header_samples.times { |i| @samples << [i * sample_size, sample_size] }
22
+
23
+ # Spread the rest.
24
+ start_offset = header_samples * sample_size
25
+ remaining_size = size - start_offset
26
+
27
+ remaining_minimum_samples = [0, minimum_samples - header_samples].max
28
+ remaining_minimum_sampling_size = remaining_minimum_samples * sample_size
29
+
30
+ remaining_additional_size = remaining_size - remaining_minimum_sampling_size
31
+ remaining_additional_sampling_size = remaining_additional_size * remaining_factor
32
+ remaining_additional_samples = (remaining_additional_sampling_size / sample_size).truncate
33
+
34
+ remaining_total_samples = remaining_minimum_samples + remaining_additional_samples
35
+ remaining_total_sampling_size = remaining_minimum_sampling_size + remaining_additional_sampling_size
36
+
37
+ remaining_unsampled_size = remaining_size - remaining_total_sampling_size
38
+ remaining_sampling_gap = (remaining_unsampled_size / remaining_total_samples).truncate
39
+
40
+ # NOTE: We can not overflow since we calculated the remaining_additional_samples with integer division.
41
+ remaining_total_samples.times do |i|
42
+ @samples << [start_offset + i * (sample_size + remaining_sampling_gap), sample_size]
43
+ end
44
+ else
45
+ total_full_samples = size / sample_size
46
+ last_sample_size = size - ((size / sample_size) * sample_size)
47
+
48
+ # Simply take them all.
49
+ total_full_samples.times { |i| @samples << [i * sample_size, sample_size] }
50
+ @samples << [total_full_samples * sample_size, last_sample_size] if last_sample_size != 0
51
+ end
52
+
53
+ @size = @samples.inject(0) { |i, v| i + v[1] }
54
+ end
55
+
56
+ def each(&block)
57
+ @samples.each(&block)
58
+ end
59
+ end
60
+ end
@@ -1,42 +1,17 @@
1
1
  module SamplingHash
2
2
  class SamplingIO
3
- def initialize(io)
3
+ def initialize(io, sampler = nil)
4
4
  raise ArgumentError, 'first parameter should be IO' unless io.kind_of?(IO)
5
5
 
6
6
  @io = io
7
- @chunk = 0
8
- end
9
-
10
- def sample
11
- return nil if @chunk > samples
12
-
13
- @io.seek(offset, IO::SEEK_SET)
14
- @chunk += 1
15
- @io.read(CHUNK_SIZE)
16
- end
17
-
18
- private
19
-
20
- CHUNK_SIZE = 256
21
-
22
- def file_size
23
- @file_size ||= @io.stat.size
24
- end
25
-
26
- def reduce
27
- (Math.log(file_size / 1000) * 1000).truncate & 0xFFFFFF00
28
- end
29
-
30
- def samples_size
31
- @samples_size ||= (file_size < 3000 ? file_size : reduce)
7
+ @sampler = sampler || Sampler.new(io.stat.size)
32
8
  end
33
9
 
34
10
  def samples
35
- samples_size / CHUNK_SIZE + 1
36
- end
37
-
38
- def offset
39
- @chunk * CHUNK_SIZE
11
+ @sampler.each do |offset, size|
12
+ @io.seek(offset, IO::SEEK_SET)
13
+ yield @io.read(size)
14
+ end
40
15
  end
41
16
  end
42
17
  end
@@ -1,3 +1,3 @@
1
1
  module SamplingHash
2
- VERSION = '0.0.1'
2
+ VERSION = '0.1.2'
3
3
  end
metadata CHANGED
@@ -1,69 +1,71 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sampling-hash
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
- - Malte Rohde
7
+ - Projective Technology GmbH
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-04 00:00:00.000000000 Z
11
+ date: 2021-04-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: xxhash
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 0.2.0
19
+ version: '0.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ! '>='
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: 0.2.0
26
+ version: '0.3'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ! '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: 12.3.3
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ! '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: 12.3.3
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: minitest
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ! '>='
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '5.5'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ! '>='
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: '5.5'
55
55
  description: Calculates deterministic hashes from file samples
56
- email:
57
- - malte.rohde@flavoursys.com
56
+ email: technology@projective.io
58
57
  executables: []
59
58
  extensions: []
60
59
  extra_rdoc_files: []
61
60
  files:
62
61
  - lib/sampling-hash.rb
63
- - lib/sampling-hash/version.rb
62
+ - lib/sampling-hash/hash.rb
63
+ - lib/sampling-hash/sampler.rb
64
64
  - lib/sampling-hash/sampling-io.rb
65
- homepage: http://github.com/flavoursys/sampling-hash
66
- licenses: []
65
+ - lib/sampling-hash/version.rb
66
+ homepage: https://github.com/projectivetech/sampling-hash
67
+ licenses:
68
+ - MIT
67
69
  metadata: {}
68
70
  post_install_message:
69
71
  rdoc_options: []
@@ -71,17 +73,16 @@ require_paths:
71
73
  - lib
72
74
  required_ruby_version: !ruby/object:Gem::Requirement
73
75
  requirements:
74
- - - ! '>='
76
+ - - ">="
75
77
  - !ruby/object:Gem::Version
76
78
  version: '0'
77
79
  required_rubygems_version: !ruby/object:Gem::Requirement
78
80
  requirements:
79
- - - ! '>='
81
+ - - ">="
80
82
  - !ruby/object:Gem::Version
81
83
  version: '0'
82
84
  requirements: []
83
- rubyforge_project:
84
- rubygems_version: 2.0.3
85
+ rubygems_version: 3.2.5
85
86
  signing_key:
86
87
  specification_version: 4
87
88
  summary: Sampling hash algorithm for large files