sampling-hash 0.0.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +6 -14
- data/lib/sampling-hash.rb +19 -6
- data/lib/sampling-hash/hash.rb +97 -0
- data/lib/sampling-hash/sampler.rb +60 -0
- data/lib/sampling-hash/sampling-io.rb +6 -31
- data/lib/sampling-hash/version.rb +1 -1
- metadata +25 -24
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
MjRjZDZjMGVjZTE1ZDczMDA1Y2NkYmJiMmY4NjkyODk0Y2I3MjdhMGU0ODFi
|
10
|
-
ODRkMmYzMTBiNDk5ZjVlMDVmZmIwY2M2MmY2Y2IzMWZmYzBiNjY5Mjg1NzBi
|
11
|
-
ZDlmZTJmMjg3YzllOGI1NTk0M2I5OTVkNGRkMTdmYzU5MTRlOTQ=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
MjNmNThiZTRhNGQxNDMxOGJjOGMxZGJhYzBhNTE1NmY1MjkwNDg2NGIyYzk4
|
14
|
-
ZTg5YTAzNGJlYjVkY2Y1ZDE3YzdkMzZhNmM2OTBkZDMzZWJlYmE1NjYzNjI2
|
15
|
-
NjdlNDNjNWZkYWFlZTM5MjU2YjNmNzMzNzY1NjRhYTIxZDU1MTk=
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e944cda00228ac5a7405198955de90e17134f243b828daead2fab77155103030
|
4
|
+
data.tar.gz: de8b6e76b09460cfad680c842edbe195922e9db5a48de1fde050be2d30156d47
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a0a7043b089963ecdff6986b9026aef765cb12cfe0630283b8c6719385c80b7531650c8a4875b8923025b3b239ec8641d9d9bb180be971e39420f6242614402c
|
7
|
+
data.tar.gz: '0295e1b2f327a286b3bc1029dc15fd141d92f2a3c424404f9ea9534b71dfc45a3bf1d893b9c34edfc62abbd013309905b8bd460a4ec0d8b06ff3058e622db26c'
|
data/lib/sampling-hash.rb
CHANGED
@@ -1,18 +1,31 @@
|
|
1
|
+
require 'sampling-hash/hash'
|
2
|
+
require 'sampling-hash/sampler'
|
1
3
|
require 'sampling-hash/sampling-io'
|
2
4
|
require 'sampling-hash/version'
|
3
5
|
require 'xxhash'
|
4
6
|
|
5
7
|
module SamplingHash
|
6
|
-
|
8
|
+
# We default to 64 bit xxhash.
|
9
|
+
def self.hash(path, seed = File.size(path), hash = XXhash::XXhashInternal::StreamingHash64.new(seed))
|
7
10
|
raise ArgumentError, 'file not found' unless File.file?(path)
|
8
11
|
|
9
|
-
|
10
|
-
|
12
|
+
File.open(path, 'r') do |fd|
|
13
|
+
|
14
|
+
sio = SamplingIO.new(fd)
|
15
|
+
sio.samples do |chunk|
|
16
|
+
hash.update(chunk)
|
17
|
+
end
|
18
|
+
|
19
|
+
hash.digest
|
11
20
|
|
12
|
-
while chunk = sio.sample
|
13
|
-
hash.update(chunk)
|
14
21
|
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.hash32(path, seed = File.size(path))
|
25
|
+
hash path, seed, XXHash::XXhashInternal::StreamingHash32.new(seed)
|
26
|
+
end
|
15
27
|
|
16
|
-
|
28
|
+
def self.hash64(path, seed = File.size(path))
|
29
|
+
hash path, seed, XXHash::XXhashInternal::StreamingHash64.new(seed)
|
17
30
|
end
|
18
31
|
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
module SamplingHash
|
2
|
+
class Hash
|
3
|
+
def initialize(size, seed = size, sampler = nil, xxhash = XXhash::XXhashInternal::StreamingHash64.new(seed))
|
4
|
+
@sampler = sampler || Sampler.new(size)
|
5
|
+
@xxhash = xxhash
|
6
|
+
|
7
|
+
# Position in data stream.
|
8
|
+
@position = 0
|
9
|
+
|
10
|
+
# Current sample.
|
11
|
+
@current_sample = nil # The data.
|
12
|
+
@current_sample_offset = 0 # The offset (within the stream).
|
13
|
+
@current_sample_size = 0 # The sample size.
|
14
|
+
@next = 0 # The next sample index.
|
15
|
+
|
16
|
+
# Start.
|
17
|
+
next_sample
|
18
|
+
end
|
19
|
+
|
20
|
+
def update(chunk)
|
21
|
+
pos = 0
|
22
|
+
while pos < chunk.size
|
23
|
+
len = chunk.size - pos
|
24
|
+
used = advance(chunk, pos, len)
|
25
|
+
@position += used
|
26
|
+
pos += used
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def digest
|
31
|
+
@xxhash.digest
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def advance(chunk, pos, len)
|
37
|
+
if in_sample?
|
38
|
+
# Use some bytes.
|
39
|
+
msb = missing_sample_bytes
|
40
|
+
if msb > len
|
41
|
+
update_sample chunk[pos..(pos + len - 1)]
|
42
|
+
len
|
43
|
+
else
|
44
|
+
finish_sample chunk[pos..(pos + msb - 1)]
|
45
|
+
msb
|
46
|
+
end
|
47
|
+
elsif samples_left?
|
48
|
+
# Discard some bytes until the next sample starts.
|
49
|
+
mgb = missing_gap_bytes
|
50
|
+
if mgb > len
|
51
|
+
len
|
52
|
+
else
|
53
|
+
mgb
|
54
|
+
end
|
55
|
+
else
|
56
|
+
# Discard the rest.
|
57
|
+
len
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def in_sample?
|
62
|
+
samples_left? && @position >= @current_sample_offset && @position < @current_sample_offset + @current_sample_size
|
63
|
+
end
|
64
|
+
|
65
|
+
def samples_left?
|
66
|
+
!!@current_sample
|
67
|
+
end
|
68
|
+
|
69
|
+
def missing_sample_bytes
|
70
|
+
@current_sample_size - @current_sample.size
|
71
|
+
end
|
72
|
+
|
73
|
+
def missing_gap_bytes
|
74
|
+
@current_sample_offset - @position
|
75
|
+
end
|
76
|
+
|
77
|
+
def update_sample(data)
|
78
|
+
@current_sample += data
|
79
|
+
end
|
80
|
+
|
81
|
+
def finish_sample(data)
|
82
|
+
@current_sample += data
|
83
|
+
@xxhash.update(@current_sample)
|
84
|
+
next_sample
|
85
|
+
end
|
86
|
+
|
87
|
+
def next_sample
|
88
|
+
if @next < @sampler.samples.size
|
89
|
+
@current_sample = String.new
|
90
|
+
@current_sample_offset, @current_sample_size = @sampler.samples[@next]
|
91
|
+
@next += 1
|
92
|
+
else
|
93
|
+
@current_sample = nil
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module SamplingHash
|
2
|
+
class Sampler
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
attr_reader :samples, :size
|
6
|
+
|
7
|
+
# Calculates sample offsets.
|
8
|
+
#
|
9
|
+
# Parameters:
|
10
|
+
# - sample_size: Size of a sample (in bytes).
|
11
|
+
# - header_samples: Number of samples at front of data always to be included.
|
12
|
+
# - minimum_samples: Minimum number of samples to be included.
|
13
|
+
# - remaining_factor: If size is greater than minimum_samples * sample_size, this specifies the
|
14
|
+
# linear factor function used to determine the additional data used.
|
15
|
+
def initialize(size, sample_size = 1024, header_samples = 1000, minimum_samples = 5000, remaining_factor = 0.001)
|
16
|
+
@samples = []
|
17
|
+
|
18
|
+
minimum_sampling_size = minimum_samples * sample_size
|
19
|
+
if (size > minimum_sampling_size)
|
20
|
+
# Continuous header samples first.
|
21
|
+
header_samples.times { |i| @samples << [i * sample_size, sample_size] }
|
22
|
+
|
23
|
+
# Spread the rest.
|
24
|
+
start_offset = header_samples * sample_size
|
25
|
+
remaining_size = size - start_offset
|
26
|
+
|
27
|
+
remaining_minimum_samples = [0, minimum_samples - header_samples].max
|
28
|
+
remaining_minimum_sampling_size = remaining_minimum_samples * sample_size
|
29
|
+
|
30
|
+
remaining_additional_size = remaining_size - remaining_minimum_sampling_size
|
31
|
+
remaining_additional_sampling_size = remaining_additional_size * remaining_factor
|
32
|
+
remaining_additional_samples = (remaining_additional_sampling_size / sample_size).truncate
|
33
|
+
|
34
|
+
remaining_total_samples = remaining_minimum_samples + remaining_additional_samples
|
35
|
+
remaining_total_sampling_size = remaining_minimum_sampling_size + remaining_additional_sampling_size
|
36
|
+
|
37
|
+
remaining_unsampled_size = remaining_size - remaining_total_sampling_size
|
38
|
+
remaining_sampling_gap = (remaining_unsampled_size / remaining_total_samples).truncate
|
39
|
+
|
40
|
+
# NOTE: We can not overflow since we calculated the remaining_additional_samples with integer division.
|
41
|
+
remaining_total_samples.times do |i|
|
42
|
+
@samples << [start_offset + i * (sample_size + remaining_sampling_gap), sample_size]
|
43
|
+
end
|
44
|
+
else
|
45
|
+
total_full_samples = size / sample_size
|
46
|
+
last_sample_size = size - ((size / sample_size) * sample_size)
|
47
|
+
|
48
|
+
# Simply take them all.
|
49
|
+
total_full_samples.times { |i| @samples << [i * sample_size, sample_size] }
|
50
|
+
@samples << [total_full_samples * sample_size, last_sample_size] if last_sample_size != 0
|
51
|
+
end
|
52
|
+
|
53
|
+
@size = @samples.inject(0) { |i, v| i + v[1] }
|
54
|
+
end
|
55
|
+
|
56
|
+
def each(&block)
|
57
|
+
@samples.each(&block)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -1,42 +1,17 @@
|
|
1
1
|
module SamplingHash
|
2
2
|
class SamplingIO
|
3
|
-
def initialize(io)
|
3
|
+
def initialize(io, sampler = nil)
|
4
4
|
raise ArgumentError, 'first parameter should be IO' unless io.kind_of?(IO)
|
5
5
|
|
6
6
|
@io = io
|
7
|
-
@
|
8
|
-
end
|
9
|
-
|
10
|
-
def sample
|
11
|
-
return nil if @chunk > samples
|
12
|
-
|
13
|
-
@io.seek(offset, IO::SEEK_SET)
|
14
|
-
@chunk += 1
|
15
|
-
@io.read(CHUNK_SIZE)
|
16
|
-
end
|
17
|
-
|
18
|
-
private
|
19
|
-
|
20
|
-
CHUNK_SIZE = 256
|
21
|
-
|
22
|
-
def file_size
|
23
|
-
@file_size ||= @io.stat.size
|
24
|
-
end
|
25
|
-
|
26
|
-
def reduce
|
27
|
-
(Math.log(file_size / 1000) * 1000).truncate & 0xFFFFFF00
|
28
|
-
end
|
29
|
-
|
30
|
-
def samples_size
|
31
|
-
@samples_size ||= (file_size < 3000 ? file_size : reduce)
|
7
|
+
@sampler = sampler || Sampler.new(io.stat.size)
|
32
8
|
end
|
33
9
|
|
34
10
|
def samples
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
@chunk * CHUNK_SIZE
|
11
|
+
@sampler.each do |offset, size|
|
12
|
+
@io.seek(offset, IO::SEEK_SET)
|
13
|
+
yield @io.read(size)
|
14
|
+
end
|
40
15
|
end
|
41
16
|
end
|
42
17
|
end
|
metadata
CHANGED
@@ -1,69 +1,71 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sampling-hash
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
|
-
-
|
7
|
+
- Projective Technology GmbH
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-04-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: xxhash
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 0.
|
19
|
+
version: '0.3'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.
|
26
|
+
version: '0.3'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: 12.3.3
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
40
|
+
version: 12.3.3
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: minitest
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '5.5'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
54
|
+
version: '5.5'
|
55
55
|
description: Calculates deterministic hashes from file samples
|
56
|
-
email:
|
57
|
-
- malte.rohde@flavoursys.com
|
56
|
+
email: technology@projective.io
|
58
57
|
executables: []
|
59
58
|
extensions: []
|
60
59
|
extra_rdoc_files: []
|
61
60
|
files:
|
62
61
|
- lib/sampling-hash.rb
|
63
|
-
- lib/sampling-hash/
|
62
|
+
- lib/sampling-hash/hash.rb
|
63
|
+
- lib/sampling-hash/sampler.rb
|
64
64
|
- lib/sampling-hash/sampling-io.rb
|
65
|
-
|
66
|
-
|
65
|
+
- lib/sampling-hash/version.rb
|
66
|
+
homepage: https://github.com/projectivetech/sampling-hash
|
67
|
+
licenses:
|
68
|
+
- MIT
|
67
69
|
metadata: {}
|
68
70
|
post_install_message:
|
69
71
|
rdoc_options: []
|
@@ -71,17 +73,16 @@ require_paths:
|
|
71
73
|
- lib
|
72
74
|
required_ruby_version: !ruby/object:Gem::Requirement
|
73
75
|
requirements:
|
74
|
-
- -
|
76
|
+
- - ">="
|
75
77
|
- !ruby/object:Gem::Version
|
76
78
|
version: '0'
|
77
79
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
80
|
requirements:
|
79
|
-
- -
|
81
|
+
- - ">="
|
80
82
|
- !ruby/object:Gem::Version
|
81
83
|
version: '0'
|
82
84
|
requirements: []
|
83
|
-
|
84
|
-
rubygems_version: 2.0.3
|
85
|
+
rubygems_version: 3.2.5
|
85
86
|
signing_key:
|
86
87
|
specification_version: 4
|
87
88
|
summary: Sampling hash algorithm for large files
|