sampling-hash 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MDAwYWQ2MTFmMWQ4ZTdlNDYxY2RhZDhjZGQ4M2MyODBjMTViMmJiOQ==
5
+ data.tar.gz: !binary |-
6
+ MTk2ZGJjOGYxYjIyM2QwZmI4ZGE1NWNiNjE3M2ZlMjcxYTU0YjVhNQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ MjRjZDZjMGVjZTE1ZDczMDA1Y2NkYmJiMmY4NjkyODk0Y2I3MjdhMGU0ODFi
10
+ ODRkMmYzMTBiNDk5ZjVlMDVmZmIwY2M2MmY2Y2IzMWZmYzBiNjY5Mjg1NzBi
11
+ ZDlmZTJmMjg3YzllOGI1NTk0M2I5OTVkNGRkMTdmYzU5MTRlOTQ=
12
+ data.tar.gz: !binary |-
13
+ MjNmNThiZTRhNGQxNDMxOGJjOGMxZGJhYzBhNTE1NmY1MjkwNDg2NGIyYzk4
14
+ ZTg5YTAzNGJlYjVkY2Y1ZDE3YzdkMzZhNmM2OTBkZDMzZWJlYmE1NjYzNjI2
15
+ NjdlNDNjNWZkYWFlZTM5MjU2YjNmNzMzNzY1NjRhYTIxZDU1MTk=
@@ -0,0 +1,42 @@
1
+ module SamplingHash
2
+ class SamplingIO
3
+ def initialize(io)
4
+ raise ArgumentError, 'first parameter should be IO' unless io.kind_of?(IO)
5
+
6
+ @io = io
7
+ @chunk = 0
8
+ end
9
+
10
+ def sample
11
+ return nil if @chunk > samples
12
+
13
+ @io.seek(offset, IO::SEEK_SET)
14
+ @chunk += 1
15
+ @io.read(CHUNK_SIZE)
16
+ end
17
+
18
+ private
19
+
20
+ CHUNK_SIZE = 256
21
+
22
+ def file_size
23
+ @file_size ||= @io.stat.size
24
+ end
25
+
26
+ def reduce
27
+ (Math.log(file_size / 1000) * 1000).truncate & 0xFFFFFF00
28
+ end
29
+
30
+ def samples_size
31
+ @samples_size ||= (file_size < 3000 ? file_size : reduce)
32
+ end
33
+
34
+ def samples
35
+ samples_size / CHUNK_SIZE + 1
36
+ end
37
+
38
+ def offset
39
+ @chunk * CHUNK_SIZE
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,3 @@
1
+ module SamplingHash
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,18 @@
1
+ require 'sampling-hash/sampling-io'
2
+ require 'sampling-hash/version'
3
+ require 'xxhash'
4
+
5
+ module SamplingHash
6
+ def self.hash(path, seed)
7
+ raise ArgumentError, 'file not found' unless File.file?(path)
8
+
9
+ hash = XXhash::Internal::StreamingHash.new(seed)
10
+ sio = SamplingIO.new(File.open(path, 'r'))
11
+
12
+ while chunk = sio.sample
13
+ hash.update(chunk)
14
+ end
15
+
16
+ hash.digest
17
+ end
18
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sampling-hash
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Malte Rohde
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-09-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: xxhash
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ! '>='
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ! '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ! '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Calculates deterministic hashes from file samples
56
+ email:
57
+ - malte.rohde@flavoursys.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/sampling-hash.rb
63
+ - lib/sampling-hash/version.rb
64
+ - lib/sampling-hash/sampling-io.rb
65
+ homepage: http://github.com/flavoursys/sampling-hash
66
+ licenses: []
67
+ metadata: {}
68
+ post_install_message:
69
+ rdoc_options: []
70
+ require_paths:
71
+ - lib
72
+ required_ruby_version: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ! '>='
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
82
+ requirements: []
83
+ rubyforge_project:
84
+ rubygems_version: 2.0.3
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: Sampling hash algorithm for large files
88
+ test_files: []