digestif 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +5 -0
- data/Gemfile.lock +40 -0
- data/History.txt +0 -0
- data/LICENSE +19 -0
- data/README.textile +58 -0
- data/Rakefile +17 -0
- data/bin/digestif +7 -0
- data/features/basic.feature +22 -0
- data/features/fast_hash.feature +42 -0
- data/features/input.feature +33 -0
- data/features/step_definitions/digest_steps.rb +22 -0
- data/features/step_definitions/stack_trace_output_steps.rb +13 -0
- data/features/support/env.rb +3 -0
- data/lib/digestif.rb +1 -0
- data/lib/digestif/cli.rb +101 -0
- data/lib/digestif/hasher.rb +28 -0
- data/lib/digestif/version.rb +9 -0
- metadata +120 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
aruba (0.3.2)
|
5
|
+
childprocess (~> 0.1.6)
|
6
|
+
cucumber (~> 0.10.0)
|
7
|
+
rspec (~> 2.3.0)
|
8
|
+
builder (2.1.2)
|
9
|
+
childprocess (0.1.6)
|
10
|
+
ffi (~> 0.6.3)
|
11
|
+
cucumber (0.10.0)
|
12
|
+
builder (>= 2.1.2)
|
13
|
+
diff-lcs (~> 1.1.2)
|
14
|
+
gherkin (~> 2.3.2)
|
15
|
+
json (~> 1.4.6)
|
16
|
+
term-ansicolor (~> 1.0.5)
|
17
|
+
diff-lcs (1.1.2)
|
18
|
+
ffi (0.6.3)
|
19
|
+
rake (>= 0.8.7)
|
20
|
+
gherkin (2.3.3)
|
21
|
+
json (~> 1.4.6)
|
22
|
+
json (1.4.6)
|
23
|
+
rake (0.8.7)
|
24
|
+
rspec (2.3.0)
|
25
|
+
rspec-core (~> 2.3.0)
|
26
|
+
rspec-expectations (~> 2.3.0)
|
27
|
+
rspec-mocks (~> 2.3.0)
|
28
|
+
rspec-core (2.3.1)
|
29
|
+
rspec-expectations (2.3.0)
|
30
|
+
diff-lcs (~> 1.1.2)
|
31
|
+
rspec-mocks (2.3.0)
|
32
|
+
term-ansicolor (1.0.5)
|
33
|
+
|
34
|
+
PLATFORMS
|
35
|
+
ruby
|
36
|
+
|
37
|
+
DEPENDENCIES
|
38
|
+
aruba
|
39
|
+
cucumber
|
40
|
+
rake
|
data/History.txt
ADDED
File without changes
|
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2011 Andrew Roberts
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.textile
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
h1. Digestif
|
2
|
+
|
3
|
+
An aid for creating hash digests of large files
|
4
|
+
|
5
|
+
h2. Synopsis
|
6
|
+
|
7
|
+
Digestif lets you create fast checksums of large files by
|
8
|
+
skipping sections of the file. It was created with compressed media
|
9
|
+
files in mind, which generally have such a high information density
|
10
|
+
that we can get away with a checksum that doesn't actually consider all
|
11
|
+
the bits. Someday I'd like to understand the likelyhood-of-collision
|
12
|
+
implications for specific compression algorithms (mp3, h.264, xvid, et al.),
|
13
|
+
but right now I'm going to settle for guessing at where "good enough for me"
|
14
|
+
might lie.
|
15
|
+
|
16
|
+
One side-effect of this approach is that the error-corrective nature of
|
17
|
+
digests is, of course, lost. This is really more of an inescapable artifact
|
18
|
+
of the problem we're trying to solve. To create a hash of a really large
|
19
|
+
file, the biggest bottleneck with modern computers is streaming
|
20
|
+
5-10 gigs off of the disk. The actual checksumming is not hard.
|
21
|
+
By looking at less data, we speed up the hash process immensely, and
|
22
|
+
we incur the cost of vulnerability of file corruption. Because the
|
23
|
+
purpose I have in mind for this tool is identity checking, not
|
24
|
+
corruption detection, this issue is not a problem for me.
|
25
|
+
|
26
|
+
h2. Installation TODO
|
27
|
+
|
28
|
+
h2. Usage
|
29
|
+
|
30
|
+
Just like md5 on the command line, but it only works on files, not on
|
31
|
+
streaming data (can't seek a stream).
|
32
|
+
|
33
|
+
<pre>
|
34
|
+
digestif some_large_file
|
35
|
+
</pre>
|
36
|
+
|
37
|
+
Since this program is designed to get around file limitations specifically, it
|
38
|
+
didn't make sense for me to invest in making streams work.
|
39
|
+
|
40
|
+
For a detailed look at the options, see
|
41
|
+
|
42
|
+
<pre>
|
43
|
+
digestif --help
|
44
|
+
</pre>
|
45
|
+
|
46
|
+
h2. Motivation
|
47
|
+
|
48
|
+
I wrote digestif to solve a problem for a media catalogue I was working on.
|
49
|
+
I wanted a filename-independent way to evaluate whether or not a file was in
|
50
|
+
the catalogue yet, but the files were so large that streaming the whole file
|
51
|
+
off of the hard drive was too slow for the response time I was hoping for.
|
52
|
+
(Interested parties, I was getting 5 gigs hashed using md5 in about 2.4
|
53
|
+
minutes.)
|
54
|
+
|
55
|
+
h2. Author
|
56
|
+
|
57
|
+
Copyright 2011 Andrew Roberts
|
58
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'cucumber'
|
3
|
+
require 'cucumber/rake/task'
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
|
6
|
+
desc 'Default: run the cucumber features.'
|
7
|
+
task :default => :features
|
8
|
+
|
9
|
+
Cucumber::Rake::Task.new(:features) do |t|
|
10
|
+
t.cucumber_opts = "features --format pretty"
|
11
|
+
end
|
12
|
+
|
13
|
+
eval("$specification = begin; #{IO.read('digestif.gemspec')}; end")
|
14
|
+
Rake::GemPackageTask.new($specification) do |package|
|
15
|
+
package.need_zip = true
|
16
|
+
package.need_tar = true
|
17
|
+
end
|
data/bin/digestif
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Feature: Basic application operation
|
2
|
+
In order to compare files based on content
|
3
|
+
As a user
|
4
|
+
I want to be told the hash digest of files
|
5
|
+
|
6
|
+
Background:
|
7
|
+
Given a file named "test_file" with:
|
8
|
+
"""
|
9
|
+
This file is a test file for md5 to hash
|
10
|
+
"""
|
11
|
+
And a file named "test_file_2" with:
|
12
|
+
"""
|
13
|
+
This is another test file.
|
14
|
+
"""
|
15
|
+
|
16
|
+
Scenario: Hashing a file
|
17
|
+
When I run "digestif -d md5 test_file"
|
18
|
+
Then the output should be a digest
|
19
|
+
|
20
|
+
Scenario: Hashing 2 files
|
21
|
+
When I run "digestif -d sha1 test_file test_file_2"
|
22
|
+
Then the output should be 2 digests
|
@@ -0,0 +1,42 @@
|
|
1
|
+
Feature: Hash files quickly
|
2
|
+
In order to hash large files quickly
|
3
|
+
As a user
|
4
|
+
I want to ensure that the hasher does not look at the whole file
|
5
|
+
|
6
|
+
Scenario: Changing a file without affecting the hash
|
7
|
+
# Given a file named "input" with:
|
8
|
+
# """
|
9
|
+
# This "feature" is really more of an inescapable artifact of the
|
10
|
+
# problem we're trying to solve. To create a hash of a really large
|
11
|
+
# file, the biggest bottleneck with modern computers is streaming
|
12
|
+
# 5-10 gigs off of the disk. The actual checksumming is not hard.
|
13
|
+
|
14
|
+
# By looking at less data, we speed up the hash process immensely, and
|
15
|
+
# we incur the cost of vulnerability of file corruption. Because the
|
16
|
+
# purpose I have in mind for this tool is identity checking, not
|
17
|
+
# corruption detection, this issue is not a problem for me.
|
18
|
+
# """
|
19
|
+
# And a file named "modified" with:
|
20
|
+
# """
|
21
|
+
# Th ea " ea mo f ne ab rt t he
|
22
|
+
# problem we're trying to solve. To create a hash of a really large
|
23
|
+
# file, the biggest bottleneck with modern computers is streaming
|
24
|
+
# 5-10 gigs off of the disk. The actual checksumming is not hard.
|
25
|
+
|
26
|
+
# By looking at less data, we speed up the hash process immensely, and
|
27
|
+
# we incur the cost of vulnerability of file corruption. Because the
|
28
|
+
# purpose I have in mind for this tool is identity checking, not
|
29
|
+
# corruption detection, this issue is not a problem for me.
|
30
|
+
# """
|
31
|
+
Given a file named "input" with:
|
32
|
+
"""
|
33
|
+
two words, and not a moment too soon
|
34
|
+
"""
|
35
|
+
And a file named "modified" with:
|
36
|
+
"""
|
37
|
+
tw0000rd0000nd0000 a0000en0000o 0000
|
38
|
+
"""
|
39
|
+
When I run "digestif -s 4 -r 2 input"
|
40
|
+
And I run "digestif -s 4 -r 2 modified"
|
41
|
+
Then the output should be 2 identical digests
|
42
|
+
|
@@ -0,0 +1,33 @@
|
|
1
|
+
Feature: Application input handling
|
2
|
+
In order to understand what was wrong with my input
|
3
|
+
As a user
|
4
|
+
I should be presented with sensical error messages
|
5
|
+
|
6
|
+
Scenario: program invoked with bad options
|
7
|
+
When I run "digestif --campari"
|
8
|
+
Then the output should not contain a stack trace
|
9
|
+
And there should be an error message
|
10
|
+
And the exit status should not be 0
|
11
|
+
|
12
|
+
When I run "digestif -d campari"
|
13
|
+
Then the output should not contain a stack trace
|
14
|
+
And there should be an error message
|
15
|
+
And the exit status should not be 0
|
16
|
+
|
17
|
+
Scenario: Program invoked on nonexistent file
|
18
|
+
When I run "digestif nonexistent_file"
|
19
|
+
Then the output should not contain a stack trace
|
20
|
+
And there should be an error message
|
21
|
+
And the exit status should not be 0
|
22
|
+
|
23
|
+
Scenario: Program invoked on existent and nonexistent files, together
|
24
|
+
Given a file named "test_file" with:
|
25
|
+
"""
|
26
|
+
test data inside
|
27
|
+
"""
|
28
|
+
When I run "digestif test_file test_file_2"
|
29
|
+
Then there should be an error message
|
30
|
+
And the output should not contain a stack trace
|
31
|
+
And the output should not contain a digest
|
32
|
+
|
33
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'aruba/api'
|
2
|
+
|
3
|
+
Then /^the output should be (a|\d+) digest(?:s?)$/ do |count|
|
4
|
+
count = 1 if count == 'a'
|
5
|
+
count = count.to_i
|
6
|
+
|
7
|
+
lines = all_output.split("\n")
|
8
|
+
lines.size.should == count
|
9
|
+
lines.each { |line| line.should match(/^[a-z0-9]+$/) }
|
10
|
+
end
|
11
|
+
|
12
|
+
Then /^the output should not contain a digest$/ do
|
13
|
+
all_output.split('\n').each { |l| l.should_not match(/^[a-z0-9]+$/) }
|
14
|
+
end
|
15
|
+
|
16
|
+
Then /^the output should be (\d+) identical digests$/ do |count|
|
17
|
+
count = count.to_i
|
18
|
+
|
19
|
+
lines = all_output.split("\n")
|
20
|
+
lines.size.should == count
|
21
|
+
lines.each { |line| line.should == lines[0] }
|
22
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'aruba/api'
|
2
|
+
|
3
|
+
Then /^the output should not contain a stack trace$/ do
|
4
|
+
all_output.should_not match(/from \/.+:\d+:in `\w+'/)
|
5
|
+
end
|
6
|
+
|
7
|
+
Then /^there should be an error message$/ do
|
8
|
+
all_stderr.should match(/^digestif: /)
|
9
|
+
end
|
10
|
+
|
11
|
+
Then /^the output should be empty$/ do
|
12
|
+
all_output.should match(/^$/)
|
13
|
+
end
|
data/lib/digestif.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'digestif/cli'
|
data/lib/digestif/cli.rb
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
require 'ostruct'
|
3
|
+
require 'digestif/hasher'
|
4
|
+
require 'digestif/version'
|
5
|
+
|
6
|
+
module Digestif
|
7
|
+
class CLI
|
8
|
+
def self.run(args)
|
9
|
+
new(args).run
|
10
|
+
end
|
11
|
+
|
12
|
+
attr_accessor :args, :options
|
13
|
+
|
14
|
+
def initialize(args)
|
15
|
+
self.args = args
|
16
|
+
self.options = parse_options
|
17
|
+
end
|
18
|
+
|
19
|
+
def run
|
20
|
+
# validate files first - fail fast
|
21
|
+
args.each do |file|
|
22
|
+
unless File.exists?(file)
|
23
|
+
error "file not found: #{file}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# engage hasher
|
28
|
+
args.each do |file|
|
29
|
+
puts Hasher.new(file, options).digest
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def parse_options
|
34
|
+
# defaults
|
35
|
+
options = OpenStruct.new
|
36
|
+
options.digest = :sha1
|
37
|
+
options.seek_size = 1024
|
38
|
+
options.read_size = 512
|
39
|
+
|
40
|
+
parser = OptionParser.new do |p|
|
41
|
+
p.banner = "Usage: digestif [options] filename"
|
42
|
+
|
43
|
+
p.separator ""
|
44
|
+
p.separator "Options:"
|
45
|
+
p.separator ""
|
46
|
+
|
47
|
+
p.on("-d", "--digest DIGEST", [:md5, :sha1],
|
48
|
+
"Digest algorithm to use. Currently supported:",
|
49
|
+
" md5", " sha1", ' ') do |digest|
|
50
|
+
options.digest = digest
|
51
|
+
end
|
52
|
+
|
53
|
+
p.on("-r", "--read-size SIZE", Integer,
|
54
|
+
"Size of chunk to read, in bytes " +
|
55
|
+
"(#{options.read_size})") do |size|
|
56
|
+
options.read_size = size
|
57
|
+
end
|
58
|
+
|
59
|
+
p.on("-s", "--seek-size SIZE", Integer,
|
60
|
+
"Size of chunk to skip after each read, in bytes " +
|
61
|
+
"(#{options.seek_size})") do |size|
|
62
|
+
options.seek_size = size
|
63
|
+
end
|
64
|
+
|
65
|
+
p.separator ""
|
66
|
+
p.separator "Common options:"
|
67
|
+
p.separator ""
|
68
|
+
|
69
|
+
p.on_tail("-v", "--version", "Show this message") do
|
70
|
+
puts Digestif.version_string
|
71
|
+
exit 0
|
72
|
+
end
|
73
|
+
|
74
|
+
p.on_tail("-h", "--help", "Show this message") do
|
75
|
+
puts p
|
76
|
+
exit 0
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
|
81
|
+
begin
|
82
|
+
parser.parse!(args)
|
83
|
+
rescue OptionParser::ParseError => e
|
84
|
+
error e
|
85
|
+
end
|
86
|
+
|
87
|
+
options
|
88
|
+
end
|
89
|
+
|
90
|
+
def error(error_obj_or_str, code = 1)
|
91
|
+
if error_obj_or_str.respond_to?('to_s')
|
92
|
+
error_str = error_obj_or_str.to_s
|
93
|
+
else
|
94
|
+
error_str = error_obj_or_str.inspect
|
95
|
+
end
|
96
|
+
|
97
|
+
$stderr.puts "digestif: #{error_str}"
|
98
|
+
exit code
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'digest/sha1'
|
2
|
+
require 'digest/md5'
|
3
|
+
|
4
|
+
module Digestif
|
5
|
+
class Hasher
|
6
|
+
|
7
|
+
attr_accessor :options, :filename
|
8
|
+
|
9
|
+
def initialize(filename, options)
|
10
|
+
self.filename = filename
|
11
|
+
self.options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
def digest
|
15
|
+
hasher = Digest.const_get(options.digest.to_s.upcase).new
|
16
|
+
|
17
|
+
File.open(filename, 'rb') do |f|
|
18
|
+
until f.eof
|
19
|
+
hasher.update(f.read(options.read_size))
|
20
|
+
f.seek(options.seek_size, IO::SEEK_CUR)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
hasher.hexdigest
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: digestif
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 1.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Andrew Roberts
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-01-12 00:00:00 -05:00
|
19
|
+
default_executable: digestif
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: cucumber
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :development
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: aruba
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
version: "0"
|
47
|
+
type: :development
|
48
|
+
version_requirements: *id002
|
49
|
+
description: |-
|
50
|
+
Digestif lets you create fast checksums of
|
51
|
+
large files by skipping sections of the file. It was created
|
52
|
+
with compressed media files in mind, which generally have such
|
53
|
+
a high information density that we can get away with a checksum
|
54
|
+
that doesn't actually consider all the bits.
|
55
|
+
email: adroberts@gmail.com
|
56
|
+
executables:
|
57
|
+
- digestif
|
58
|
+
extensions: []
|
59
|
+
|
60
|
+
extra_rdoc_files: []
|
61
|
+
|
62
|
+
files:
|
63
|
+
- Gemfile
|
64
|
+
- Gemfile.lock
|
65
|
+
- History.txt
|
66
|
+
- LICENSE
|
67
|
+
- Rakefile
|
68
|
+
- README.textile
|
69
|
+
- lib/digestif/cli.rb
|
70
|
+
- lib/digestif/hasher.rb
|
71
|
+
- lib/digestif/version.rb
|
72
|
+
- lib/digestif.rb
|
73
|
+
- features/basic.feature
|
74
|
+
- features/fast_hash.feature
|
75
|
+
- features/input.feature
|
76
|
+
- features/step_definitions/digest_steps.rb
|
77
|
+
- features/step_definitions/stack_trace_output_steps.rb
|
78
|
+
- features/support/env.rb
|
79
|
+
- bin/digestif
|
80
|
+
has_rdoc: true
|
81
|
+
homepage: http://github.com/aroberts/digestif
|
82
|
+
licenses: []
|
83
|
+
|
84
|
+
post_install_message:
|
85
|
+
rdoc_options: []
|
86
|
+
|
87
|
+
require_paths:
|
88
|
+
- lib
|
89
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
none: false
|
91
|
+
requirements:
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
hash: 3
|
95
|
+
segments:
|
96
|
+
- 0
|
97
|
+
version: "0"
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
hash: 3
|
104
|
+
segments:
|
105
|
+
- 0
|
106
|
+
version: "0"
|
107
|
+
requirements: []
|
108
|
+
|
109
|
+
rubyforge_project:
|
110
|
+
rubygems_version: 1.3.7
|
111
|
+
signing_key:
|
112
|
+
specification_version: 3
|
113
|
+
summary: Easy digest generation for large files
|
114
|
+
test_files:
|
115
|
+
- features/basic.feature
|
116
|
+
- features/fast_hash.feature
|
117
|
+
- features/input.feature
|
118
|
+
- features/step_definitions/digest_steps.rb
|
119
|
+
- features/step_definitions/stack_trace_output_steps.rb
|
120
|
+
- features/support/env.rb
|