dupe-magick 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Tim Koopmans
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,17 @@
1
+ = dupe-magick
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Tim Koopmans. See LICENSE for details.
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "dupe-magick"
8
+ gem.summary = %Q{Detect image duplicates with RMagick}
9
+ gem.description = %Q{If you need to look for duplicates which can't otherwise be found via md5 checksums e.g. images of different formats or slightly lossy, then this gem can compare or find duplicates based on euclidian distances between points in an 8x8x8 RGB vector cube.}
10
+ gem.email = "tim.koops@gmail.com"
11
+ gem.homepage = "http://github.com/90kts/dupe-magick"
12
+ gem.authors = ["Tim Koopmans"]
13
+ gem.add_development_dependency "thoughtbot-shoulda", ">= 0"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'rake/testtask'
22
+ Rake::TestTask.new(:test) do |test|
23
+ test.libs << 'lib' << 'test'
24
+ test.pattern = 'test/**/test_*.rb'
25
+ test.verbose = true
26
+ end
27
+
28
+ begin
29
+ require 'rcov/rcovtask'
30
+ Rcov::RcovTask.new do |test|
31
+ test.libs << 'test'
32
+ test.pattern = 'test/**/test_*.rb'
33
+ test.verbose = true
34
+ end
35
+ rescue LoadError
36
+ task :rcov do
37
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
38
+ end
39
+ end
40
+
41
+ task :test => :check_dependencies
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
48
+
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "dupe-magick #{version}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'dupe-magick'
4
+ require 'optparse'
5
+
6
+ options = {}
7
+ OptionParser.new do |opts|
8
+ opts.banner = "Usage: dupes [options]"
9
+ opts.separator ""
10
+ opts.separator "Specific options:"
11
+ opts.on("-s source-file", "--source-file", String,
12
+ "Specify source file to base search on") do |s|
13
+ options[:source_file] = s || nil
14
+ end
15
+ opts.on("-t target-path", "--target-path", String,
16
+ "Specify target path to conduct search in") do |t|
17
+ options[:target_path] = t
18
+ end
19
+ end.parse!
20
+
21
+ comparison = DupeMagick.new
22
+ comparison.find_duplicates(options[:source_file], options[:target_path])
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env ruby
2
+ require 'RMagick'
3
+ require 'progressbar'
4
+ require 'base64'
5
+
6
+ class DupeMagick
7
+
8
+ attr_accessor :source_vector, :target_vector, :source_cube, :target_cube, :distance
9
+
10
+ def compare_images(source, target, params)
11
+ source_image = process_image(source, params)
12
+ target_image = process_image(target, params)
13
+ @source_vector = create_vector_from_image(source_image)
14
+ @source_cube = create_cube_from_vector(@source_vector)
15
+ @target_vector = create_vector_from_image(target_image)
16
+ @target_cube = create_cube_from_vector(@target_vector)
17
+ @distance = calculate_euclidian_distance(@source_cube, @target_cube)
18
+ puts "Images are " + euclidian_plain_language(@distance) + ", score: " + @distance.to_i.to_s
19
+ @distance.to_i
20
+ end
21
+
22
+ def find_duplicates(source_file, target_path)
23
+ source_image = process_image(source_file, {:geometry => '8x8!'})
24
+ @source_vector = create_vector_from_image(source_image)
25
+ @source_cube = create_cube_from_vector(@source_vector)
26
+ files = files_in_directory(target_path)
27
+ files.each do |target|
28
+ target_image = process_image(target, {:geometry => '8x8!'})
29
+ @target_vector = create_vector_from_image(target_image)
30
+ @target_cube = create_cube_from_vector(@target_vector)
31
+ @distance = calculate_euclidian_distance(@source_cube, @target_cube)
32
+ puts "Images are " + euclidian_plain_language(@distance) + ", score: " + @distance.to_i.to_s
33
+ end
34
+ end
35
+
36
+ def serialize(obj)
37
+ # for storing image data as a blob in mysql etc
38
+ Base64.encode64(Marshal.dump(obj))
39
+ end
40
+
41
+ def deserialize(obj)
42
+ # for retrieving image data from a blob in mysql etc
43
+ Base64.decode64(Marshal.load(obj))
44
+ end
45
+
46
+ private
47
+
48
+ def read_image(file)
49
+ Magick::Image.read(file).first
50
+ end
51
+
52
+ def files_in_directory(path)
53
+ Dir[path]
54
+ end
55
+
56
+ def do_make_progress_bar (title, total)
57
+ ProgressBar.new(title, total)
58
+ end
59
+
60
+ def process_image(image, params)
61
+ # read image
62
+ image = read_image(image)
63
+
64
+ # normalize image
65
+ image = image.normalize
66
+
67
+ # strip off 10% border
68
+ image = image.excerpt((image.columns * 10/100).to_i, (image.rows * 10/100).to_i,
69
+ (image.columns * 90/100).to_i, (image.rows * 90/100).to_i)
70
+
71
+ # optionally quantize image to 32 colours
72
+ image = image.quantize(32, Magick::RGBColorspace) if params[:quantize]
73
+
74
+ # optionally blur image
75
+ image = image.gaussian_blur(0.0, params[:blur_radius]) if params[:blur]
76
+
77
+ # change geometry of image
78
+ image = image.change_geometry(params[:geometry]) { |cols, rows, img| img.resize!(cols, rows)}
79
+
80
+ image
81
+ end
82
+
83
+ def create_vector_from_image(image)
84
+ image.export_pixels(0, 0, image.columns, image.rows, "RGB")
85
+ end
86
+
87
+ def create_cube_from_vector(vector)
88
+ # create cube 8x8x8 for each channel
89
+ cube = Hash.new
90
+ cube[:r] = Hash.new 0
91
+ cube[:g] = Hash.new 0
92
+ cube[:b] = Hash.new 0
93
+ array_mod(vector, 3, 0).each { |r| cube[:r][which_bin(r)] += 1 }
94
+ array_mod(vector, 3, 1).each { |g| cube[:g][which_bin(g)] += 1 }
95
+ array_mod(vector, 3, 2).each { |b| cube[:b][which_bin(b)] += 1 }
96
+ cube
97
+ end
98
+
99
+ def which_bin(channel)
100
+ case channel
101
+ when 0..8191 then 1
102
+ when 8192..16383 then 2
103
+ when 16384..24575 then 3
104
+ when 24576..32767 then 4
105
+ when 32768..40959 then 5
106
+ when 40960..49151 then 6
107
+ when 49152..57343 then 7
108
+ when 57344..65535 then 8
109
+ end
110
+ end
111
+
112
+ def array_mod(arr, mod, offset = 0)
113
+ arr.shift(offset)
114
+ out_arr = []
115
+ arr.each_with_index do |val, idx|
116
+ out_arr << val if idx % mod == 0
117
+ end
118
+ out_arr
119
+ end
120
+
121
+ def calculate_euclidian_distance(source_cube, target_cube)
122
+ sum = Hash.new 0
123
+ 1.upto(8) do |r|
124
+ 1.upto(8) do |g|
125
+ 1.upto(8) do |b|
126
+ sum[:r] += (target_cube[:r][r] - source_cube[:r][r])**2
127
+ sum[:g] += (target_cube[:g][g] - source_cube[:g][g])**2
128
+ sum[:b] += (target_cube[:b][b] - source_cube[:b][b])**2
129
+ end
130
+ end
131
+ end
132
+ Math.sqrt(sum[:r] + sum[:g] + sum[:b])
133
+ end
134
+
135
+ def euclidian_plain_language(distance)
136
+ case distance
137
+ when 0 then "identical"
138
+ when 1..50 then "similar"
139
+ when 51..150 then "possibly similar"
140
+ else "different"
141
+ end
142
+ end
143
+
144
+ end
145
+
146
+
@@ -0,0 +1,236 @@
1
+ #
2
+ # Ruby/ProgressBar - a text progress bar library
3
+ #
4
+ # Copyright (C) 2001-2005 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms
9
+ # of Ruby's license.
10
+ #
11
+
12
+ class ProgressBar
13
+ VERSION = "0.9"
14
+
15
+ def initialize (title, total, out = STDERR)
16
+ @title = title
17
+ @total = total
18
+ @out = out
19
+ @terminal_width = 80
20
+ @bar_mark = "o"
21
+ @current = 0
22
+ @previous = 0
23
+ @finished_p = false
24
+ @start_time = Time.now
25
+ @previous_time = @start_time
26
+ @title_width = 14
27
+ @format = "%-#{@title_width}s %3d%% %s %s"
28
+ @format_arguments = [:title, :percentage, :bar, :stat]
29
+ clear
30
+ show
31
+ end
32
+ attr_reader :title
33
+ attr_reader :current
34
+ attr_reader :total
35
+ attr_accessor :start_time
36
+
37
+ private
38
+ def fmt_bar
39
+ bar_width = do_percentage * @terminal_width / 100
40
+ sprintf("|%s%s|",
41
+ @bar_mark * bar_width,
42
+ " " * (@terminal_width - bar_width))
43
+ end
44
+
45
+ def fmt_percentage
46
+ do_percentage
47
+ end
48
+
49
+ def fmt_stat
50
+ if @finished_p then elapsed else eta end
51
+ end
52
+
53
+ def fmt_stat_for_file_transfer
54
+ if @finished_p then
55
+ sprintf("%s %s %s", bytes, transfer_rate, elapsed)
56
+ else
57
+ sprintf("%s %s %s", bytes, transfer_rate, eta)
58
+ end
59
+ end
60
+
61
+ def fmt_title
62
+ @title[0,(@title_width - 1)] + ":"
63
+ end
64
+
65
+ def convert_bytes (bytes)
66
+ if bytes < 1024
67
+ sprintf("%6dB", bytes)
68
+ elsif bytes < 1024 * 1000 # 1000kb
69
+ sprintf("%5.1fKB", bytes.to_f / 1024)
70
+ elsif bytes < 1024 * 1024 * 1000 # 1000mb
71
+ sprintf("%5.1fMB", bytes.to_f / 1024 / 1024)
72
+ else
73
+ sprintf("%5.1fGB", bytes.to_f / 1024 / 1024 / 1024)
74
+ end
75
+ end
76
+
77
+ def transfer_rate
78
+ bytes_per_second = @current.to_f / (Time.now - @start_time)
79
+ sprintf("%s/s", convert_bytes(bytes_per_second))
80
+ end
81
+
82
+ def bytes
83
+ convert_bytes(@current)
84
+ end
85
+
86
+ def format_time (t)
87
+ t = t.to_i
88
+ sec = t % 60
89
+ min = (t / 60) % 60
90
+ hour = t / 3600
91
+ sprintf("%02d:%02d:%02d", hour, min, sec);
92
+ end
93
+
94
+ # ETA stands for Estimated Time of Arrival.
95
+ def eta
96
+ if @current == 0
97
+ "ETA: --:--:--"
98
+ else
99
+ elapsed = Time.now - @start_time
100
+ eta = elapsed * @total / @current - elapsed;
101
+ sprintf("ETA: %s", format_time(eta))
102
+ end
103
+ end
104
+
105
+ def elapsed
106
+ elapsed = Time.now - @start_time
107
+ sprintf("Time: %s", format_time(elapsed))
108
+ end
109
+
110
+ def eol
111
+ if @finished_p then "\n" else "\r" end
112
+ end
113
+
114
+ def do_percentage
115
+ if @total.zero?
116
+ 100
117
+ else
118
+ @current * 100 / @total
119
+ end
120
+ end
121
+
122
+ def get_width
123
+ # FIXME: I don't know how portable it is.
124
+ default_width = 80
125
+ begin
126
+ tiocgwinsz = 0x5413
127
+ data = [0, 0, 0, 0].pack("SSSS")
128
+ if @out.ioctl(tiocgwinsz, data) >= 0 then
129
+ rows, cols, xpixels, ypixels = data.unpack("SSSS")
130
+ if cols >= 0 then cols else default_width end
131
+ else
132
+ default_width
133
+ end
134
+ rescue Exception
135
+ default_width
136
+ end
137
+ end
138
+
139
+ def show
140
+ arguments = @format_arguments.map {|method|
141
+ method = sprintf("fmt_%s", method)
142
+ send(method)
143
+ }
144
+ line = sprintf(@format, *arguments)
145
+
146
+ width = get_width
147
+ if line.length == width - 1
148
+ @out.print(line + eol)
149
+ @out.flush
150
+ elsif line.length >= width
151
+ @terminal_width = [@terminal_width - (line.length - width + 1), 0].max
152
+ if @terminal_width == 0 then @out.print(line + eol) else show end
153
+ else # line.length < width - 1
154
+ @terminal_width += width - line.length + 1
155
+ show
156
+ end
157
+ @previous_time = Time.now
158
+ end
159
+
160
+ def show_if_needed
161
+ if @total.zero?
162
+ cur_percentage = 100
163
+ prev_percentage = 0
164
+ else
165
+ cur_percentage = (@current * 100 / @total).to_i
166
+ prev_percentage = (@previous * 100 / @total).to_i
167
+ end
168
+
169
+ # Use "!=" instead of ">" to support negative changes
170
+ if cur_percentage != prev_percentage ||
171
+ Time.now - @previous_time >= 1 || @finished_p
172
+ show
173
+ end
174
+ end
175
+
176
+ public
177
+ def clear
178
+ @out.print "\r"
179
+ @out.print(" " * (get_width - 1))
180
+ @out.print "\r"
181
+ end
182
+
183
+ def finish
184
+ @current = @total
185
+ @finished_p = true
186
+ show
187
+ end
188
+
189
+ def finished?
190
+ @finished_p
191
+ end
192
+
193
+ def file_transfer_mode
194
+ @format_arguments = [:title, :percentage, :bar, :stat_for_file_transfer]
195
+ end
196
+
197
+ def format= (format)
198
+ @format = format
199
+ end
200
+
201
+ def format_arguments= (arguments)
202
+ @format_arguments = arguments
203
+ end
204
+
205
+ def halt
206
+ @finished_p = true
207
+ show
208
+ end
209
+
210
+ def inc (step = 1)
211
+ @current += step
212
+ @current = @total if @current > @total
213
+ show_if_needed
214
+ @previous = @current
215
+ end
216
+
217
+ def set (count)
218
+ if count < 0 || count > @total
219
+ raise "invalid count: #{count} (total: #{@total})"
220
+ end
221
+ @current = count
222
+ show_if_needed
223
+ @previous = @current
224
+ end
225
+
226
+ def inspect
227
+ "#<ProgressBar:#{@current}/#{@total}>"
228
+ end
229
+ end
230
+
231
+ class ReversedProgressBar < ProgressBar
232
+ def do_percentage
233
+ 100 - super
234
+ end
235
+ end
236
+
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'dupe-magick'
8
+
9
+ class Test::Unit::TestCase
10
+ end
Binary file
Binary file
@@ -0,0 +1,41 @@
1
+ require 'helper'
2
+
3
+ class TestDupeMagick < Test::Unit::TestCase
4
+ should "compare two identical images and return a euclidian distance of 0" do
5
+ comparison = DupeMagick.new
6
+ path = File.expand_path(File.dirname(__FILE__))
7
+ assert_equal 0, comparison.compare_images(
8
+ path + "/" + "images/first_wife.png",
9
+ path + "/" + "images/second_wife.jpg",
10
+ {
11
+ :geometry => '8x8!'
12
+ }
13
+ )
14
+ end
15
+
16
+ should "compare two possibly similar images and return a euclidian distance of 104" do
17
+ comparison = DupeMagick.new
18
+ path = File.expand_path(File.dirname(__FILE__))
19
+ assert_equal 104, comparison.compare_images(
20
+ path + "/" + "images/first_wife.png",
21
+ path + "/" + "images/bright_wife.png",
22
+ {
23
+ :geometry => '8x8!'
24
+ }
25
+ )
26
+ end
27
+
28
+ should "compare two different images and return a euclidian distance of 531" do
29
+ comparison = DupeMagick.new
30
+ path = File.expand_path(File.dirname(__FILE__))
31
+ assert_equal 531, comparison.compare_images(
32
+ path + "/" + "images/first_wife.png",
33
+ path + "/" + "images/different.jpg",
34
+ {
35
+ :geometry => '8x8!'
36
+ }
37
+ )
38
+ end
39
+
40
+
41
+ end
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: dupe-magick
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease: false
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 0
10
+ version: 1.0.0
11
+ platform: ruby
12
+ authors:
13
+ - Tim Koopmans
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-15 00:00:00 +11:00
19
+ default_executable: dupes
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: thoughtbot-shoulda
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :development
34
+ version_requirements: *id001
35
+ description: If you need to look for duplicates which can't otherwise be found via md5 checksums e.g. images of different formats or slightly lossy, then this gem can compare or find duplicates based on euclidian distances between points in an 8x8x8 RGB vector cube.
36
+ email: tim.koops@gmail.com
37
+ executables:
38
+ - dupes
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ - README.rdoc
44
+ files:
45
+ - .document
46
+ - .gitignore
47
+ - LICENSE
48
+ - README.rdoc
49
+ - Rakefile
50
+ - VERSION
51
+ - bin/dupes
52
+ - lib/dupe-magick.rb
53
+ - lib/progressbar.rb
54
+ - test/helper.rb
55
+ - test/images/bright_wife.png
56
+ - test/images/dark_wife.png
57
+ - test/images/different.jpg
58
+ - test/images/first_wife.png
59
+ - test/images/second_wife.jpg
60
+ - test/images/third_wife.jpg
61
+ - test/test_dupe-magick.rb
62
+ has_rdoc: true
63
+ homepage: http://github.com/90kts/dupe-magick
64
+ licenses: []
65
+
66
+ post_install_message:
67
+ rdoc_options:
68
+ - --charset=UTF-8
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ hash: 3
77
+ segments:
78
+ - 0
79
+ version: "0"
80
+ required_rubygems_version: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ hash: 3
86
+ segments:
87
+ - 0
88
+ version: "0"
89
+ requirements: []
90
+
91
+ rubyforge_project:
92
+ rubygems_version: 1.3.7
93
+ signing_key:
94
+ specification_version: 3
95
+ summary: Detect image duplicates with RMagick
96
+ test_files:
97
+ - test/helper.rb
98
+ - test/test_dupe-magick.rb