find_dupe_images 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +2 -0
- data/.tags1 +13 -0
- data/.travis.yml +4 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +4 -0
- data/README.md +51 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/find_dupe_images +47 -0
- data/bin/setup +7 -0
- data/find_dupe_images.gemspec +32 -0
- data/lib/find_dupe_images.rb +45 -0
- data/lib/find_dupe_images/error/base.rb +16 -0
- data/lib/find_dupe_images/finder.rb +118 -0
- data/lib/find_dupe_images/image.rb +63 -0
- data/lib/find_dupe_images/image_mime_types.rb +49 -0
- data/lib/find_dupe_images/logger.rb +23 -0
- data/lib/find_dupe_images/option.rb +8 -0
- data/lib/find_dupe_images/processed_data.rb +10 -0
- data/lib/find_dupe_images/serializer.rb +51 -0
- data/lib/find_dupe_images/version.rb +4 -0
- metadata +152 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7f9c49c230d090f068e5f52e0b5e19414eb97523
|
4
|
+
data.tar.gz: 26a5165b0ab5f1b5f09ec3f6db1378ece52db442
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: d58e65668d12883ae5e243f1d853495b4a82d7855758bc341bf166c30283d2a0e9d663aa12760da2381ab78b94bac291763b937fc4272cbbc9f9d38519052857
|
7
|
+
data.tar.gz: 6c1108cbd99a4e4b1b4c64105079e0833d50c646ccaeecf7720b8031018090c6e9c1e29445b9791c0d427c041fe26085a75cfe38aba03fe771a3efa6704be4a0
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.tags1
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
!_TAG_FILE_FORMAT 2 /extended format; --format=1 will not append ;" to lines/
|
2
|
+
!_TAG_FILE_SORTED 0 /0=unsorted, 1=sorted, 2=foldcase/
|
3
|
+
!_TAG_PROGRAM_AUTHOR Darren Hiebert /dhiebert@users.sourceforge.net/
|
4
|
+
!_TAG_PROGRAM_NAME Exuberant Ctags //
|
5
|
+
!_TAG_PROGRAM_URL http://ctags.sourceforge.net /official site/
|
6
|
+
!_TAG_PROGRAM_VERSION 5.8 //
|
7
|
+
FindDupeImages /Users/andwen/project/ruby/find_dupe_images/find_dupe_images/lib/find_dupe_images/serializer.rb /^module FindDupeImages$/;" module line:1
|
8
|
+
Serializer /Users/andwen/project/ruby/find_dupe_images/find_dupe_images/lib/find_dupe_images/serializer.rb /^ class Serializer$/;" class line:2 class:FindDupeImages
|
9
|
+
initialize /Users/andwen/project/ruby/find_dupe_images/find_dupe_images/lib/find_dupe_images/serializer.rb /^ def initialize(processed_image_data = nil)$/;" method line:6 class:FindDupeImages.Serializer
|
10
|
+
serialize /Users/andwen/project/ruby/find_dupe_images/find_dupe_images/lib/find_dupe_images/serializer.rb /^ def serialize$/;" method line:12 class:FindDupeImages.Serializer
|
11
|
+
deserialize /Users/andwen/project/ruby/find_dupe_images/find_dupe_images/lib/find_dupe_images/serializer.rb /^ def deserialize$/;" method line:19 class:FindDupeImages.Serializer
|
12
|
+
remove_marshal_file /Users/andwen/project/ruby/find_dupe_images/find_dupe_images/lib/find_dupe_images/serializer.rb /^ def remove_marshal_file$/;" method line:41 class:FindDupeImages.Serializer
|
13
|
+
serialize_to_file /Users/andwen/project/ruby/find_dupe_images/find_dupe_images/lib/find_dupe_images/serializer.rb /^ def serialize_to_file$/;" method line:47 class:FindDupeImages.Serializer
|
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# Contributor Code of Conduct
|
2
|
+
|
3
|
+
As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
|
4
|
+
|
5
|
+
We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
|
6
|
+
|
7
|
+
Examples of unacceptable behavior by participants include the use of sexual language or imagery, derogatory comments or personal attacks, trolling, public or private harassment, insults, or other unprofessional conduct.
|
8
|
+
|
9
|
+
Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed from the project team.
|
10
|
+
|
11
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by opening an issue or contacting one or more of the project maintainers.
|
12
|
+
|
13
|
+
This Code of Conduct is adapted from the [Contributor Covenant](http://contributor-covenant.org), version 1.0.0, available at [http://contributor-covenant.org/version/1/0/0/](http://contributor-covenant.org/version/1/0/0/)
|
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# FindDupeImages
|
2
|
+
|
3
|
+
This is a simple gem to find duplicate images in a directory structure recursively. At the moment images greater than
|
4
|
+
8 MB will be ignored. This is due to the fact, that I observed a high memory consumption leading to use many
|
5
|
+
GB RAM. You can easily change this to another value by changing `MAX_FILE_SIZE` in find_dupe_images.rb.
|
6
|
+
|
7
|
+
## Technical idea
|
8
|
+
|
9
|
+
The process of comparing the images is this:
|
10
|
+
|
11
|
+
* traverse through the directory
|
12
|
+
* check if the mime-type is the one for an image (defined in `ImageMimeTypes`)
|
13
|
+
* open the image and read the bytes
|
14
|
+
* create an `Digest::MD5.hexdigest` of the content of the image
|
15
|
+
* Marshal.dump the digest and further info to a file (serialized.marshal)
|
16
|
+
* when all images are scanned, open the marshal file, run through it and find duplicate digests
|
17
|
+
* show the result
|
18
|
+
|
19
|
+
## Installation
|
20
|
+
|
21
|
+
Add this line to your application's Gemfile:
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
gem 'find_dupe_images'
|
25
|
+
```
|
26
|
+
|
27
|
+
And then execute:
|
28
|
+
|
29
|
+
$ bundle
|
30
|
+
|
31
|
+
Or install it yourself as:
|
32
|
+
|
33
|
+
$ gem install find_dupe_images
|
34
|
+
|
35
|
+
## Usage
|
36
|
+
|
37
|
+
It's as simple as this:
|
38
|
+
|
39
|
+
$ find_dupe_images /your/path/to/images
|
40
|
+
|
41
|
+
where the directory images can contain directories with directories of images.
|
42
|
+
|
43
|
+
## Development
|
44
|
+
|
45
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
46
|
+
|
47
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
48
|
+
|
49
|
+
## Contributing
|
50
|
+
|
51
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/andywenk/find_dupe_images. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](contributor-covenant.org) code of conduct.
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup_find_dupe_images"
|
4
|
+
require "find_dupe_images"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
@@ -0,0 +1,47 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "logstash-logger"
|
4
|
+
require 'digest/md5'
|
5
|
+
require 'rmagick'
|
6
|
+
require 'pathname'
|
7
|
+
require 'yaml'
|
8
|
+
require 'filemagic'
|
9
|
+
require_relative "../lib/find_dupe_images"
|
10
|
+
require_relative "../lib/find_dupe_images/image_mime_types"
|
11
|
+
require_relative "../lib/find_dupe_images/logger"
|
12
|
+
require_relative "../lib/find_dupe_images/error/base"
|
13
|
+
require_relative "../lib/find_dupe_images/image"
|
14
|
+
require_relative "../lib/find_dupe_images/finder"
|
15
|
+
require_relative "../lib/find_dupe_images/option"
|
16
|
+
require_relative "../lib/find_dupe_images/serializer"
|
17
|
+
require_relative "../lib/find_dupe_images/processed_data"
|
18
|
+
|
19
|
+
|
20
|
+
#require 'find_dupe_images'
|
21
|
+
|
22
|
+
puts <<INFO
|
23
|
+
|
24
|
+
find_dupe_images #{FindDupeImages::VERSION}
|
25
|
+
|
26
|
+
Description:
|
27
|
+
|
28
|
+
find_dupe_images is a tool to find duplicate images in a directory structure. It will open
|
29
|
+
the images and create a MD5 hash of the content. That means, that the name of the file is
|
30
|
+
irrelevant. Due to this fact, it can take quite a long time to create the digests from the images.
|
31
|
+
|
32
|
+
All you have to do is to give a starting directory. All images in directories below will be procesed.
|
33
|
+
|
34
|
+
If you want to stop the process, simply hit CTRL + c.
|
35
|
+
|
36
|
+
Usage:
|
37
|
+
|
38
|
+
find_dupe_images /some/directory/with/images
|
39
|
+
|
40
|
+
INFO
|
41
|
+
|
42
|
+
@start = Time.now
|
43
|
+
FindDupeImages.execute
|
44
|
+
@end = Time.now
|
45
|
+
@duration = Time.at((@end - @start).ceil.to_i).utc.strftime("%H:%M:%S")
|
46
|
+
|
47
|
+
puts "\tIt took #{@duration} to process all files.\n"
|
data/bin/setup
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'find_dupe_images/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "find_dupe_images"
|
8
|
+
spec.version = FindDupeImages::VERSION
|
9
|
+
spec.authors = ["Andy Wenk"]
|
10
|
+
spec.email = ["andy@nms.de"]
|
11
|
+
|
12
|
+
spec.summary = %q{Find duplicate images in a directory structure}
|
13
|
+
spec.description = %q{Find duplicate images in a directory structure}
|
14
|
+
spec.homepage = "https://github.com/andywenk/find_dupe_images"
|
15
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
16
|
+
spec.bindir = "bin"
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_dependency "rmagick"
|
21
|
+
spec.add_dependency "logstash-logger"
|
22
|
+
|
23
|
+
if `man libmagic`.size > 0
|
24
|
+
spec.add_dependency "ruby-filemagic"
|
25
|
+
else
|
26
|
+
raise "\n\nMISSING library - Please install libmagic!\n\n\tMac OS X: brew install libmagic\n\tLinux: see http://blackwinter.github.io/ruby-filemagic/#label-Installation\n\n"
|
27
|
+
end
|
28
|
+
|
29
|
+
spec.add_development_dependency "bundler", "~> 1.10"
|
30
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
31
|
+
spec.add_development_dependency "rspec"
|
32
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require "logger"
|
2
|
+
require "logstash-logger"
|
3
|
+
require 'digest/md5'
|
4
|
+
require 'rmagick'
|
5
|
+
require 'pathname'
|
6
|
+
require 'yaml'
|
7
|
+
require 'filemagic'
|
8
|
+
require "find_dupe_images/logger"
|
9
|
+
require "find_dupe_images/version"
|
10
|
+
require "find_dupe_images/image_mime_types"
|
11
|
+
require "find_dupe_images/error/base"
|
12
|
+
require "find_dupe_images/image"
|
13
|
+
require "find_dupe_images/option"
|
14
|
+
require "find_dupe_images/finder"
|
15
|
+
require "find_dupe_images/serializer"
|
16
|
+
require "find_dupe_images/processed_data"
|
17
|
+
|
18
|
+
# FindDupeImages
|
19
|
+
#
|
20
|
+
# this is the main entry
|
21
|
+
module FindDupeImages
|
22
|
+
|
23
|
+
# some definitions.
|
24
|
+
# @TODO Should be refactored to a FindDupeImages::Configuration
|
25
|
+
#
|
26
|
+
MAX_FILE_SIZE = 8000 #in kb
|
27
|
+
$count = 0
|
28
|
+
$too_big = 0
|
29
|
+
$not_an_image = 0
|
30
|
+
|
31
|
+
# The only public interface method to run the scan
|
32
|
+
#
|
33
|
+
# @return
|
34
|
+
def self.execute
|
35
|
+
begin
|
36
|
+
Finder.run
|
37
|
+
rescue Error::DirectoryRequired
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# create a logger
|
42
|
+
def self.logger
|
43
|
+
FindDupeImages::Logger.new
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module FindDupeImages
|
2
|
+
module Error
|
3
|
+
class Base < StandardError
|
4
|
+
def initialize(error_message = '')
|
5
|
+
super(error_message)
|
6
|
+
puts "ERROR: #{error_message}"
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class DirectoryRequired < Base
|
11
|
+
def initialize
|
12
|
+
super('Please provide a directory')
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
#
|
2
|
+
# FindDupeImages::Finder
|
3
|
+
#
|
4
|
+
# will be run to scan the images
|
5
|
+
#
|
6
|
+
module FindDupeImages
|
7
|
+
class Finder
|
8
|
+
attr_reader :image_data, :hexdigest
|
9
|
+
|
10
|
+
class << self
|
11
|
+
|
12
|
+
# start the scan
|
13
|
+
#
|
14
|
+
# @return [String]
|
15
|
+
def run
|
16
|
+
create_traversed_directories_array
|
17
|
+
define_path
|
18
|
+
define_directory
|
19
|
+
traverse_directory
|
20
|
+
result
|
21
|
+
|
22
|
+
puts result.class
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def create_traversed_directories_array
|
28
|
+
@traversed_directories = []
|
29
|
+
end
|
30
|
+
|
31
|
+
def define_path
|
32
|
+
@directory_path = FindDupeImages::Option.directory_path
|
33
|
+
end
|
34
|
+
|
35
|
+
def define_directory
|
36
|
+
@directory = Dir.glob("#{@directory_path}**/*").reject { |fn| File.directory?(fn)}
|
37
|
+
end
|
38
|
+
|
39
|
+
def traverse_directory
|
40
|
+
FindDupeImages::Serializer.new.remove_marshal_file
|
41
|
+
|
42
|
+
puts "\nStarting to process #{@directory.size} files in the directory ...\n\n"
|
43
|
+
|
44
|
+
@directory.each do |filename|
|
45
|
+
@image = FindDupeImages::Image.new(filename)
|
46
|
+
|
47
|
+
if @image.is_image?
|
48
|
+
read_image_data(filename)
|
49
|
+
$count += 1
|
50
|
+
log_data(filename)
|
51
|
+
serialize_data
|
52
|
+
end
|
53
|
+
|
54
|
+
trap("SIGINT") do
|
55
|
+
puts 'Interrupted! Closing.'
|
56
|
+
exit(0)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def log_data(filename)
|
62
|
+
create_hexdigest
|
63
|
+
FindDupeImages.logger.log(processed_image_data.to_json)
|
64
|
+
end
|
65
|
+
|
66
|
+
def read_image_data(filename)
|
67
|
+
@image_data = @image.image
|
68
|
+
end
|
69
|
+
|
70
|
+
def create_hexdigest
|
71
|
+
@hexdigest = Digest::MD5.hexdigest @image_data.export_pixels_to_str
|
72
|
+
end
|
73
|
+
|
74
|
+
def processed_image_data
|
75
|
+
{image_data: @image_data.inspect, hexdigest: @hexdigest}
|
76
|
+
end
|
77
|
+
|
78
|
+
def serialize_data
|
79
|
+
FindDupeImages::Serializer.new(FindDupeImages::ProcessedData.new({file_name: @image_data.filename, hexdigest: @hexdigest})).serialize
|
80
|
+
end
|
81
|
+
|
82
|
+
def result
|
83
|
+
results = FindDupeImages::Serializer.new.deserialize
|
84
|
+
create_report(results)
|
85
|
+
end
|
86
|
+
|
87
|
+
def create_report(results)
|
88
|
+
results.size > 0 ? with_duplicates(results) : without_duplicates
|
89
|
+
puts "\nReport:"
|
90
|
+
files_too_big
|
91
|
+
not_an_image
|
92
|
+
end
|
93
|
+
|
94
|
+
def with_duplicates(results)
|
95
|
+
puts "\n\nThe following files have been detected as duplicates:\n\n"
|
96
|
+
results.each_pair do |hexdigest, file|
|
97
|
+
puts file
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def without_duplicates
|
102
|
+
puts "\tNo files have been detectad as duplicates"
|
103
|
+
end
|
104
|
+
|
105
|
+
def files_too_big
|
106
|
+
if $too_big > 0
|
107
|
+
puts "\t#{$too_big} files have been ignored because they are too big!"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def not_an_image
|
112
|
+
if $not_an_image > 0
|
113
|
+
puts "\t#{$not_an_image} files have been ignored because they are not recognized as an image!"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module FindDupeImages
|
2
|
+
class Image
|
3
|
+
|
4
|
+
attr_accessor :image
|
5
|
+
|
6
|
+
def initialize(image)
|
7
|
+
@raw_image = image
|
8
|
+
@image_types = %w(GIF JPEG PNG TIFF BMP ICO CUR PSD SVG WEBP)
|
9
|
+
return unless is_image_and_not_too_big?
|
10
|
+
read_image
|
11
|
+
end
|
12
|
+
|
13
|
+
def is_image?
|
14
|
+
@image.is_a?(Magick::Image)
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def is_image_and_not_too_big?
|
20
|
+
!image_is_too_big? && mime_is_image?
|
21
|
+
end
|
22
|
+
|
23
|
+
def image_is_too_big?
|
24
|
+
if is_too_big?
|
25
|
+
$too_big += 1
|
26
|
+
FindDupeImages.logger.log({error: "The image #{@raw_image} is too big!"}.to_json)
|
27
|
+
return true
|
28
|
+
end
|
29
|
+
false
|
30
|
+
end
|
31
|
+
|
32
|
+
def mime_is_image?
|
33
|
+
fm = ::FileMagic.new(::FileMagic::MAGIC_MIME)
|
34
|
+
mime_type = fm.file(@raw_image).split(';').first
|
35
|
+
if FindDupeImages::IMAGE_MIME_TYPES.include?(mime_type)
|
36
|
+
return true
|
37
|
+
else
|
38
|
+
$not_an_image += 1
|
39
|
+
false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def is_too_big?
|
44
|
+
File.size?(@raw_image).to_f / (1024) > FindDupeImages::MAX_FILE_SIZE
|
45
|
+
end
|
46
|
+
|
47
|
+
def image_type
|
48
|
+
@image.format
|
49
|
+
end
|
50
|
+
|
51
|
+
def read_image
|
52
|
+
@image = read
|
53
|
+
end
|
54
|
+
|
55
|
+
def read
|
56
|
+
begin
|
57
|
+
Magick::Image.read(@raw_image).first
|
58
|
+
rescue Magick::ImageMagickError => e
|
59
|
+
FindDupeImages.logger.log({error: e.message}.to_json)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module FindDupeImages
|
2
|
+
# source http://www.sitepoint.com/web-foundations/mime-types-complete-list/
|
3
|
+
IMAGE_MIME_TYPES = %w(
|
4
|
+
image/gif
|
5
|
+
image/jpeg
|
6
|
+
image/pjpeg
|
7
|
+
image/png
|
8
|
+
image/bmp
|
9
|
+
image/tiff
|
10
|
+
image/pict
|
11
|
+
image/cmu-raster
|
12
|
+
image/fif
|
13
|
+
image/florian
|
14
|
+
image/vnd.fpx
|
15
|
+
image/vnd.net-fpx
|
16
|
+
image/vnd.rn-realpix
|
17
|
+
image/vnd.wap.wbmp
|
18
|
+
image/vnd.rn-realflash
|
19
|
+
image/vnd.dwg
|
20
|
+
image/g3fax
|
21
|
+
image/ief
|
22
|
+
image/jutvision
|
23
|
+
image/vasa
|
24
|
+
image/naplps
|
25
|
+
image/x-dwg
|
26
|
+
image/x-jg
|
27
|
+
image/x-niff
|
28
|
+
image/x-portable-bitmap
|
29
|
+
image/x-pict
|
30
|
+
image/x-pcx
|
31
|
+
image/x-portable-graymap
|
32
|
+
image/x-jps
|
33
|
+
image/x-xpixmap
|
34
|
+
image/x-portable-anymap
|
35
|
+
image/x-portable-pixmap
|
36
|
+
image/x-quicktime
|
37
|
+
image/x-tiff
|
38
|
+
image/x-windows-bmp
|
39
|
+
image/x-icon
|
40
|
+
image/x-cmu-raster
|
41
|
+
image/x-rgb
|
42
|
+
image/x-xbitmap
|
43
|
+
image/x-xbm
|
44
|
+
image/x-xwd
|
45
|
+
image/xbm
|
46
|
+
image/x-xwindowdump
|
47
|
+
application/octet-stream
|
48
|
+
)
|
49
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module FindDupeImages
|
2
|
+
class Logger
|
3
|
+
attr_accessor :log_level
|
4
|
+
|
5
|
+
def initialize(log_level: :debug, log_file: 'find_dupe_images.log')
|
6
|
+
log_levels = %i(debug info warn error fatal)
|
7
|
+
raise ArgumentError.new("log_level must be ohne of #{log_levels.join(', ')}") unless log_levels.include?(log_level)
|
8
|
+
|
9
|
+
self.log_level = log_level
|
10
|
+
@@logger ||= ::LogStashLogger.new(type: :file, path: log_file, sync: true)
|
11
|
+
end
|
12
|
+
|
13
|
+
def log(message, log_level: self.log_level)
|
14
|
+
@@logger.send(log_level, message.force_encoding('UTF-8'))
|
15
|
+
if $count && ($count % 10 == 0)
|
16
|
+
puts "(#{$count}) "
|
17
|
+
end
|
18
|
+
if $count
|
19
|
+
print '.'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module FindDupeImages
|
2
|
+
class Serializer
|
3
|
+
|
4
|
+
attr_reader :processed_image_data
|
5
|
+
|
6
|
+
def initialize(processed_image_data = nil)
|
7
|
+
@processed_image_data = processed_image_data
|
8
|
+
@serialize_filename = 'serialized.marshal'
|
9
|
+
@seperator = "---_---"
|
10
|
+
end
|
11
|
+
|
12
|
+
def serialize
|
13
|
+
File.open(serialize_to_file, 'a') do |file|
|
14
|
+
file.print Marshal::dump(@processed_image_data)
|
15
|
+
file.print @seperator
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def deserialize
|
20
|
+
$/ = @seperator
|
21
|
+
hexdigests = {}
|
22
|
+
hits = {}
|
23
|
+
|
24
|
+
return hits unless File.exist?(serialize_to_file)
|
25
|
+
|
26
|
+
File.open(serialize_to_file, 'r').each do |object|
|
27
|
+
o = Marshal::load(object.chomp)
|
28
|
+
if hexdigests.has_key?(o.hexdigest)
|
29
|
+
hits[o.hexdigest] = "#{o.file_name} is a duplicate of #{hexdigests[o.hexdigest].join(' and ')}"
|
30
|
+
end
|
31
|
+
if hexdigests[o.hexdigest].is_a?(Array)
|
32
|
+
hexdigests[o.hexdigest] << o.file_name
|
33
|
+
else
|
34
|
+
hexdigests[o.hexdigest] = [o.file_name]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
hits
|
39
|
+
end
|
40
|
+
|
41
|
+
def remove_marshal_file
|
42
|
+
File.unlink(serialize_to_file) if File.file?(serialize_to_file)
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
def serialize_to_file
|
48
|
+
Pathname.new([File.expand_path('../../', __FILE__), @serialize_filename].join(''))
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
metadata
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: find_dupe_images
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andy Wenk
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-12-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rmagick
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: logstash-logger
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: ruby-filemagic
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bundler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.10'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.10'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '10.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '10.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Find duplicate images in a directory structure
|
98
|
+
email:
|
99
|
+
- andy@nms.de
|
100
|
+
executables:
|
101
|
+
- console
|
102
|
+
- find_dupe_images
|
103
|
+
- setup
|
104
|
+
extensions: []
|
105
|
+
extra_rdoc_files: []
|
106
|
+
files:
|
107
|
+
- ".gitignore"
|
108
|
+
- ".rspec"
|
109
|
+
- ".tags1"
|
110
|
+
- ".travis.yml"
|
111
|
+
- CODE_OF_CONDUCT.md
|
112
|
+
- Gemfile
|
113
|
+
- README.md
|
114
|
+
- Rakefile
|
115
|
+
- bin/console
|
116
|
+
- bin/find_dupe_images
|
117
|
+
- bin/setup
|
118
|
+
- find_dupe_images.gemspec
|
119
|
+
- lib/find_dupe_images.rb
|
120
|
+
- lib/find_dupe_images/error/base.rb
|
121
|
+
- lib/find_dupe_images/finder.rb
|
122
|
+
- lib/find_dupe_images/image.rb
|
123
|
+
- lib/find_dupe_images/image_mime_types.rb
|
124
|
+
- lib/find_dupe_images/logger.rb
|
125
|
+
- lib/find_dupe_images/option.rb
|
126
|
+
- lib/find_dupe_images/processed_data.rb
|
127
|
+
- lib/find_dupe_images/serializer.rb
|
128
|
+
- lib/find_dupe_images/version.rb
|
129
|
+
homepage: https://github.com/andywenk/find_dupe_images
|
130
|
+
licenses: []
|
131
|
+
metadata: {}
|
132
|
+
post_install_message:
|
133
|
+
rdoc_options: []
|
134
|
+
require_paths:
|
135
|
+
- lib
|
136
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
137
|
+
requirements:
|
138
|
+
- - ">="
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: '0'
|
141
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
requirements: []
|
147
|
+
rubyforge_project:
|
148
|
+
rubygems_version: 2.4.5
|
149
|
+
signing_key:
|
150
|
+
specification_version: 4
|
151
|
+
summary: Find duplicate images in a directory structure
|
152
|
+
test_files: []
|