find-duplicates 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/find-duplicates +80 -0
- data/find-duplicates.gemspec +19 -0
- data/lib/find-duplicates.rb +80 -0
- metadata +82 -0
data/bin/find-duplicates
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# Copyright 2013 Daneel S. Yaitskov
|
4
|
+
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
# this tool is written directly for 1 usecase.
|
18
|
+
# There is 2 folders containing relativly similar files
|
19
|
+
# but these files are in different pathes.
|
20
|
+
# And you need to merge to folder structures
|
21
|
+
# without space overhead.
|
22
|
+
|
23
|
+
# Solution. Merge file sets with cp or rsync.
|
24
|
+
# Start this script over 1 directory structure.
|
25
|
+
# Script builds hashes for all files.
|
26
|
+
# Groups pathes to files with the same hash.
|
27
|
+
|
28
|
+
require 'optparse'
|
29
|
+
require 'ruby-progressbar'
|
30
|
+
require 'find-duplicates'
|
31
|
+
include FindDuplicates
|
32
|
+
|
33
|
+
options = {
|
34
|
+
:bar => lambda { |size| ProgressBar.create(:title => "Hashing",
|
35
|
+
:format => '%t: %e %P |%b%i|',
|
36
|
+
:total => size) },
|
37
|
+
:dry => true,
|
38
|
+
:remove_policy => PolicyLeaveLongest.new
|
39
|
+
}
|
40
|
+
|
41
|
+
paths = []
|
42
|
+
begin
|
43
|
+
OptionParser.new do |opts|
|
44
|
+
opts.banner = 'Usage: find-duplicates.rb [ options ] <path-to-dir>'
|
45
|
+
opts.on('-d', '--dry',
|
46
|
+
'default. dry run. just show groups duplicated files
|
47
|
+
and which one will be left') do |v|
|
48
|
+
options[:dry] = true
|
49
|
+
end
|
50
|
+
opts.on('-B', '--no-bar',
|
51
|
+
'hide progress bar') do |v|
|
52
|
+
options[:bar] = lambda { |size| DummyBar.new }
|
53
|
+
end
|
54
|
+
opts.on('-r', '--run',
|
55
|
+
'delete duplicates by the specified policy') do |v|
|
56
|
+
options[:dry] = false
|
57
|
+
end
|
58
|
+
opts.on('-l', '--longest',
|
59
|
+
'policy leaves a file with longest name among duplicates') do |v|
|
60
|
+
options[:remove_policy] = PolicyLeaveLongest.new
|
61
|
+
end
|
62
|
+
end.parse!
|
63
|
+
|
64
|
+
if ARGV.empty?
|
65
|
+
raise "there isn't any path given"
|
66
|
+
end
|
67
|
+
|
68
|
+
paths = ARGV.map do |path|
|
69
|
+
raise "path '#{ path }' isn't a directory" if !File.directory?(path)
|
70
|
+
path.sub(/\/+ *$/, '')
|
71
|
+
end
|
72
|
+
rescue
|
73
|
+
$stderr.puts "Error #{ $! }"
|
74
|
+
exit 1
|
75
|
+
end
|
76
|
+
|
77
|
+
find_duplicates options paths
|
78
|
+
|
79
|
+
|
80
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'find-duplicates'
|
3
|
+
s.version = '0.0.1'
|
4
|
+
s.date = '2013-12-07'
|
5
|
+
s.summary = 'Find duplicated files'
|
6
|
+
s.description = 'Find and remove duplicated files'
|
7
|
+
s.authors = [ 'Daneel S. Yaitskov' ]
|
8
|
+
s.email = 'rtfm.rtfm.rtfm@gmail.com'
|
9
|
+
s.files = [ 'bin/find-duplicates',
|
10
|
+
'lib/find-duplicates.rb',
|
11
|
+
'find-duplicates.gemspec'
|
12
|
+
]
|
13
|
+
s.require_paths = [ 'bin', 'lib' ]
|
14
|
+
s.executable = 'find-duplicates'
|
15
|
+
s.license = 'Apache 2'
|
16
|
+
s.homepage = 'https://github.com/yaitskov/find-duplicates'
|
17
|
+
s.add_dependency "ruby-progressbar"
|
18
|
+
s.add_dependency "bundler"
|
19
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# Copyright 2013 Daneel S. Yaitskov
|
2
|
+
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
require 'digest/md5'
|
16
|
+
|
17
|
+
module FindDuplicates
|
18
|
+
class PolicyLeaveLongest
|
19
|
+
def choose_to_die(files)
|
20
|
+
files.sort { |a,b| b.size - a.size }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class DummyBar
|
25
|
+
def increment
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Groups
|
30
|
+
def initialize
|
31
|
+
@hashes = {}
|
32
|
+
end
|
33
|
+
def add(path)
|
34
|
+
md5 = Digest::MD5.new
|
35
|
+
begin
|
36
|
+
File.open(path, 'rb') do |file_h|
|
37
|
+
file_h.each(8192) do |block|
|
38
|
+
md5.update block
|
39
|
+
end
|
40
|
+
end
|
41
|
+
rescue
|
42
|
+
$stderr.puts "problem: #{ $! }"
|
43
|
+
return
|
44
|
+
end
|
45
|
+
digest = md5.digest
|
46
|
+
if @hashes.key?(digest)
|
47
|
+
@hashes[digest] << path
|
48
|
+
else
|
49
|
+
@hashes[digest] = [ path ]
|
50
|
+
end
|
51
|
+
end
|
52
|
+
def duplicates
|
53
|
+
@hashes.find_all { |k,v| v.size > 1 }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def find_duplicates(options, paths)
|
58
|
+
groups = Groups.new
|
59
|
+
files = paths.map { |path| Dir.glob(path + '/**/*') }.flatten
|
60
|
+
|
61
|
+
bar = options[:bar].call files.size
|
62
|
+
|
63
|
+
files.each do |file|
|
64
|
+
bar.increment
|
65
|
+
next if File.directory?(file)
|
66
|
+
groups.add file
|
67
|
+
end
|
68
|
+
|
69
|
+
groups.duplicates.each do |k,group|
|
70
|
+
death_queue = options[:remove_policy].choose_to_die(group)
|
71
|
+
survivor = [death_queue.first]
|
72
|
+
death_queue = death_queue.slice(1..-1)
|
73
|
+
if options[:dry]
|
74
|
+
puts (survivor + death_queue.map { |s| " " + s }).join("\n")
|
75
|
+
else
|
76
|
+
death_queue.each { |file| File.delete file }
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: find-duplicates
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Daneel S. Yaitskov
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-12-07 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: ruby-progressbar
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: bundler
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: Find and remove duplicated files
|
47
|
+
email: rtfm.rtfm.rtfm@gmail.com
|
48
|
+
executables:
|
49
|
+
- find-duplicates
|
50
|
+
extensions: []
|
51
|
+
extra_rdoc_files: []
|
52
|
+
files:
|
53
|
+
- bin/find-duplicates
|
54
|
+
- lib/find-duplicates.rb
|
55
|
+
- find-duplicates.gemspec
|
56
|
+
homepage: https://github.com/yaitskov/find-duplicates
|
57
|
+
licenses:
|
58
|
+
- Apache 2
|
59
|
+
post_install_message:
|
60
|
+
rdoc_options: []
|
61
|
+
require_paths:
|
62
|
+
- bin
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
requirements: []
|
77
|
+
rubyforge_project:
|
78
|
+
rubygems_version: 1.8.23
|
79
|
+
signing_key:
|
80
|
+
specification_version: 3
|
81
|
+
summary: Find duplicated files
|
82
|
+
test_files: []
|