find-duplicates 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # Copyright 2013 Daneel S. Yaitskov
4
+
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # this tool is written directly for 1 usecase.
18
+ # There is 2 folders containing relativly similar files
19
+ # but these files are in different pathes.
20
+ # And you need to merge to folder structures
21
+ # without space overhead.
22
+
23
+ # Solution. Merge file sets with cp or rsync.
24
+ # Start this script over 1 directory structure.
25
+ # Script builds hashes for all files.
26
+ # Groups pathes to files with the same hash.
27
+
28
+ require 'optparse'
29
+ require 'ruby-progressbar'
30
+ require 'find-duplicates'
31
+ include FindDuplicates
32
+
33
+ options = {
34
+ :bar => lambda { |size| ProgressBar.create(:title => "Hashing",
35
+ :format => '%t: %e %P |%b%i|',
36
+ :total => size) },
37
+ :dry => true,
38
+ :remove_policy => PolicyLeaveLongest.new
39
+ }
40
+
41
+ paths = []
42
+ begin
43
+ OptionParser.new do |opts|
44
+ opts.banner = 'Usage: find-duplicates.rb [ options ] <path-to-dir>'
45
+ opts.on('-d', '--dry',
46
+ 'default. dry run. just show groups duplicated files
47
+ and which one will be left') do |v|
48
+ options[:dry] = true
49
+ end
50
+ opts.on('-B', '--no-bar',
51
+ 'hide progress bar') do |v|
52
+ options[:bar] = lambda { |size| DummyBar.new }
53
+ end
54
+ opts.on('-r', '--run',
55
+ 'delete duplicates by the specified policy') do |v|
56
+ options[:dry] = false
57
+ end
58
+ opts.on('-l', '--longest',
59
+ 'policy leaves a file with longest name among duplicates') do |v|
60
+ options[:remove_policy] = PolicyLeaveLongest.new
61
+ end
62
+ end.parse!
63
+
64
+ if ARGV.empty?
65
+ raise "there isn't any path given"
66
+ end
67
+
68
+ paths = ARGV.map do |path|
69
+ raise "path '#{ path }' isn't a directory" if !File.directory?(path)
70
+ path.sub(/\/+ *$/, '')
71
+ end
72
+ rescue
73
+ $stderr.puts "Error #{ $! }"
74
+ exit 1
75
+ end
76
+
77
+ find_duplicates options paths
78
+
79
+
80
+
@@ -0,0 +1,19 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'find-duplicates'
3
+ s.version = '0.0.1'
4
+ s.date = '2013-12-07'
5
+ s.summary = 'Find duplicated files'
6
+ s.description = 'Find and remove duplicated files'
7
+ s.authors = [ 'Daneel S. Yaitskov' ]
8
+ s.email = 'rtfm.rtfm.rtfm@gmail.com'
9
+ s.files = [ 'bin/find-duplicates',
10
+ 'lib/find-duplicates.rb',
11
+ 'find-duplicates.gemspec'
12
+ ]
13
+ s.require_paths = [ 'bin', 'lib' ]
14
+ s.executable = 'find-duplicates'
15
+ s.license = 'Apache 2'
16
+ s.homepage = 'https://github.com/yaitskov/find-duplicates'
17
+ s.add_dependency "ruby-progressbar"
18
+ s.add_dependency "bundler"
19
+ end
@@ -0,0 +1,80 @@
1
+ # Copyright 2013 Daneel S. Yaitskov
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ require 'digest/md5'
16
+
17
+ module FindDuplicates
18
+ class PolicyLeaveLongest
19
+ def choose_to_die(files)
20
+ files.sort { |a,b| b.size - a.size }
21
+ end
22
+ end
23
+
24
+ class DummyBar
25
+ def increment
26
+ end
27
+ end
28
+
29
+ class Groups
30
+ def initialize
31
+ @hashes = {}
32
+ end
33
+ def add(path)
34
+ md5 = Digest::MD5.new
35
+ begin
36
+ File.open(path, 'rb') do |file_h|
37
+ file_h.each(8192) do |block|
38
+ md5.update block
39
+ end
40
+ end
41
+ rescue
42
+ $stderr.puts "problem: #{ $! }"
43
+ return
44
+ end
45
+ digest = md5.digest
46
+ if @hashes.key?(digest)
47
+ @hashes[digest] << path
48
+ else
49
+ @hashes[digest] = [ path ]
50
+ end
51
+ end
52
+ def duplicates
53
+ @hashes.find_all { |k,v| v.size > 1 }
54
+ end
55
+ end
56
+
57
+ def find_duplicates(options, paths)
58
+ groups = Groups.new
59
+ files = paths.map { |path| Dir.glob(path + '/**/*') }.flatten
60
+
61
+ bar = options[:bar].call files.size
62
+
63
+ files.each do |file|
64
+ bar.increment
65
+ next if File.directory?(file)
66
+ groups.add file
67
+ end
68
+
69
+ groups.duplicates.each do |k,group|
70
+ death_queue = options[:remove_policy].choose_to_die(group)
71
+ survivor = [death_queue.first]
72
+ death_queue = death_queue.slice(1..-1)
73
+ if options[:dry]
74
+ puts (survivor + death_queue.map { |s| " " + s }).join("\n")
75
+ else
76
+ death_queue.each { |file| File.delete file }
77
+ end
78
+ end
79
+ end
80
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: find-duplicates
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Daneel S. Yaitskov
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-12-07 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ruby-progressbar
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: bundler
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: Find and remove duplicated files
47
+ email: rtfm.rtfm.rtfm@gmail.com
48
+ executables:
49
+ - find-duplicates
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - bin/find-duplicates
54
+ - lib/find-duplicates.rb
55
+ - find-duplicates.gemspec
56
+ homepage: https://github.com/yaitskov/find-duplicates
57
+ licenses:
58
+ - Apache 2
59
+ post_install_message:
60
+ rdoc_options: []
61
+ require_paths:
62
+ - bin
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ requirements: []
77
+ rubyforge_project:
78
+ rubygems_version: 1.8.23
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: Find duplicated files
82
+ test_files: []