find-duplicates 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # Copyright 2013 Daneel S. Yaitskov
4
+
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # this tool is written directly for 1 usecase.
18
+ # There is 2 folders containing relativly similar files
19
+ # but these files are in different pathes.
20
+ # And you need to merge to folder structures
21
+ # without space overhead.
22
+
23
+ # Solution. Merge file sets with cp or rsync.
24
+ # Start this script over 1 directory structure.
25
+ # Script builds hashes for all files.
26
+ # Groups pathes to files with the same hash.
27
+
28
+ require 'optparse'
29
+ require 'ruby-progressbar'
30
+ require 'find-duplicates'
31
+ include FindDuplicates
32
+
33
+ options = {
34
+ :bar => lambda { |size| ProgressBar.create(:title => "Hashing",
35
+ :format => '%t: %e %P |%b%i|',
36
+ :total => size) },
37
+ :dry => true,
38
+ :remove_policy => PolicyLeaveLongest.new
39
+ }
40
+
41
+ paths = []
42
+ begin
43
+ OptionParser.new do |opts|
44
+ opts.banner = 'Usage: find-duplicates.rb [ options ] <path-to-dir>'
45
+ opts.on('-d', '--dry',
46
+ 'default. dry run. just show groups duplicated files
47
+ and which one will be left') do |v|
48
+ options[:dry] = true
49
+ end
50
+ opts.on('-B', '--no-bar',
51
+ 'hide progress bar') do |v|
52
+ options[:bar] = lambda { |size| DummyBar.new }
53
+ end
54
+ opts.on('-r', '--run',
55
+ 'delete duplicates by the specified policy') do |v|
56
+ options[:dry] = false
57
+ end
58
+ opts.on('-l', '--longest',
59
+ 'policy leaves a file with longest name among duplicates') do |v|
60
+ options[:remove_policy] = PolicyLeaveLongest.new
61
+ end
62
+ end.parse!
63
+
64
+ if ARGV.empty?
65
+ raise "there isn't any path given"
66
+ end
67
+
68
+ paths = ARGV.map do |path|
69
+ raise "path '#{ path }' isn't a directory" if !File.directory?(path)
70
+ path.sub(/\/+ *$/, '')
71
+ end
72
+ rescue
73
+ $stderr.puts "Error #{ $! }"
74
+ exit 1
75
+ end
76
+
77
+ find_duplicates options paths
78
+
79
+
80
+
@@ -0,0 +1,19 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'find-duplicates'
3
+ s.version = '0.0.1'
4
+ s.date = '2013-12-07'
5
+ s.summary = 'Find duplicated files'
6
+ s.description = 'Find and remove duplicated files'
7
+ s.authors = [ 'Daneel S. Yaitskov' ]
8
+ s.email = 'rtfm.rtfm.rtfm@gmail.com'
9
+ s.files = [ 'bin/find-duplicates',
10
+ 'lib/find-duplicates.rb',
11
+ 'find-duplicates.gemspec'
12
+ ]
13
+ s.require_paths = [ 'bin', 'lib' ]
14
+ s.executable = 'find-duplicates'
15
+ s.license = 'Apache 2'
16
+ s.homepage = 'https://github.com/yaitskov/find-duplicates'
17
+ s.add_dependency "ruby-progressbar"
18
+ s.add_dependency "bundler"
19
+ end
@@ -0,0 +1,80 @@
1
+ # Copyright 2013 Daneel S. Yaitskov
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ require 'digest/md5'
16
+
17
+ module FindDuplicates
18
+ class PolicyLeaveLongest
19
+ def choose_to_die(files)
20
+ files.sort { |a,b| b.size - a.size }
21
+ end
22
+ end
23
+
24
+ class DummyBar
25
+ def increment
26
+ end
27
+ end
28
+
29
+ class Groups
30
+ def initialize
31
+ @hashes = {}
32
+ end
33
+ def add(path)
34
+ md5 = Digest::MD5.new
35
+ begin
36
+ File.open(path, 'rb') do |file_h|
37
+ file_h.each(8192) do |block|
38
+ md5.update block
39
+ end
40
+ end
41
+ rescue
42
+ $stderr.puts "problem: #{ $! }"
43
+ return
44
+ end
45
+ digest = md5.digest
46
+ if @hashes.key?(digest)
47
+ @hashes[digest] << path
48
+ else
49
+ @hashes[digest] = [ path ]
50
+ end
51
+ end
52
+ def duplicates
53
+ @hashes.find_all { |k,v| v.size > 1 }
54
+ end
55
+ end
56
+
57
+ def find_duplicates(options, paths)
58
+ groups = Groups.new
59
+ files = paths.map { |path| Dir.glob(path + '/**/*') }.flatten
60
+
61
+ bar = options[:bar].call files.size
62
+
63
+ files.each do |file|
64
+ bar.increment
65
+ next if File.directory?(file)
66
+ groups.add file
67
+ end
68
+
69
+ groups.duplicates.each do |k,group|
70
+ death_queue = options[:remove_policy].choose_to_die(group)
71
+ survivor = [death_queue.first]
72
+ death_queue = death_queue.slice(1..-1)
73
+ if options[:dry]
74
+ puts (survivor + death_queue.map { |s| " " + s }).join("\n")
75
+ else
76
+ death_queue.each { |file| File.delete file }
77
+ end
78
+ end
79
+ end
80
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: find-duplicates
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Daneel S. Yaitskov
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-12-07 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: ruby-progressbar
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: bundler
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: Find and remove duplicated files
47
+ email: rtfm.rtfm.rtfm@gmail.com
48
+ executables:
49
+ - find-duplicates
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - bin/find-duplicates
54
+ - lib/find-duplicates.rb
55
+ - find-duplicates.gemspec
56
+ homepage: https://github.com/yaitskov/find-duplicates
57
+ licenses:
58
+ - Apache 2
59
+ post_install_message:
60
+ rdoc_options: []
61
+ require_paths:
62
+ - bin
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ requirements: []
77
+ rubyforge_project:
78
+ rubygems_version: 1.8.23
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: Find duplicated files
82
+ test_files: []