right_data 0.5.25 → 0.7.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/README.rdoc +1 -0
  2. data/lib/main.rb +97 -48
  3. data/lib/right_data.rb +5 -0
  4. metadata +4 -4
data/README.rdoc CHANGED
@@ -22,6 +22,7 @@ sudo gem install pkg/right_data-0.5.4.gem
22
22
 
23
23
  echo "gem 'right_data'; require 'right_data'; RightData::prune_report('/Users/jonathan/Dropbox','/Users/jonathan/Desktop/Old')" | ruby -rrubygems
24
24
 
25
+ echo "gem 'right_data'; require 'right_data'; RightData::prune_image_report('/Users/jonathan/Dropbox','/Users/jonathan/Desktop/Old')" | ruby -rrubygems
25
26
 
26
27
  == Copyright
27
28
 
data/lib/main.rb CHANGED
@@ -6,44 +6,6 @@ require 'escape'
6
6
 
7
7
  $KCODE = 'UTF-8' # only used when encoding is not specified.
8
8
 
9
- # crawler = Index.new
10
- # crawler.crawl
11
-
12
- =begin
13
- #!/usr/bin/ruby
14
- # find_duplicates.rb
15
-
16
- require 'find'
17
- require 'digest/md5'
18
-
19
- def each_set_of_duplicates(*paths)
20
- sizes = {}
21
- Find.find(*paths) do |f|
22
- (sizes[File.size(f)] ||= []) << f if File.file? f
23
- end
24
- sizes.each do |size, files|
25
- next unless files.size > 1
26
- md5s = {}
27
- files.each do |f|
28
- digest = Digest::MD5.hexdigest(File.read(f))
29
- (md5s[digest] ||= []) << f
30
- end
31
- md5s.each { |sum, files| yield files if files.size > 1 }
32
- end
33
- end
34
-
35
- each_set_of_
36
- duplicates(*ARGV) do |f|
37
- puts "
38
- Duplicates: #{f.join(", ")}"
39
- end
40
- =end
41
-
42
- # http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
43
-
44
- #!/usr/bin/ruby
45
- # find_duplicates2.rb
46
-
47
9
  require 'find'
48
10
 
49
11
  module RightData
@@ -58,6 +20,21 @@ module RightData
58
20
  File.basename(f).downcase =~ /\.swp$/
59
21
  end
60
22
 
23
+ # Is this a picture? If so, we'll be using imagemagick's compare feature later on
24
+ def self.is_visual_media?(f)
25
+ ext = File.basename(f).downcase.split(".").last
26
+ ["jpg","jpeg","gif","bmp","png"].include?(ext)
27
+ end
28
+
29
+ def self.identical_images?(a,b)
30
+ return false unless self.is_visual_media?(a) && self.is_visual_media?(b)
31
+ # rmagick1.signature <=> rmagick2.signature
32
+ # rmagick1.compare_channel(rmagick2, MeanAbsoluteErrorMetric).last == 0
33
+ cmd = "compare -metric AE \"#{a.gsub(/\"/,'\"')}\" \"#{b.gsub(/\"/,'\"')}\" /dev/null"
34
+ puts "Executing comparison: #{cmd}"
35
+ "0" == `#{cmd}`
36
+ end
37
+
61
38
  def self.each_set_of_duplicates(*paths, &block)
62
39
  sizes = Hash.new {|h, k| h[k] = [] }
63
40
  Find.find(*paths) { |f| sizes[File.size(f)] << f if File.file? f }
@@ -101,19 +78,29 @@ module RightData
101
78
  return possible_duplicates
102
79
  end
103
80
 
81
+ def self.index_by_name(*paths)
82
+ names = Hash.new {|h, k| h[k] = [] }
83
+ count = 0
84
+ Find.find(*paths) { |f|
85
+ names[File.basename(f).downcase] << f if File.file?(f) && !ignore_test(f)
86
+ count += 1
87
+ }
88
+ puts "# Indexed #{count} files by name."
89
+ names
90
+ end
91
+
104
92
  def self.index_by_size(*paths)
105
93
  sizes = Hash.new {|h, k| h[k] = [] }
106
94
  count = 0
107
95
  Find.find(*paths) { |f|
108
- sizes[File.size(f)] << f if File.file?(f) && !ignore_test(f)
96
+ sizes[File.size(f)] << f if File.file?(f) && !ignore_test(f)
109
97
  count += 1
110
98
  }
111
- puts "# Indexed #{count} files."
99
+ puts "# Indexed #{count} files by size."
112
100
  sizes
113
101
  end
114
102
 
115
- def self.cache_not_working_on_write(master)
116
- master_cache = File.join(master,".rightPruneCache")
103
+ def self.cache_not_working_on_write(master, master_cache, indexing_function)
117
104
  if File.exist?(master_cache)
118
105
  puts "# Master cache FOUND at #{master_cache}."
119
106
  master_index = File.open(master_cache) do |f|
@@ -121,7 +108,7 @@ module RightData
121
108
  end
122
109
  else
123
110
  puts "# Master cache not found at #{master_cache}."
124
- master_index = index_by_size(master)
111
+ master_index = indexing_function.call(master)
125
112
  puts "# Writing #{master_cache}."
126
113
  File.open(master_cache, "w") do |f|
127
114
  YAML.dump(master_index, f)
@@ -164,7 +151,16 @@ module RightData
164
151
  end
165
152
  end
166
153
 
167
- def self.check_file_in_index(master_index, file_to_check, &block)
154
+ def self.check_file_in_image_index(master_index, file_to_check)
155
+ size = File.size(file_to_check)
156
+ return [] if size == 0 # Ignore empty files
157
+ possible_master_dups = master_index[File.basename(file_to_check).downcase] || []
158
+ possible_master_dups.select { |master_file|
159
+ self.identical_images?(master_file,file_to_check)
160
+ }
161
+ end
162
+
163
+ def self.check_file_in_index(master_index, file_to_check)
168
164
  size = File.size(file_to_check)
169
165
  return [] if size == 0 # Ignore empty files
170
166
  possible_master_dups = master_index[size] || []
@@ -201,13 +197,27 @@ module RightData
201
197
  end
202
198
  end
203
199
 
200
+ def self.scan_for_prunable_images(master, prune, &block)
201
+ indexing_function = Proc.new { |a| self.index_by_name(a) }
202
+ check_index_function = Proc.new { |a,b| self.check_file_in_image_index(a,b) }
203
+ self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, "image", &block)
204
+ end
205
+
206
+ def self.scan_for_prunable(master, prune, &block)
207
+ indexing_function = Proc.new { |a| self.index_by_size(a) }
208
+ check_index_function = Proc.new { |a,b| self.check_file_in_index(a,b) }
209
+ scan_for_prunable_base(master, prune, indexing_function, check_index_function, "size", &block)
210
+ end
211
+
204
212
  # tree = scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }; nil
205
- def self.scan_for_prunable(master,prune, &block)
213
+ def self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block)
206
214
  puts "# Ignoring: #{IGNORE_FILES.inspect}"
207
215
 
208
- master_index = cache_not_working_on_write(master)
216
+ master_cache = File.join(master,".rightPruneCache-#{kind}")
217
+ master_index = cache_not_working_on_write(master, master_cache, indexing_function)
218
+
209
219
  # master_index = index_by_size(master)
210
- puts "# Found #{master_index.size} unique sizes."
220
+ puts "# Found #{master_index.size} unique #{kind}s."
211
221
 
212
222
  # dups = check_file_in_index(master_index, "/Users/jonathan/Dropbox/2261093437_fac9fa9008_b.jpg")
213
223
 
@@ -240,7 +250,7 @@ module RightData
240
250
  n.parent.increment_ignorable_children
241
251
  else
242
252
  # puts n.path
243
- duplicates = check_file_in_index(master_index, n.path)
253
+ duplicates = check_index_function.call(master_index, n.path)
244
254
  if(!duplicates.empty?)
245
255
  n.duplicates = duplicates
246
256
  n.parent.increment_duplicate_children
@@ -330,3 +340,42 @@ module RightData
330
340
  # Found 37765 unique sizes.
331
341
  # After check. Found 1240 / 1940 dups in master.
332
342
  end
343
+
344
+ # crawler = Index.new
345
+ # crawler.crawl
346
+
347
+ =begin
348
+ #!/usr/bin/ruby
349
+ # find_duplicates.rb
350
+
351
+ require 'find'
352
+ require 'digest/md5'
353
+
354
+ def each_set_of_duplicates(*paths)
355
+ sizes = {}
356
+ Find.find(*paths) do |f|
357
+ (sizes[File.size(f)] ||= []) << f if File.file? f
358
+ end
359
+ sizes.each do |size, files|
360
+ next unless files.size > 1
361
+ md5s = {}
362
+ files.each do |f|
363
+ digest = Digest::MD5.hexdigest(File.read(f))
364
+ (md5s[digest] ||= []) << f
365
+ end
366
+ md5s.each { |sum, files| yield files if files.size > 1 }
367
+ end
368
+ end
369
+
370
+ each_set_of_
371
+ duplicates(*ARGV) do |f|
372
+ puts "
373
+ Duplicates: #{f.join(", ")}"
374
+ end
375
+ =end
376
+
377
+ # http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
378
+
379
+ #!/usr/bin/ruby
380
+ # find_duplicates2.rb
381
+
data/lib/right_data.rb CHANGED
@@ -10,6 +10,11 @@ module RightData
10
10
  tree.report('rm -rf'); nil
11
11
  end
12
12
 
13
+ def self.prune_image_report(master,prunable)
14
+ tree = RightData::scan_for_prunable_images(master,prunable)
15
+ tree.report('rm -rf'); nil
16
+ end
17
+
13
18
  # Run this in a directory that is suspected of containing self-duplicate files.
14
19
  # Compare to: fdupes -r -n prunable
15
20
  def self.dup_report(prunable)
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: right_data
3
3
  version: !ruby/object:Gem::Version
4
- hash: 57
4
+ hash: 9
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
+ - 7
8
9
  - 5
9
- - 25
10
- version: 0.5.25
10
+ version: 0.7.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jonathan Siegel
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-31 00:00:00 +01:00
18
+ date: 2010-10-08 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency