right_data 0.5.25 → 0.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/README.rdoc +1 -0
  2. data/lib/main.rb +97 -48
  3. data/lib/right_data.rb +5 -0
  4. metadata +4 -4
data/README.rdoc CHANGED
@@ -22,6 +22,7 @@ sudo gem install pkg/right_data-0.5.4.gem
22
22
 
23
23
  echo "gem 'right_data'; require 'right_data'; RightData::prune_report('/Users/jonathan/Dropbox','/Users/jonathan/Desktop/Old')" | ruby -rrubygems
24
24
 
25
+ echo "gem 'right_data'; require 'right_data'; RightData::prune_image_report('/Users/jonathan/Dropbox','/Users/jonathan/Desktop/Old')" | ruby -rrubygems
25
26
 
26
27
  == Copyright
27
28
 
data/lib/main.rb CHANGED
@@ -6,44 +6,6 @@ require 'escape'
6
6
 
7
7
  $KCODE = 'UTF-8' # only used when encoding is not specified.
8
8
 
9
- # crawler = Index.new
10
- # crawler.crawl
11
-
12
- =begin
13
- #!/usr/bin/ruby
14
- # find_duplicates.rb
15
-
16
- require 'find'
17
- require 'digest/md5'
18
-
19
- def each_set_of_duplicates(*paths)
20
- sizes = {}
21
- Find.find(*paths) do |f|
22
- (sizes[File.size(f)] ||= []) << f if File.file? f
23
- end
24
- sizes.each do |size, files|
25
- next unless files.size > 1
26
- md5s = {}
27
- files.each do |f|
28
- digest = Digest::MD5.hexdigest(File.read(f))
29
- (md5s[digest] ||= []) << f
30
- end
31
- md5s.each { |sum, files| yield files if files.size > 1 }
32
- end
33
- end
34
-
35
- each_set_of_
36
- duplicates(*ARGV) do |f|
37
- puts "
38
- Duplicates: #{f.join(", ")}"
39
- end
40
- =end
41
-
42
- # http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
43
-
44
- #!/usr/bin/ruby
45
- # find_duplicates2.rb
46
-
47
9
  require 'find'
48
10
 
49
11
  module RightData
@@ -58,6 +20,21 @@ module RightData
58
20
  File.basename(f).downcase =~ /\.swp$/
59
21
  end
60
22
 
23
+ # Is this a picture? If so, we'll be using imagemagick's compare feature later on
24
+ def self.is_visual_media?(f)
25
+ ext = File.basename(f).downcase.split(".").last
26
+ ["jpg","jpeg","gif","bmp","png"].include?(ext)
27
+ end
28
+
29
+ def self.identical_images?(a,b)
30
+ return false unless self.is_visual_media?(a) && self.is_visual_media?(b)
31
+ # rmagick1.signature <=> rmagick2.signature
32
+ # rmagick1.compare_channel(rmagick2, MeanAbsoluteErrorMetric).last == 0
33
+ cmd = "compare -metric AE \"#{a.gsub(/\"/,'\"')}\" \"#{b.gsub(/\"/,'\"')}\" /dev/null"
34
+ puts "Executing comparison: #{cmd}"
35
+ "0" == `#{cmd}`
36
+ end
37
+
61
38
  def self.each_set_of_duplicates(*paths, &block)
62
39
  sizes = Hash.new {|h, k| h[k] = [] }
63
40
  Find.find(*paths) { |f| sizes[File.size(f)] << f if File.file? f }
@@ -101,19 +78,29 @@ module RightData
101
78
  return possible_duplicates
102
79
  end
103
80
 
81
+ def self.index_by_name(*paths)
82
+ names = Hash.new {|h, k| h[k] = [] }
83
+ count = 0
84
+ Find.find(*paths) { |f|
85
+ names[File.basename(f).downcase] << f if File.file?(f) && !ignore_test(f)
86
+ count += 1
87
+ }
88
+ puts "# Indexed #{count} files by name."
89
+ names
90
+ end
91
+
104
92
  def self.index_by_size(*paths)
105
93
  sizes = Hash.new {|h, k| h[k] = [] }
106
94
  count = 0
107
95
  Find.find(*paths) { |f|
108
- sizes[File.size(f)] << f if File.file?(f) && !ignore_test(f)
96
+ sizes[File.size(f)] << f if File.file?(f) && !ignore_test(f)
109
97
  count += 1
110
98
  }
111
- puts "# Indexed #{count} files."
99
+ puts "# Indexed #{count} files by size."
112
100
  sizes
113
101
  end
114
102
 
115
- def self.cache_not_working_on_write(master)
116
- master_cache = File.join(master,".rightPruneCache")
103
+ def self.cache_not_working_on_write(master, master_cache, indexing_function)
117
104
  if File.exist?(master_cache)
118
105
  puts "# Master cache FOUND at #{master_cache}."
119
106
  master_index = File.open(master_cache) do |f|
@@ -121,7 +108,7 @@ module RightData
121
108
  end
122
109
  else
123
110
  puts "# Master cache not found at #{master_cache}."
124
- master_index = index_by_size(master)
111
+ master_index = indexing_function.call(master)
125
112
  puts "# Writing #{master_cache}."
126
113
  File.open(master_cache, "w") do |f|
127
114
  YAML.dump(master_index, f)
@@ -164,7 +151,16 @@ module RightData
164
151
  end
165
152
  end
166
153
 
167
- def self.check_file_in_index(master_index, file_to_check, &block)
154
+ def self.check_file_in_image_index(master_index, file_to_check)
155
+ size = File.size(file_to_check)
156
+ return [] if size == 0 # Ignore empty files
157
+ possible_master_dups = master_index[File.basename(file_to_check).downcase] || []
158
+ possible_master_dups.select { |master_file|
159
+ self.identical_images?(master_file,file_to_check)
160
+ }
161
+ end
162
+
163
+ def self.check_file_in_index(master_index, file_to_check)
168
164
  size = File.size(file_to_check)
169
165
  return [] if size == 0 # Ignore empty files
170
166
  possible_master_dups = master_index[size] || []
@@ -201,13 +197,27 @@ module RightData
201
197
  end
202
198
  end
203
199
 
200
+ def self.scan_for_prunable_images(master, prune, &block)
201
+ indexing_function = Proc.new { |a| self.index_by_name(a) }
202
+ check_index_function = Proc.new { |a,b| self.check_file_in_image_index(a,b) }
203
+ self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, "image", &block)
204
+ end
205
+
206
+ def self.scan_for_prunable(master, prune, &block)
207
+ indexing_function = Proc.new { |a| self.index_by_size(a) }
208
+ check_index_function = Proc.new { |a,b| self.check_file_in_index(a,b) }
209
+ scan_for_prunable_base(master, prune, indexing_function, check_index_function, "size", &block)
210
+ end
211
+
204
212
  # tree = scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }; nil
205
- def self.scan_for_prunable(master,prune, &block)
213
+ def self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block)
206
214
  puts "# Ignoring: #{IGNORE_FILES.inspect}"
207
215
 
208
- master_index = cache_not_working_on_write(master)
216
+ master_cache = File.join(master,".rightPruneCache-#{kind}")
217
+ master_index = cache_not_working_on_write(master, master_cache, indexing_function)
218
+
209
219
  # master_index = index_by_size(master)
210
- puts "# Found #{master_index.size} unique sizes."
220
+ puts "# Found #{master_index.size} unique #{kind}s."
211
221
 
212
222
  # dups = check_file_in_index(master_index, "/Users/jonathan/Dropbox/2261093437_fac9fa9008_b.jpg")
213
223
 
@@ -240,7 +250,7 @@ module RightData
240
250
  n.parent.increment_ignorable_children
241
251
  else
242
252
  # puts n.path
243
- duplicates = check_file_in_index(master_index, n.path)
253
+ duplicates = check_index_function.call(master_index, n.path)
244
254
  if(!duplicates.empty?)
245
255
  n.duplicates = duplicates
246
256
  n.parent.increment_duplicate_children
@@ -330,3 +340,42 @@ module RightData
330
340
  # Found 37765 unique sizes.
331
341
  # After check. Found 1240 / 1940 dups in master.
332
342
  end
343
+
344
+ # crawler = Index.new
345
+ # crawler.crawl
346
+
347
+ =begin
348
+ #!/usr/bin/ruby
349
+ # find_duplicates.rb
350
+
351
+ require 'find'
352
+ require 'digest/md5'
353
+
354
+ def each_set_of_duplicates(*paths)
355
+ sizes = {}
356
+ Find.find(*paths) do |f|
357
+ (sizes[File.size(f)] ||= []) << f if File.file? f
358
+ end
359
+ sizes.each do |size, files|
360
+ next unless files.size > 1
361
+ md5s = {}
362
+ files.each do |f|
363
+ digest = Digest::MD5.hexdigest(File.read(f))
364
+ (md5s[digest] ||= []) << f
365
+ end
366
+ md5s.each { |sum, files| yield files if files.size > 1 }
367
+ end
368
+ end
369
+
370
+ each_set_of_
371
+ duplicates(*ARGV) do |f|
372
+ puts "
373
+ Duplicates: #{f.join(", ")}"
374
+ end
375
+ =end
376
+
377
+ # http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
378
+
379
+ #!/usr/bin/ruby
380
+ # find_duplicates2.rb
381
+
data/lib/right_data.rb CHANGED
@@ -10,6 +10,11 @@ module RightData
10
10
  tree.report('rm -rf'); nil
11
11
  end
12
12
 
13
+ def self.prune_image_report(master,prunable)
14
+ tree = RightData::scan_for_prunable_images(master,prunable)
15
+ tree.report('rm -rf'); nil
16
+ end
17
+
13
18
  # Run this in a directory that is suspected of containing self-duplicate files.
14
19
  # Compare to: fdupes -r -n prunable
15
20
  def self.dup_report(prunable)
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: right_data
3
3
  version: !ruby/object:Gem::Version
4
- hash: 57
4
+ hash: 9
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
+ - 7
8
9
  - 5
9
- - 25
10
- version: 0.5.25
10
+ version: 0.7.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Jonathan Siegel
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-31 00:00:00 +01:00
18
+ date: 2010-10-08 00:00:00 +01:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency