right_data 0.5.25 → 0.7.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +1 -0
- data/lib/main.rb +97 -48
- data/lib/right_data.rb +5 -0
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -22,6 +22,7 @@ sudo gem install pkg/right_data-0.5.4.gem
|
|
22
22
|
|
23
23
|
echo "gem 'right_data'; require 'right_data'; RightData::prune_report('/Users/jonathan/Dropbox','/Users/jonathan/Desktop/Old')" | ruby -rrubygems
|
24
24
|
|
25
|
+
echo "gem 'right_data'; require 'right_data'; RightData::prune_image_report('/Users/jonathan/Dropbox','/Users/jonathan/Desktop/Old')" | ruby -rrubygems
|
25
26
|
|
26
27
|
== Copyright
|
27
28
|
|
data/lib/main.rb
CHANGED
@@ -6,44 +6,6 @@ require 'escape'
|
|
6
6
|
|
7
7
|
$KCODE = 'UTF-8' # only used when encoding is not specified.
|
8
8
|
|
9
|
-
# crawler = Index.new
|
10
|
-
# crawler.crawl
|
11
|
-
|
12
|
-
=begin
|
13
|
-
#!/usr/bin/ruby
|
14
|
-
# find_duplicates.rb
|
15
|
-
|
16
|
-
require 'find'
|
17
|
-
require 'digest/md5'
|
18
|
-
|
19
|
-
def each_set_of_duplicates(*paths)
|
20
|
-
sizes = {}
|
21
|
-
Find.find(*paths) do |f|
|
22
|
-
(sizes[File.size(f)] ||= []) << f if File.file? f
|
23
|
-
end
|
24
|
-
sizes.each do |size, files|
|
25
|
-
next unless files.size > 1
|
26
|
-
md5s = {}
|
27
|
-
files.each do |f|
|
28
|
-
digest = Digest::MD5.hexdigest(File.read(f))
|
29
|
-
(md5s[digest] ||= []) << f
|
30
|
-
end
|
31
|
-
md5s.each { |sum, files| yield files if files.size > 1 }
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
each_set_of_
|
36
|
-
duplicates(*ARGV) do |f|
|
37
|
-
puts "
|
38
|
-
Duplicates: #{f.join(", ")}"
|
39
|
-
end
|
40
|
-
=end
|
41
|
-
|
42
|
-
# http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
|
43
|
-
|
44
|
-
#!/usr/bin/ruby
|
45
|
-
# find_duplicates2.rb
|
46
|
-
|
47
9
|
require 'find'
|
48
10
|
|
49
11
|
module RightData
|
@@ -58,6 +20,21 @@ module RightData
|
|
58
20
|
File.basename(f).downcase =~ /\.swp$/
|
59
21
|
end
|
60
22
|
|
23
|
+
# Is this a picture? If so, we'll be using imagemagick's compare feature later on
|
24
|
+
def self.is_visual_media?(f)
|
25
|
+
ext = File.basename(f).downcase.split(".").last
|
26
|
+
["jpg","jpeg","gif","bmp","png"].include?(ext)
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.identical_images?(a,b)
|
30
|
+
return false unless self.is_visual_media?(a) && self.is_visual_media?(b)
|
31
|
+
# rmagick1.signature <=> rmagick2.signature
|
32
|
+
# rmagick1.compare_channel(rmagick2, MeanAbsoluteErrorMetric).last == 0
|
33
|
+
cmd = "compare -metric AE \"#{a.gsub(/\"/,'\"')}\" \"#{b.gsub(/\"/,'\"')}\" /dev/null"
|
34
|
+
puts "Executing comparison: #{cmd}"
|
35
|
+
"0" == `#{cmd}`
|
36
|
+
end
|
37
|
+
|
61
38
|
def self.each_set_of_duplicates(*paths, &block)
|
62
39
|
sizes = Hash.new {|h, k| h[k] = [] }
|
63
40
|
Find.find(*paths) { |f| sizes[File.size(f)] << f if File.file? f }
|
@@ -101,19 +78,29 @@ module RightData
|
|
101
78
|
return possible_duplicates
|
102
79
|
end
|
103
80
|
|
81
|
+
def self.index_by_name(*paths)
|
82
|
+
names = Hash.new {|h, k| h[k] = [] }
|
83
|
+
count = 0
|
84
|
+
Find.find(*paths) { |f|
|
85
|
+
names[File.basename(f).downcase] << f if File.file?(f) && !ignore_test(f)
|
86
|
+
count += 1
|
87
|
+
}
|
88
|
+
puts "# Indexed #{count} files by name."
|
89
|
+
names
|
90
|
+
end
|
91
|
+
|
104
92
|
def self.index_by_size(*paths)
|
105
93
|
sizes = Hash.new {|h, k| h[k] = [] }
|
106
94
|
count = 0
|
107
95
|
Find.find(*paths) { |f|
|
108
|
-
|
96
|
+
sizes[File.size(f)] << f if File.file?(f) && !ignore_test(f)
|
109
97
|
count += 1
|
110
98
|
}
|
111
|
-
puts "# Indexed #{count} files."
|
99
|
+
puts "# Indexed #{count} files by size."
|
112
100
|
sizes
|
113
101
|
end
|
114
102
|
|
115
|
-
def self.cache_not_working_on_write(master)
|
116
|
-
master_cache = File.join(master,".rightPruneCache")
|
103
|
+
def self.cache_not_working_on_write(master, master_cache, indexing_function)
|
117
104
|
if File.exist?(master_cache)
|
118
105
|
puts "# Master cache FOUND at #{master_cache}."
|
119
106
|
master_index = File.open(master_cache) do |f|
|
@@ -121,7 +108,7 @@ module RightData
|
|
121
108
|
end
|
122
109
|
else
|
123
110
|
puts "# Master cache not found at #{master_cache}."
|
124
|
-
master_index =
|
111
|
+
master_index = indexing_function.call(master)
|
125
112
|
puts "# Writing #{master_cache}."
|
126
113
|
File.open(master_cache, "w") do |f|
|
127
114
|
YAML.dump(master_index, f)
|
@@ -164,7 +151,16 @@ module RightData
|
|
164
151
|
end
|
165
152
|
end
|
166
153
|
|
167
|
-
def self.
|
154
|
+
def self.check_file_in_image_index(master_index, file_to_check)
|
155
|
+
size = File.size(file_to_check)
|
156
|
+
return [] if size == 0 # Ignore empty files
|
157
|
+
possible_master_dups = master_index[File.basename(file_to_check).downcase] || []
|
158
|
+
possible_master_dups.select { |master_file|
|
159
|
+
self.identical_images?(master_file,file_to_check)
|
160
|
+
}
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.check_file_in_index(master_index, file_to_check)
|
168
164
|
size = File.size(file_to_check)
|
169
165
|
return [] if size == 0 # Ignore empty files
|
170
166
|
possible_master_dups = master_index[size] || []
|
@@ -201,13 +197,27 @@ module RightData
|
|
201
197
|
end
|
202
198
|
end
|
203
199
|
|
200
|
+
def self.scan_for_prunable_images(master, prune, &block)
|
201
|
+
indexing_function = Proc.new { |a| self.index_by_name(a) }
|
202
|
+
check_index_function = Proc.new { |a,b| self.check_file_in_image_index(a,b) }
|
203
|
+
self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, "image", &block)
|
204
|
+
end
|
205
|
+
|
206
|
+
def self.scan_for_prunable(master, prune, &block)
|
207
|
+
indexing_function = Proc.new { |a| self.index_by_size(a) }
|
208
|
+
check_index_function = Proc.new { |a,b| self.check_file_in_index(a,b) }
|
209
|
+
scan_for_prunable_base(master, prune, indexing_function, check_index_function, "size", &block)
|
210
|
+
end
|
211
|
+
|
204
212
|
# tree = scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }; nil
|
205
|
-
def self.
|
213
|
+
def self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block)
|
206
214
|
puts "# Ignoring: #{IGNORE_FILES.inspect}"
|
207
215
|
|
208
|
-
|
216
|
+
master_cache = File.join(master,".rightPruneCache-#{kind}")
|
217
|
+
master_index = cache_not_working_on_write(master, master_cache, indexing_function)
|
218
|
+
|
209
219
|
# master_index = index_by_size(master)
|
210
|
-
puts "# Found #{master_index.size} unique
|
220
|
+
puts "# Found #{master_index.size} unique #{kind}s."
|
211
221
|
|
212
222
|
# dups = check_file_in_index(master_index, "/Users/jonathan/Dropbox/2261093437_fac9fa9008_b.jpg")
|
213
223
|
|
@@ -240,7 +250,7 @@ module RightData
|
|
240
250
|
n.parent.increment_ignorable_children
|
241
251
|
else
|
242
252
|
# puts n.path
|
243
|
-
duplicates =
|
253
|
+
duplicates = check_index_function.call(master_index, n.path)
|
244
254
|
if(!duplicates.empty?)
|
245
255
|
n.duplicates = duplicates
|
246
256
|
n.parent.increment_duplicate_children
|
@@ -330,3 +340,42 @@ module RightData
|
|
330
340
|
# Found 37765 unique sizes.
|
331
341
|
# After check. Found 1240 / 1940 dups in master.
|
332
342
|
end
|
343
|
+
|
344
|
+
# crawler = Index.new
|
345
|
+
# crawler.crawl
|
346
|
+
|
347
|
+
=begin
|
348
|
+
#!/usr/bin/ruby
|
349
|
+
# find_duplicates.rb
|
350
|
+
|
351
|
+
require 'find'
|
352
|
+
require 'digest/md5'
|
353
|
+
|
354
|
+
def each_set_of_duplicates(*paths)
|
355
|
+
sizes = {}
|
356
|
+
Find.find(*paths) do |f|
|
357
|
+
(sizes[File.size(f)] ||= []) << f if File.file? f
|
358
|
+
end
|
359
|
+
sizes.each do |size, files|
|
360
|
+
next unless files.size > 1
|
361
|
+
md5s = {}
|
362
|
+
files.each do |f|
|
363
|
+
digest = Digest::MD5.hexdigest(File.read(f))
|
364
|
+
(md5s[digest] ||= []) << f
|
365
|
+
end
|
366
|
+
md5s.each { |sum, files| yield files if files.size > 1 }
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
each_set_of_
|
371
|
+
duplicates(*ARGV) do |f|
|
372
|
+
puts "
|
373
|
+
Duplicates: #{f.join(", ")}"
|
374
|
+
end
|
375
|
+
=end
|
376
|
+
|
377
|
+
# http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
|
378
|
+
|
379
|
+
#!/usr/bin/ruby
|
380
|
+
# find_duplicates2.rb
|
381
|
+
|
data/lib/right_data.rb
CHANGED
@@ -10,6 +10,11 @@ module RightData
|
|
10
10
|
tree.report('rm -rf'); nil
|
11
11
|
end
|
12
12
|
|
13
|
+
def self.prune_image_report(master,prunable)
|
14
|
+
tree = RightData::scan_for_prunable_images(master,prunable)
|
15
|
+
tree.report('rm -rf'); nil
|
16
|
+
end
|
17
|
+
|
13
18
|
# Run this in a directory that is suspected of containing self-duplicate files.
|
14
19
|
# Compare to: fdupes -r -n prunable
|
15
20
|
def self.dup_report(prunable)
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: right_data
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 9
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
+
- 7
|
8
9
|
- 5
|
9
|
-
|
10
|
-
version: 0.5.25
|
10
|
+
version: 0.7.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jonathan Siegel
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08
|
18
|
+
date: 2010-10-08 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|