right_data 0.5.25 → 0.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +1 -0
- data/lib/main.rb +97 -48
- data/lib/right_data.rb +5 -0
- metadata +4 -4
data/README.rdoc
CHANGED
@@ -22,6 +22,7 @@ sudo gem install pkg/right_data-0.5.4.gem
|
|
22
22
|
|
23
23
|
echo "gem 'right_data'; require 'right_data'; RightData::prune_report('/Users/jonathan/Dropbox','/Users/jonathan/Desktop/Old')" | ruby -rrubygems
|
24
24
|
|
25
|
+
echo "gem 'right_data'; require 'right_data'; RightData::prune_image_report('/Users/jonathan/Dropbox','/Users/jonathan/Desktop/Old')" | ruby -rrubygems
|
25
26
|
|
26
27
|
== Copyright
|
27
28
|
|
data/lib/main.rb
CHANGED
@@ -6,44 +6,6 @@ require 'escape'
|
|
6
6
|
|
7
7
|
$KCODE = 'UTF-8' # only used when encoding is not specified.
|
8
8
|
|
9
|
-
# crawler = Index.new
|
10
|
-
# crawler.crawl
|
11
|
-
|
12
|
-
=begin
|
13
|
-
#!/usr/bin/ruby
|
14
|
-
# find_duplicates.rb
|
15
|
-
|
16
|
-
require 'find'
|
17
|
-
require 'digest/md5'
|
18
|
-
|
19
|
-
def each_set_of_duplicates(*paths)
|
20
|
-
sizes = {}
|
21
|
-
Find.find(*paths) do |f|
|
22
|
-
(sizes[File.size(f)] ||= []) << f if File.file? f
|
23
|
-
end
|
24
|
-
sizes.each do |size, files|
|
25
|
-
next unless files.size > 1
|
26
|
-
md5s = {}
|
27
|
-
files.each do |f|
|
28
|
-
digest = Digest::MD5.hexdigest(File.read(f))
|
29
|
-
(md5s[digest] ||= []) << f
|
30
|
-
end
|
31
|
-
md5s.each { |sum, files| yield files if files.size > 1 }
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
each_set_of_
|
36
|
-
duplicates(*ARGV) do |f|
|
37
|
-
puts "
|
38
|
-
Duplicates: #{f.join(", ")}"
|
39
|
-
end
|
40
|
-
=end
|
41
|
-
|
42
|
-
# http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
|
43
|
-
|
44
|
-
#!/usr/bin/ruby
|
45
|
-
# find_duplicates2.rb
|
46
|
-
|
47
9
|
require 'find'
|
48
10
|
|
49
11
|
module RightData
|
@@ -58,6 +20,21 @@ module RightData
|
|
58
20
|
File.basename(f).downcase =~ /\.swp$/
|
59
21
|
end
|
60
22
|
|
23
|
+
# Is this a picture? If so, we'll be using imagemagick's compare feature later on
|
24
|
+
def self.is_visual_media?(f)
|
25
|
+
ext = File.basename(f).downcase.split(".").last
|
26
|
+
["jpg","jpeg","gif","bmp","png"].include?(ext)
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.identical_images?(a,b)
|
30
|
+
return false unless self.is_visual_media?(a) && self.is_visual_media?(b)
|
31
|
+
# rmagick1.signature <=> rmagick2.signature
|
32
|
+
# rmagick1.compare_channel(rmagick2, MeanAbsoluteErrorMetric).last == 0
|
33
|
+
cmd = "compare -metric AE \"#{a.gsub(/\"/,'\"')}\" \"#{b.gsub(/\"/,'\"')}\" /dev/null"
|
34
|
+
puts "Executing comparison: #{cmd}"
|
35
|
+
"0" == `#{cmd}`
|
36
|
+
end
|
37
|
+
|
61
38
|
def self.each_set_of_duplicates(*paths, &block)
|
62
39
|
sizes = Hash.new {|h, k| h[k] = [] }
|
63
40
|
Find.find(*paths) { |f| sizes[File.size(f)] << f if File.file? f }
|
@@ -101,19 +78,29 @@ module RightData
|
|
101
78
|
return possible_duplicates
|
102
79
|
end
|
103
80
|
|
81
|
+
def self.index_by_name(*paths)
|
82
|
+
names = Hash.new {|h, k| h[k] = [] }
|
83
|
+
count = 0
|
84
|
+
Find.find(*paths) { |f|
|
85
|
+
names[File.basename(f).downcase] << f if File.file?(f) && !ignore_test(f)
|
86
|
+
count += 1
|
87
|
+
}
|
88
|
+
puts "# Indexed #{count} files by name."
|
89
|
+
names
|
90
|
+
end
|
91
|
+
|
104
92
|
def self.index_by_size(*paths)
|
105
93
|
sizes = Hash.new {|h, k| h[k] = [] }
|
106
94
|
count = 0
|
107
95
|
Find.find(*paths) { |f|
|
108
|
-
|
96
|
+
sizes[File.size(f)] << f if File.file?(f) && !ignore_test(f)
|
109
97
|
count += 1
|
110
98
|
}
|
111
|
-
puts "# Indexed #{count} files."
|
99
|
+
puts "# Indexed #{count} files by size."
|
112
100
|
sizes
|
113
101
|
end
|
114
102
|
|
115
|
-
def self.cache_not_working_on_write(master)
|
116
|
-
master_cache = File.join(master,".rightPruneCache")
|
103
|
+
def self.cache_not_working_on_write(master, master_cache, indexing_function)
|
117
104
|
if File.exist?(master_cache)
|
118
105
|
puts "# Master cache FOUND at #{master_cache}."
|
119
106
|
master_index = File.open(master_cache) do |f|
|
@@ -121,7 +108,7 @@ module RightData
|
|
121
108
|
end
|
122
109
|
else
|
123
110
|
puts "# Master cache not found at #{master_cache}."
|
124
|
-
master_index =
|
111
|
+
master_index = indexing_function.call(master)
|
125
112
|
puts "# Writing #{master_cache}."
|
126
113
|
File.open(master_cache, "w") do |f|
|
127
114
|
YAML.dump(master_index, f)
|
@@ -164,7 +151,16 @@ module RightData
|
|
164
151
|
end
|
165
152
|
end
|
166
153
|
|
167
|
-
def self.
|
154
|
+
def self.check_file_in_image_index(master_index, file_to_check)
|
155
|
+
size = File.size(file_to_check)
|
156
|
+
return [] if size == 0 # Ignore empty files
|
157
|
+
possible_master_dups = master_index[File.basename(file_to_check).downcase] || []
|
158
|
+
possible_master_dups.select { |master_file|
|
159
|
+
self.identical_images?(master_file,file_to_check)
|
160
|
+
}
|
161
|
+
end
|
162
|
+
|
163
|
+
def self.check_file_in_index(master_index, file_to_check)
|
168
164
|
size = File.size(file_to_check)
|
169
165
|
return [] if size == 0 # Ignore empty files
|
170
166
|
possible_master_dups = master_index[size] || []
|
@@ -201,13 +197,27 @@ module RightData
|
|
201
197
|
end
|
202
198
|
end
|
203
199
|
|
200
|
+
def self.scan_for_prunable_images(master, prune, &block)
|
201
|
+
indexing_function = Proc.new { |a| self.index_by_name(a) }
|
202
|
+
check_index_function = Proc.new { |a,b| self.check_file_in_image_index(a,b) }
|
203
|
+
self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, "image", &block)
|
204
|
+
end
|
205
|
+
|
206
|
+
def self.scan_for_prunable(master, prune, &block)
|
207
|
+
indexing_function = Proc.new { |a| self.index_by_size(a) }
|
208
|
+
check_index_function = Proc.new { |a,b| self.check_file_in_index(a,b) }
|
209
|
+
scan_for_prunable_base(master, prune, indexing_function, check_index_function, "size", &block)
|
210
|
+
end
|
211
|
+
|
204
212
|
# tree = scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }; nil
|
205
|
-
def self.
|
213
|
+
def self.scan_for_prunable_base(master, prune, indexing_function, check_index_function, kind, &block)
|
206
214
|
puts "# Ignoring: #{IGNORE_FILES.inspect}"
|
207
215
|
|
208
|
-
|
216
|
+
master_cache = File.join(master,".rightPruneCache-#{kind}")
|
217
|
+
master_index = cache_not_working_on_write(master, master_cache, indexing_function)
|
218
|
+
|
209
219
|
# master_index = index_by_size(master)
|
210
|
-
puts "# Found #{master_index.size} unique
|
220
|
+
puts "# Found #{master_index.size} unique #{kind}s."
|
211
221
|
|
212
222
|
# dups = check_file_in_index(master_index, "/Users/jonathan/Dropbox/2261093437_fac9fa9008_b.jpg")
|
213
223
|
|
@@ -240,7 +250,7 @@ module RightData
|
|
240
250
|
n.parent.increment_ignorable_children
|
241
251
|
else
|
242
252
|
# puts n.path
|
243
|
-
duplicates =
|
253
|
+
duplicates = check_index_function.call(master_index, n.path)
|
244
254
|
if(!duplicates.empty?)
|
245
255
|
n.duplicates = duplicates
|
246
256
|
n.parent.increment_duplicate_children
|
@@ -330,3 +340,42 @@ module RightData
|
|
330
340
|
# Found 37765 unique sizes.
|
331
341
|
# After check. Found 1240 / 1940 dups in master.
|
332
342
|
end
|
343
|
+
|
344
|
+
# crawler = Index.new
|
345
|
+
# crawler.crawl
|
346
|
+
|
347
|
+
=begin
|
348
|
+
#!/usr/bin/ruby
|
349
|
+
# find_duplicates.rb
|
350
|
+
|
351
|
+
require 'find'
|
352
|
+
require 'digest/md5'
|
353
|
+
|
354
|
+
def each_set_of_duplicates(*paths)
|
355
|
+
sizes = {}
|
356
|
+
Find.find(*paths) do |f|
|
357
|
+
(sizes[File.size(f)] ||= []) << f if File.file? f
|
358
|
+
end
|
359
|
+
sizes.each do |size, files|
|
360
|
+
next unless files.size > 1
|
361
|
+
md5s = {}
|
362
|
+
files.each do |f|
|
363
|
+
digest = Digest::MD5.hexdigest(File.read(f))
|
364
|
+
(md5s[digest] ||= []) << f
|
365
|
+
end
|
366
|
+
md5s.each { |sum, files| yield files if files.size > 1 }
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
each_set_of_
|
371
|
+
duplicates(*ARGV) do |f|
|
372
|
+
puts "
|
373
|
+
Duplicates: #{f.join(", ")}"
|
374
|
+
end
|
375
|
+
=end
|
376
|
+
|
377
|
+
# http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
|
378
|
+
|
379
|
+
#!/usr/bin/ruby
|
380
|
+
# find_duplicates2.rb
|
381
|
+
|
data/lib/right_data.rb
CHANGED
@@ -10,6 +10,11 @@ module RightData
|
|
10
10
|
tree.report('rm -rf'); nil
|
11
11
|
end
|
12
12
|
|
13
|
+
def self.prune_image_report(master,prunable)
|
14
|
+
tree = RightData::scan_for_prunable_images(master,prunable)
|
15
|
+
tree.report('rm -rf'); nil
|
16
|
+
end
|
17
|
+
|
13
18
|
# Run this in a directory that is suspected of containing self-duplicate files.
|
14
19
|
# Compare to: fdupes -r -n prunable
|
15
20
|
def self.dup_report(prunable)
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: right_data
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 9
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
+
- 7
|
8
9
|
- 5
|
9
|
-
|
10
|
-
version: 0.5.25
|
10
|
+
version: 0.7.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Jonathan Siegel
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08
|
18
|
+
date: 2010-10-08 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|