right_data 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Jonathan Siegel
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,21 @@
1
+ = right_data
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ rake version:bump:minor
16
+ rake build
17
+ gem install pkg/...
18
+
19
+ == Copyright
20
+
21
+ Copyright (c) 2010 Jonathan Siegel. See LICENSE for details.
@@ -0,0 +1,153 @@
1
+ module RightData
2
+ class FileSystemItem
3
+
4
+ attr_reader :relativePath
5
+ attr_reader :parent
6
+
7
+ attr_reader :ignore_children
8
+ attr_reader :duplicate_children
9
+ attr_accessor :duplicates
10
+ attr_accessor :ignorable
11
+
12
+ def initialize path, args
13
+ if args[:parent]
14
+ @relativePath = File.basename(path)
15
+ @parent = args[:parent]
16
+ else
17
+ @relativePath = path
18
+ @parent = nil
19
+ end
20
+ @ignorable = false
21
+ @duplicates = [] # for this node
22
+ @duplicate_children = 0 # counts for children
23
+ @ignore_children = 0
24
+ self
25
+ end
26
+
27
+ def files
28
+ return 0 if leaf? && File.directory?(fullPath)
29
+ return 1 if leaf?
30
+ return children.map {|n| n.files}.inject {|sum, n| sum + n }
31
+ end
32
+ def ignore_files
33
+ return 0 if leaf? && File.directory?(fullPath)
34
+ return ignorable? ? 1 : 0 if leaf?
35
+ return children.map {|n| n.ignore_files}.inject {|sum, n| sum + n }
36
+ end
37
+ def duplicate_files
38
+ return 0 if leaf? && File.directory?(fullPath)
39
+ return duplicate? ? 1 : 0 if leaf?
40
+ return children.map {|n| n.duplicate_files}.inject {|sum, n| sum + n }
41
+ end
42
+
43
+
44
+ def basename; @relativePath; end
45
+
46
+ def self.rootItem
47
+ @rootItem ||= self.new '/', :parent => nil
48
+ end
49
+
50
+ def children
51
+ unless @children
52
+ if File.directory?(fullPath) and File.readable?(fullPath)
53
+ @children = Dir.entries(fullPath).select { |path|
54
+ path != '.' and path != '..'
55
+ }.map { |path|
56
+ FileSystemItem.new path, :parent => self
57
+ }
58
+ else
59
+ @children = nil
60
+ end
61
+ end
62
+ @children
63
+ end
64
+
65
+ def path; fullPath; end
66
+ def fullPath
67
+ @parent ? File.join(@parent.fullPath, @relativePath) : @relativePath
68
+ end
69
+
70
+ def childAtIndex n
71
+ children[n]
72
+ end
73
+
74
+ def numberOfChildren
75
+ children == nil ? -1 : children.size
76
+ end
77
+
78
+ def children?; !children.nil? && !children.empty?; end
79
+
80
+ def duplicate?
81
+ if leaf?
82
+ !duplicates.empty?
83
+ else # Dup if all ignored / dup children
84
+ ((@ignore_children + @duplicate_children) == numberOfChildren)
85
+ end
86
+ end
87
+
88
+ def ignorable?; ignorable; end
89
+
90
+ def increment_ignorable_children
91
+ @ignore_children += 1
92
+ update_duplicate_ignorable_status
93
+ end
94
+
95
+ def update_duplicate_ignorable_status
96
+ parent.increment_duplicate_children if((@ignore_children + @duplicate_children) == numberOfChildren)
97
+ end
98
+
99
+ def increment_duplicate_children
100
+ @duplicate_children += 1
101
+ update_duplicate_ignorable_status
102
+ end
103
+
104
+ def leaf?; !children?; end
105
+
106
+ def traverse(&block) # Allow proc to decide if we traverse
107
+ if block.call(self) && children?
108
+ children.each { |c| c.traverse(&block) }
109
+ end
110
+ end
111
+
112
+ def other_children
113
+ children.size - ignore_children - duplicate_children
114
+ end
115
+
116
+ def to_param; to_s; end
117
+ def to_s
118
+ "<Tree :path => #{self.path}, :files => #{self.files}>"
119
+ end
120
+
121
+ # Inspect the nodes:
122
+ def report(pre="")
123
+ pre += " " if !pre.empty?
124
+ self.traverse do |n|
125
+ # Is this a leaf (e.g. a file)?
126
+ if n.leaf?
127
+ msg = nil
128
+ msg = "# dup(#{n.duplicates.count})" if n.duplicate?
129
+ msg = "# ign" if n.ignorable?
130
+ if msg
131
+ puts "#{pre}#{n.path} #{msg}" # Remove the dups/igns!
132
+ else
133
+ puts "# #{n.path} unique"
134
+ end
135
+ false # Don't traverse deeper!
136
+ else
137
+ if n.duplicate_children + n.ignore_children == n.children.size
138
+ puts "#{pre}#{n.path} # #{n.duplicate_children} dups / #{n.ignore_children} ignores"
139
+ false # Don't traverse deeper!
140
+ elsif n.children.size == 0
141
+ puts "#{pre}#{n.path} # Empty... "
142
+ false
143
+ else
144
+ puts "# #{n.path} # Not #{n.duplicate_children} dup/ #{n.ignore_children} ign / #{n.other_children} other "
145
+ true
146
+ end
147
+ end
148
+ end
149
+ puts "# #{self.ignore_files} ignores, #{self.duplicate_files} dups of #{self.files} files"
150
+ end
151
+
152
+ end
153
+ end
data/lib/main.rb ADDED
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env ruby
2
+ require 'FileSystemItem'
3
+ # require 'unicode'
4
+
5
+ $KCODE = 'UTF-8' # only used when encoding is not specified.
6
+
7
+ # crawler = Index.new
8
+ # crawler.crawl
9
+
10
+ =begin
11
+ #!/usr/bin/ruby
12
+ # find_duplicates.rb
13
+
14
+ require 'find'
15
+ require 'digest/md5'
16
+
17
+ def each_set_of_duplicates(*paths)
18
+ sizes = {}
19
+ Find.find(*paths) do |f|
20
+ (sizes[File.size(f)] ||= []) << f if File.file? f
21
+ end
22
+ sizes.each do |size, files|
23
+ next unless files.size > 1
24
+ md5s = {}
25
+ files.each do |f|
26
+ digest = Digest::MD5.hexdigest(File.read(f))
27
+ (md5s[digest] ||= []) << f
28
+ end
29
+ md5s.each { |sum, files| yield files if files.size > 1 }
30
+ end
31
+ end
32
+
33
+ each_set_of_
34
+ duplicates(*ARGV) do |f|
35
+ puts "
36
+ Duplicates: #{f.join(", ")}"
37
+ end
38
+ =end
39
+
40
+ # http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
41
+
42
+ #!/usr/bin/ruby
43
+ # find_duplicates2.rb
44
+
45
+ require 'find'
46
+
47
+ module RightData
48
+
49
+ BLOCK_SIZE = 1024*8
50
+
51
+ def self.each_set_of_duplicates(*paths, &block)
52
+ sizes = Hash.new {|h, k| h[k] = [] }
53
+ Find.find(*paths) { |f| sizes[File.size(f)] << f if File.file? f }
54
+
55
+ sizes.each_pair do |size, files|
56
+ # puts files.inspect
57
+ next unless files.size > 1
58
+ offset = 0
59
+ files = [files]
60
+ while !files.empty? && offset <= size
61
+ files = eliminate_non_duplicates(files, size, offset, &block)
62
+ offset += BLOCK_SIZE
63
+ end
64
+ end
65
+ end
66
+
67
+ def self.eliminate_non_duplicates(partition, size, offset)
68
+ possible_duplicates = []
69
+ partition.each do |possible_duplicate_set|
70
+ blocks = Hash.new {|h, k| h[k] = [] }
71
+ possible_duplicate_set.each do |f|
72
+ block = open(f, 'rb') do |file|
73
+ file.seek(offset)
74
+ file.read(BLOCK_SIZE)
75
+ end
76
+ blocks[block || ''] << f
77
+ end
78
+ blocks.each_value do |files|
79
+ if files.size > 1
80
+ if offset+BLOCK_SIZE >= size
81
+ # We know these are duplicates.
82
+ yield files
83
+ else
84
+ # We suspect these are duplicates, but we need to compare
85
+ # more blocks of data.
86
+ possible_duplicates << files
87
+ end
88
+ end
89
+ end
90
+ end
91
+ return possible_duplicates
92
+ end
93
+
94
+ IGNORE_FILES = [".DS_Store", ".typeAttributes.dict", "empty-file"]
95
+
96
+ def self.index_by_size(*paths)
97
+ sizes = Hash.new {|h, k| h[k] = [] }
98
+ count = 0
99
+ Find.find(*paths) { |f|
100
+ sizes[File.size(f)] << f if File.file?(f) && !IGNORE_FILES.include?(File.basename(f)) && (File.size(f) != 0) # Ignore empty files
101
+ count += 1
102
+ }
103
+ puts "Indexed #{count} files."
104
+ sizes
105
+ end
106
+
107
+ def self.cache_not_working_on_write(master)
108
+ master_cache = File.join(master,".rightPruneCache")
109
+ if File.exist?(master_cache)
110
+ puts "Master cache FOUND at #{master_cache}."
111
+ master_index = File.open(master_cache) do |f|
112
+ YAML::load(f)
113
+ end
114
+ else
115
+ puts "Master cache not found at #{master_cache}."
116
+ master_index = index_by_size(master)
117
+ puts "Writing #{master_cache}."
118
+ File.open(master_cache, "w") do |f|
119
+ YAML.dump(master_index, f)
120
+ end
121
+ puts "Wrote #{master_cache}."
122
+ end
123
+ master_index
124
+ end
125
+
126
+
127
+ def self.cache_serializing_on_write(master)
128
+ master_cache = File.join(master,".rightPruneCache")
129
+ if File.exist?(master_cache)
130
+ puts "Master cache FOUND at #{master_cache}."
131
+ master_index = File.open(master_cache) do |f|
132
+ rval = {}
133
+ f.each_line do |l|
134
+ kv = Marshal.load(l)
135
+ rval[kv.first] = kv.last
136
+ end
137
+ rval
138
+ end
139
+ else
140
+ puts "Master cache not found at #{master_cache}."
141
+ master_index = index_by_size(master)
142
+ puts "Writing #{master_cache}."
143
+ File.open(master_cache, "w") do |f|
144
+ master_index.each_pair do |k,v|
145
+ Marshal.dump([k,v], f)
146
+ end
147
+ # f.write(master_index.inspect)
148
+ end
149
+ puts "Wrote #{master_cache}."
150
+ end
151
+ end
152
+
153
+ def self.get_block(file,offset)
154
+ open(file, 'r') do |f|
155
+ f.seek(offset); f.read(BLOCK_SIZE)
156
+ end
157
+ end
158
+
159
+ def self.check_file_in_index(master_index, file_to_check, &block)
160
+ size = File.size(file_to_check)
161
+ return [] if size == 0 # Ignore empty files
162
+ possible_master_dups = master_index[size] || []
163
+ offset = 0
164
+ while !possible_master_dups.empty? && offset <= size
165
+ file_to_check_block = get_block(file_to_check, offset)
166
+ new_possible_master_dups = []
167
+ possible_master_dups.each do |master|
168
+ block = get_block(master,offset)
169
+ if(block == file_to_check_block)
170
+ new_possible_master_dups << master
171
+ end
172
+ end
173
+ possible_master_dups = new_possible_master_dups
174
+ offset += BLOCK_SIZE
175
+ end
176
+ # puts possible_master_dups.inspect
177
+ possible_master_dups
178
+ end
179
+
180
+ def self.test
181
+ master = "/Users/jonathan/Dropbox"
182
+ prune = "/Users/jonathan/Desktop/Old"
183
+ scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }
184
+ end
185
+
186
+ # tree = scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }; nil
187
+ def self.scan_for_prunable(master,prune, &block)
188
+ puts "Ignoring: #{IGNORE_FILES.inspect}"
189
+
190
+ master_index = cache_not_working_on_write(master)
191
+ # master_index = index_by_size(master)
192
+ puts "Found #{master_index.size} unique sizes."
193
+
194
+ # dups = check_file_in_index(master_index, "/Users/jonathan/Dropbox/2261093437_fac9fa9008_b.jpg")
195
+
196
+ count = 0
197
+
198
+ # Recursively compare the files in the filesystem.
199
+ # When a parent node gets a response from all its children
200
+ # that they are dups OR ignorable, that NODE becomes dup_or_ignorable too.
201
+ # This propagates.
202
+ # Then, there is a traversal that grabs all base nodes that are non_dup like:
203
+ # rm -rf /a_path_duped/here # 14 dups / 9 ignores
204
+ # rm -rf /b_path_duped/way/here # 1 dup
205
+ tree = FileSystemItem.new(prune, :parent => nil)
206
+ # Mark the nodes:
207
+ tree.traverse do |n|
208
+ next true if File.directory?(n.path)
209
+ count += 1
210
+ if IGNORE_FILES.include?(n.basename)
211
+ n.ignorable = true
212
+ n.parent.increment_ignorable_children
213
+ else
214
+ # puts n.path
215
+ duplicates = check_file_in_index(master_index, n.path)
216
+ if(!duplicates.empty?)
217
+ n.duplicates = duplicates
218
+ n.parent.increment_duplicate_children
219
+ end
220
+ end
221
+ true
222
+ end
223
+ puts "We counted #{count} files. Tree thinks it has #{tree.files}."
224
+ return tree
225
+
226
+ if nil
227
+ Find.find(prune) { |f|
228
+ if File.directory? f
229
+ puts "Dir: #{f}"
230
+ prunable_dirs[f] = {}
231
+ next
232
+ end
233
+ # next unless File.file? f
234
+ count += 1
235
+ duplicates = check_file_in_index(master_index, f)
236
+ if(!duplicates.empty?)
237
+ dups[f] = duplicates
238
+ prunable_files[f] = duplicates
239
+ block.call(f, duplicates) unless block.nil?
240
+ else
241
+ prunable_files[f] = false
242
+ end
243
+ }
244
+
245
+ puts "After check. Found #{dups.size} / #{count} dups in master."
246
+ puts "After check. Found #{dups.first.inspect}"
247
+ end
248
+
249
+ # puts "Dirs scanned."
250
+ #prunable_dirs.each_pair do |file,prunable|
251
+ #puts "#{'#' if !prunable} #{file}"
252
+ #end
253
+
254
+ # puts "Files scanned."
255
+ # prunable_files.keys.sort.each do |file|
256
+ # prunable = prunable_files[file]
257
+ # puts "#{'#' if !prunable} #{file}"
258
+ # end
259
+ # prunable_files
260
+ end
261
+
262
+ # each_set_of_duplicates(dirs) do |f|
263
+ # puts "Duplicates: #{f.join(", ")}"
264
+ #end
265
+
266
+ # With YAML cache:
267
+ # Master cache FOUND at /Users/jonathan/Dropbox/.rightPruneCache.
268
+ # Found 37765 unique sizes.
269
+ # After check. Found 1240 / 1940 dups in master.
270
+ end
data/lib/right_data.rb ADDED
@@ -0,0 +1,5 @@
1
+ require 'main'
2
+
3
+ module RightData
4
+ def self.hello; "Hi!"; end
5
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'right_data'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestRightData < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: right_data
3
+ version: !ruby/object:Gem::Version
4
+ hash: 11
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 5
9
+ - 0
10
+ version: 0.5.0
11
+ platform: ruby
12
+ authors:
13
+ - Jonathan Siegel
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-08-30 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: thoughtbot-shoulda
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :development
34
+ version_requirements: *id001
35
+ description: RightData helpers
36
+ email: usiegj00@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ - README.rdoc
44
+ files:
45
+ - lib/FileSystemItem.rb
46
+ - lib/main.rb
47
+ - lib/right_data.rb
48
+ - LICENSE
49
+ - README.rdoc
50
+ - test/helper.rb
51
+ - test/test_right_data.rb
52
+ has_rdoc: true
53
+ homepage: http://github.com/usiegj00/right_data
54
+ licenses: []
55
+
56
+ post_install_message:
57
+ rdoc_options:
58
+ - --charset=UTF-8
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ hash: 3
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 3
76
+ segments:
77
+ - 0
78
+ version: "0"
79
+ requirements: []
80
+
81
+ rubyforge_project:
82
+ rubygems_version: 1.3.7
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: RightData helpers
86
+ test_files:
87
+ - test/helper.rb
88
+ - test/test_right_data.rb