right_data 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Jonathan Siegel
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,21 @@
1
+ = right_data
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ rake version:bump:minor
16
+ rake build
17
+ gem install pkg/...
18
+
19
+ == Copyright
20
+
21
+ Copyright (c) 2010 Jonathan Siegel. See LICENSE for details.
@@ -0,0 +1,153 @@
1
+ module RightData
2
+ class FileSystemItem
3
+
4
+ attr_reader :relativePath
5
+ attr_reader :parent
6
+
7
+ attr_reader :ignore_children
8
+ attr_reader :duplicate_children
9
+ attr_accessor :duplicates
10
+ attr_accessor :ignorable
11
+
12
+ def initialize path, args
13
+ if args[:parent]
14
+ @relativePath = File.basename(path)
15
+ @parent = args[:parent]
16
+ else
17
+ @relativePath = path
18
+ @parent = nil
19
+ end
20
+ @ignorable = false
21
+ @duplicates = [] # for this node
22
+ @duplicate_children = 0 # counts for children
23
+ @ignore_children = 0
24
+ self
25
+ end
26
+
27
+ def files
28
+ return 0 if leaf? && File.directory?(fullPath)
29
+ return 1 if leaf?
30
+ return children.map {|n| n.files}.inject {|sum, n| sum + n }
31
+ end
32
+ def ignore_files
33
+ return 0 if leaf? && File.directory?(fullPath)
34
+ return ignorable? ? 1 : 0 if leaf?
35
+ return children.map {|n| n.ignore_files}.inject {|sum, n| sum + n }
36
+ end
37
+ def duplicate_files
38
+ return 0 if leaf? && File.directory?(fullPath)
39
+ return duplicate? ? 1 : 0 if leaf?
40
+ return children.map {|n| n.duplicate_files}.inject {|sum, n| sum + n }
41
+ end
42
+
43
+
44
+ def basename; @relativePath; end
45
+
46
+ def self.rootItem
47
+ @rootItem ||= self.new '/', :parent => nil
48
+ end
49
+
50
+ def children
51
+ unless @children
52
+ if File.directory?(fullPath) and File.readable?(fullPath)
53
+ @children = Dir.entries(fullPath).select { |path|
54
+ path != '.' and path != '..'
55
+ }.map { |path|
56
+ FileSystemItem.new path, :parent => self
57
+ }
58
+ else
59
+ @children = nil
60
+ end
61
+ end
62
+ @children
63
+ end
64
+
65
+ def path; fullPath; end
66
+ def fullPath
67
+ @parent ? File.join(@parent.fullPath, @relativePath) : @relativePath
68
+ end
69
+
70
+ def childAtIndex n
71
+ children[n]
72
+ end
73
+
74
+ def numberOfChildren
75
+ children == nil ? -1 : children.size
76
+ end
77
+
78
+ def children?; !children.nil? && !children.empty?; end
79
+
80
+ def duplicate?
81
+ if leaf?
82
+ !duplicates.empty?
83
+ else # Dup if all ignored / dup children
84
+ ((@ignore_children + @duplicate_children) == numberOfChildren)
85
+ end
86
+ end
87
+
88
+ def ignorable?; ignorable; end
89
+
90
+ def increment_ignorable_children
91
+ @ignore_children += 1
92
+ update_duplicate_ignorable_status
93
+ end
94
+
95
+ def update_duplicate_ignorable_status
96
+ parent.increment_duplicate_children if((@ignore_children + @duplicate_children) == numberOfChildren)
97
+ end
98
+
99
+ def increment_duplicate_children
100
+ @duplicate_children += 1
101
+ update_duplicate_ignorable_status
102
+ end
103
+
104
+ def leaf?; !children?; end
105
+
106
+ def traverse(&block) # Allow proc to decide if we traverse
107
+ if block.call(self) && children?
108
+ children.each { |c| c.traverse(&block) }
109
+ end
110
+ end
111
+
112
+ def other_children
113
+ children.size - ignore_children - duplicate_children
114
+ end
115
+
116
+ def to_param; to_s; end
117
+ def to_s
118
+ "<Tree :path => #{self.path}, :files => #{self.files}>"
119
+ end
120
+
121
+ # Inspect the nodes:
122
+ def report(pre="")
123
+ pre += " " if !pre.empty?
124
+ self.traverse do |n|
125
+ # Is this a leaf (e.g. a file)?
126
+ if n.leaf?
127
+ msg = nil
128
+ msg = "# dup(#{n.duplicates.count})" if n.duplicate?
129
+ msg = "# ign" if n.ignorable?
130
+ if msg
131
+ puts "#{pre}#{n.path} #{msg}" # Remove the dups/igns!
132
+ else
133
+ puts "# #{n.path} unique"
134
+ end
135
+ false # Don't traverse deeper!
136
+ else
137
+ if n.duplicate_children + n.ignore_children == n.children.size
138
+ puts "#{pre}#{n.path} # #{n.duplicate_children} dups / #{n.ignore_children} ignores"
139
+ false # Don't traverse deeper!
140
+ elsif n.children.size == 0
141
+ puts "#{pre}#{n.path} # Empty... "
142
+ false
143
+ else
144
+ puts "# #{n.path} # Not #{n.duplicate_children} dup/ #{n.ignore_children} ign / #{n.other_children} other "
145
+ true
146
+ end
147
+ end
148
+ end
149
+ puts "# #{self.ignore_files} ignores, #{self.duplicate_files} dups of #{self.files} files"
150
+ end
151
+
152
+ end
153
+ end
data/lib/main.rb ADDED
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env ruby
2
+ require 'FileSystemItem'
3
+ # require 'unicode'
4
+
5
+ $KCODE = 'UTF-8' # only used when encoding is not specified.
6
+
7
+ # crawler = Index.new
8
+ # crawler.crawl
9
+
10
+ =begin
11
+ #!/usr/bin/ruby
12
+ # find_duplicates.rb
13
+
14
+ require 'find'
15
+ require 'digest/md5'
16
+
17
+ def each_set_of_duplicates(*paths)
18
+ sizes = {}
19
+ Find.find(*paths) do |f|
20
+ (sizes[File.size(f)] ||= []) << f if File.file? f
21
+ end
22
+ sizes.each do |size, files|
23
+ next unless files.size > 1
24
+ md5s = {}
25
+ files.each do |f|
26
+ digest = Digest::MD5.hexdigest(File.read(f))
27
+ (md5s[digest] ||= []) << f
28
+ end
29
+ md5s.each { |sum, files| yield files if files.size > 1 }
30
+ end
31
+ end
32
+
33
+ each_set_of_
34
+ duplicates(*ARGV) do |f|
35
+ puts "
36
+ Duplicates: #{f.join(", ")}"
37
+ end
38
+ =end
39
+
40
+ # http://codeidol.com/other/rubyckbk/System-Administration/Finding-Duplicate-Files/
41
+
42
+ #!/usr/bin/ruby
43
+ # find_duplicates2.rb
44
+
45
+ require 'find'
46
+
47
+ module RightData
48
+
49
+ BLOCK_SIZE = 1024*8
50
+
51
+ def self.each_set_of_duplicates(*paths, &block)
52
+ sizes = Hash.new {|h, k| h[k] = [] }
53
+ Find.find(*paths) { |f| sizes[File.size(f)] << f if File.file? f }
54
+
55
+ sizes.each_pair do |size, files|
56
+ # puts files.inspect
57
+ next unless files.size > 1
58
+ offset = 0
59
+ files = [files]
60
+ while !files.empty? && offset <= size
61
+ files = eliminate_non_duplicates(files, size, offset, &block)
62
+ offset += BLOCK_SIZE
63
+ end
64
+ end
65
+ end
66
+
67
+ def self.eliminate_non_duplicates(partition, size, offset)
68
+ possible_duplicates = []
69
+ partition.each do |possible_duplicate_set|
70
+ blocks = Hash.new {|h, k| h[k] = [] }
71
+ possible_duplicate_set.each do |f|
72
+ block = open(f, 'rb') do |file|
73
+ file.seek(offset)
74
+ file.read(BLOCK_SIZE)
75
+ end
76
+ blocks[block || ''] << f
77
+ end
78
+ blocks.each_value do |files|
79
+ if files.size > 1
80
+ if offset+BLOCK_SIZE >= size
81
+ # We know these are duplicates.
82
+ yield files
83
+ else
84
+ # We suspect these are duplicates, but we need to compare
85
+ # more blocks of data.
86
+ possible_duplicates << files
87
+ end
88
+ end
89
+ end
90
+ end
91
+ return possible_duplicates
92
+ end
93
+
94
+ IGNORE_FILES = [".DS_Store", ".typeAttributes.dict", "empty-file"]
95
+
96
+ def self.index_by_size(*paths)
97
+ sizes = Hash.new {|h, k| h[k] = [] }
98
+ count = 0
99
+ Find.find(*paths) { |f|
100
+ sizes[File.size(f)] << f if File.file?(f) && !IGNORE_FILES.include?(File.basename(f)) && (File.size(f) != 0) # Ignore empty files
101
+ count += 1
102
+ }
103
+ puts "Indexed #{count} files."
104
+ sizes
105
+ end
106
+
107
+ def self.cache_not_working_on_write(master)
108
+ master_cache = File.join(master,".rightPruneCache")
109
+ if File.exist?(master_cache)
110
+ puts "Master cache FOUND at #{master_cache}."
111
+ master_index = File.open(master_cache) do |f|
112
+ YAML::load(f)
113
+ end
114
+ else
115
+ puts "Master cache not found at #{master_cache}."
116
+ master_index = index_by_size(master)
117
+ puts "Writing #{master_cache}."
118
+ File.open(master_cache, "w") do |f|
119
+ YAML.dump(master_index, f)
120
+ end
121
+ puts "Wrote #{master_cache}."
122
+ end
123
+ master_index
124
+ end
125
+
126
+
127
+ def self.cache_serializing_on_write(master)
128
+ master_cache = File.join(master,".rightPruneCache")
129
+ if File.exist?(master_cache)
130
+ puts "Master cache FOUND at #{master_cache}."
131
+ master_index = File.open(master_cache) do |f|
132
+ rval = {}
133
+ f.each_line do |l|
134
+ kv = Marshal.load(l)
135
+ rval[kv.first] = kv.last
136
+ end
137
+ rval
138
+ end
139
+ else
140
+ puts "Master cache not found at #{master_cache}."
141
+ master_index = index_by_size(master)
142
+ puts "Writing #{master_cache}."
143
+ File.open(master_cache, "w") do |f|
144
+ master_index.each_pair do |k,v|
145
+ Marshal.dump([k,v], f)
146
+ end
147
+ # f.write(master_index.inspect)
148
+ end
149
+ puts "Wrote #{master_cache}."
150
+ end
151
+ end
152
+
153
+ def self.get_block(file,offset)
154
+ open(file, 'r') do |f|
155
+ f.seek(offset); f.read(BLOCK_SIZE)
156
+ end
157
+ end
158
+
159
+ def self.check_file_in_index(master_index, file_to_check, &block)
160
+ size = File.size(file_to_check)
161
+ return [] if size == 0 # Ignore empty files
162
+ possible_master_dups = master_index[size] || []
163
+ offset = 0
164
+ while !possible_master_dups.empty? && offset <= size
165
+ file_to_check_block = get_block(file_to_check, offset)
166
+ new_possible_master_dups = []
167
+ possible_master_dups.each do |master|
168
+ block = get_block(master,offset)
169
+ if(block == file_to_check_block)
170
+ new_possible_master_dups << master
171
+ end
172
+ end
173
+ possible_master_dups = new_possible_master_dups
174
+ offset += BLOCK_SIZE
175
+ end
176
+ # puts possible_master_dups.inspect
177
+ possible_master_dups
178
+ end
179
+
180
+ def self.test
181
+ master = "/Users/jonathan/Dropbox"
182
+ prune = "/Users/jonathan/Desktop/Old"
183
+ scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }
184
+ end
185
+
186
+ # tree = scan_for_prunable(master,prune) { |a,b| puts "#{b.size} : #{a}" }; nil
187
+ def self.scan_for_prunable(master,prune, &block)
188
+ puts "Ignoring: #{IGNORE_FILES.inspect}"
189
+
190
+ master_index = cache_not_working_on_write(master)
191
+ # master_index = index_by_size(master)
192
+ puts "Found #{master_index.size} unique sizes."
193
+
194
+ # dups = check_file_in_index(master_index, "/Users/jonathan/Dropbox/2261093437_fac9fa9008_b.jpg")
195
+
196
+ count = 0
197
+
198
+ # Recursively compare the files in the filesystem.
199
+ # When a parent node gets a response from all its children
200
+ # that they are dups OR ignorable, that NODE becomes dup_or_ignorable too.
201
+ # This propagates.
202
+ # Then, there is a traversal that grabs all base nodes that are non_dup like:
203
+ # rm -rf /a_path_duped/here # 14 dups / 9 ignores
204
+ # rm -rf /b_path_duped/way/here # 1 dup
205
+ tree = FileSystemItem.new(prune, :parent => nil)
206
+ # Mark the nodes:
207
+ tree.traverse do |n|
208
+ next true if File.directory?(n.path)
209
+ count += 1
210
+ if IGNORE_FILES.include?(n.basename)
211
+ n.ignorable = true
212
+ n.parent.increment_ignorable_children
213
+ else
214
+ # puts n.path
215
+ duplicates = check_file_in_index(master_index, n.path)
216
+ if(!duplicates.empty?)
217
+ n.duplicates = duplicates
218
+ n.parent.increment_duplicate_children
219
+ end
220
+ end
221
+ true
222
+ end
223
+ puts "We counted #{count} files. Tree thinks it has #{tree.files}."
224
+ return tree
225
+
226
+ if nil
227
+ Find.find(prune) { |f|
228
+ if File.directory? f
229
+ puts "Dir: #{f}"
230
+ prunable_dirs[f] = {}
231
+ next
232
+ end
233
+ # next unless File.file? f
234
+ count += 1
235
+ duplicates = check_file_in_index(master_index, f)
236
+ if(!duplicates.empty?)
237
+ dups[f] = duplicates
238
+ prunable_files[f] = duplicates
239
+ block.call(f, duplicates) unless block.nil?
240
+ else
241
+ prunable_files[f] = false
242
+ end
243
+ }
244
+
245
+ puts "After check. Found #{dups.size} / #{count} dups in master."
246
+ puts "After check. Found #{dups.first.inspect}"
247
+ end
248
+
249
+ # puts "Dirs scanned."
250
+ #prunable_dirs.each_pair do |file,prunable|
251
+ #puts "#{'#' if !prunable} #{file}"
252
+ #end
253
+
254
+ # puts "Files scanned."
255
+ # prunable_files.keys.sort.each do |file|
256
+ # prunable = prunable_files[file]
257
+ # puts "#{'#' if !prunable} #{file}"
258
+ # end
259
+ # prunable_files
260
+ end
261
+
262
+ # each_set_of_duplicates(dirs) do |f|
263
+ # puts "Duplicates: #{f.join(", ")}"
264
+ #end
265
+
266
+ # With YAML cache:
267
+ # Master cache FOUND at /Users/jonathan/Dropbox/.rightPruneCache.
268
+ # Found 37765 unique sizes.
269
+ # After check. Found 1240 / 1940 dups in master.
270
+ end
data/lib/right_data.rb ADDED
@@ -0,0 +1,5 @@
1
+ require 'main'
2
+
3
+ module RightData
4
+ def self.hello; "Hi!"; end
5
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'right_data'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,7 @@
1
+ require 'helper'
2
+
3
+ class TestRightData < Test::Unit::TestCase
4
+ should "probably rename this file and start testing for real" do
5
+ flunk "hey buddy, you should probably rename this file and start testing for real"
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: right_data
3
+ version: !ruby/object:Gem::Version
4
+ hash: 11
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 5
9
+ - 0
10
+ version: 0.5.0
11
+ platform: ruby
12
+ authors:
13
+ - Jonathan Siegel
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-08-30 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: thoughtbot-shoulda
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :development
34
+ version_requirements: *id001
35
+ description: RightData helpers
36
+ email: usiegj00@gmail.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ - README.rdoc
44
+ files:
45
+ - lib/FileSystemItem.rb
46
+ - lib/main.rb
47
+ - lib/right_data.rb
48
+ - LICENSE
49
+ - README.rdoc
50
+ - test/helper.rb
51
+ - test/test_right_data.rb
52
+ has_rdoc: true
53
+ homepage: http://github.com/usiegj00/right_data
54
+ licenses: []
55
+
56
+ post_install_message:
57
+ rdoc_options:
58
+ - --charset=UTF-8
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ hash: 3
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 3
76
+ segments:
77
+ - 0
78
+ version: "0"
79
+ requirements: []
80
+
81
+ rubyforge_project:
82
+ rubygems_version: 1.3.7
83
+ signing_key:
84
+ specification_version: 3
85
+ summary: RightData helpers
86
+ test_files:
87
+ - test/helper.rb
88
+ - test/test_right_data.rb