imw 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
@@ -1,305 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- module Linkish
3
- def self.included base
4
- base.class_eval do
5
- include DataMapper::Resource
6
- include Infochimps::Resource
7
- property :id, Integer, :serial => true
8
- property :full_url, String, :length => 255, :nullable => false, :unique_index => true
9
- has_handle
10
- alias_method :handle_generator, :full_url
11
- has_time_and_user_stamps
12
- #
13
- property :name, String, :length => 255, :nullable => false, :default => ''
14
- #
15
- property :file_path, String, :length => 1024
16
- property :file_time, DateTime
17
- property :file_size, Integer
18
- property :file_sha1, String, :length => 40
19
- property :tried_fetch, DataMapper::Resource::Boolean
20
- property :fetched, DataMapper::Resource::Boolean
21
- #
22
- before :create, :make_uuid_and_handle
23
- before :create, :update_from_file!
24
- end
25
- base.extend ClassMethods
26
- end
27
-
28
- # ===========================================================================
29
- #
30
- # Delegate methods to uri
31
- #
32
- def uri
33
- @uri ||= Addressable::URI.parse(self.full_url)
34
- end
35
- # Dispatch anything else to the aggregated uri object
36
- def method_missing method, *args
37
- if self.uri.respond_to?(method)
38
- self.uri.send(method, *args)
39
- else
40
- super method, *args
41
- end
42
- end
43
-
44
- def to_s
45
- "<a href='#{self.uri.to_s}'>#{self.name}</a>" # <-- !! not escaped !!
46
- end
47
-
48
- # ===========================================================================
49
- #
50
- # ID, naming, etc
51
- #
52
- def normalize_url!
53
- u = Addressable::URI.parse(self.full_url).normalize
54
- self.full_url = u.to_s
55
- end
56
-
57
- # ===========================================================================
58
- #
59
- # Properly belongs in FileStore module
60
- #
61
- #
62
- # Refresh cached properties from our copy of the asset.
63
- #
64
- def update_from_file!
65
- self.make_uuid_and_handle # make sure this happened
66
- # Set the file path
67
- self.file_path = self.to_file_path if self.file_path.blank?
68
- # FIXME -- kludge to ripd_root
69
- if ! File.exist?(actual_path)
70
- self.fetched = false
71
- else
72
- self.fetched = self.tried_fetch = true
73
- self.file_size = File.size( actual_path)
74
- self.file_time = File.mtime(actual_path)
75
- end
76
- self.fetched
77
- end
78
- def actual_path
79
- path_to(:ripd_root, self.file_path)
80
- end
81
-
82
- # ===========================================================================
83
- #
84
- # Properly belongs in own module
85
- #
86
-
87
- IMW_WGET_OPTIONS = {
88
- :root => :ripd_root,
89
- :wait => 2,
90
- :noretry => true,
91
- :log_level => Logger::DEBUG,
92
- :clobber => false,
93
- }
94
- #
95
- # Fetch from the web
96
- #
97
- def wget options={}
98
- options.reverse_merge! IMW_WGET_OPTIONS
99
- cd path_to(options[:root]) do
100
- if (not options[:clobber]) && File.file?(file_path) then
101
- IMW.log.add options[:log_level], "Skipping #{file_path}"; return
102
- end
103
- # Do the fetch
104
- mkdir_p File.dirname(actual_path)
105
- # defaults are --connect-timeout=infinity --read-timeout=900 --tries=20 acc. to man page
106
- cmd = %Q{wget -nv "#{full_url}" -O"#{actual_path}" --connect-timeout=5 --read-timeout=10 --tries=1 &}
107
- IMW.log.add(options[:log_level], cmd)
108
- IMW.log.add(options[:log_level], `#{cmd}`)
109
- self.tried_fetch = true
110
- sleep options[:wait] # please hammer don't hurt em
111
- update_from_file!
112
- self.save
113
- return self.fetched
114
- end
115
- end
116
-
117
- #
118
- #
119
- #
120
- def contents options={}
121
- wget options
122
- if fetched
123
- File.open actual_path
124
- end
125
- end
126
-
127
- # ===========================================================================
128
- #
129
- # Properly belongs in FileStore
130
- #
131
-
132
- protected
133
- #
134
- # The standard file path for this url's ripped cache
135
- #
136
- # * leading directory from reverse.dotted.host_scheme:port:user@password
137
- # * normalized path/file?query#fragment
138
- # * uuid formed from the
139
- #
140
- def to_file_path
141
- file_path_str = ""
142
- file_path_str << to_file_path_root_part
143
- file_path_str << to_file_path_path_part
144
- file_path_str << to_file_path_file_part
145
- file_path_str = self.class.path_str_encode(file_path_str)
146
- self.class.validate_roundtrip(file_path_str)
147
- file_path_str
148
- end
149
- def file_timestamp
150
- file_time.strftime("%Y%m%d-%H%M%S")
151
- end
152
- def to_file_path_with_timestamp
153
- to_file_path + file_timestamp
154
- end
155
- #
156
- # revhost_scheme:port:user@password -- omitting _scheme if it's http, and
157
- # omitting :port:user@password if all three are blank.
158
- #
159
- def to_file_path_root_part
160
- root_part_str = ""
161
- tld_host_frag = self.class.tier_path_segment(revhost, /^([^\.]+)\.([^\.]{1,2})/)
162
- root_part_str << revhost
163
- root_part_str << "_#{uri.scheme}" unless uri.scheme == 'http'
164
- root_part_str << ":#{uri.port}:#{uri.user}@#{uri.password}" unless uri.simple?
165
- root_part_str
166
- end
167
- def to_file_path_path_part
168
- uri.path.to_s
169
- end
170
- def to_file_path_file_part
171
- file_path_str = ""
172
- file_path_str << "?#{uri.query}" unless uri.query.nil?
173
- file_path_str << "##{uri.fragment}" unless uri.fragment.nil?
174
- file_path_str << "-#{self.uuid}"
175
- end
176
- public
177
-
178
-
179
- module ClassMethods
180
- #
181
- # find_or_creates from url
182
- #
183
- # url is heuristic_parse'd and normalized by Addressable before lookup:
184
- # "Converts an input to a URI. The input does not have to be a valid URI —
185
- # the method will use heuristics to guess what URI was intended. This is not
186
- # standards compliant, merely user-friendly.
187
- #
188
- def find_or_create_from_url url_str
189
- link = self.find_or_new_from_url url_str
190
- link.save
191
- link
192
- end
193
- def find_or_new_from_url url_str # :nodoc:
194
- url_str = Addressable::URI.heuristic_parse(url_str).normalize.to_s
195
- link = self.first( :full_url => url_str ) || self.new( :full_url => url_str )
196
- link.make_uuid_and_handle
197
- link.update_from_file!
198
- link
199
- end
200
- def find_or_create_from_file_path ripd_file
201
- url_str = Link.url_from_file_path(ripd_file)
202
- link = self.first( :full_url => url_str.to_s ) || self.new( :full_url => url_str.to_s )
203
- link.file_path = ripd_file
204
- link.make_uuid_and_handle
205
- link.update_from_file!
206
- link.save
207
- link
208
- end
209
- #
210
- # Decode url from its file_path
211
- #
212
- def url_from_file_path fp
213
- fp = path_str_decode(fp)
214
- m = (%r{\A
215
- (#{Addressable::URI::HOST_TLD}) # tld tier
216
- /(..?) # revhost tier
217
- /([^/\:_]+) # revhost
218
- (?:_([^/\:]+))? # _scheme
219
- (?::(\d*):([^/]*)@([^@/]*?))? # :port:user@password
220
- /(?:(.*?)/)? # /dirs/
221
- ([^/]*) # file
222
- -([a-f0-9]{32}) # -uuid
223
- \z}x.match(fp))
224
- raise "Can't extract url from file path #{fp}" if !m
225
- fp_host, fp_scheme, fp_port, fp_user, fp_pass, fp_path, fp_file, fp_uuid = m.captures
226
- fp_host = fp_host.split('.').reverse.join('.')
227
- fp_scheme ||= 'http'
228
- fp_pass = ":#{fp_pass}" unless fp_pass.blank?
229
- fp_userpass = "#{fp_user}#{fp_user}@" unless fp_user.blank?
230
- fp_port = ":#{fp_port}" unless fp_port.blank?
231
- fp_path = File.join(*[fp_path, fp_file].compact)
232
- "#{fp_scheme}://#{fp_userpass}#{fp_host}#{fp_port}/#{fp_path}"
233
- end
234
- #
235
- # to control files-per-directory madness, take a path segment like "foobar" in
236
- # blah.com/top/foobar/directory
237
- # and transform into
238
- # blah.com/top/fo/foobar/directory
239
- #
240
- # Ex.
241
- # self.class.tier_path_segment('a_username')
242
- # # => 'a_/a_username'
243
- # self.class.tier_path_segment('1')
244
- # # => '1/1'
245
- # self.class.tier_path_segment('com.twitter', /^([^\.]+)\.([^\.]{1,2})/)
246
- # # => 'com/tw/com.twitter'
247
- #
248
- def self.tier_path_segment(path_seg, re=/(..?)/)
249
- frag_seg = re.match(path_seg).captures
250
- raise "Can't tier path_seg #{path_seg} using #{re}" if frag_seg.blank?
251
- File.join(* [frag_seg, path_seg].flatten )
252
- end
253
- #
254
- #
255
- # It's really bad if you can't roundtrip --
256
- # since saving is the rare case (only done once!) we insist on checking.
257
- #
258
- def self.validate_roundtrip file_path_str
259
- # uu = self.class.url_from_file_path(file_path_str)
260
- # puts "*"*75, uri.to_hash.inspect, ['path str', file_path_str, 'uri', uri.to_s, 'rt', uu.to_s].inspect
261
- return_trip_url = Addressable::URI.parse(self.class.url_from_file_path(file_path_str))
262
- raise "crapsticks: uri doesn't roundtrip #{file_path_str} to #{uri.to_s}: #{return_trip_url}" if return_trip_url != uri
263
- end
264
- #
265
- # Uses a similar scheme as the 'Quoted Printable' encoding, but more strict
266
- # and without linebreaking or anything. The intent is to reversibly and
267
- # recognizably store URLs to disk with names that (apart from path) do not
268
- # need to be further escaped in filesystem, URL, database or HTML.
269
- #
270
- # The only characters in a path_encoded string are alpha-numeric /_-.=
271
- #
272
- # Rules:
273
- # * Any character that is not alphanumeric, and is not /_-. is encoded as an
274
- # equals sign = followed by its upper-case hex encoding.
275
- #
276
- # * Furthermore, in any sequence of repeated '.' characters, all after the
277
- # first are hex encoded; same with '/'.
278
- #
279
- # Ex.
280
- # path_encode("www.measuringworth.com/datasets/consumer/result.php?use[]=VCB&use[]=CU&use[]=SZ&year_source=1900&year_result=2007"
281
- # # => www.measuringworth.com/datasets/consumer/result.php=3Fuse=5B=5D=3DVCB=26use=5B=5D=3DCU=26use=5B=5D=3DSZ=26year_source=3D1900=26year_result=3D2007
282
- #
283
- # Code inspired by "Glenn Parker's response to ruby quiz #23"http://www.rubyquiz.com/quiz23.html
284
- #
285
- def path_str_encode(str)
286
- str.gsub(%r{\.(\.+)}){|chars| '.'+path_encode_chars(chars) }
287
- str.gsub(%r{\/(\/+)}){|chars| '/'+path_encode_chars(chars) }
288
- str.gsub(%r{[^A-Za-z0-9/_\-\.]+}){|chars| path_encode_chars(chars) }
289
- end
290
- #
291
- # See the notes in path_encode
292
- #
293
- def path_str_decode(str)
294
- str.gsub(/\+([\dA-F]{2})/){ $1.hex.chr }
295
- end
296
- protected
297
- def path_encode_chars(chars) # :nodoc:
298
- # send each character to an equals sign followed by its uppercase hex encoding
299
- encoded = "";
300
- chars.each_byte{|c| encoded << "+%02X" % c }
301
- encoded
302
- end
303
- public
304
- end
305
- end
@@ -1,87 +0,0 @@
1
-
2
- def self.url_from_file_path fp
3
- # FIXME -- doesn't work with extension preservation
4
- unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{30,32})\z}.match(fp)) then
5
- # m1 = %r{\A([^/_]+)(_[^/]+)?/(?:(.*?))-([a-f0-9]{28,})}i.match(fp);
6
- raise "Bad match to #{fp}"
7
- end
8
- fp_host, fp_scheme, fp_path, fp_file, fp_uuid, fp_ext = m.captures
9
- fp_host = fp_host.split('.').reverse.join('.')
10
- fp_scheme ||= 'http'
11
- fp_path = File.join(*[fp_path, fp_file].compact) # FIXME -- no ext
12
- url = Addressable::URI.new(fp_scheme, nil, nil, fp_host, nil, fp_path, nil, nil)
13
- unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{32})\z}.match(fp)) then
14
- # warn "Bad luck!!! #{url.path} hash is #{fp_uuid} vs #{UUID.sha1_create(UUID_INFOCHIMPS_LINKS_NAMESPACE, url.to_s).hexdigest}"
15
- end
16
- url
17
- end
18
-
19
- #
20
- # returns [dirname, basename, ext] for the file_path
21
- # ext is determined by basename_ext_splitter
22
- #
23
- def path_split
24
- path_split_str path
25
- end
26
-
27
- # lowercase; only a-z, num, . -
28
- def scrubbed_revhost
29
- return unless revhost
30
- revhost.downcase.gsub(/[^a-z0-9\.\-]+/i, '') # note: no _
31
- end
32
-
33
- cattr_accessor :basename_ext_splitter
34
- BASENAME_EXT_SPLIT_SMART = /(.+?)\.(tar\.gz|tar\.bz2|[^\.]+)/
35
- BASENAME_EXT_NO_SPLIT = /(.+?)()/
36
- self.basename_ext_splitter = BASENAME_EXT_NO_SPLIT
37
-
38
- #
39
- # Like File.split but heuristically handles things like .tar.bz2:
40
- #
41
- # foo. => ['foo.', '']
42
- # foo.tar.gz => ['foo.', '']
43
- # foo.tar.bz2 => ['foo.', '']
44
- # foo.yaml => ['foo', '']
45
- #
46
- def path_split_str str
47
- if str =~ %r{/.+\z}
48
- dirname, basename = %r{\A(.*)/([^/]+)\z}.match(str).captures
49
- else
50
- dirname, basename = ['', str]
51
- end
52
- # Get basename, extension (as given by capture groups in basename_ext_splitter)
53
- if basename_ext_splitter && (m = /\A#{basename_ext_splitter}\z/i.match(basename))
54
- basename, ext = m.captures
55
- else
56
- basename, ext = [basename, '']
57
- end
58
- [dirname, basename, ext]
59
- end
60
-
61
- # remove all blank components, join the rest with separator
62
- def join_non_blank separator, *strs
63
- strs.reject(&:blank?).join(separator)
64
- end
65
-
66
- # only a-z A-Z, num, .-_/
67
- def scrubbed_path
68
- path_part = path
69
- # colons into /
70
- path_part = path_part.gsub(%r{\:+}, '/')
71
- # Kill weird chars
72
- path_part = path_part.gsub(%r{[^a-zA-Z0-9\.\-_/]+}, '_')
73
- # Compact (killing foo/../bar, etc)
74
- path_part = path_part.gsub(%r{/[^a-zA-Z0-9]+/}, '/').gsub(%r{/\.\.+/}, '.')
75
- # Kill leading & trailing non-alnum
76
- path_part = path_part.gsub(%r{^[^a-zA-Z0-9]+}, '').gsub(%r{[^a-zA-Z0-9]+$}, '')
77
- end
78
-
79
- #
80
- # name for this URL regarded as a file (instance)
81
- #
82
- def to_file_path
83
- dirname, basename, ext = path_split_str(scrubbed_path)
84
- basename = join_non_blank '-', basename, uuid
85
- basename = join_non_blank '.', basename, ext
86
- join_non_blank '/', root_path, dirname, basename
87
- end
@@ -1,147 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- require 'rubygems'
3
- require 'active_support'
4
- require 'uuidtools'
5
-
6
- module Scrub
7
- class Generic
8
- # A regular expression character group
9
- # (a bunch of characters ready to drop into /[#{validator}]*/)
10
- # whitelisting allowed characters
11
- #
12
- # Must be overridden in child class
13
- class_inheritable_accessor :validator
14
-
15
- # Sentence fragment for error message on failed validation.
16
- class_inheritable_accessor :complaint
17
- self.complaint = "has characters I can't understand"
18
-
19
- # Proc or string or anything that can be 2nd arg to gsub
20
- # to sanitize
21
- class_inheritable_accessor :replacer
22
- self.replacer = '-'
23
-
24
- # A regular expression to sanitize objects
25
- # if unset or nil, the validator char group
26
- class_inheritable_accessor :sanitizer
27
-
28
- # unless overridden or set expressly, just use the
29
- # validator
30
- def sanitizer
31
- @sanitizer || self.validator
32
- end
33
-
34
- def sanitize str
35
- str = str.to_s
36
- str.gsub(%r{([^#{validator.to_s}]+)}u, replacer)
37
- end
38
-
39
- def valid? str
40
- %r{\A([#{validator.to_s}]*)\z}u.match(str)
41
- end
42
- end
43
-
44
- #
45
- # A permissive, ASCII-only name string - no control chars, newlines, backslash
46
- # or <> angle brackets
47
- #
48
- class Title < Scrub::Generic
49
- self.complaint = "should only contain basic keyboard characters (and should not use \\ &lt; or &gt;)."
50
- self.validator = %r{a-zA-Z0-9_ ~\!@#\$%\^&\*\(\)\-\+=;\:'"`\[\]\{\}\|,\?\.\/}u
51
- end
52
-
53
- #
54
- # A permissive, ASCII-only name string - no control chars, newlines, backslash
55
- # or <> angle brackets
56
- #
57
- class UnicodeTitle < Scrub::Title
58
- self.complaint = "should only contain keyboard characters (and should not use \\ &lt; or &gt;)."
59
- self.validator = %r{[:alpha:][:digit:]#{Scrub::Title.validator}}u
60
- end
61
-
62
- #
63
- # Visible characters and spaces (i.e. anything except control characters, etc.)
64
- #
65
- class FreeText < Scrub::Generic
66
- self.complaint = "should not contain control characters or that kind of junk."
67
- self.validator = %r{[:print:]\n\t}u
68
- end
69
-
70
- module BeginsWithAlpha
71
- mattr_accessor :slug
72
- self.slug = 'x'
73
- # prepend #{slug}#{replacer} to the string if it starts with non-alpha.
74
- # so, for instance '23jumpstreet' => 'x_23jumpstreet'
75
- def sanitize_with_begins_with_alpha str
76
- str = sanitize_without_begins_with_alpha str
77
- str = 'x' + replacer + str if (str !~ /^[a-z]/i) # call at end of chain!
78
- str
79
- end
80
- def valid_with_begins_with_alpha? str
81
- (str =~ /^[a-z]/i) && valid_without_begins_with_alpha?(str)
82
- end
83
- def self.included base
84
- base.alias_method_chain :sanitize, :begins_with_alpha # unless defined?(base.sanitize_without_begins_with_alpha)
85
- base.alias_method_chain :valid?, :begins_with_alpha # unless defined?(base.valid_without_begins_with_alpha?)
86
- end
87
- end
88
-
89
- #
90
- # insist that a string be lowercased.
91
- #
92
- module Lowercased
93
- def sanitize_with_lowercased str
94
- str = sanitize_without_lowercased str
95
- str.downcase # call at end of chain!
96
- end
97
- def valid_with_lowercase? str
98
- (str !~ /[[:upper:]]/u) && valid_without_lowercase?(str)
99
- end
100
- def self.included base
101
- base.alias_method_chain :sanitize, :lowercased # unless defined?(base.sanitize_without_lowercased)
102
- base.alias_method_chain :valid?, :lowercase # unless defined?(base.valid_without_lowercase?)
103
- end
104
- end
105
-
106
- #
107
- # start with a letter, and contain only A-Za-z0-9_
108
- #
109
- class Identifier < Scrub::Generic
110
- self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
111
- self.validator = %r{a-z0-9_}u
112
- self.replacer = '_'
113
- include Scrub::BeginsWithAlpha
114
- include Scrub::Lowercased
115
- end
116
-
117
- #
118
- # start with a letter, and contain only A-Za-z0-9_
119
- #
120
- class Handle < Scrub::Generic
121
- self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
122
- self.validator = %r{a-z0-9_}u
123
- self.replacer = '_'
124
- include Scrub::BeginsWithAlpha
125
- include Scrub::Lowercased
126
- end
127
-
128
- # HANDLE_RE = %r{\A[a-z][]*\z}i # ascii, not :alpha: etc.
129
- # HANDLE_MSG = "should start with a letter, and contain only characters like a-z0-9_-."
130
- #
131
- # # "Domain names are restricted to the ASCII letters a through z
132
- # # (case-insensitive), the digits 0 through 9, and the hyphen, with some other
133
- # # restrictions in terms of name length and position of hyphens."
134
- # # (http://en.wikipedia.org/wiki/Domain_name#Overview)
135
- # # http://tools.ietf.org/html/rfc1034
136
- # DOMAIN_RE = %r{\A[a-z][a-z0-9\-][a-z0-9]\z}i # case insensitive
137
- # DOMAIN_MSG = "should look like a domain name."
138
- # DOMAIN_MORE = "only letters, digits or hyphens (-), start with a letter and end with a letter or number."
139
- MSG_EMAIL_BAD = "should look like an email address (you@somethingsomething.com) and include only letters, numbers and .&nbsp;+&nbsp;-&nbsp;&#37; please."
140
- RE_EMAIL_NAME = '[\w\.%\+\-]+' # what you actually see in practice
141
- RE_EMAIL_N_RFC2822 = '0-9A-Z!#\$%\&\'\*\+_/=\?^\-`\{|\}~\.' # technically allowed by RFC-2822
142
- RE_DOMAIN_HEAD = '(?:[A-Z0-9\-]+\.)+'
143
- RE_DOMAIN_TLD = '(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
144
- RE_EMAIL_OK = /\A#{RE_EMAIL_NAME}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
145
- RE_EMAIL_RFC2822 = /\A#{RE_EMAIL_N_RFC2822}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
146
-
147
- end
@@ -1,38 +0,0 @@
1
-
2
-
3
- module IMW
4
- module URIScrubber
5
-
6
- def scrubbed
7
- to_dirpath
8
- end
9
- end
10
- end
11
-
12
- module Scrub
13
- #
14
- # start with a letter, and contain only A-Za-z0-9_
15
- #
16
- class SimplifiedURL < Scrub::Generic
17
- self.complaint = "should follow our zany simplified URL rules: com.host.dot-reversed:schemeifnothttp/path/seg_men-ts/stuff.ext-SHA1ifweird"
18
- self.validator = %r{#{Addressable::URI::SAFE_CHARS}#{Addressable::URI::RESERVED_CHARS}}u
19
- self.replacer = ''
20
- include Scrub::Lowercased
21
- attr_accessor :uri
22
-
23
- def valid? str
24
- str.to_s.downcase == sanitize(str)
25
- end
26
-
27
- def sanitize str
28
- # if this fails just normalize once, or don't set $KCODE: http://bit.ly/1664vp
29
- uri = Addressable::URI.heuristic_parse(str.to_s).normalize
30
- # print [uri.host, uri.host_valid?, uri.path, uri.path_valid?].inspect
31
- if uri.host_valid?
32
- uri.scrubbed
33
- else
34
- uri.uuid_path
35
- end
36
- end
37
- end
38
- end
@@ -1,60 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
- require 'scrub'
4
- require 'scrub_simple_url'
5
-
6
- test_strings = [
7
- nil, '', '12', '123', 'simple', 'UPPER', 'CamelCased', 'iden_tifier_23_',
8
- 'twentyfouralphacharslong', 'twentyfiveatozonlyletters', 'hello.-_there@funnychar.com',
9
- "tab\t", "newline\n",
10
- "Iñtërnâtiônàlizætiøn",
11
- 'semicolon;', 'quote"', 'tick\'', 'backtick`', 'percent%', 'plus+', 'space ',
12
- 'leftanglebracket<', 'ampersand&',
13
- "control char-bel\x07",
14
- "http://foo.bar.com/",
15
- "HTTP://FOO.BAR.com",
16
- ".com/zazz",
17
- "scheme://user_name@user_acct:passwd@host-name.museum:9047/path;pathquery/p!a-th~2/path?query=param&amp;query=pa%20ram#fragment",
18
- "http://web.site.com/path/path/file.ext",
19
- "ftp://ftp.site.com/path/path/file.ext",
20
- "/absolute/pathname/file.ext",
21
- "http://foo.bar.com/.hidden_file_with.ext",
22
- "http://foo.bar.com/.hidden_file",
23
- "dir/--/non_alpha_path_segment.ext",
24
- "http://foo.bar.com/dir/../two_dots_in_path",
25
-
26
- ]
27
-
28
-
29
- scrubbers = {
30
- # :unicode_title => Scrub::UnicodeTitle.new,
31
- # :title => Scrub::Title.new,
32
- # :identifier => Scrub::Identifier.new,
33
- # :free_text => Scrub::FreeText.new,
34
- :handle => Scrub::Handle.new,
35
- :simplified_url => Scrub::SimplifiedURL.new,
36
- # :domain => Scrub::Domain.new,
37
- # :email => Scrub::Email.new,
38
- }
39
-
40
- scrubbers.each do |scrubber_name, scrubber|
41
- puts scrubber_name
42
- results = test_strings.map do |test_string|
43
- [!!scrubber.valid?(test_string), scrubber.sanitize(test_string).inspect, test_string.inspect ]
44
- end
45
- results.sort_by{|val,san,orig| val ? 1 : -1 }.each do |val,san,orig|
46
- puts " %-5s %-30s %-30s" % [val,san,orig]
47
- end
48
- end
49
-
50
-
51
-
52
- # 'foo@bar.com', 'foo@newskool-tld.museum', 'foo@twoletter-tld.de', 'foo@nonexistant-tld.qq',
53
- # 'r@a.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail.com',
54
- # 'hello.-_there@funnychar.com', 'uucp%addr@gmail.com', 'hello+routing-str@gmail.com',
55
- # 'domain@can.haz.many.sub.doma.in',],
56
- # :invalid => [nil, '', '!!@nobadchars.com', 'foo@no-rep-dots..com', 'foo@badtld.xxx', 'foo@toolongtld.abcdefg',
57
- # 'Iñtërnâtiônàlizætiøn@hasnt.happened.to.email', 'need.domain.and.tld@de', "tab\t", "newline\n",
58
- # 'r@.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail2.com',
59
- # # these are technically allowed but not seen in practice:
60
- # 'uucp!addr@gmail.com', 'semicolon;@gmail.com', 'quote"@gmail.com', 'tick\'@gmail.com', 'backtick`@gmail.com', 'space @gmail.com', 'bracket<@gmail.com', 'bracket>@gmail.com'