imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
@@ -1,305 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- module Linkish
3
- def self.included base
4
- base.class_eval do
5
- include DataMapper::Resource
6
- include Infochimps::Resource
7
- property :id, Integer, :serial => true
8
- property :full_url, String, :length => 255, :nullable => false, :unique_index => true
9
- has_handle
10
- alias_method :handle_generator, :full_url
11
- has_time_and_user_stamps
12
- #
13
- property :name, String, :length => 255, :nullable => false, :default => ''
14
- #
15
- property :file_path, String, :length => 1024
16
- property :file_time, DateTime
17
- property :file_size, Integer
18
- property :file_sha1, String, :length => 40
19
- property :tried_fetch, DataMapper::Resource::Boolean
20
- property :fetched, DataMapper::Resource::Boolean
21
- #
22
- before :create, :make_uuid_and_handle
23
- before :create, :update_from_file!
24
- end
25
- base.extend ClassMethods
26
- end
27
-
28
- # ===========================================================================
29
- #
30
- # Delegate methods to uri
31
- #
32
- def uri
33
- @uri ||= Addressable::URI.parse(self.full_url)
34
- end
35
- # Dispatch anything else to the aggregated uri object
36
- def method_missing method, *args
37
- if self.uri.respond_to?(method)
38
- self.uri.send(method, *args)
39
- else
40
- super method, *args
41
- end
42
- end
43
-
44
- def to_s
45
- "<a href='#{self.uri.to_s}'>#{self.name}</a>" # <-- !! not escaped !!
46
- end
47
-
48
- # ===========================================================================
49
- #
50
- # ID, naming, etc
51
- #
52
- def normalize_url!
53
- u = Addressable::URI.parse(self.full_url).normalize
54
- self.full_url = u.to_s
55
- end
56
-
57
- # ===========================================================================
58
- #
59
- # Properly belongs in FileStore module
60
- #
61
- #
62
- # Refresh cached properties from our copy of the asset.
63
- #
64
- def update_from_file!
65
- self.make_uuid_and_handle # make sure this happened
66
- # Set the file path
67
- self.file_path = self.to_file_path if self.file_path.blank?
68
- # FIXME -- kludge to ripd_root
69
- if ! File.exist?(actual_path)
70
- self.fetched = false
71
- else
72
- self.fetched = self.tried_fetch = true
73
- self.file_size = File.size( actual_path)
74
- self.file_time = File.mtime(actual_path)
75
- end
76
- self.fetched
77
- end
78
- def actual_path
79
- path_to(:ripd_root, self.file_path)
80
- end
81
-
82
- # ===========================================================================
83
- #
84
- # Properly belongs in own module
85
- #
86
-
87
- IMW_WGET_OPTIONS = {
88
- :root => :ripd_root,
89
- :wait => 2,
90
- :noretry => true,
91
- :log_level => Logger::DEBUG,
92
- :clobber => false,
93
- }
94
- #
95
- # Fetch from the web
96
- #
97
- def wget options={}
98
- options.reverse_merge! IMW_WGET_OPTIONS
99
- cd path_to(options[:root]) do
100
- if (not options[:clobber]) && File.file?(file_path) then
101
- IMW.log.add options[:log_level], "Skipping #{file_path}"; return
102
- end
103
- # Do the fetch
104
- mkdir_p File.dirname(actual_path)
105
- # defaults are --connect-timeout=infinity --read-timeout=900 --tries=20 acc. to man page
106
- cmd = %Q{wget -nv "#{full_url}" -O"#{actual_path}" --connect-timeout=5 --read-timeout=10 --tries=1 &}
107
- IMW.log.add(options[:log_level], cmd)
108
- IMW.log.add(options[:log_level], `#{cmd}`)
109
- self.tried_fetch = true
110
- sleep options[:wait] # please hammer don't hurt em
111
- update_from_file!
112
- self.save
113
- return self.fetched
114
- end
115
- end
116
-
117
- #
118
- #
119
- #
120
- def contents options={}
121
- wget options
122
- if fetched
123
- File.open actual_path
124
- end
125
- end
126
-
127
- # ===========================================================================
128
- #
129
- # Properly belongs in FileStore
130
- #
131
-
132
- protected
133
- #
134
- # The standard file path for this url's ripped cache
135
- #
136
- # * leading directory from reverse.dotted.host_scheme:port:user@password
137
- # * normalized path/file?query#fragment
138
- # * uuid formed from the
139
- #
140
- def to_file_path
141
- file_path_str = ""
142
- file_path_str << to_file_path_root_part
143
- file_path_str << to_file_path_path_part
144
- file_path_str << to_file_path_file_part
145
- file_path_str = self.class.path_str_encode(file_path_str)
146
- self.class.validate_roundtrip(file_path_str)
147
- file_path_str
148
- end
149
- def file_timestamp
150
- file_time.strftime("%Y%m%d-%H%M%S")
151
- end
152
- def to_file_path_with_timestamp
153
- to_file_path + file_timestamp
154
- end
155
- #
156
- # revhost_scheme:port:user@password -- omitting _scheme if it's http, and
157
- # omitting :port:user@password if all three are blank.
158
- #
159
- def to_file_path_root_part
160
- root_part_str = ""
161
- tld_host_frag = self.class.tier_path_segment(revhost, /^([^\.]+)\.([^\.]{1,2})/)
162
- root_part_str << revhost
163
- root_part_str << "_#{uri.scheme}" unless uri.scheme == 'http'
164
- root_part_str << ":#{uri.port}:#{uri.user}@#{uri.password}" unless uri.simple?
165
- root_part_str
166
- end
167
- def to_file_path_path_part
168
- uri.path.to_s
169
- end
170
- def to_file_path_file_part
171
- file_path_str = ""
172
- file_path_str << "?#{uri.query}" unless uri.query.nil?
173
- file_path_str << "##{uri.fragment}" unless uri.fragment.nil?
174
- file_path_str << "-#{self.uuid}"
175
- end
176
- public
177
-
178
-
179
- module ClassMethods
180
- #
181
- # find_or_creates from url
182
- #
183
- # url is heuristic_parse'd and normalized by Addressable before lookup:
184
- # "Converts an input to a URI. The input does not have to be a valid URI —
185
- # the method will use heuristics to guess what URI was intended. This is not
186
- # standards compliant, merely user-friendly.
187
- #
188
- def find_or_create_from_url url_str
189
- link = self.find_or_new_from_url url_str
190
- link.save
191
- link
192
- end
193
- def find_or_new_from_url url_str # :nodoc:
194
- url_str = Addressable::URI.heuristic_parse(url_str).normalize.to_s
195
- link = self.first( :full_url => url_str ) || self.new( :full_url => url_str )
196
- link.make_uuid_and_handle
197
- link.update_from_file!
198
- link
199
- end
200
- def find_or_create_from_file_path ripd_file
201
- url_str = Link.url_from_file_path(ripd_file)
202
- link = self.first( :full_url => url_str.to_s ) || self.new( :full_url => url_str.to_s )
203
- link.file_path = ripd_file
204
- link.make_uuid_and_handle
205
- link.update_from_file!
206
- link.save
207
- link
208
- end
209
- #
210
- # Decode url from its file_path
211
- #
212
- def url_from_file_path fp
213
- fp = path_str_decode(fp)
214
- m = (%r{\A
215
- (#{Addressable::URI::HOST_TLD}) # tld tier
216
- /(..?) # revhost tier
217
- /([^/\:_]+) # revhost
218
- (?:_([^/\:]+))? # _scheme
219
- (?::(\d*):([^/]*)@([^@/]*?))? # :port:user@password
220
- /(?:(.*?)/)? # /dirs/
221
- ([^/]*) # file
222
- -([a-f0-9]{32}) # -uuid
223
- \z}x.match(fp))
224
- raise "Can't extract url from file path #{fp}" if !m
225
- fp_host, fp_scheme, fp_port, fp_user, fp_pass, fp_path, fp_file, fp_uuid = m.captures
226
- fp_host = fp_host.split('.').reverse.join('.')
227
- fp_scheme ||= 'http'
228
- fp_pass = ":#{fp_pass}" unless fp_pass.blank?
229
- fp_userpass = "#{fp_user}#{fp_user}@" unless fp_user.blank?
230
- fp_port = ":#{fp_port}" unless fp_port.blank?
231
- fp_path = File.join(*[fp_path, fp_file].compact)
232
- "#{fp_scheme}://#{fp_userpass}#{fp_host}#{fp_port}/#{fp_path}"
233
- end
234
- #
235
- # to control files-per-directory madness, take a path segment like "foobar" in
236
- # blah.com/top/foobar/directory
237
- # and transform into
238
- # blah.com/top/fo/foobar/directory
239
- #
240
- # Ex.
241
- # self.class.tier_path_segment('a_username')
242
- # # => 'a_/a_username'
243
- # self.class.tier_path_segment('1')
244
- # # => '1/1'
245
- # self.class.tier_path_segment('com.twitter', /^([^\.]+)\.([^\.]{1,2})/)
246
- # # => 'com/tw/com.twitter'
247
- #
248
- def self.tier_path_segment(path_seg, re=/(..?)/)
249
- frag_seg = re.match(path_seg).captures
250
- raise "Can't tier path_seg #{path_seg} using #{re}" if frag_seg.blank?
251
- File.join(* [frag_seg, path_seg].flatten )
252
- end
253
- #
254
- #
255
- # It's really bad if you can't roundtrip --
256
- # since saving is the rare case (only done once!) we insist on checking.
257
- #
258
- def self.validate_roundtrip file_path_str
259
- # uu = self.class.url_from_file_path(file_path_str)
260
- # puts "*"*75, uri.to_hash.inspect, ['path str', file_path_str, 'uri', uri.to_s, 'rt', uu.to_s].inspect
261
- return_trip_url = Addressable::URI.parse(self.class.url_from_file_path(file_path_str))
262
- raise "crapsticks: uri doesn't roundtrip #{file_path_str} to #{uri.to_s}: #{return_trip_url}" if return_trip_url != uri
263
- end
264
- #
265
- # Uses a similar scheme as the 'Quoted Printable' encoding, but more strict
266
- # and without linebreaking or anything. The intent is to reversibly and
267
- # recognizably store URLs to disk with names that (apart from path) do not
268
- # need to be further escaped in filesystem, URL, database or HTML.
269
- #
270
- # The only characters in a path_encoded string are alpha-numeric /_-.=
271
- #
272
- # Rules:
273
- # * Any character that is not alphanumeric, and is not /_-. is encoded as an
274
- # equals sign = followed by its upper-case hex encoding.
275
- #
276
- # * Furthermore, in any sequence of repeated '.' characters, all after the
277
- # first are hex encoded; same with '/'.
278
- #
279
- # Ex.
280
- # path_encode("www.measuringworth.com/datasets/consumer/result.php?use[]=VCB&use[]=CU&use[]=SZ&year_source=1900&year_result=2007"
281
- # # => www.measuringworth.com/datasets/consumer/result.php=3Fuse=5B=5D=3DVCB=26use=5B=5D=3DCU=26use=5B=5D=3DSZ=26year_source=3D1900=26year_result=3D2007
282
- #
283
- # Code inspired by "Glenn Parker's response to ruby quiz #23"http://www.rubyquiz.com/quiz23.html
284
- #
285
- def path_str_encode(str)
286
- str.gsub(%r{\.(\.+)}){|chars| '.'+path_encode_chars(chars) }
287
- str.gsub(%r{\/(\/+)}){|chars| '/'+path_encode_chars(chars) }
288
- str.gsub(%r{[^A-Za-z0-9/_\-\.]+}){|chars| path_encode_chars(chars) }
289
- end
290
- #
291
- # See the notes in path_encode
292
- #
293
- def path_str_decode(str)
294
- str.gsub(/\+([\dA-F]{2})/){ $1.hex.chr }
295
- end
296
- protected
297
- def path_encode_chars(chars) # :nodoc:
298
- # send each character to an equals sign followed by its uppercase hex encoding
299
- encoded = "";
300
- chars.each_byte{|c| encoded << "+%02X" % c }
301
- encoded
302
- end
303
- public
304
- end
305
- end
@@ -1,87 +0,0 @@
1
-
2
- def self.url_from_file_path fp
3
- # FIXME -- doesn't work with extension preservation
4
- unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{30,32})\z}.match(fp)) then
5
- # m1 = %r{\A([^/_]+)(_[^/]+)?/(?:(.*?))-([a-f0-9]{28,})}i.match(fp);
6
- raise "Bad match to #{fp}"
7
- end
8
- fp_host, fp_scheme, fp_path, fp_file, fp_uuid, fp_ext = m.captures
9
- fp_host = fp_host.split('.').reverse.join('.')
10
- fp_scheme ||= 'http'
11
- fp_path = File.join(*[fp_path, fp_file].compact) # FIXME -- no ext
12
- url = Addressable::URI.new(fp_scheme, nil, nil, fp_host, nil, fp_path, nil, nil)
13
- unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{32})\z}.match(fp)) then
14
- # warn "Bad luck!!! #{url.path} hash is #{fp_uuid} vs #{UUID.sha1_create(UUID_INFOCHIMPS_LINKS_NAMESPACE, url.to_s).hexdigest}"
15
- end
16
- url
17
- end
18
-
19
- #
20
- # returns [dirname, basename, ext] for the file_path
21
- # ext is determined by basename_ext_splitter
22
- #
23
- def path_split
24
- path_split_str path
25
- end
26
-
27
- # lowercase; only a-z, num, . -
28
- def scrubbed_revhost
29
- return unless revhost
30
- revhost.downcase.gsub(/[^a-z0-9\.\-]+/i, '') # note: no _
31
- end
32
-
33
- cattr_accessor :basename_ext_splitter
34
- BASENAME_EXT_SPLIT_SMART = /(.+?)\.(tar\.gz|tar\.bz2|[^\.]+)/
35
- BASENAME_EXT_NO_SPLIT = /(.+?)()/
36
- self.basename_ext_splitter = BASENAME_EXT_NO_SPLIT
37
-
38
- #
39
- # Like File.split but heuristically handles things like .tar.bz2:
40
- #
41
- # foo. => ['foo.', '']
42
- # foo.tar.gz => ['foo.', '']
43
- # foo.tar.bz2 => ['foo.', '']
44
- # foo.yaml => ['foo', '']
45
- #
46
- def path_split_str str
47
- if str =~ %r{/.+\z}
48
- dirname, basename = %r{\A(.*)/([^/]+)\z}.match(str).captures
49
- else
50
- dirname, basename = ['', str]
51
- end
52
- # Get basename, extension (as given by capture groups in basename_ext_splitter)
53
- if basename_ext_splitter && (m = /\A#{basename_ext_splitter}\z/i.match(basename))
54
- basename, ext = m.captures
55
- else
56
- basename, ext = [basename, '']
57
- end
58
- [dirname, basename, ext]
59
- end
60
-
61
- # remove all blank components, join the rest with separator
62
- def join_non_blank separator, *strs
63
- strs.reject(&:blank?).join(separator)
64
- end
65
-
66
- # only a-z A-Z, num, .-_/
67
- def scrubbed_path
68
- path_part = path
69
- # colons into /
70
- path_part = path_part.gsub(%r{\:+}, '/')
71
- # Kill weird chars
72
- path_part = path_part.gsub(%r{[^a-zA-Z0-9\.\-_/]+}, '_')
73
- # Compact (killing foo/../bar, etc)
74
- path_part = path_part.gsub(%r{/[^a-zA-Z0-9]+/}, '/').gsub(%r{/\.\.+/}, '.')
75
- # Kill leading & trailing non-alnum
76
- path_part = path_part.gsub(%r{^[^a-zA-Z0-9]+}, '').gsub(%r{[^a-zA-Z0-9]+$}, '')
77
- end
78
-
79
- #
80
- # name for this URL regarded as a file (instance)
81
- #
82
- def to_file_path
83
- dirname, basename, ext = path_split_str(scrubbed_path)
84
- basename = join_non_blank '-', basename, uuid
85
- basename = join_non_blank '.', basename, ext
86
- join_non_blank '/', root_path, dirname, basename
87
- end
@@ -1,147 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- require 'rubygems'
3
- require 'active_support'
4
- require 'uuidtools'
5
-
6
- module Scrub
7
- class Generic
8
- # A regular expression character group
9
- # (a bunch of characters ready to drop into /[#{validator}]*/)
10
- # whitelisting allowed characters
11
- #
12
- # Must be overridden in child class
13
- class_inheritable_accessor :validator
14
-
15
- # Sentence fragment for error message on failed validation.
16
- class_inheritable_accessor :complaint
17
- self.complaint = "has characters I can't understand"
18
-
19
- # Proc or string or anything that can be 2nd arg to gsub
20
- # to sanitize
21
- class_inheritable_accessor :replacer
22
- self.replacer = '-'
23
-
24
- # A regular expression to sanitize objects
25
- # if unset or nil, the validator char group
26
- class_inheritable_accessor :sanitizer
27
-
28
- # unless overridden or set expressly, just use the
29
- # validator
30
- def sanitizer
31
- @sanitizer || self.validator
32
- end
33
-
34
- def sanitize str
35
- str = str.to_s
36
- str.gsub(%r{([^#{validator.to_s}]+)}u, replacer)
37
- end
38
-
39
- def valid? str
40
- %r{\A([#{validator.to_s}]*)\z}u.match(str)
41
- end
42
- end
43
-
44
- #
45
- # A permissive, ASCII-only name string - no control chars, newlines, backslash
46
- # or <> angle brackets
47
- #
48
- class Title < Scrub::Generic
49
- self.complaint = "should only contain basic keyboard characters (and should not use \\ &lt; or &gt;)."
50
- self.validator = %r{a-zA-Z0-9_ ~\!@#\$%\^&\*\(\)\-\+=;\:'"`\[\]\{\}\|,\?\.\/}u
51
- end
52
-
53
- #
54
- # A permissive, ASCII-only name string - no control chars, newlines, backslash
55
- # or <> angle brackets
56
- #
57
- class UnicodeTitle < Scrub::Title
58
- self.complaint = "should only contain keyboard characters (and should not use \\ &lt; or &gt;)."
59
- self.validator = %r{[:alpha:][:digit:]#{Scrub::Title.validator}}u
60
- end
61
-
62
- #
63
- # Visible characters and spaces (i.e. anything except control characters, etc.)
64
- #
65
- class FreeText < Scrub::Generic
66
- self.complaint = "should not contain control characters or that kind of junk."
67
- self.validator = %r{[:print:]\n\t}u
68
- end
69
-
70
- module BeginsWithAlpha
71
- mattr_accessor :slug
72
- self.slug = 'x'
73
- # prepend #{slug}#{replacer} to the string if it starts with non-alpha.
74
- # so, for instance '23jumpstreet' => 'x_23jumpstreet'
75
- def sanitize_with_begins_with_alpha str
76
- str = sanitize_without_begins_with_alpha str
77
- str = 'x' + replacer + str if (str !~ /^[a-z]/i) # call at end of chain!
78
- str
79
- end
80
- def valid_with_begins_with_alpha? str
81
- (str =~ /^[a-z]/i) && valid_without_begins_with_alpha?(str)
82
- end
83
- def self.included base
84
- base.alias_method_chain :sanitize, :begins_with_alpha # unless defined?(base.sanitize_without_begins_with_alpha)
85
- base.alias_method_chain :valid?, :begins_with_alpha # unless defined?(base.valid_without_begins_with_alpha?)
86
- end
87
- end
88
-
89
- #
90
- # insist that a string be lowercased.
91
- #
92
- module Lowercased
93
- def sanitize_with_lowercased str
94
- str = sanitize_without_lowercased str
95
- str.downcase # call at end of chain!
96
- end
97
- def valid_with_lowercase? str
98
- (str !~ /[[:upper:]]/u) && valid_without_lowercase?(str)
99
- end
100
- def self.included base
101
- base.alias_method_chain :sanitize, :lowercased # unless defined?(base.sanitize_without_lowercased)
102
- base.alias_method_chain :valid?, :lowercase # unless defined?(base.valid_without_lowercase?)
103
- end
104
- end
105
-
106
- #
107
- # start with a letter, and contain only A-Za-z0-9_
108
- #
109
- class Identifier < Scrub::Generic
110
- self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
111
- self.validator = %r{a-z0-9_}u
112
- self.replacer = '_'
113
- include Scrub::BeginsWithAlpha
114
- include Scrub::Lowercased
115
- end
116
-
117
- #
118
- # start with a letter, and contain only A-Za-z0-9_
119
- #
120
- class Handle < Scrub::Generic
121
- self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
122
- self.validator = %r{a-z0-9_}u
123
- self.replacer = '_'
124
- include Scrub::BeginsWithAlpha
125
- include Scrub::Lowercased
126
- end
127
-
128
- # HANDLE_RE = %r{\A[a-z][]*\z}i # ascii, not :alpha: etc.
129
- # HANDLE_MSG = "should start with a letter, and contain only characters like a-z0-9_-."
130
- #
131
- # # "Domain names are restricted to the ASCII letters a through z
132
- # # (case-insensitive), the digits 0 through 9, and the hyphen, with some other
133
- # # restrictions in terms of name length and position of hyphens."
134
- # # (http://en.wikipedia.org/wiki/Domain_name#Overview)
135
- # # http://tools.ietf.org/html/rfc1034
136
- # DOMAIN_RE = %r{\A[a-z][a-z0-9\-][a-z0-9]\z}i # case insensitive
137
- # DOMAIN_MSG = "should look like a domain name."
138
- # DOMAIN_MORE = "only letters, digits or hyphens (-), start with a letter and end with a letter or number."
139
- MSG_EMAIL_BAD = "should look like an email address (you@somethingsomething.com) and include only letters, numbers and .&nbsp;+&nbsp;-&nbsp;&#37; please."
140
- RE_EMAIL_NAME = '[\w\.%\+\-]+' # what you actually see in practice
141
- RE_EMAIL_N_RFC2822 = '0-9A-Z!#\$%\&\'\*\+_/=\?^\-`\{|\}~\.' # technically allowed by RFC-2822
142
- RE_DOMAIN_HEAD = '(?:[A-Z0-9\-]+\.)+'
143
- RE_DOMAIN_TLD = '(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
144
- RE_EMAIL_OK = /\A#{RE_EMAIL_NAME}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
145
- RE_EMAIL_RFC2822 = /\A#{RE_EMAIL_N_RFC2822}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
146
-
147
- end
@@ -1,38 +0,0 @@
1
-
2
-
3
- module IMW
4
- module URIScrubber
5
-
6
- def scrubbed
7
- to_dirpath
8
- end
9
- end
10
- end
11
-
12
- module Scrub
13
- #
14
- # start with a letter, and contain only A-Za-z0-9_
15
- #
16
- class SimplifiedURL < Scrub::Generic
17
- self.complaint = "should follow our zany simplified URL rules: com.host.dot-reversed:schemeifnothttp/path/seg_men-ts/stuff.ext-SHA1ifweird"
18
- self.validator = %r{#{Addressable::URI::SAFE_CHARS}#{Addressable::URI::RESERVED_CHARS}}u
19
- self.replacer = ''
20
- include Scrub::Lowercased
21
- attr_accessor :uri
22
-
23
- def valid? str
24
- str.to_s.downcase == sanitize(str)
25
- end
26
-
27
- def sanitize str
28
- # if this fails just normalize once, or don't set $KCODE: http://bit.ly/1664vp
29
- uri = Addressable::URI.heuristic_parse(str.to_s).normalize
30
- # print [uri.host, uri.host_valid?, uri.path, uri.path_valid?].inspect
31
- if uri.host_valid?
32
- uri.scrubbed
33
- else
34
- uri.uuid_path
35
- end
36
- end
37
- end
38
- end
@@ -1,60 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
- require 'scrub'
4
- require 'scrub_simple_url'
5
-
6
- test_strings = [
7
- nil, '', '12', '123', 'simple', 'UPPER', 'CamelCased', 'iden_tifier_23_',
8
- 'twentyfouralphacharslong', 'twentyfiveatozonlyletters', 'hello.-_there@funnychar.com',
9
- "tab\t", "newline\n",
10
- "Iñtërnâtiônàlizætiøn",
11
- 'semicolon;', 'quote"', 'tick\'', 'backtick`', 'percent%', 'plus+', 'space ',
12
- 'leftanglebracket<', 'ampersand&',
13
- "control char-bel\x07",
14
- "http://foo.bar.com/",
15
- "HTTP://FOO.BAR.com",
16
- ".com/zazz",
17
- "scheme://user_name@user_acct:passwd@host-name.museum:9047/path;pathquery/p!a-th~2/path?query=param&amp;query=pa%20ram#fragment",
18
- "http://web.site.com/path/path/file.ext",
19
- "ftp://ftp.site.com/path/path/file.ext",
20
- "/absolute/pathname/file.ext",
21
- "http://foo.bar.com/.hidden_file_with.ext",
22
- "http://foo.bar.com/.hidden_file",
23
- "dir/--/non_alpha_path_segment.ext",
24
- "http://foo.bar.com/dir/../two_dots_in_path",
25
-
26
- ]
27
-
28
-
29
- scrubbers = {
30
- # :unicode_title => Scrub::UnicodeTitle.new,
31
- # :title => Scrub::Title.new,
32
- # :identifier => Scrub::Identifier.new,
33
- # :free_text => Scrub::FreeText.new,
34
- :handle => Scrub::Handle.new,
35
- :simplified_url => Scrub::SimplifiedURL.new,
36
- # :domain => Scrub::Domain.new,
37
- # :email => Scrub::Email.new,
38
- }
39
-
40
- scrubbers.each do |scrubber_name, scrubber|
41
- puts scrubber_name
42
- results = test_strings.map do |test_string|
43
- [!!scrubber.valid?(test_string), scrubber.sanitize(test_string).inspect, test_string.inspect ]
44
- end
45
- results.sort_by{|val,san,orig| val ? 1 : -1 }.each do |val,san,orig|
46
- puts " %-5s %-30s %-30s" % [val,san,orig]
47
- end
48
- end
49
-
50
-
51
-
52
- # 'foo@bar.com', 'foo@newskool-tld.museum', 'foo@twoletter-tld.de', 'foo@nonexistant-tld.qq',
53
- # 'r@a.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail.com',
54
- # 'hello.-_there@funnychar.com', 'uucp%addr@gmail.com', 'hello+routing-str@gmail.com',
55
- # 'domain@can.haz.many.sub.doma.in',],
56
- # :invalid => [nil, '', '!!@nobadchars.com', 'foo@no-rep-dots..com', 'foo@badtld.xxx', 'foo@toolongtld.abcdefg',
57
- # 'Iñtërnâtiônàlizætiøn@hasnt.happened.to.email', 'need.domain.and.tld@de', "tab\t", "newline\n",
58
- # 'r@.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail2.com',
59
- # # these are technically allowed but not seen in practice:
60
- # 'uucp!addr@gmail.com', 'semicolon;@gmail.com', 'quote"@gmail.com', 'tick\'@gmail.com', 'backtick`@gmail.com', 'space @gmail.com', 'bracket<@gmail.com', 'bracket>@gmail.com'