imw 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,305 @@
1
+ # -*- coding: utf-8 -*-
2
+ module Linkish
3
+ def self.included base
4
+ base.class_eval do
5
+ include DataMapper::Resource
6
+ include Infochimps::Resource
7
+ property :id, Integer, :serial => true
8
+ property :full_url, String, :length => 255, :nullable => false, :unique_index => true
9
+ has_handle
10
+ alias_method :handle_generator, :full_url
11
+ has_time_and_user_stamps
12
+ #
13
+ property :name, String, :length => 255, :nullable => false, :default => ''
14
+ #
15
+ property :file_path, String, :length => 1024
16
+ property :file_time, DateTime
17
+ property :file_size, Integer
18
+ property :file_sha1, String, :length => 40
19
+ property :tried_fetch, DataMapper::Resource::Boolean
20
+ property :fetched, DataMapper::Resource::Boolean
21
+ #
22
+ before :create, :make_uuid_and_handle
23
+ before :create, :update_from_file!
24
+ end
25
+ base.extend ClassMethods
26
+ end
27
+
28
+ # ===========================================================================
29
+ #
30
+ # Delegate methods to uri
31
+ #
32
+ def uri
33
+ @uri ||= Addressable::URI.parse(self.full_url)
34
+ end
35
+ # Dispatch anything else to the aggregated uri object
36
+ def method_missing method, *args
37
+ if self.uri.respond_to?(method)
38
+ self.uri.send(method, *args)
39
+ else
40
+ super method, *args
41
+ end
42
+ end
43
+
44
+ def to_s
45
+ "<a href='#{self.uri.to_s}'>#{self.name}</a>" # <-- !! not escaped !!
46
+ end
47
+
48
+ # ===========================================================================
49
+ #
50
+ # ID, naming, etc
51
+ #
52
+ def normalize_url!
53
+ u = Addressable::URI.parse(self.full_url).normalize
54
+ self.full_url = u.to_s
55
+ end
56
+
57
+ # ===========================================================================
58
+ #
59
+ # Properly belongs in FileStore module
60
+ #
61
+ #
62
+ # Refresh cached properties from our copy of the asset.
63
+ #
64
+ def update_from_file!
65
+ self.make_uuid_and_handle # make sure this happened
66
+ # Set the file path
67
+ self.file_path = self.to_file_path if self.file_path.blank?
68
+ # FIXME -- kludge to ripd_root
69
+ if ! File.exist?(actual_path)
70
+ self.fetched = false
71
+ else
72
+ self.fetched = self.tried_fetch = true
73
+ self.file_size = File.size( actual_path)
74
+ self.file_time = File.mtime(actual_path)
75
+ end
76
+ self.fetched
77
+ end
78
+ def actual_path
79
+ path_to(:ripd_root, self.file_path)
80
+ end
81
+
82
+ # ===========================================================================
83
+ #
84
+ # Properly belongs in own module
85
+ #
86
+
87
+ IMW_WGET_OPTIONS = {
88
+ :root => :ripd_root,
89
+ :wait => 2,
90
+ :noretry => true,
91
+ :log_level => Logger::DEBUG,
92
+ :clobber => false,
93
+ }
94
+ #
95
+ # Fetch from the web
96
+ #
97
+ def wget options={}
98
+ options.reverse_merge! IMW_WGET_OPTIONS
99
+ cd path_to(options[:root]) do
100
+ if (not options[:clobber]) && File.file?(file_path) then
101
+ IMW.log.add options[:log_level], "Skipping #{file_path}"; return
102
+ end
103
+ # Do the fetch
104
+ mkdir_p File.dirname(actual_path)
105
+ # defaults are --connect-timeout=infinity --read-timeout=900 --tries=20 acc. to man page
106
+ cmd = %Q{wget -nv "#{full_url}" -O"#{actual_path}" --connect-timeout=5 --read-timeout=10 --tries=1 &}
107
+ IMW.log.add(options[:log_level], cmd)
108
+ IMW.log.add(options[:log_level], `#{cmd}`)
109
+ self.tried_fetch = true
110
+ sleep options[:wait] # please hammer don't hurt em
111
+ update_from_file!
112
+ self.save
113
+ return self.fetched
114
+ end
115
+ end
116
+
117
+ #
118
+ #
119
+ #
120
+ def contents options={}
121
+ wget options
122
+ if fetched
123
+ File.open actual_path
124
+ end
125
+ end
126
+
127
+ # ===========================================================================
128
+ #
129
+ # Properly belongs in FileStore
130
+ #
131
+
132
+ protected
133
+ #
134
+ # The standard file path for this url's ripped cache
135
+ #
136
+ # * leading directory from reverse.dotted.host_scheme:port:user@password
137
+ # * normalized path/file?query#fragment
138
+ # * uuid formed from the
139
+ #
140
+ def to_file_path
141
+ file_path_str = ""
142
+ file_path_str << to_file_path_root_part
143
+ file_path_str << to_file_path_path_part
144
+ file_path_str << to_file_path_file_part
145
+ file_path_str = self.class.path_str_encode(file_path_str)
146
+ self.class.validate_roundtrip(file_path_str)
147
+ file_path_str
148
+ end
149
+ def file_timestamp
150
+ file_time.strftime("%Y%m%d-%H%M%S")
151
+ end
152
+ def to_file_path_with_timestamp
153
+ to_file_path + file_timestamp
154
+ end
155
+ #
156
+ # revhost_scheme:port:user@password -- omitting _scheme if it's http, and
157
+ # omitting :port:user@password if all three are blank.
158
+ #
159
+ def to_file_path_root_part
160
+ root_part_str = ""
161
+ tld_host_frag = self.class.tier_path_segment(revhost, /^([^\.]+)\.([^\.]{1,2})/)
162
+ root_part_str << revhost
163
+ root_part_str << "_#{uri.scheme}" unless uri.scheme == 'http'
164
+ root_part_str << ":#{uri.port}:#{uri.user}@#{uri.password}" unless uri.simple?
165
+ root_part_str
166
+ end
167
+ def to_file_path_path_part
168
+ uri.path.to_s
169
+ end
170
+ def to_file_path_file_part
171
+ file_path_str = ""
172
+ file_path_str << "?#{uri.query}" unless uri.query.nil?
173
+ file_path_str << "##{uri.fragment}" unless uri.fragment.nil?
174
+ file_path_str << "-#{self.uuid}"
175
+ end
176
+ public
177
+
178
+
179
+ module ClassMethods
180
+ #
181
+ # find_or_creates from url
182
+ #
183
+ # url is heuristic_parse'd and normalized by Addressable before lookup:
184
+ # "Converts an input to a URI. The input does not have to be a valid URI —
185
+ # the method will use heuristics to guess what URI was intended. This is not
186
+ # standards compliant, merely user-friendly.
187
+ #
188
+ def find_or_create_from_url url_str
189
+ link = self.find_or_new_from_url url_str
190
+ link.save
191
+ link
192
+ end
193
+ def find_or_new_from_url url_str # :nodoc:
194
+ url_str = Addressable::URI.heuristic_parse(url_str).normalize.to_s
195
+ link = self.first( :full_url => url_str ) || self.new( :full_url => url_str )
196
+ link.make_uuid_and_handle
197
+ link.update_from_file!
198
+ link
199
+ end
200
+ def find_or_create_from_file_path ripd_file
201
+ url_str = Link.url_from_file_path(ripd_file)
202
+ link = self.first( :full_url => url_str.to_s ) || self.new( :full_url => url_str.to_s )
203
+ link.file_path = ripd_file
204
+ link.make_uuid_and_handle
205
+ link.update_from_file!
206
+ link.save
207
+ link
208
+ end
209
+ #
210
+ # Decode url from its file_path
211
+ #
212
+ def url_from_file_path fp
213
+ fp = path_str_decode(fp)
214
+ m = (%r{\A
215
+ (#{Addressable::URI::HOST_TLD}) # tld tier
216
+ /(..?) # revhost tier
217
+ /([^/\:_]+) # revhost
218
+ (?:_([^/\:]+))? # _scheme
219
+ (?::(\d*):([^/]*)@([^@/]*?))? # :port:user@password
220
+ /(?:(.*?)/)? # /dirs/
221
+ ([^/]*) # file
222
+ -([a-f0-9]{32}) # -uuid
223
+ \z}x.match(fp))
224
+ raise "Can't extract url from file path #{fp}" if !m
225
+ fp_host, fp_scheme, fp_port, fp_user, fp_pass, fp_path, fp_file, fp_uuid = m.captures
226
+ fp_host = fp_host.split('.').reverse.join('.')
227
+ fp_scheme ||= 'http'
228
+ fp_pass = ":#{fp_pass}" unless fp_pass.blank?
229
+ fp_userpass = "#{fp_user}#{fp_user}@" unless fp_user.blank?
230
+ fp_port = ":#{fp_port}" unless fp_port.blank?
231
+ fp_path = File.join(*[fp_path, fp_file].compact)
232
+ "#{fp_scheme}://#{fp_userpass}#{fp_host}#{fp_port}/#{fp_path}"
233
+ end
234
+ #
235
+ # to control files-per-directory madness, take a path segment like "foobar" in
236
+ # blah.com/top/foobar/directory
237
+ # and transform into
238
+ # blah.com/top/fo/foobar/directory
239
+ #
240
+ # Ex.
241
+ # self.class.tier_path_segment('a_username')
242
+ # # => 'a_/a_username'
243
+ # self.class.tier_path_segment('1')
244
+ # # => '1/1'
245
+ # self.class.tier_path_segment('com.twitter', /^([^\.]+)\.([^\.]{1,2})/)
246
+ # # => 'com/tw/com.twitter'
247
+ #
248
+ def self.tier_path_segment(path_seg, re=/(..?)/)
249
+ frag_seg = re.match(path_seg).captures
250
+ raise "Can't tier path_seg #{path_seg} using #{re}" if frag_seg.blank?
251
+ File.join(* [frag_seg, path_seg].flatten )
252
+ end
253
+ #
254
+ #
255
+ # It's really bad if you can't roundtrip --
256
+ # since saving is the rare case (only done once!) we insist on checking.
257
+ #
258
+ def self.validate_roundtrip file_path_str
259
+ # uu = self.class.url_from_file_path(file_path_str)
260
+ # puts "*"*75, uri.to_hash.inspect, ['path str', file_path_str, 'uri', uri.to_s, 'rt', uu.to_s].inspect
261
+ return_trip_url = Addressable::URI.parse(self.class.url_from_file_path(file_path_str))
262
+ raise "crapsticks: uri doesn't roundtrip #{file_path_str} to #{uri.to_s}: #{return_trip_url}" if return_trip_url != uri
263
+ end
264
+ #
265
+ # Uses a similar scheme as the 'Quoted Printable' encoding, but more strict
266
+ # and without linebreaking or anything. The intent is to reversibly and
267
+ # recognizably store URLs to disk with names that (apart from path) do not
268
+ # need to be further escaped in filesystem, URL, database or HTML.
269
+ #
270
+ # The only characters in a path_encoded string are alpha-numeric /_-.=
271
+ #
272
+ # Rules:
273
+ # * Any character that is not alphanumeric, and is not /_-. is encoded as an
274
+ # equals sign = followed by its upper-case hex encoding.
275
+ #
276
+ # * Furthermore, in any sequence of repeated '.' characters, all after the
277
+ # first are hex encoded; same with '/'.
278
+ #
279
+ # Ex.
280
+ # path_encode("www.measuringworth.com/datasets/consumer/result.php?use[]=VCB&use[]=CU&use[]=SZ&year_source=1900&year_result=2007"
281
+ # # => www.measuringworth.com/datasets/consumer/result.php=3Fuse=5B=5D=3DVCB=26use=5B=5D=3DCU=26use=5B=5D=3DSZ=26year_source=3D1900=26year_result=3D2007
282
+ #
283
+ # Code inspired by "Glenn Parker's response to ruby quiz #23"http://www.rubyquiz.com/quiz23.html
284
+ #
285
+ def path_str_encode(str)
286
+ str.gsub(%r{\.(\.+)}){|chars| '.'+path_encode_chars(chars) }
287
+ str.gsub(%r{\/(\/+)}){|chars| '/'+path_encode_chars(chars) }
288
+ str.gsub(%r{[^A-Za-z0-9/_\-\.]+}){|chars| path_encode_chars(chars) }
289
+ end
290
+ #
291
+ # See the notes in path_encode
292
+ #
293
+ def path_str_decode(str)
294
+ str.gsub(/\+([\dA-F]{2})/){ $1.hex.chr }
295
+ end
296
+ protected
297
+ def path_encode_chars(chars) # :nodoc:
298
+ # send each character to an equals sign followed by its uppercase hex encoding
299
+ encoded = "";
300
+ chars.each_byte{|c| encoded << "+%02X" % c }
301
+ encoded
302
+ end
303
+ public
304
+ end
305
+ end
@@ -0,0 +1,87 @@
1
+
2
+ def self.url_from_file_path fp
3
+ # FIXME -- doesn't work with extension preservation
4
+ unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{30,32})\z}.match(fp)) then
5
+ # m1 = %r{\A([^/_]+)(_[^/]+)?/(?:(.*?))-([a-f0-9]{28,})}i.match(fp);
6
+ raise "Bad match to #{fp}"
7
+ end
8
+ fp_host, fp_scheme, fp_path, fp_file, fp_uuid, fp_ext = m.captures
9
+ fp_host = fp_host.split('.').reverse.join('.')
10
+ fp_scheme ||= 'http'
11
+ fp_path = File.join(*[fp_path, fp_file].compact) # FIXME -- no ext
12
+ url = Addressable::URI.new(fp_scheme, nil, nil, fp_host, nil, fp_path, nil, nil)
13
+ unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{32})\z}.match(fp)) then
14
+ # warn "Bad luck!!! #{url.path} hash is #{fp_uuid} vs #{UUID.sha1_create(UUID_INFOCHIMPS_LINKS_NAMESPACE, url.to_s).hexdigest}"
15
+ end
16
+ url
17
+ end
18
+
19
+ #
20
+ # returns [dirname, basename, ext] for the file_path
21
+ # ext is determined by basename_ext_splitter
22
+ #
23
+ def path_split
24
+ path_split_str path
25
+ end
26
+
27
+ # lowercase; only a-z, num, . -
28
+ def scrubbed_revhost
29
+ return unless revhost
30
+ revhost.downcase.gsub(/[^a-z0-9\.\-]+/i, '') # note: no _
31
+ end
32
+
33
+ cattr_accessor :basename_ext_splitter
34
+ BASENAME_EXT_SPLIT_SMART = /(.+?)\.(tar\.gz|tar\.bz2|[^\.]+)/
35
+ BASENAME_EXT_NO_SPLIT = /(.+?)()/
36
+ self.basename_ext_splitter = BASENAME_EXT_NO_SPLIT
37
+
38
+ #
39
+ # Like File.split but heuristically handles things like .tar.bz2:
40
+ #
41
+ # foo. => ['foo.', '']
42
+ # foo.tar.gz => ['foo.', '']
43
+ # foo.tar.bz2 => ['foo.', '']
44
+ # foo.yaml => ['foo', '']
45
+ #
46
+ def path_split_str str
47
+ if str =~ %r{/.+\z}
48
+ dirname, basename = %r{\A(.*)/([^/]+)\z}.match(str).captures
49
+ else
50
+ dirname, basename = ['', str]
51
+ end
52
+ # Get basename, extension (as given by capture groups in basename_ext_splitter)
53
+ if basename_ext_splitter && (m = /\A#{basename_ext_splitter}\z/i.match(basename))
54
+ basename, ext = m.captures
55
+ else
56
+ basename, ext = [basename, '']
57
+ end
58
+ [dirname, basename, ext]
59
+ end
60
+
61
+ # remove all blank components, join the rest with separator
62
+ def join_non_blank separator, *strs
63
+ strs.reject(&:blank?).join(separator)
64
+ end
65
+
66
+ # only a-z A-Z, num, .-_/
67
+ def scrubbed_path
68
+ path_part = path
69
+ # colons into /
70
+ path_part = path_part.gsub(%r{\:+}, '/')
71
+ # Kill weird chars
72
+ path_part = path_part.gsub(%r{[^a-zA-Z0-9\.\-_/]+}, '_')
73
+ # Compact (killing foo/../bar, etc)
74
+ path_part = path_part.gsub(%r{/[^a-zA-Z0-9]+/}, '/').gsub(%r{/\.\.+/}, '.')
75
+ # Kill leading & trailing non-alnum
76
+ path_part = path_part.gsub(%r{^[^a-zA-Z0-9]+}, '').gsub(%r{[^a-zA-Z0-9]+$}, '')
77
+ end
78
+
79
+ #
80
+ # name for this URL regarded as a file (instance)
81
+ #
82
+ def to_file_path
83
+ dirname, basename, ext = path_split_str(scrubbed_path)
84
+ basename = join_non_blank '-', basename, uuid
85
+ basename = join_non_blank '.', basename, ext
86
+ join_non_blank '/', root_path, dirname, basename
87
+ end
@@ -0,0 +1,147 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rubygems'
3
+ require 'active_support'
4
+ require 'uuidtools'
5
+
6
+ module Scrub
7
+ class Generic
8
+ # A regular expression character group
9
+ # (a bunch of characters ready to drop into /[#{validator}]*/)
10
+ # whitelisting allowed characters
11
+ #
12
+ # Must be overridden in child class
13
+ class_inheritable_accessor :validator
14
+
15
+ # Sentence fragment for error message on failed validation.
16
+ class_inheritable_accessor :complaint
17
+ self.complaint = "has characters I can't understand"
18
+
19
+ # Proc or string or anything that can be 2nd arg to gsub
20
+ # to sanitize
21
+ class_inheritable_accessor :replacer
22
+ self.replacer = '-'
23
+
24
+ # A regular expression to sanitize objects
25
+ # if unset or nil, the validator char group
26
+ class_inheritable_accessor :sanitizer
27
+
28
+ # unless overridden or set expressly, just use the
29
+ # validator
30
+ def sanitizer
31
+ @sanitizer || self.validator
32
+ end
33
+
34
+ def sanitize str
35
+ str = str.to_s
36
+ str.gsub(%r{([^#{validator.to_s}]+)}u, replacer)
37
+ end
38
+
39
+ def valid? str
40
+ %r{\A([#{validator.to_s}]*)\z}u.match(str)
41
+ end
42
+ end
43
+
44
+ #
45
+ # A permissive, ASCII-only name string - no control chars, newlines, backslash
46
+ # or <> angle brackets
47
+ #
48
+ class Title < Scrub::Generic
49
+ self.complaint = "should only contain basic keyboard characters (and should not use \\ &lt; or &gt;)."
50
+ self.validator = %r{a-zA-Z0-9_ ~\!@#\$%\^&\*\(\)\-\+=;\:'"`\[\]\{\}\|,\?\.\/}u
51
+ end
52
+
53
+ #
54
+ # A permissive, ASCII-only name string - no control chars, newlines, backslash
55
+ # or <> angle brackets
56
+ #
57
+ class UnicodeTitle < Scrub::Title
58
+ self.complaint = "should only contain keyboard characters (and should not use \\ &lt; or &gt;)."
59
+ self.validator = %r{[:alpha:][:digit:]#{Scrub::Title.validator}}u
60
+ end
61
+
62
+ #
63
+ # Visible characters and spaces (i.e. anything except control characters, etc.)
64
+ #
65
+ class FreeText < Scrub::Generic
66
+ self.complaint = "should not contain control characters or that kind of junk."
67
+ self.validator = %r{[:print:]\n\t}u
68
+ end
69
+
70
+ module BeginsWithAlpha
71
+ mattr_accessor :slug
72
+ self.slug = 'x'
73
+ # prepend #{slug}#{replacer} to the string if it starts with non-alpha.
74
+ # so, for instance '23jumpstreet' => 'x_23jumpstreet'
75
+ def sanitize_with_begins_with_alpha str
76
+ str = sanitize_without_begins_with_alpha str
77
+ str = 'x' + replacer + str if (str !~ /^[a-z]/i) # call at end of chain!
78
+ str
79
+ end
80
+ def valid_with_begins_with_alpha? str
81
+ (str =~ /^[a-z]/i) && valid_without_begins_with_alpha?(str)
82
+ end
83
+ def self.included base
84
+ base.alias_method_chain :sanitize, :begins_with_alpha # unless defined?(base.sanitize_without_begins_with_alpha)
85
+ base.alias_method_chain :valid?, :begins_with_alpha # unless defined?(base.valid_without_begins_with_alpha?)
86
+ end
87
+ end
88
+
89
+ #
90
+ # insist that a string be lowercased.
91
+ #
92
+ module Lowercased
93
+ def sanitize_with_lowercased str
94
+ str = sanitize_without_lowercased str
95
+ str.downcase # call at end of chain!
96
+ end
97
+ def valid_with_lowercase? str
98
+ (str !~ /[[:upper:]]/u) && valid_without_lowercase?(str)
99
+ end
100
+ def self.included base
101
+ base.alias_method_chain :sanitize, :lowercased # unless defined?(base.sanitize_without_lowercased)
102
+ base.alias_method_chain :valid?, :lowercase # unless defined?(base.valid_without_lowercase?)
103
+ end
104
+ end
105
+
106
+ #
107
+ # start with a letter, and contain only A-Za-z0-9_
108
+ #
109
+ class Identifier < Scrub::Generic
110
+ self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
111
+ self.validator = %r{a-z0-9_}u
112
+ self.replacer = '_'
113
+ include Scrub::BeginsWithAlpha
114
+ include Scrub::Lowercased
115
+ end
116
+
117
+ #
118
+ # start with a letter, and contain only A-Za-z0-9_
119
+ #
120
+ class Handle < Scrub::Generic
121
+ self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
122
+ self.validator = %r{a-z0-9_}u
123
+ self.replacer = '_'
124
+ include Scrub::BeginsWithAlpha
125
+ include Scrub::Lowercased
126
+ end
127
+
128
+ # HANDLE_RE = %r{\A[a-z][]*\z}i # ascii, not :alpha: etc.
129
+ # HANDLE_MSG = "should start with a letter, and contain only characters like a-z0-9_-."
130
+ #
131
+ # # "Domain names are restricted to the ASCII letters a through z
132
+ # # (case-insensitive), the digits 0 through 9, and the hyphen, with some other
133
+ # # restrictions in terms of name length and position of hyphens."
134
+ # # (http://en.wikipedia.org/wiki/Domain_name#Overview)
135
+ # # http://tools.ietf.org/html/rfc1034
136
+ # DOMAIN_RE = %r{\A[a-z][a-z0-9\-][a-z0-9]\z}i # case insensitive
137
+ # DOMAIN_MSG = "should look like a domain name."
138
+ # DOMAIN_MORE = "only letters, digits or hyphens (-), start with a letter and end with a letter or number."
139
+ MSG_EMAIL_BAD = "should look like an email address (you@somethingsomething.com) and include only letters, numbers and .&nbsp;+&nbsp;-&nbsp;&#37; please."
140
+ RE_EMAIL_NAME = '[\w\.%\+\-]+' # what you actually see in practice
141
+ RE_EMAIL_N_RFC2822 = '0-9A-Z!#\$%\&\'\*\+_/=\?^\-`\{|\}~\.' # technically allowed by RFC-2822
142
+ RE_DOMAIN_HEAD = '(?:[A-Z0-9\-]+\.)+'
143
+ RE_DOMAIN_TLD = '(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
144
+ RE_EMAIL_OK = /\A#{RE_EMAIL_NAME}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
145
+ RE_EMAIL_RFC2822 = /\A#{RE_EMAIL_N_RFC2822}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
146
+
147
+ end