imw 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
@@ -1,305 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
module Linkish
|
3
|
-
def self.included base
|
4
|
-
base.class_eval do
|
5
|
-
include DataMapper::Resource
|
6
|
-
include Infochimps::Resource
|
7
|
-
property :id, Integer, :serial => true
|
8
|
-
property :full_url, String, :length => 255, :nullable => false, :unique_index => true
|
9
|
-
has_handle
|
10
|
-
alias_method :handle_generator, :full_url
|
11
|
-
has_time_and_user_stamps
|
12
|
-
#
|
13
|
-
property :name, String, :length => 255, :nullable => false, :default => ''
|
14
|
-
#
|
15
|
-
property :file_path, String, :length => 1024
|
16
|
-
property :file_time, DateTime
|
17
|
-
property :file_size, Integer
|
18
|
-
property :file_sha1, String, :length => 40
|
19
|
-
property :tried_fetch, DataMapper::Resource::Boolean
|
20
|
-
property :fetched, DataMapper::Resource::Boolean
|
21
|
-
#
|
22
|
-
before :create, :make_uuid_and_handle
|
23
|
-
before :create, :update_from_file!
|
24
|
-
end
|
25
|
-
base.extend ClassMethods
|
26
|
-
end
|
27
|
-
|
28
|
-
# ===========================================================================
|
29
|
-
#
|
30
|
-
# Delegate methods to uri
|
31
|
-
#
|
32
|
-
def uri
|
33
|
-
@uri ||= Addressable::URI.parse(self.full_url)
|
34
|
-
end
|
35
|
-
# Dispatch anything else to the aggregated uri object
|
36
|
-
def method_missing method, *args
|
37
|
-
if self.uri.respond_to?(method)
|
38
|
-
self.uri.send(method, *args)
|
39
|
-
else
|
40
|
-
super method, *args
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def to_s
|
45
|
-
"<a href='#{self.uri.to_s}'>#{self.name}</a>" # <-- !! not escaped !!
|
46
|
-
end
|
47
|
-
|
48
|
-
# ===========================================================================
|
49
|
-
#
|
50
|
-
# ID, naming, etc
|
51
|
-
#
|
52
|
-
def normalize_url!
|
53
|
-
u = Addressable::URI.parse(self.full_url).normalize
|
54
|
-
self.full_url = u.to_s
|
55
|
-
end
|
56
|
-
|
57
|
-
# ===========================================================================
|
58
|
-
#
|
59
|
-
# Properly belongs in FileStore module
|
60
|
-
#
|
61
|
-
#
|
62
|
-
# Refresh cached properties from our copy of the asset.
|
63
|
-
#
|
64
|
-
def update_from_file!
|
65
|
-
self.make_uuid_and_handle # make sure this happened
|
66
|
-
# Set the file path
|
67
|
-
self.file_path = self.to_file_path if self.file_path.blank?
|
68
|
-
# FIXME -- kludge to ripd_root
|
69
|
-
if ! File.exist?(actual_path)
|
70
|
-
self.fetched = false
|
71
|
-
else
|
72
|
-
self.fetched = self.tried_fetch = true
|
73
|
-
self.file_size = File.size( actual_path)
|
74
|
-
self.file_time = File.mtime(actual_path)
|
75
|
-
end
|
76
|
-
self.fetched
|
77
|
-
end
|
78
|
-
def actual_path
|
79
|
-
path_to(:ripd_root, self.file_path)
|
80
|
-
end
|
81
|
-
|
82
|
-
# ===========================================================================
|
83
|
-
#
|
84
|
-
# Properly belongs in own module
|
85
|
-
#
|
86
|
-
|
87
|
-
IMW_WGET_OPTIONS = {
|
88
|
-
:root => :ripd_root,
|
89
|
-
:wait => 2,
|
90
|
-
:noretry => true,
|
91
|
-
:log_level => Logger::DEBUG,
|
92
|
-
:clobber => false,
|
93
|
-
}
|
94
|
-
#
|
95
|
-
# Fetch from the web
|
96
|
-
#
|
97
|
-
def wget options={}
|
98
|
-
options.reverse_merge! IMW_WGET_OPTIONS
|
99
|
-
cd path_to(options[:root]) do
|
100
|
-
if (not options[:clobber]) && File.file?(file_path) then
|
101
|
-
IMW.log.add options[:log_level], "Skipping #{file_path}"; return
|
102
|
-
end
|
103
|
-
# Do the fetch
|
104
|
-
mkdir_p File.dirname(actual_path)
|
105
|
-
# defaults are --connect-timeout=infinity --read-timeout=900 --tries=20 acc. to man page
|
106
|
-
cmd = %Q{wget -nv "#{full_url}" -O"#{actual_path}" --connect-timeout=5 --read-timeout=10 --tries=1 &}
|
107
|
-
IMW.log.add(options[:log_level], cmd)
|
108
|
-
IMW.log.add(options[:log_level], `#{cmd}`)
|
109
|
-
self.tried_fetch = true
|
110
|
-
sleep options[:wait] # please hammer don't hurt em
|
111
|
-
update_from_file!
|
112
|
-
self.save
|
113
|
-
return self.fetched
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
#
|
118
|
-
#
|
119
|
-
#
|
120
|
-
def contents options={}
|
121
|
-
wget options
|
122
|
-
if fetched
|
123
|
-
File.open actual_path
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
# ===========================================================================
|
128
|
-
#
|
129
|
-
# Properly belongs in FileStore
|
130
|
-
#
|
131
|
-
|
132
|
-
protected
|
133
|
-
#
|
134
|
-
# The standard file path for this url's ripped cache
|
135
|
-
#
|
136
|
-
# * leading directory from reverse.dotted.host_scheme:port:user@password
|
137
|
-
# * normalized path/file?query#fragment
|
138
|
-
# * uuid formed from the
|
139
|
-
#
|
140
|
-
def to_file_path
|
141
|
-
file_path_str = ""
|
142
|
-
file_path_str << to_file_path_root_part
|
143
|
-
file_path_str << to_file_path_path_part
|
144
|
-
file_path_str << to_file_path_file_part
|
145
|
-
file_path_str = self.class.path_str_encode(file_path_str)
|
146
|
-
self.class.validate_roundtrip(file_path_str)
|
147
|
-
file_path_str
|
148
|
-
end
|
149
|
-
def file_timestamp
|
150
|
-
file_time.strftime("%Y%m%d-%H%M%S")
|
151
|
-
end
|
152
|
-
def to_file_path_with_timestamp
|
153
|
-
to_file_path + file_timestamp
|
154
|
-
end
|
155
|
-
#
|
156
|
-
# revhost_scheme:port:user@password -- omitting _scheme if it's http, and
|
157
|
-
# omitting :port:user@password if all three are blank.
|
158
|
-
#
|
159
|
-
def to_file_path_root_part
|
160
|
-
root_part_str = ""
|
161
|
-
tld_host_frag = self.class.tier_path_segment(revhost, /^([^\.]+)\.([^\.]{1,2})/)
|
162
|
-
root_part_str << revhost
|
163
|
-
root_part_str << "_#{uri.scheme}" unless uri.scheme == 'http'
|
164
|
-
root_part_str << ":#{uri.port}:#{uri.user}@#{uri.password}" unless uri.simple?
|
165
|
-
root_part_str
|
166
|
-
end
|
167
|
-
def to_file_path_path_part
|
168
|
-
uri.path.to_s
|
169
|
-
end
|
170
|
-
def to_file_path_file_part
|
171
|
-
file_path_str = ""
|
172
|
-
file_path_str << "?#{uri.query}" unless uri.query.nil?
|
173
|
-
file_path_str << "##{uri.fragment}" unless uri.fragment.nil?
|
174
|
-
file_path_str << "-#{self.uuid}"
|
175
|
-
end
|
176
|
-
public
|
177
|
-
|
178
|
-
|
179
|
-
module ClassMethods
|
180
|
-
#
|
181
|
-
# find_or_creates from url
|
182
|
-
#
|
183
|
-
# url is heuristic_parse'd and normalized by Addressable before lookup:
|
184
|
-
# "Converts an input to a URI. The input does not have to be a valid URI —
|
185
|
-
# the method will use heuristics to guess what URI was intended. This is not
|
186
|
-
# standards compliant, merely user-friendly.
|
187
|
-
#
|
188
|
-
def find_or_create_from_url url_str
|
189
|
-
link = self.find_or_new_from_url url_str
|
190
|
-
link.save
|
191
|
-
link
|
192
|
-
end
|
193
|
-
def find_or_new_from_url url_str # :nodoc:
|
194
|
-
url_str = Addressable::URI.heuristic_parse(url_str).normalize.to_s
|
195
|
-
link = self.first( :full_url => url_str ) || self.new( :full_url => url_str )
|
196
|
-
link.make_uuid_and_handle
|
197
|
-
link.update_from_file!
|
198
|
-
link
|
199
|
-
end
|
200
|
-
def find_or_create_from_file_path ripd_file
|
201
|
-
url_str = Link.url_from_file_path(ripd_file)
|
202
|
-
link = self.first( :full_url => url_str.to_s ) || self.new( :full_url => url_str.to_s )
|
203
|
-
link.file_path = ripd_file
|
204
|
-
link.make_uuid_and_handle
|
205
|
-
link.update_from_file!
|
206
|
-
link.save
|
207
|
-
link
|
208
|
-
end
|
209
|
-
#
|
210
|
-
# Decode url from its file_path
|
211
|
-
#
|
212
|
-
def url_from_file_path fp
|
213
|
-
fp = path_str_decode(fp)
|
214
|
-
m = (%r{\A
|
215
|
-
(#{Addressable::URI::HOST_TLD}) # tld tier
|
216
|
-
/(..?) # revhost tier
|
217
|
-
/([^/\:_]+) # revhost
|
218
|
-
(?:_([^/\:]+))? # _scheme
|
219
|
-
(?::(\d*):([^/]*)@([^@/]*?))? # :port:user@password
|
220
|
-
/(?:(.*?)/)? # /dirs/
|
221
|
-
([^/]*) # file
|
222
|
-
-([a-f0-9]{32}) # -uuid
|
223
|
-
\z}x.match(fp))
|
224
|
-
raise "Can't extract url from file path #{fp}" if !m
|
225
|
-
fp_host, fp_scheme, fp_port, fp_user, fp_pass, fp_path, fp_file, fp_uuid = m.captures
|
226
|
-
fp_host = fp_host.split('.').reverse.join('.')
|
227
|
-
fp_scheme ||= 'http'
|
228
|
-
fp_pass = ":#{fp_pass}" unless fp_pass.blank?
|
229
|
-
fp_userpass = "#{fp_user}#{fp_user}@" unless fp_user.blank?
|
230
|
-
fp_port = ":#{fp_port}" unless fp_port.blank?
|
231
|
-
fp_path = File.join(*[fp_path, fp_file].compact)
|
232
|
-
"#{fp_scheme}://#{fp_userpass}#{fp_host}#{fp_port}/#{fp_path}"
|
233
|
-
end
|
234
|
-
#
|
235
|
-
# to control files-per-directory madness, take a path segment like "foobar" in
|
236
|
-
# blah.com/top/foobar/directory
|
237
|
-
# and transform into
|
238
|
-
# blah.com/top/fo/foobar/directory
|
239
|
-
#
|
240
|
-
# Ex.
|
241
|
-
# self.class.tier_path_segment('a_username')
|
242
|
-
# # => 'a_/a_username'
|
243
|
-
# self.class.tier_path_segment('1')
|
244
|
-
# # => '1/1'
|
245
|
-
# self.class.tier_path_segment('com.twitter', /^([^\.]+)\.([^\.]{1,2})/)
|
246
|
-
# # => 'com/tw/com.twitter'
|
247
|
-
#
|
248
|
-
def self.tier_path_segment(path_seg, re=/(..?)/)
|
249
|
-
frag_seg = re.match(path_seg).captures
|
250
|
-
raise "Can't tier path_seg #{path_seg} using #{re}" if frag_seg.blank?
|
251
|
-
File.join(* [frag_seg, path_seg].flatten )
|
252
|
-
end
|
253
|
-
#
|
254
|
-
#
|
255
|
-
# It's really bad if you can't roundtrip --
|
256
|
-
# since saving is the rare case (only done once!) we insist on checking.
|
257
|
-
#
|
258
|
-
def self.validate_roundtrip file_path_str
|
259
|
-
# uu = self.class.url_from_file_path(file_path_str)
|
260
|
-
# puts "*"*75, uri.to_hash.inspect, ['path str', file_path_str, 'uri', uri.to_s, 'rt', uu.to_s].inspect
|
261
|
-
return_trip_url = Addressable::URI.parse(self.class.url_from_file_path(file_path_str))
|
262
|
-
raise "crapsticks: uri doesn't roundtrip #{file_path_str} to #{uri.to_s}: #{return_trip_url}" if return_trip_url != uri
|
263
|
-
end
|
264
|
-
#
|
265
|
-
# Uses a similar scheme as the 'Quoted Printable' encoding, but more strict
|
266
|
-
# and without linebreaking or anything. The intent is to reversibly and
|
267
|
-
# recognizably store URLs to disk with names that (apart from path) do not
|
268
|
-
# need to be further escaped in filesystem, URL, database or HTML.
|
269
|
-
#
|
270
|
-
# The only characters in a path_encoded string are alpha-numeric /_-.=
|
271
|
-
#
|
272
|
-
# Rules:
|
273
|
-
# * Any character that is not alphanumeric, and is not /_-. is encoded as an
|
274
|
-
# equals sign = followed by its upper-case hex encoding.
|
275
|
-
#
|
276
|
-
# * Furthermore, in any sequence of repeated '.' characters, all after the
|
277
|
-
# first are hex encoded; same with '/'.
|
278
|
-
#
|
279
|
-
# Ex.
|
280
|
-
# path_encode("www.measuringworth.com/datasets/consumer/result.php?use[]=VCB&use[]=CU&use[]=SZ&year_source=1900&year_result=2007"
|
281
|
-
# # => www.measuringworth.com/datasets/consumer/result.php=3Fuse=5B=5D=3DVCB=26use=5B=5D=3DCU=26use=5B=5D=3DSZ=26year_source=3D1900=26year_result=3D2007
|
282
|
-
#
|
283
|
-
# Code inspired by "Glenn Parker's response to ruby quiz #23"http://www.rubyquiz.com/quiz23.html
|
284
|
-
#
|
285
|
-
def path_str_encode(str)
|
286
|
-
str.gsub(%r{\.(\.+)}){|chars| '.'+path_encode_chars(chars) }
|
287
|
-
str.gsub(%r{\/(\/+)}){|chars| '/'+path_encode_chars(chars) }
|
288
|
-
str.gsub(%r{[^A-Za-z0-9/_\-\.]+}){|chars| path_encode_chars(chars) }
|
289
|
-
end
|
290
|
-
#
|
291
|
-
# See the notes in path_encode
|
292
|
-
#
|
293
|
-
def path_str_decode(str)
|
294
|
-
str.gsub(/\+([\dA-F]{2})/){ $1.hex.chr }
|
295
|
-
end
|
296
|
-
protected
|
297
|
-
def path_encode_chars(chars) # :nodoc:
|
298
|
-
# send each character to an equals sign followed by its uppercase hex encoding
|
299
|
-
encoded = "";
|
300
|
-
chars.each_byte{|c| encoded << "+%02X" % c }
|
301
|
-
encoded
|
302
|
-
end
|
303
|
-
public
|
304
|
-
end
|
305
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
|
2
|
-
def self.url_from_file_path fp
|
3
|
-
# FIXME -- doesn't work with extension preservation
|
4
|
-
unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{30,32})\z}.match(fp)) then
|
5
|
-
# m1 = %r{\A([^/_]+)(_[^/]+)?/(?:(.*?))-([a-f0-9]{28,})}i.match(fp);
|
6
|
-
raise "Bad match to #{fp}"
|
7
|
-
end
|
8
|
-
fp_host, fp_scheme, fp_path, fp_file, fp_uuid, fp_ext = m.captures
|
9
|
-
fp_host = fp_host.split('.').reverse.join('.')
|
10
|
-
fp_scheme ||= 'http'
|
11
|
-
fp_path = File.join(*[fp_path, fp_file].compact) # FIXME -- no ext
|
12
|
-
url = Addressable::URI.new(fp_scheme, nil, nil, fp_host, nil, fp_path, nil, nil)
|
13
|
-
unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{32})\z}.match(fp)) then
|
14
|
-
# warn "Bad luck!!! #{url.path} hash is #{fp_uuid} vs #{UUID.sha1_create(UUID_INFOCHIMPS_LINKS_NAMESPACE, url.to_s).hexdigest}"
|
15
|
-
end
|
16
|
-
url
|
17
|
-
end
|
18
|
-
|
19
|
-
#
|
20
|
-
# returns [dirname, basename, ext] for the file_path
|
21
|
-
# ext is determined by basename_ext_splitter
|
22
|
-
#
|
23
|
-
def path_split
|
24
|
-
path_split_str path
|
25
|
-
end
|
26
|
-
|
27
|
-
# lowercase; only a-z, num, . -
|
28
|
-
def scrubbed_revhost
|
29
|
-
return unless revhost
|
30
|
-
revhost.downcase.gsub(/[^a-z0-9\.\-]+/i, '') # note: no _
|
31
|
-
end
|
32
|
-
|
33
|
-
cattr_accessor :basename_ext_splitter
|
34
|
-
BASENAME_EXT_SPLIT_SMART = /(.+?)\.(tar\.gz|tar\.bz2|[^\.]+)/
|
35
|
-
BASENAME_EXT_NO_SPLIT = /(.+?)()/
|
36
|
-
self.basename_ext_splitter = BASENAME_EXT_NO_SPLIT
|
37
|
-
|
38
|
-
#
|
39
|
-
# Like File.split but heuristically handles things like .tar.bz2:
|
40
|
-
#
|
41
|
-
# foo. => ['foo.', '']
|
42
|
-
# foo.tar.gz => ['foo.', '']
|
43
|
-
# foo.tar.bz2 => ['foo.', '']
|
44
|
-
# foo.yaml => ['foo', '']
|
45
|
-
#
|
46
|
-
def path_split_str str
|
47
|
-
if str =~ %r{/.+\z}
|
48
|
-
dirname, basename = %r{\A(.*)/([^/]+)\z}.match(str).captures
|
49
|
-
else
|
50
|
-
dirname, basename = ['', str]
|
51
|
-
end
|
52
|
-
# Get basename, extension (as given by capture groups in basename_ext_splitter)
|
53
|
-
if basename_ext_splitter && (m = /\A#{basename_ext_splitter}\z/i.match(basename))
|
54
|
-
basename, ext = m.captures
|
55
|
-
else
|
56
|
-
basename, ext = [basename, '']
|
57
|
-
end
|
58
|
-
[dirname, basename, ext]
|
59
|
-
end
|
60
|
-
|
61
|
-
# remove all blank components, join the rest with separator
|
62
|
-
def join_non_blank separator, *strs
|
63
|
-
strs.reject(&:blank?).join(separator)
|
64
|
-
end
|
65
|
-
|
66
|
-
# only a-z A-Z, num, .-_/
|
67
|
-
def scrubbed_path
|
68
|
-
path_part = path
|
69
|
-
# colons into /
|
70
|
-
path_part = path_part.gsub(%r{\:+}, '/')
|
71
|
-
# Kill weird chars
|
72
|
-
path_part = path_part.gsub(%r{[^a-zA-Z0-9\.\-_/]+}, '_')
|
73
|
-
# Compact (killing foo/../bar, etc)
|
74
|
-
path_part = path_part.gsub(%r{/[^a-zA-Z0-9]+/}, '/').gsub(%r{/\.\.+/}, '.')
|
75
|
-
# Kill leading & trailing non-alnum
|
76
|
-
path_part = path_part.gsub(%r{^[^a-zA-Z0-9]+}, '').gsub(%r{[^a-zA-Z0-9]+$}, '')
|
77
|
-
end
|
78
|
-
|
79
|
-
#
|
80
|
-
# name for this URL regarded as a file (instance)
|
81
|
-
#
|
82
|
-
def to_file_path
|
83
|
-
dirname, basename, ext = path_split_str(scrubbed_path)
|
84
|
-
basename = join_non_blank '-', basename, uuid
|
85
|
-
basename = join_non_blank '.', basename, ext
|
86
|
-
join_non_blank '/', root_path, dirname, basename
|
87
|
-
end
|
@@ -1,147 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
require 'rubygems'
|
3
|
-
require 'active_support'
|
4
|
-
require 'uuidtools'
|
5
|
-
|
6
|
-
module Scrub
|
7
|
-
class Generic
|
8
|
-
# A regular expression character group
|
9
|
-
# (a bunch of characters ready to drop into /[#{validator}]*/)
|
10
|
-
# whitelisting allowed characters
|
11
|
-
#
|
12
|
-
# Must be overridden in child class
|
13
|
-
class_inheritable_accessor :validator
|
14
|
-
|
15
|
-
# Sentence fragment for error message on failed validation.
|
16
|
-
class_inheritable_accessor :complaint
|
17
|
-
self.complaint = "has characters I can't understand"
|
18
|
-
|
19
|
-
# Proc or string or anything that can be 2nd arg to gsub
|
20
|
-
# to sanitize
|
21
|
-
class_inheritable_accessor :replacer
|
22
|
-
self.replacer = '-'
|
23
|
-
|
24
|
-
# A regular expression to sanitize objects
|
25
|
-
# if unset or nil, the validator char group
|
26
|
-
class_inheritable_accessor :sanitizer
|
27
|
-
|
28
|
-
# unless overridden or set expressly, just use the
|
29
|
-
# validator
|
30
|
-
def sanitizer
|
31
|
-
@sanitizer || self.validator
|
32
|
-
end
|
33
|
-
|
34
|
-
def sanitize str
|
35
|
-
str = str.to_s
|
36
|
-
str.gsub(%r{([^#{validator.to_s}]+)}u, replacer)
|
37
|
-
end
|
38
|
-
|
39
|
-
def valid? str
|
40
|
-
%r{\A([#{validator.to_s}]*)\z}u.match(str)
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
#
|
45
|
-
# A permissive, ASCII-only name string - no control chars, newlines, backslash
|
46
|
-
# or <> angle brackets
|
47
|
-
#
|
48
|
-
class Title < Scrub::Generic
|
49
|
-
self.complaint = "should only contain basic keyboard characters (and should not use \\ < or >)."
|
50
|
-
self.validator = %r{a-zA-Z0-9_ ~\!@#\$%\^&\*\(\)\-\+=;\:'"`\[\]\{\}\|,\?\.\/}u
|
51
|
-
end
|
52
|
-
|
53
|
-
#
|
54
|
-
# A permissive, ASCII-only name string - no control chars, newlines, backslash
|
55
|
-
# or <> angle brackets
|
56
|
-
#
|
57
|
-
class UnicodeTitle < Scrub::Title
|
58
|
-
self.complaint = "should only contain keyboard characters (and should not use \\ < or >)."
|
59
|
-
self.validator = %r{[:alpha:][:digit:]#{Scrub::Title.validator}}u
|
60
|
-
end
|
61
|
-
|
62
|
-
#
|
63
|
-
# Visible characters and spaces (i.e. anything except control characters, etc.)
|
64
|
-
#
|
65
|
-
class FreeText < Scrub::Generic
|
66
|
-
self.complaint = "should not contain control characters or that kind of junk."
|
67
|
-
self.validator = %r{[:print:]\n\t}u
|
68
|
-
end
|
69
|
-
|
70
|
-
module BeginsWithAlpha
|
71
|
-
mattr_accessor :slug
|
72
|
-
self.slug = 'x'
|
73
|
-
# prepend #{slug}#{replacer} to the string if it starts with non-alpha.
|
74
|
-
# so, for instance '23jumpstreet' => 'x_23jumpstreet'
|
75
|
-
def sanitize_with_begins_with_alpha str
|
76
|
-
str = sanitize_without_begins_with_alpha str
|
77
|
-
str = 'x' + replacer + str if (str !~ /^[a-z]/i) # call at end of chain!
|
78
|
-
str
|
79
|
-
end
|
80
|
-
def valid_with_begins_with_alpha? str
|
81
|
-
(str =~ /^[a-z]/i) && valid_without_begins_with_alpha?(str)
|
82
|
-
end
|
83
|
-
def self.included base
|
84
|
-
base.alias_method_chain :sanitize, :begins_with_alpha # unless defined?(base.sanitize_without_begins_with_alpha)
|
85
|
-
base.alias_method_chain :valid?, :begins_with_alpha # unless defined?(base.valid_without_begins_with_alpha?)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
#
|
90
|
-
# insist that a string be lowercased.
|
91
|
-
#
|
92
|
-
module Lowercased
|
93
|
-
def sanitize_with_lowercased str
|
94
|
-
str = sanitize_without_lowercased str
|
95
|
-
str.downcase # call at end of chain!
|
96
|
-
end
|
97
|
-
def valid_with_lowercase? str
|
98
|
-
(str !~ /[[:upper:]]/u) && valid_without_lowercase?(str)
|
99
|
-
end
|
100
|
-
def self.included base
|
101
|
-
base.alias_method_chain :sanitize, :lowercased # unless defined?(base.sanitize_without_lowercased)
|
102
|
-
base.alias_method_chain :valid?, :lowercase # unless defined?(base.valid_without_lowercase?)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
#
|
107
|
-
# start with a letter, and contain only A-Za-z0-9_
|
108
|
-
#
|
109
|
-
class Identifier < Scrub::Generic
|
110
|
-
self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
|
111
|
-
self.validator = %r{a-z0-9_}u
|
112
|
-
self.replacer = '_'
|
113
|
-
include Scrub::BeginsWithAlpha
|
114
|
-
include Scrub::Lowercased
|
115
|
-
end
|
116
|
-
|
117
|
-
#
|
118
|
-
# start with a letter, and contain only A-Za-z0-9_
|
119
|
-
#
|
120
|
-
class Handle < Scrub::Generic
|
121
|
-
self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
|
122
|
-
self.validator = %r{a-z0-9_}u
|
123
|
-
self.replacer = '_'
|
124
|
-
include Scrub::BeginsWithAlpha
|
125
|
-
include Scrub::Lowercased
|
126
|
-
end
|
127
|
-
|
128
|
-
# HANDLE_RE = %r{\A[a-z][]*\z}i # ascii, not :alpha: etc.
|
129
|
-
# HANDLE_MSG = "should start with a letter, and contain only characters like a-z0-9_-."
|
130
|
-
#
|
131
|
-
# # "Domain names are restricted to the ASCII letters a through z
|
132
|
-
# # (case-insensitive), the digits 0 through 9, and the hyphen, with some other
|
133
|
-
# # restrictions in terms of name length and position of hyphens."
|
134
|
-
# # (http://en.wikipedia.org/wiki/Domain_name#Overview)
|
135
|
-
# # http://tools.ietf.org/html/rfc1034
|
136
|
-
# DOMAIN_RE = %r{\A[a-z][a-z0-9\-][a-z0-9]\z}i # case insensitive
|
137
|
-
# DOMAIN_MSG = "should look like a domain name."
|
138
|
-
# DOMAIN_MORE = "only letters, digits or hyphens (-), start with a letter and end with a letter or number."
|
139
|
-
MSG_EMAIL_BAD = "should look like an email address (you@somethingsomething.com) and include only letters, numbers and . + - % please."
|
140
|
-
RE_EMAIL_NAME = '[\w\.%\+\-]+' # what you actually see in practice
|
141
|
-
RE_EMAIL_N_RFC2822 = '0-9A-Z!#\$%\&\'\*\+_/=\?^\-`\{|\}~\.' # technically allowed by RFC-2822
|
142
|
-
RE_DOMAIN_HEAD = '(?:[A-Z0-9\-]+\.)+'
|
143
|
-
RE_DOMAIN_TLD = '(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
|
144
|
-
RE_EMAIL_OK = /\A#{RE_EMAIL_NAME}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
|
145
|
-
RE_EMAIL_RFC2822 = /\A#{RE_EMAIL_N_RFC2822}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
|
146
|
-
|
147
|
-
end
|
@@ -1,38 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module IMW
|
4
|
-
module URIScrubber
|
5
|
-
|
6
|
-
def scrubbed
|
7
|
-
to_dirpath
|
8
|
-
end
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
module Scrub
|
13
|
-
#
|
14
|
-
# start with a letter, and contain only A-Za-z0-9_
|
15
|
-
#
|
16
|
-
class SimplifiedURL < Scrub::Generic
|
17
|
-
self.complaint = "should follow our zany simplified URL rules: com.host.dot-reversed:schemeifnothttp/path/seg_men-ts/stuff.ext-SHA1ifweird"
|
18
|
-
self.validator = %r{#{Addressable::URI::SAFE_CHARS}#{Addressable::URI::RESERVED_CHARS}}u
|
19
|
-
self.replacer = ''
|
20
|
-
include Scrub::Lowercased
|
21
|
-
attr_accessor :uri
|
22
|
-
|
23
|
-
def valid? str
|
24
|
-
str.to_s.downcase == sanitize(str)
|
25
|
-
end
|
26
|
-
|
27
|
-
def sanitize str
|
28
|
-
# if this fails just normalize once, or don't set $KCODE: http://bit.ly/1664vp
|
29
|
-
uri = Addressable::URI.heuristic_parse(str.to_s).normalize
|
30
|
-
# print [uri.host, uri.host_valid?, uri.path, uri.path_valid?].inspect
|
31
|
-
if uri.host_valid?
|
32
|
-
uri.scrubbed
|
33
|
-
else
|
34
|
-
uri.uuid_path
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
@@ -1,60 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
require 'scrub'
|
4
|
-
require 'scrub_simple_url'
|
5
|
-
|
6
|
-
test_strings = [
|
7
|
-
nil, '', '12', '123', 'simple', 'UPPER', 'CamelCased', 'iden_tifier_23_',
|
8
|
-
'twentyfouralphacharslong', 'twentyfiveatozonlyletters', 'hello.-_there@funnychar.com',
|
9
|
-
"tab\t", "newline\n",
|
10
|
-
"Iñtërnâtiônàlizætiøn",
|
11
|
-
'semicolon;', 'quote"', 'tick\'', 'backtick`', 'percent%', 'plus+', 'space ',
|
12
|
-
'leftanglebracket<', 'ampersand&',
|
13
|
-
"control char-bel\x07",
|
14
|
-
"http://foo.bar.com/",
|
15
|
-
"HTTP://FOO.BAR.com",
|
16
|
-
".com/zazz",
|
17
|
-
"scheme://user_name@user_acct:passwd@host-name.museum:9047/path;pathquery/p!a-th~2/path?query=param&query=pa%20ram#fragment",
|
18
|
-
"http://web.site.com/path/path/file.ext",
|
19
|
-
"ftp://ftp.site.com/path/path/file.ext",
|
20
|
-
"/absolute/pathname/file.ext",
|
21
|
-
"http://foo.bar.com/.hidden_file_with.ext",
|
22
|
-
"http://foo.bar.com/.hidden_file",
|
23
|
-
"dir/--/non_alpha_path_segment.ext",
|
24
|
-
"http://foo.bar.com/dir/../two_dots_in_path",
|
25
|
-
|
26
|
-
]
|
27
|
-
|
28
|
-
|
29
|
-
scrubbers = {
|
30
|
-
# :unicode_title => Scrub::UnicodeTitle.new,
|
31
|
-
# :title => Scrub::Title.new,
|
32
|
-
# :identifier => Scrub::Identifier.new,
|
33
|
-
# :free_text => Scrub::FreeText.new,
|
34
|
-
:handle => Scrub::Handle.new,
|
35
|
-
:simplified_url => Scrub::SimplifiedURL.new,
|
36
|
-
# :domain => Scrub::Domain.new,
|
37
|
-
# :email => Scrub::Email.new,
|
38
|
-
}
|
39
|
-
|
40
|
-
scrubbers.each do |scrubber_name, scrubber|
|
41
|
-
puts scrubber_name
|
42
|
-
results = test_strings.map do |test_string|
|
43
|
-
[!!scrubber.valid?(test_string), scrubber.sanitize(test_string).inspect, test_string.inspect ]
|
44
|
-
end
|
45
|
-
results.sort_by{|val,san,orig| val ? 1 : -1 }.each do |val,san,orig|
|
46
|
-
puts " %-5s %-30s %-30s" % [val,san,orig]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
# 'foo@bar.com', 'foo@newskool-tld.museum', 'foo@twoletter-tld.de', 'foo@nonexistant-tld.qq',
|
53
|
-
# 'r@a.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail.com',
|
54
|
-
# 'hello.-_there@funnychar.com', 'uucp%addr@gmail.com', 'hello+routing-str@gmail.com',
|
55
|
-
# 'domain@can.haz.many.sub.doma.in',],
|
56
|
-
# :invalid => [nil, '', '!!@nobadchars.com', 'foo@no-rep-dots..com', 'foo@badtld.xxx', 'foo@toolongtld.abcdefg',
|
57
|
-
# 'Iñtërnâtiônàlizætiøn@hasnt.happened.to.email', 'need.domain.and.tld@de', "tab\t", "newline\n",
|
58
|
-
# 'r@.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail2.com',
|
59
|
-
# # these are technically allowed but not seen in practice:
|
60
|
-
# 'uucp!addr@gmail.com', 'semicolon;@gmail.com', 'quote"@gmail.com', 'tick\'@gmail.com', 'backtick`@gmail.com', 'space @gmail.com', 'bracket<@gmail.com', 'bracket>@gmail.com'
|