imw 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
@@ -1,305 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
module Linkish
|
3
|
-
def self.included base
|
4
|
-
base.class_eval do
|
5
|
-
include DataMapper::Resource
|
6
|
-
include Infochimps::Resource
|
7
|
-
property :id, Integer, :serial => true
|
8
|
-
property :full_url, String, :length => 255, :nullable => false, :unique_index => true
|
9
|
-
has_handle
|
10
|
-
alias_method :handle_generator, :full_url
|
11
|
-
has_time_and_user_stamps
|
12
|
-
#
|
13
|
-
property :name, String, :length => 255, :nullable => false, :default => ''
|
14
|
-
#
|
15
|
-
property :file_path, String, :length => 1024
|
16
|
-
property :file_time, DateTime
|
17
|
-
property :file_size, Integer
|
18
|
-
property :file_sha1, String, :length => 40
|
19
|
-
property :tried_fetch, DataMapper::Resource::Boolean
|
20
|
-
property :fetched, DataMapper::Resource::Boolean
|
21
|
-
#
|
22
|
-
before :create, :make_uuid_and_handle
|
23
|
-
before :create, :update_from_file!
|
24
|
-
end
|
25
|
-
base.extend ClassMethods
|
26
|
-
end
|
27
|
-
|
28
|
-
# ===========================================================================
|
29
|
-
#
|
30
|
-
# Delegate methods to uri
|
31
|
-
#
|
32
|
-
def uri
|
33
|
-
@uri ||= Addressable::URI.parse(self.full_url)
|
34
|
-
end
|
35
|
-
# Dispatch anything else to the aggregated uri object
|
36
|
-
def method_missing method, *args
|
37
|
-
if self.uri.respond_to?(method)
|
38
|
-
self.uri.send(method, *args)
|
39
|
-
else
|
40
|
-
super method, *args
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def to_s
|
45
|
-
"<a href='#{self.uri.to_s}'>#{self.name}</a>" # <-- !! not escaped !!
|
46
|
-
end
|
47
|
-
|
48
|
-
# ===========================================================================
|
49
|
-
#
|
50
|
-
# ID, naming, etc
|
51
|
-
#
|
52
|
-
def normalize_url!
|
53
|
-
u = Addressable::URI.parse(self.full_url).normalize
|
54
|
-
self.full_url = u.to_s
|
55
|
-
end
|
56
|
-
|
57
|
-
# ===========================================================================
|
58
|
-
#
|
59
|
-
# Properly belongs in FileStore module
|
60
|
-
#
|
61
|
-
#
|
62
|
-
# Refresh cached properties from our copy of the asset.
|
63
|
-
#
|
64
|
-
def update_from_file!
|
65
|
-
self.make_uuid_and_handle # make sure this happened
|
66
|
-
# Set the file path
|
67
|
-
self.file_path = self.to_file_path if self.file_path.blank?
|
68
|
-
# FIXME -- kludge to ripd_root
|
69
|
-
if ! File.exist?(actual_path)
|
70
|
-
self.fetched = false
|
71
|
-
else
|
72
|
-
self.fetched = self.tried_fetch = true
|
73
|
-
self.file_size = File.size( actual_path)
|
74
|
-
self.file_time = File.mtime(actual_path)
|
75
|
-
end
|
76
|
-
self.fetched
|
77
|
-
end
|
78
|
-
def actual_path
|
79
|
-
path_to(:ripd_root, self.file_path)
|
80
|
-
end
|
81
|
-
|
82
|
-
# ===========================================================================
|
83
|
-
#
|
84
|
-
# Properly belongs in own module
|
85
|
-
#
|
86
|
-
|
87
|
-
IMW_WGET_OPTIONS = {
|
88
|
-
:root => :ripd_root,
|
89
|
-
:wait => 2,
|
90
|
-
:noretry => true,
|
91
|
-
:log_level => Logger::DEBUG,
|
92
|
-
:clobber => false,
|
93
|
-
}
|
94
|
-
#
|
95
|
-
# Fetch from the web
|
96
|
-
#
|
97
|
-
def wget options={}
|
98
|
-
options.reverse_merge! IMW_WGET_OPTIONS
|
99
|
-
cd path_to(options[:root]) do
|
100
|
-
if (not options[:clobber]) && File.file?(file_path) then
|
101
|
-
IMW.log.add options[:log_level], "Skipping #{file_path}"; return
|
102
|
-
end
|
103
|
-
# Do the fetch
|
104
|
-
mkdir_p File.dirname(actual_path)
|
105
|
-
# defaults are --connect-timeout=infinity --read-timeout=900 --tries=20 acc. to man page
|
106
|
-
cmd = %Q{wget -nv "#{full_url}" -O"#{actual_path}" --connect-timeout=5 --read-timeout=10 --tries=1 &}
|
107
|
-
IMW.log.add(options[:log_level], cmd)
|
108
|
-
IMW.log.add(options[:log_level], `#{cmd}`)
|
109
|
-
self.tried_fetch = true
|
110
|
-
sleep options[:wait] # please hammer don't hurt em
|
111
|
-
update_from_file!
|
112
|
-
self.save
|
113
|
-
return self.fetched
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
#
|
118
|
-
#
|
119
|
-
#
|
120
|
-
def contents options={}
|
121
|
-
wget options
|
122
|
-
if fetched
|
123
|
-
File.open actual_path
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
# ===========================================================================
|
128
|
-
#
|
129
|
-
# Properly belongs in FileStore
|
130
|
-
#
|
131
|
-
|
132
|
-
protected
|
133
|
-
#
|
134
|
-
# The standard file path for this url's ripped cache
|
135
|
-
#
|
136
|
-
# * leading directory from reverse.dotted.host_scheme:port:user@password
|
137
|
-
# * normalized path/file?query#fragment
|
138
|
-
# * uuid formed from the
|
139
|
-
#
|
140
|
-
def to_file_path
|
141
|
-
file_path_str = ""
|
142
|
-
file_path_str << to_file_path_root_part
|
143
|
-
file_path_str << to_file_path_path_part
|
144
|
-
file_path_str << to_file_path_file_part
|
145
|
-
file_path_str = self.class.path_str_encode(file_path_str)
|
146
|
-
self.class.validate_roundtrip(file_path_str)
|
147
|
-
file_path_str
|
148
|
-
end
|
149
|
-
def file_timestamp
|
150
|
-
file_time.strftime("%Y%m%d-%H%M%S")
|
151
|
-
end
|
152
|
-
def to_file_path_with_timestamp
|
153
|
-
to_file_path + file_timestamp
|
154
|
-
end
|
155
|
-
#
|
156
|
-
# revhost_scheme:port:user@password -- omitting _scheme if it's http, and
|
157
|
-
# omitting :port:user@password if all three are blank.
|
158
|
-
#
|
159
|
-
def to_file_path_root_part
|
160
|
-
root_part_str = ""
|
161
|
-
tld_host_frag = self.class.tier_path_segment(revhost, /^([^\.]+)\.([^\.]{1,2})/)
|
162
|
-
root_part_str << revhost
|
163
|
-
root_part_str << "_#{uri.scheme}" unless uri.scheme == 'http'
|
164
|
-
root_part_str << ":#{uri.port}:#{uri.user}@#{uri.password}" unless uri.simple?
|
165
|
-
root_part_str
|
166
|
-
end
|
167
|
-
def to_file_path_path_part
|
168
|
-
uri.path.to_s
|
169
|
-
end
|
170
|
-
def to_file_path_file_part
|
171
|
-
file_path_str = ""
|
172
|
-
file_path_str << "?#{uri.query}" unless uri.query.nil?
|
173
|
-
file_path_str << "##{uri.fragment}" unless uri.fragment.nil?
|
174
|
-
file_path_str << "-#{self.uuid}"
|
175
|
-
end
|
176
|
-
public
|
177
|
-
|
178
|
-
|
179
|
-
module ClassMethods
|
180
|
-
#
|
181
|
-
# find_or_creates from url
|
182
|
-
#
|
183
|
-
# url is heuristic_parse'd and normalized by Addressable before lookup:
|
184
|
-
# "Converts an input to a URI. The input does not have to be a valid URI —
|
185
|
-
# the method will use heuristics to guess what URI was intended. This is not
|
186
|
-
# standards compliant, merely user-friendly.
|
187
|
-
#
|
188
|
-
def find_or_create_from_url url_str
|
189
|
-
link = self.find_or_new_from_url url_str
|
190
|
-
link.save
|
191
|
-
link
|
192
|
-
end
|
193
|
-
def find_or_new_from_url url_str # :nodoc:
|
194
|
-
url_str = Addressable::URI.heuristic_parse(url_str).normalize.to_s
|
195
|
-
link = self.first( :full_url => url_str ) || self.new( :full_url => url_str )
|
196
|
-
link.make_uuid_and_handle
|
197
|
-
link.update_from_file!
|
198
|
-
link
|
199
|
-
end
|
200
|
-
def find_or_create_from_file_path ripd_file
|
201
|
-
url_str = Link.url_from_file_path(ripd_file)
|
202
|
-
link = self.first( :full_url => url_str.to_s ) || self.new( :full_url => url_str.to_s )
|
203
|
-
link.file_path = ripd_file
|
204
|
-
link.make_uuid_and_handle
|
205
|
-
link.update_from_file!
|
206
|
-
link.save
|
207
|
-
link
|
208
|
-
end
|
209
|
-
#
|
210
|
-
# Decode url from its file_path
|
211
|
-
#
|
212
|
-
def url_from_file_path fp
|
213
|
-
fp = path_str_decode(fp)
|
214
|
-
m = (%r{\A
|
215
|
-
(#{Addressable::URI::HOST_TLD}) # tld tier
|
216
|
-
/(..?) # revhost tier
|
217
|
-
/([^/\:_]+) # revhost
|
218
|
-
(?:_([^/\:]+))? # _scheme
|
219
|
-
(?::(\d*):([^/]*)@([^@/]*?))? # :port:user@password
|
220
|
-
/(?:(.*?)/)? # /dirs/
|
221
|
-
([^/]*) # file
|
222
|
-
-([a-f0-9]{32}) # -uuid
|
223
|
-
\z}x.match(fp))
|
224
|
-
raise "Can't extract url from file path #{fp}" if !m
|
225
|
-
fp_host, fp_scheme, fp_port, fp_user, fp_pass, fp_path, fp_file, fp_uuid = m.captures
|
226
|
-
fp_host = fp_host.split('.').reverse.join('.')
|
227
|
-
fp_scheme ||= 'http'
|
228
|
-
fp_pass = ":#{fp_pass}" unless fp_pass.blank?
|
229
|
-
fp_userpass = "#{fp_user}#{fp_user}@" unless fp_user.blank?
|
230
|
-
fp_port = ":#{fp_port}" unless fp_port.blank?
|
231
|
-
fp_path = File.join(*[fp_path, fp_file].compact)
|
232
|
-
"#{fp_scheme}://#{fp_userpass}#{fp_host}#{fp_port}/#{fp_path}"
|
233
|
-
end
|
234
|
-
#
|
235
|
-
# to control files-per-directory madness, take a path segment like "foobar" in
|
236
|
-
# blah.com/top/foobar/directory
|
237
|
-
# and transform into
|
238
|
-
# blah.com/top/fo/foobar/directory
|
239
|
-
#
|
240
|
-
# Ex.
|
241
|
-
# self.class.tier_path_segment('a_username')
|
242
|
-
# # => 'a_/a_username'
|
243
|
-
# self.class.tier_path_segment('1')
|
244
|
-
# # => '1/1'
|
245
|
-
# self.class.tier_path_segment('com.twitter', /^([^\.]+)\.([^\.]{1,2})/)
|
246
|
-
# # => 'com/tw/com.twitter'
|
247
|
-
#
|
248
|
-
def self.tier_path_segment(path_seg, re=/(..?)/)
|
249
|
-
frag_seg = re.match(path_seg).captures
|
250
|
-
raise "Can't tier path_seg #{path_seg} using #{re}" if frag_seg.blank?
|
251
|
-
File.join(* [frag_seg, path_seg].flatten )
|
252
|
-
end
|
253
|
-
#
|
254
|
-
#
|
255
|
-
# It's really bad if you can't roundtrip --
|
256
|
-
# since saving is the rare case (only done once!) we insist on checking.
|
257
|
-
#
|
258
|
-
def self.validate_roundtrip file_path_str
|
259
|
-
# uu = self.class.url_from_file_path(file_path_str)
|
260
|
-
# puts "*"*75, uri.to_hash.inspect, ['path str', file_path_str, 'uri', uri.to_s, 'rt', uu.to_s].inspect
|
261
|
-
return_trip_url = Addressable::URI.parse(self.class.url_from_file_path(file_path_str))
|
262
|
-
raise "crapsticks: uri doesn't roundtrip #{file_path_str} to #{uri.to_s}: #{return_trip_url}" if return_trip_url != uri
|
263
|
-
end
|
264
|
-
#
|
265
|
-
# Uses a similar scheme as the 'Quoted Printable' encoding, but more strict
|
266
|
-
# and without linebreaking or anything. The intent is to reversibly and
|
267
|
-
# recognizably store URLs to disk with names that (apart from path) do not
|
268
|
-
# need to be further escaped in filesystem, URL, database or HTML.
|
269
|
-
#
|
270
|
-
# The only characters in a path_encoded string are alpha-numeric /_-.=
|
271
|
-
#
|
272
|
-
# Rules:
|
273
|
-
# * Any character that is not alphanumeric, and is not /_-. is encoded as an
|
274
|
-
# equals sign = followed by its upper-case hex encoding.
|
275
|
-
#
|
276
|
-
# * Furthermore, in any sequence of repeated '.' characters, all after the
|
277
|
-
# first are hex encoded; same with '/'.
|
278
|
-
#
|
279
|
-
# Ex.
|
280
|
-
# path_encode("www.measuringworth.com/datasets/consumer/result.php?use[]=VCB&use[]=CU&use[]=SZ&year_source=1900&year_result=2007"
|
281
|
-
# # => www.measuringworth.com/datasets/consumer/result.php=3Fuse=5B=5D=3DVCB=26use=5B=5D=3DCU=26use=5B=5D=3DSZ=26year_source=3D1900=26year_result=3D2007
|
282
|
-
#
|
283
|
-
# Code inspired by "Glenn Parker's response to ruby quiz #23"http://www.rubyquiz.com/quiz23.html
|
284
|
-
#
|
285
|
-
def path_str_encode(str)
|
286
|
-
str.gsub(%r{\.(\.+)}){|chars| '.'+path_encode_chars(chars) }
|
287
|
-
str.gsub(%r{\/(\/+)}){|chars| '/'+path_encode_chars(chars) }
|
288
|
-
str.gsub(%r{[^A-Za-z0-9/_\-\.]+}){|chars| path_encode_chars(chars) }
|
289
|
-
end
|
290
|
-
#
|
291
|
-
# See the notes in path_encode
|
292
|
-
#
|
293
|
-
def path_str_decode(str)
|
294
|
-
str.gsub(/\+([\dA-F]{2})/){ $1.hex.chr }
|
295
|
-
end
|
296
|
-
protected
|
297
|
-
def path_encode_chars(chars) # :nodoc:
|
298
|
-
# send each character to an equals sign followed by its uppercase hex encoding
|
299
|
-
encoded = "";
|
300
|
-
chars.each_byte{|c| encoded << "+%02X" % c }
|
301
|
-
encoded
|
302
|
-
end
|
303
|
-
public
|
304
|
-
end
|
305
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
|
2
|
-
def self.url_from_file_path fp
|
3
|
-
# FIXME -- doesn't work with extension preservation
|
4
|
-
unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{30,32})\z}.match(fp)) then
|
5
|
-
# m1 = %r{\A([^/_]+)(_[^/]+)?/(?:(.*?))-([a-f0-9]{28,})}i.match(fp);
|
6
|
-
raise "Bad match to #{fp}"
|
7
|
-
end
|
8
|
-
fp_host, fp_scheme, fp_path, fp_file, fp_uuid, fp_ext = m.captures
|
9
|
-
fp_host = fp_host.split('.').reverse.join('.')
|
10
|
-
fp_scheme ||= 'http'
|
11
|
-
fp_path = File.join(*[fp_path, fp_file].compact) # FIXME -- no ext
|
12
|
-
url = Addressable::URI.new(fp_scheme, nil, nil, fp_host, nil, fp_path, nil, nil)
|
13
|
-
unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{32})\z}.match(fp)) then
|
14
|
-
# warn "Bad luck!!! #{url.path} hash is #{fp_uuid} vs #{UUID.sha1_create(UUID_INFOCHIMPS_LINKS_NAMESPACE, url.to_s).hexdigest}"
|
15
|
-
end
|
16
|
-
url
|
17
|
-
end
|
18
|
-
|
19
|
-
#
|
20
|
-
# returns [dirname, basename, ext] for the file_path
|
21
|
-
# ext is determined by basename_ext_splitter
|
22
|
-
#
|
23
|
-
def path_split
|
24
|
-
path_split_str path
|
25
|
-
end
|
26
|
-
|
27
|
-
# lowercase; only a-z, num, . -
|
28
|
-
def scrubbed_revhost
|
29
|
-
return unless revhost
|
30
|
-
revhost.downcase.gsub(/[^a-z0-9\.\-]+/i, '') # note: no _
|
31
|
-
end
|
32
|
-
|
33
|
-
cattr_accessor :basename_ext_splitter
|
34
|
-
BASENAME_EXT_SPLIT_SMART = /(.+?)\.(tar\.gz|tar\.bz2|[^\.]+)/
|
35
|
-
BASENAME_EXT_NO_SPLIT = /(.+?)()/
|
36
|
-
self.basename_ext_splitter = BASENAME_EXT_NO_SPLIT
|
37
|
-
|
38
|
-
#
|
39
|
-
# Like File.split but heuristically handles things like .tar.bz2:
|
40
|
-
#
|
41
|
-
# foo. => ['foo.', '']
|
42
|
-
# foo.tar.gz => ['foo.', '']
|
43
|
-
# foo.tar.bz2 => ['foo.', '']
|
44
|
-
# foo.yaml => ['foo', '']
|
45
|
-
#
|
46
|
-
def path_split_str str
|
47
|
-
if str =~ %r{/.+\z}
|
48
|
-
dirname, basename = %r{\A(.*)/([^/]+)\z}.match(str).captures
|
49
|
-
else
|
50
|
-
dirname, basename = ['', str]
|
51
|
-
end
|
52
|
-
# Get basename, extension (as given by capture groups in basename_ext_splitter)
|
53
|
-
if basename_ext_splitter && (m = /\A#{basename_ext_splitter}\z/i.match(basename))
|
54
|
-
basename, ext = m.captures
|
55
|
-
else
|
56
|
-
basename, ext = [basename, '']
|
57
|
-
end
|
58
|
-
[dirname, basename, ext]
|
59
|
-
end
|
60
|
-
|
61
|
-
# remove all blank components, join the rest with separator
|
62
|
-
def join_non_blank separator, *strs
|
63
|
-
strs.reject(&:blank?).join(separator)
|
64
|
-
end
|
65
|
-
|
66
|
-
# only a-z A-Z, num, .-_/
|
67
|
-
def scrubbed_path
|
68
|
-
path_part = path
|
69
|
-
# colons into /
|
70
|
-
path_part = path_part.gsub(%r{\:+}, '/')
|
71
|
-
# Kill weird chars
|
72
|
-
path_part = path_part.gsub(%r{[^a-zA-Z0-9\.\-_/]+}, '_')
|
73
|
-
# Compact (killing foo/../bar, etc)
|
74
|
-
path_part = path_part.gsub(%r{/[^a-zA-Z0-9]+/}, '/').gsub(%r{/\.\.+/}, '.')
|
75
|
-
# Kill leading & trailing non-alnum
|
76
|
-
path_part = path_part.gsub(%r{^[^a-zA-Z0-9]+}, '').gsub(%r{[^a-zA-Z0-9]+$}, '')
|
77
|
-
end
|
78
|
-
|
79
|
-
#
|
80
|
-
# name for this URL regarded as a file (instance)
|
81
|
-
#
|
82
|
-
def to_file_path
|
83
|
-
dirname, basename, ext = path_split_str(scrubbed_path)
|
84
|
-
basename = join_non_blank '-', basename, uuid
|
85
|
-
basename = join_non_blank '.', basename, ext
|
86
|
-
join_non_blank '/', root_path, dirname, basename
|
87
|
-
end
|
@@ -1,147 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
require 'rubygems'
|
3
|
-
require 'active_support'
|
4
|
-
require 'uuidtools'
|
5
|
-
|
6
|
-
module Scrub
|
7
|
-
class Generic
|
8
|
-
# A regular expression character group
|
9
|
-
# (a bunch of characters ready to drop into /[#{validator}]*/)
|
10
|
-
# whitelisting allowed characters
|
11
|
-
#
|
12
|
-
# Must be overridden in child class
|
13
|
-
class_inheritable_accessor :validator
|
14
|
-
|
15
|
-
# Sentence fragment for error message on failed validation.
|
16
|
-
class_inheritable_accessor :complaint
|
17
|
-
self.complaint = "has characters I can't understand"
|
18
|
-
|
19
|
-
# Proc or string or anything that can be 2nd arg to gsub
|
20
|
-
# to sanitize
|
21
|
-
class_inheritable_accessor :replacer
|
22
|
-
self.replacer = '-'
|
23
|
-
|
24
|
-
# A regular expression to sanitize objects
|
25
|
-
# if unset or nil, the validator char group
|
26
|
-
class_inheritable_accessor :sanitizer
|
27
|
-
|
28
|
-
# unless overridden or set expressly, just use the
|
29
|
-
# validator
|
30
|
-
def sanitizer
|
31
|
-
@sanitizer || self.validator
|
32
|
-
end
|
33
|
-
|
34
|
-
def sanitize str
|
35
|
-
str = str.to_s
|
36
|
-
str.gsub(%r{([^#{validator.to_s}]+)}u, replacer)
|
37
|
-
end
|
38
|
-
|
39
|
-
def valid? str
|
40
|
-
%r{\A([#{validator.to_s}]*)\z}u.match(str)
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
#
|
45
|
-
# A permissive, ASCII-only name string - no control chars, newlines, backslash
|
46
|
-
# or <> angle brackets
|
47
|
-
#
|
48
|
-
class Title < Scrub::Generic
|
49
|
-
self.complaint = "should only contain basic keyboard characters (and should not use \\ < or >)."
|
50
|
-
self.validator = %r{a-zA-Z0-9_ ~\!@#\$%\^&\*\(\)\-\+=;\:'"`\[\]\{\}\|,\?\.\/}u
|
51
|
-
end
|
52
|
-
|
53
|
-
#
|
54
|
-
# A permissive, ASCII-only name string - no control chars, newlines, backslash
|
55
|
-
# or <> angle brackets
|
56
|
-
#
|
57
|
-
class UnicodeTitle < Scrub::Title
|
58
|
-
self.complaint = "should only contain keyboard characters (and should not use \\ < or >)."
|
59
|
-
self.validator = %r{[:alpha:][:digit:]#{Scrub::Title.validator}}u
|
60
|
-
end
|
61
|
-
|
62
|
-
#
|
63
|
-
# Visible characters and spaces (i.e. anything except control characters, etc.)
|
64
|
-
#
|
65
|
-
class FreeText < Scrub::Generic
|
66
|
-
self.complaint = "should not contain control characters or that kind of junk."
|
67
|
-
self.validator = %r{[:print:]\n\t}u
|
68
|
-
end
|
69
|
-
|
70
|
-
module BeginsWithAlpha
|
71
|
-
mattr_accessor :slug
|
72
|
-
self.slug = 'x'
|
73
|
-
# prepend #{slug}#{replacer} to the string if it starts with non-alpha.
|
74
|
-
# so, for instance '23jumpstreet' => 'x_23jumpstreet'
|
75
|
-
def sanitize_with_begins_with_alpha str
|
76
|
-
str = sanitize_without_begins_with_alpha str
|
77
|
-
str = 'x' + replacer + str if (str !~ /^[a-z]/i) # call at end of chain!
|
78
|
-
str
|
79
|
-
end
|
80
|
-
def valid_with_begins_with_alpha? str
|
81
|
-
(str =~ /^[a-z]/i) && valid_without_begins_with_alpha?(str)
|
82
|
-
end
|
83
|
-
def self.included base
|
84
|
-
base.alias_method_chain :sanitize, :begins_with_alpha # unless defined?(base.sanitize_without_begins_with_alpha)
|
85
|
-
base.alias_method_chain :valid?, :begins_with_alpha # unless defined?(base.valid_without_begins_with_alpha?)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
#
|
90
|
-
# insist that a string be lowercased.
|
91
|
-
#
|
92
|
-
module Lowercased
|
93
|
-
def sanitize_with_lowercased str
|
94
|
-
str = sanitize_without_lowercased str
|
95
|
-
str.downcase # call at end of chain!
|
96
|
-
end
|
97
|
-
def valid_with_lowercase? str
|
98
|
-
(str !~ /[[:upper:]]/u) && valid_without_lowercase?(str)
|
99
|
-
end
|
100
|
-
def self.included base
|
101
|
-
base.alias_method_chain :sanitize, :lowercased # unless defined?(base.sanitize_without_lowercased)
|
102
|
-
base.alias_method_chain :valid?, :lowercase # unless defined?(base.valid_without_lowercase?)
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
#
|
107
|
-
# start with a letter, and contain only A-Za-z0-9_
|
108
|
-
#
|
109
|
-
class Identifier < Scrub::Generic
|
110
|
-
self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
|
111
|
-
self.validator = %r{a-z0-9_}u
|
112
|
-
self.replacer = '_'
|
113
|
-
include Scrub::BeginsWithAlpha
|
114
|
-
include Scrub::Lowercased
|
115
|
-
end
|
116
|
-
|
117
|
-
#
|
118
|
-
# start with a letter, and contain only A-Za-z0-9_
|
119
|
-
#
|
120
|
-
class Handle < Scrub::Generic
|
121
|
-
self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
|
122
|
-
self.validator = %r{a-z0-9_}u
|
123
|
-
self.replacer = '_'
|
124
|
-
include Scrub::BeginsWithAlpha
|
125
|
-
include Scrub::Lowercased
|
126
|
-
end
|
127
|
-
|
128
|
-
# HANDLE_RE = %r{\A[a-z][]*\z}i # ascii, not :alpha: etc.
|
129
|
-
# HANDLE_MSG = "should start with a letter, and contain only characters like a-z0-9_-."
|
130
|
-
#
|
131
|
-
# # "Domain names are restricted to the ASCII letters a through z
|
132
|
-
# # (case-insensitive), the digits 0 through 9, and the hyphen, with some other
|
133
|
-
# # restrictions in terms of name length and position of hyphens."
|
134
|
-
# # (http://en.wikipedia.org/wiki/Domain_name#Overview)
|
135
|
-
# # http://tools.ietf.org/html/rfc1034
|
136
|
-
# DOMAIN_RE = %r{\A[a-z][a-z0-9\-][a-z0-9]\z}i # case insensitive
|
137
|
-
# DOMAIN_MSG = "should look like a domain name."
|
138
|
-
# DOMAIN_MORE = "only letters, digits or hyphens (-), start with a letter and end with a letter or number."
|
139
|
-
MSG_EMAIL_BAD = "should look like an email address (you@somethingsomething.com) and include only letters, numbers and . + - % please."
|
140
|
-
RE_EMAIL_NAME = '[\w\.%\+\-]+' # what you actually see in practice
|
141
|
-
RE_EMAIL_N_RFC2822 = '0-9A-Z!#\$%\&\'\*\+_/=\?^\-`\{|\}~\.' # technically allowed by RFC-2822
|
142
|
-
RE_DOMAIN_HEAD = '(?:[A-Z0-9\-]+\.)+'
|
143
|
-
RE_DOMAIN_TLD = '(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
|
144
|
-
RE_EMAIL_OK = /\A#{RE_EMAIL_NAME}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
|
145
|
-
RE_EMAIL_RFC2822 = /\A#{RE_EMAIL_N_RFC2822}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
|
146
|
-
|
147
|
-
end
|
@@ -1,38 +0,0 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
module IMW
|
4
|
-
module URIScrubber
|
5
|
-
|
6
|
-
def scrubbed
|
7
|
-
to_dirpath
|
8
|
-
end
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
module Scrub
|
13
|
-
#
|
14
|
-
# start with a letter, and contain only A-Za-z0-9_
|
15
|
-
#
|
16
|
-
class SimplifiedURL < Scrub::Generic
|
17
|
-
self.complaint = "should follow our zany simplified URL rules: com.host.dot-reversed:schemeifnothttp/path/seg_men-ts/stuff.ext-SHA1ifweird"
|
18
|
-
self.validator = %r{#{Addressable::URI::SAFE_CHARS}#{Addressable::URI::RESERVED_CHARS}}u
|
19
|
-
self.replacer = ''
|
20
|
-
include Scrub::Lowercased
|
21
|
-
attr_accessor :uri
|
22
|
-
|
23
|
-
def valid? str
|
24
|
-
str.to_s.downcase == sanitize(str)
|
25
|
-
end
|
26
|
-
|
27
|
-
def sanitize str
|
28
|
-
# if this fails just normalize once, or don't set $KCODE: http://bit.ly/1664vp
|
29
|
-
uri = Addressable::URI.heuristic_parse(str.to_s).normalize
|
30
|
-
# print [uri.host, uri.host_valid?, uri.path, uri.path_valid?].inspect
|
31
|
-
if uri.host_valid?
|
32
|
-
uri.scrubbed
|
33
|
-
else
|
34
|
-
uri.uuid_path
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
@@ -1,60 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
|
-
require 'scrub'
|
4
|
-
require 'scrub_simple_url'
|
5
|
-
|
6
|
-
test_strings = [
|
7
|
-
nil, '', '12', '123', 'simple', 'UPPER', 'CamelCased', 'iden_tifier_23_',
|
8
|
-
'twentyfouralphacharslong', 'twentyfiveatozonlyletters', 'hello.-_there@funnychar.com',
|
9
|
-
"tab\t", "newline\n",
|
10
|
-
"Iñtërnâtiônàlizætiøn",
|
11
|
-
'semicolon;', 'quote"', 'tick\'', 'backtick`', 'percent%', 'plus+', 'space ',
|
12
|
-
'leftanglebracket<', 'ampersand&',
|
13
|
-
"control char-bel\x07",
|
14
|
-
"http://foo.bar.com/",
|
15
|
-
"HTTP://FOO.BAR.com",
|
16
|
-
".com/zazz",
|
17
|
-
"scheme://user_name@user_acct:passwd@host-name.museum:9047/path;pathquery/p!a-th~2/path?query=param&query=pa%20ram#fragment",
|
18
|
-
"http://web.site.com/path/path/file.ext",
|
19
|
-
"ftp://ftp.site.com/path/path/file.ext",
|
20
|
-
"/absolute/pathname/file.ext",
|
21
|
-
"http://foo.bar.com/.hidden_file_with.ext",
|
22
|
-
"http://foo.bar.com/.hidden_file",
|
23
|
-
"dir/--/non_alpha_path_segment.ext",
|
24
|
-
"http://foo.bar.com/dir/../two_dots_in_path",
|
25
|
-
|
26
|
-
]
|
27
|
-
|
28
|
-
|
29
|
-
scrubbers = {
|
30
|
-
# :unicode_title => Scrub::UnicodeTitle.new,
|
31
|
-
# :title => Scrub::Title.new,
|
32
|
-
# :identifier => Scrub::Identifier.new,
|
33
|
-
# :free_text => Scrub::FreeText.new,
|
34
|
-
:handle => Scrub::Handle.new,
|
35
|
-
:simplified_url => Scrub::SimplifiedURL.new,
|
36
|
-
# :domain => Scrub::Domain.new,
|
37
|
-
# :email => Scrub::Email.new,
|
38
|
-
}
|
39
|
-
|
40
|
-
scrubbers.each do |scrubber_name, scrubber|
|
41
|
-
puts scrubber_name
|
42
|
-
results = test_strings.map do |test_string|
|
43
|
-
[!!scrubber.valid?(test_string), scrubber.sanitize(test_string).inspect, test_string.inspect ]
|
44
|
-
end
|
45
|
-
results.sort_by{|val,san,orig| val ? 1 : -1 }.each do |val,san,orig|
|
46
|
-
puts " %-5s %-30s %-30s" % [val,san,orig]
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
# 'foo@bar.com', 'foo@newskool-tld.museum', 'foo@twoletter-tld.de', 'foo@nonexistant-tld.qq',
|
53
|
-
# 'r@a.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail.com',
|
54
|
-
# 'hello.-_there@funnychar.com', 'uucp%addr@gmail.com', 'hello+routing-str@gmail.com',
|
55
|
-
# 'domain@can.haz.many.sub.doma.in',],
|
56
|
-
# :invalid => [nil, '', '!!@nobadchars.com', 'foo@no-rep-dots..com', 'foo@badtld.xxx', 'foo@toolongtld.abcdefg',
|
57
|
-
# 'Iñtërnâtiônàlizætiøn@hasnt.happened.to.email', 'need.domain.and.tld@de', "tab\t", "newline\n",
|
58
|
-
# 'r@.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail2.com',
|
59
|
-
# # these are technically allowed but not seen in practice:
|
60
|
-
# 'uucp!addr@gmail.com', 'semicolon;@gmail.com', 'quote"@gmail.com', 'tick\'@gmail.com', 'backtick`@gmail.com', 'space @gmail.com', 'bracket<@gmail.com', 'bracket>@gmail.com'
|