imw 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
@@ -0,0 +1,305 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Linkish
|
3
|
+
def self.included base
|
4
|
+
base.class_eval do
|
5
|
+
include DataMapper::Resource
|
6
|
+
include Infochimps::Resource
|
7
|
+
property :id, Integer, :serial => true
|
8
|
+
property :full_url, String, :length => 255, :nullable => false, :unique_index => true
|
9
|
+
has_handle
|
10
|
+
alias_method :handle_generator, :full_url
|
11
|
+
has_time_and_user_stamps
|
12
|
+
#
|
13
|
+
property :name, String, :length => 255, :nullable => false, :default => ''
|
14
|
+
#
|
15
|
+
property :file_path, String, :length => 1024
|
16
|
+
property :file_time, DateTime
|
17
|
+
property :file_size, Integer
|
18
|
+
property :file_sha1, String, :length => 40
|
19
|
+
property :tried_fetch, DataMapper::Resource::Boolean
|
20
|
+
property :fetched, DataMapper::Resource::Boolean
|
21
|
+
#
|
22
|
+
before :create, :make_uuid_and_handle
|
23
|
+
before :create, :update_from_file!
|
24
|
+
end
|
25
|
+
base.extend ClassMethods
|
26
|
+
end
|
27
|
+
|
28
|
+
# ===========================================================================
|
29
|
+
#
|
30
|
+
# Delegate methods to uri
|
31
|
+
#
|
32
|
+
def uri
|
33
|
+
@uri ||= Addressable::URI.parse(self.full_url)
|
34
|
+
end
|
35
|
+
# Dispatch anything else to the aggregated uri object
|
36
|
+
def method_missing method, *args
|
37
|
+
if self.uri.respond_to?(method)
|
38
|
+
self.uri.send(method, *args)
|
39
|
+
else
|
40
|
+
super method, *args
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def to_s
|
45
|
+
"<a href='#{self.uri.to_s}'>#{self.name}</a>" # <-- !! not escaped !!
|
46
|
+
end
|
47
|
+
|
48
|
+
# ===========================================================================
|
49
|
+
#
|
50
|
+
# ID, naming, etc
|
51
|
+
#
|
52
|
+
def normalize_url!
|
53
|
+
u = Addressable::URI.parse(self.full_url).normalize
|
54
|
+
self.full_url = u.to_s
|
55
|
+
end
|
56
|
+
|
57
|
+
# ===========================================================================
|
58
|
+
#
|
59
|
+
# Properly belongs in FileStore module
|
60
|
+
#
|
61
|
+
#
|
62
|
+
# Refresh cached properties from our copy of the asset.
|
63
|
+
#
|
64
|
+
def update_from_file!
|
65
|
+
self.make_uuid_and_handle # make sure this happened
|
66
|
+
# Set the file path
|
67
|
+
self.file_path = self.to_file_path if self.file_path.blank?
|
68
|
+
# FIXME -- kludge to ripd_root
|
69
|
+
if ! File.exist?(actual_path)
|
70
|
+
self.fetched = false
|
71
|
+
else
|
72
|
+
self.fetched = self.tried_fetch = true
|
73
|
+
self.file_size = File.size( actual_path)
|
74
|
+
self.file_time = File.mtime(actual_path)
|
75
|
+
end
|
76
|
+
self.fetched
|
77
|
+
end
|
78
|
+
def actual_path
|
79
|
+
path_to(:ripd_root, self.file_path)
|
80
|
+
end
|
81
|
+
|
82
|
+
# ===========================================================================
|
83
|
+
#
|
84
|
+
# Properly belongs in own module
|
85
|
+
#
|
86
|
+
|
87
|
+
IMW_WGET_OPTIONS = {
|
88
|
+
:root => :ripd_root,
|
89
|
+
:wait => 2,
|
90
|
+
:noretry => true,
|
91
|
+
:log_level => Logger::DEBUG,
|
92
|
+
:clobber => false,
|
93
|
+
}
|
94
|
+
#
|
95
|
+
# Fetch from the web
|
96
|
+
#
|
97
|
+
def wget options={}
|
98
|
+
options.reverse_merge! IMW_WGET_OPTIONS
|
99
|
+
cd path_to(options[:root]) do
|
100
|
+
if (not options[:clobber]) && File.file?(file_path) then
|
101
|
+
IMW.log.add options[:log_level], "Skipping #{file_path}"; return
|
102
|
+
end
|
103
|
+
# Do the fetch
|
104
|
+
mkdir_p File.dirname(actual_path)
|
105
|
+
# defaults are --connect-timeout=infinity --read-timeout=900 --tries=20 acc. to man page
|
106
|
+
cmd = %Q{wget -nv "#{full_url}" -O"#{actual_path}" --connect-timeout=5 --read-timeout=10 --tries=1 &}
|
107
|
+
IMW.log.add(options[:log_level], cmd)
|
108
|
+
IMW.log.add(options[:log_level], `#{cmd}`)
|
109
|
+
self.tried_fetch = true
|
110
|
+
sleep options[:wait] # please hammer don't hurt em
|
111
|
+
update_from_file!
|
112
|
+
self.save
|
113
|
+
return self.fetched
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
#
|
118
|
+
#
|
119
|
+
#
|
120
|
+
def contents options={}
|
121
|
+
wget options
|
122
|
+
if fetched
|
123
|
+
File.open actual_path
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# ===========================================================================
|
128
|
+
#
|
129
|
+
# Properly belongs in FileStore
|
130
|
+
#
|
131
|
+
|
132
|
+
protected
|
133
|
+
#
|
134
|
+
# The standard file path for this url's ripped cache
|
135
|
+
#
|
136
|
+
# * leading directory from reverse.dotted.host_scheme:port:user@password
|
137
|
+
# * normalized path/file?query#fragment
|
138
|
+
# * uuid formed from the
|
139
|
+
#
|
140
|
+
def to_file_path
|
141
|
+
file_path_str = ""
|
142
|
+
file_path_str << to_file_path_root_part
|
143
|
+
file_path_str << to_file_path_path_part
|
144
|
+
file_path_str << to_file_path_file_part
|
145
|
+
file_path_str = self.class.path_str_encode(file_path_str)
|
146
|
+
self.class.validate_roundtrip(file_path_str)
|
147
|
+
file_path_str
|
148
|
+
end
|
149
|
+
def file_timestamp
|
150
|
+
file_time.strftime("%Y%m%d-%H%M%S")
|
151
|
+
end
|
152
|
+
def to_file_path_with_timestamp
|
153
|
+
to_file_path + file_timestamp
|
154
|
+
end
|
155
|
+
#
|
156
|
+
# revhost_scheme:port:user@password -- omitting _scheme if it's http, and
|
157
|
+
# omitting :port:user@password if all three are blank.
|
158
|
+
#
|
159
|
+
def to_file_path_root_part
|
160
|
+
root_part_str = ""
|
161
|
+
tld_host_frag = self.class.tier_path_segment(revhost, /^([^\.]+)\.([^\.]{1,2})/)
|
162
|
+
root_part_str << revhost
|
163
|
+
root_part_str << "_#{uri.scheme}" unless uri.scheme == 'http'
|
164
|
+
root_part_str << ":#{uri.port}:#{uri.user}@#{uri.password}" unless uri.simple?
|
165
|
+
root_part_str
|
166
|
+
end
|
167
|
+
def to_file_path_path_part
|
168
|
+
uri.path.to_s
|
169
|
+
end
|
170
|
+
def to_file_path_file_part
|
171
|
+
file_path_str = ""
|
172
|
+
file_path_str << "?#{uri.query}" unless uri.query.nil?
|
173
|
+
file_path_str << "##{uri.fragment}" unless uri.fragment.nil?
|
174
|
+
file_path_str << "-#{self.uuid}"
|
175
|
+
end
|
176
|
+
public
|
177
|
+
|
178
|
+
|
179
|
+
module ClassMethods
|
180
|
+
#
|
181
|
+
# find_or_creates from url
|
182
|
+
#
|
183
|
+
# url is heuristic_parse'd and normalized by Addressable before lookup:
|
184
|
+
# "Converts an input to a URI. The input does not have to be a valid URI —
|
185
|
+
# the method will use heuristics to guess what URI was intended. This is not
|
186
|
+
# standards compliant, merely user-friendly.
|
187
|
+
#
|
188
|
+
def find_or_create_from_url url_str
|
189
|
+
link = self.find_or_new_from_url url_str
|
190
|
+
link.save
|
191
|
+
link
|
192
|
+
end
|
193
|
+
def find_or_new_from_url url_str # :nodoc:
|
194
|
+
url_str = Addressable::URI.heuristic_parse(url_str).normalize.to_s
|
195
|
+
link = self.first( :full_url => url_str ) || self.new( :full_url => url_str )
|
196
|
+
link.make_uuid_and_handle
|
197
|
+
link.update_from_file!
|
198
|
+
link
|
199
|
+
end
|
200
|
+
def find_or_create_from_file_path ripd_file
|
201
|
+
url_str = Link.url_from_file_path(ripd_file)
|
202
|
+
link = self.first( :full_url => url_str.to_s ) || self.new( :full_url => url_str.to_s )
|
203
|
+
link.file_path = ripd_file
|
204
|
+
link.make_uuid_and_handle
|
205
|
+
link.update_from_file!
|
206
|
+
link.save
|
207
|
+
link
|
208
|
+
end
|
209
|
+
#
|
210
|
+
# Decode url from its file_path
|
211
|
+
#
|
212
|
+
def url_from_file_path fp
|
213
|
+
fp = path_str_decode(fp)
|
214
|
+
m = (%r{\A
|
215
|
+
(#{Addressable::URI::HOST_TLD}) # tld tier
|
216
|
+
/(..?) # revhost tier
|
217
|
+
/([^/\:_]+) # revhost
|
218
|
+
(?:_([^/\:]+))? # _scheme
|
219
|
+
(?::(\d*):([^/]*)@([^@/]*?))? # :port:user@password
|
220
|
+
/(?:(.*?)/)? # /dirs/
|
221
|
+
([^/]*) # file
|
222
|
+
-([a-f0-9]{32}) # -uuid
|
223
|
+
\z}x.match(fp))
|
224
|
+
raise "Can't extract url from file path #{fp}" if !m
|
225
|
+
fp_host, fp_scheme, fp_port, fp_user, fp_pass, fp_path, fp_file, fp_uuid = m.captures
|
226
|
+
fp_host = fp_host.split('.').reverse.join('.')
|
227
|
+
fp_scheme ||= 'http'
|
228
|
+
fp_pass = ":#{fp_pass}" unless fp_pass.blank?
|
229
|
+
fp_userpass = "#{fp_user}#{fp_user}@" unless fp_user.blank?
|
230
|
+
fp_port = ":#{fp_port}" unless fp_port.blank?
|
231
|
+
fp_path = File.join(*[fp_path, fp_file].compact)
|
232
|
+
"#{fp_scheme}://#{fp_userpass}#{fp_host}#{fp_port}/#{fp_path}"
|
233
|
+
end
|
234
|
+
#
|
235
|
+
# to control files-per-directory madness, take a path segment like "foobar" in
|
236
|
+
# blah.com/top/foobar/directory
|
237
|
+
# and transform into
|
238
|
+
# blah.com/top/fo/foobar/directory
|
239
|
+
#
|
240
|
+
# Ex.
|
241
|
+
# self.class.tier_path_segment('a_username')
|
242
|
+
# # => 'a_/a_username'
|
243
|
+
# self.class.tier_path_segment('1')
|
244
|
+
# # => '1/1'
|
245
|
+
# self.class.tier_path_segment('com.twitter', /^([^\.]+)\.([^\.]{1,2})/)
|
246
|
+
# # => 'com/tw/com.twitter'
|
247
|
+
#
|
248
|
+
def self.tier_path_segment(path_seg, re=/(..?)/)
|
249
|
+
frag_seg = re.match(path_seg).captures
|
250
|
+
raise "Can't tier path_seg #{path_seg} using #{re}" if frag_seg.blank?
|
251
|
+
File.join(* [frag_seg, path_seg].flatten )
|
252
|
+
end
|
253
|
+
#
|
254
|
+
#
|
255
|
+
# It's really bad if you can't roundtrip --
|
256
|
+
# since saving is the rare case (only done once!) we insist on checking.
|
257
|
+
#
|
258
|
+
def self.validate_roundtrip file_path_str
|
259
|
+
# uu = self.class.url_from_file_path(file_path_str)
|
260
|
+
# puts "*"*75, uri.to_hash.inspect, ['path str', file_path_str, 'uri', uri.to_s, 'rt', uu.to_s].inspect
|
261
|
+
return_trip_url = Addressable::URI.parse(self.class.url_from_file_path(file_path_str))
|
262
|
+
raise "crapsticks: uri doesn't roundtrip #{file_path_str} to #{uri.to_s}: #{return_trip_url}" if return_trip_url != uri
|
263
|
+
end
|
264
|
+
#
|
265
|
+
# Uses a similar scheme as the 'Quoted Printable' encoding, but more strict
|
266
|
+
# and without linebreaking or anything. The intent is to reversibly and
|
267
|
+
# recognizably store URLs to disk with names that (apart from path) do not
|
268
|
+
# need to be further escaped in filesystem, URL, database or HTML.
|
269
|
+
#
|
270
|
+
# The only characters in a path_encoded string are alpha-numeric /_-.=
|
271
|
+
#
|
272
|
+
# Rules:
|
273
|
+
# * Any character that is not alphanumeric, and is not /_-. is encoded as an
|
274
|
+
# equals sign = followed by its upper-case hex encoding.
|
275
|
+
#
|
276
|
+
# * Furthermore, in any sequence of repeated '.' characters, all after the
|
277
|
+
# first are hex encoded; same with '/'.
|
278
|
+
#
|
279
|
+
# Ex.
|
280
|
+
# path_encode("www.measuringworth.com/datasets/consumer/result.php?use[]=VCB&use[]=CU&use[]=SZ&year_source=1900&year_result=2007"
|
281
|
+
# # => www.measuringworth.com/datasets/consumer/result.php=3Fuse=5B=5D=3DVCB=26use=5B=5D=3DCU=26use=5B=5D=3DSZ=26year_source=3D1900=26year_result=3D2007
|
282
|
+
#
|
283
|
+
# Code inspired by "Glenn Parker's response to ruby quiz #23"http://www.rubyquiz.com/quiz23.html
|
284
|
+
#
|
285
|
+
def path_str_encode(str)
|
286
|
+
str.gsub(%r{\.(\.+)}){|chars| '.'+path_encode_chars(chars) }
|
287
|
+
str.gsub(%r{\/(\/+)}){|chars| '/'+path_encode_chars(chars) }
|
288
|
+
str.gsub(%r{[^A-Za-z0-9/_\-\.]+}){|chars| path_encode_chars(chars) }
|
289
|
+
end
|
290
|
+
#
|
291
|
+
# See the notes in path_encode
|
292
|
+
#
|
293
|
+
def path_str_decode(str)
|
294
|
+
str.gsub(/\+([\dA-F]{2})/){ $1.hex.chr }
|
295
|
+
end
|
296
|
+
protected
|
297
|
+
def path_encode_chars(chars) # :nodoc:
|
298
|
+
# send each character to an equals sign followed by its uppercase hex encoding
|
299
|
+
encoded = "";
|
300
|
+
chars.each_byte{|c| encoded << "+%02X" % c }
|
301
|
+
encoded
|
302
|
+
end
|
303
|
+
public
|
304
|
+
end
|
305
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
|
2
|
+
def self.url_from_file_path fp
|
3
|
+
# FIXME -- doesn't work with extension preservation
|
4
|
+
unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{30,32})\z}.match(fp)) then
|
5
|
+
# m1 = %r{\A([^/_]+)(_[^/]+)?/(?:(.*?))-([a-f0-9]{28,})}i.match(fp);
|
6
|
+
raise "Bad match to #{fp}"
|
7
|
+
end
|
8
|
+
fp_host, fp_scheme, fp_path, fp_file, fp_uuid, fp_ext = m.captures
|
9
|
+
fp_host = fp_host.split('.').reverse.join('.')
|
10
|
+
fp_scheme ||= 'http'
|
11
|
+
fp_path = File.join(*[fp_path, fp_file].compact) # FIXME -- no ext
|
12
|
+
url = Addressable::URI.new(fp_scheme, nil, nil, fp_host, nil, fp_path, nil, nil)
|
13
|
+
unless m = (%r{\A([^/_]+)(_[^/]+)?/(?:(.*?)/)?([^/]*)-([a-f0-9]{32})\z}.match(fp)) then
|
14
|
+
# warn "Bad luck!!! #{url.path} hash is #{fp_uuid} vs #{UUID.sha1_create(UUID_INFOCHIMPS_LINKS_NAMESPACE, url.to_s).hexdigest}"
|
15
|
+
end
|
16
|
+
url
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# returns [dirname, basename, ext] for the file_path
|
21
|
+
# ext is determined by basename_ext_splitter
|
22
|
+
#
|
23
|
+
def path_split
|
24
|
+
path_split_str path
|
25
|
+
end
|
26
|
+
|
27
|
+
# lowercase; only a-z, num, . -
|
28
|
+
def scrubbed_revhost
|
29
|
+
return unless revhost
|
30
|
+
revhost.downcase.gsub(/[^a-z0-9\.\-]+/i, '') # note: no _
|
31
|
+
end
|
32
|
+
|
33
|
+
cattr_accessor :basename_ext_splitter
|
34
|
+
BASENAME_EXT_SPLIT_SMART = /(.+?)\.(tar\.gz|tar\.bz2|[^\.]+)/
|
35
|
+
BASENAME_EXT_NO_SPLIT = /(.+?)()/
|
36
|
+
self.basename_ext_splitter = BASENAME_EXT_NO_SPLIT
|
37
|
+
|
38
|
+
#
|
39
|
+
# Like File.split but heuristically handles things like .tar.bz2:
|
40
|
+
#
|
41
|
+
# foo. => ['foo.', '']
|
42
|
+
# foo.tar.gz => ['foo.', '']
|
43
|
+
# foo.tar.bz2 => ['foo.', '']
|
44
|
+
# foo.yaml => ['foo', '']
|
45
|
+
#
|
46
|
+
def path_split_str str
|
47
|
+
if str =~ %r{/.+\z}
|
48
|
+
dirname, basename = %r{\A(.*)/([^/]+)\z}.match(str).captures
|
49
|
+
else
|
50
|
+
dirname, basename = ['', str]
|
51
|
+
end
|
52
|
+
# Get basename, extension (as given by capture groups in basename_ext_splitter)
|
53
|
+
if basename_ext_splitter && (m = /\A#{basename_ext_splitter}\z/i.match(basename))
|
54
|
+
basename, ext = m.captures
|
55
|
+
else
|
56
|
+
basename, ext = [basename, '']
|
57
|
+
end
|
58
|
+
[dirname, basename, ext]
|
59
|
+
end
|
60
|
+
|
61
|
+
# remove all blank components, join the rest with separator
|
62
|
+
def join_non_blank separator, *strs
|
63
|
+
strs.reject(&:blank?).join(separator)
|
64
|
+
end
|
65
|
+
|
66
|
+
# only a-z A-Z, num, .-_/
|
67
|
+
def scrubbed_path
|
68
|
+
path_part = path
|
69
|
+
# colons into /
|
70
|
+
path_part = path_part.gsub(%r{\:+}, '/')
|
71
|
+
# Kill weird chars
|
72
|
+
path_part = path_part.gsub(%r{[^a-zA-Z0-9\.\-_/]+}, '_')
|
73
|
+
# Compact (killing foo/../bar, etc)
|
74
|
+
path_part = path_part.gsub(%r{/[^a-zA-Z0-9]+/}, '/').gsub(%r{/\.\.+/}, '.')
|
75
|
+
# Kill leading & trailing non-alnum
|
76
|
+
path_part = path_part.gsub(%r{^[^a-zA-Z0-9]+}, '').gsub(%r{[^a-zA-Z0-9]+$}, '')
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# name for this URL regarded as a file (instance)
|
81
|
+
#
|
82
|
+
def to_file_path
|
83
|
+
dirname, basename, ext = path_split_str(scrubbed_path)
|
84
|
+
basename = join_non_blank '-', basename, uuid
|
85
|
+
basename = join_non_blank '.', basename, ext
|
86
|
+
join_non_blank '/', root_path, dirname, basename
|
87
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'rubygems'
|
3
|
+
require 'active_support'
|
4
|
+
require 'uuidtools'
|
5
|
+
|
6
|
+
module Scrub
|
7
|
+
class Generic
|
8
|
+
# A regular expression character group
|
9
|
+
# (a bunch of characters ready to drop into /[#{validator}]*/)
|
10
|
+
# whitelisting allowed characters
|
11
|
+
#
|
12
|
+
# Must be overridden in child class
|
13
|
+
class_inheritable_accessor :validator
|
14
|
+
|
15
|
+
# Sentence fragment for error message on failed validation.
|
16
|
+
class_inheritable_accessor :complaint
|
17
|
+
self.complaint = "has characters I can't understand"
|
18
|
+
|
19
|
+
# Proc or string or anything that can be 2nd arg to gsub
|
20
|
+
# to sanitize
|
21
|
+
class_inheritable_accessor :replacer
|
22
|
+
self.replacer = '-'
|
23
|
+
|
24
|
+
# A regular expression to sanitize objects
|
25
|
+
# if unset or nil, the validator char group
|
26
|
+
class_inheritable_accessor :sanitizer
|
27
|
+
|
28
|
+
# unless overridden or set expressly, just use the
|
29
|
+
# validator
|
30
|
+
def sanitizer
|
31
|
+
@sanitizer || self.validator
|
32
|
+
end
|
33
|
+
|
34
|
+
def sanitize str
|
35
|
+
str = str.to_s
|
36
|
+
str.gsub(%r{([^#{validator.to_s}]+)}u, replacer)
|
37
|
+
end
|
38
|
+
|
39
|
+
def valid? str
|
40
|
+
%r{\A([#{validator.to_s}]*)\z}u.match(str)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
#
|
45
|
+
# A permissive, ASCII-only name string - no control chars, newlines, backslash
|
46
|
+
# or <> angle brackets
|
47
|
+
#
|
48
|
+
class Title < Scrub::Generic
|
49
|
+
self.complaint = "should only contain basic keyboard characters (and should not use \\ < or >)."
|
50
|
+
self.validator = %r{a-zA-Z0-9_ ~\!@#\$%\^&\*\(\)\-\+=;\:'"`\[\]\{\}\|,\?\.\/}u
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# A permissive, ASCII-only name string - no control chars, newlines, backslash
|
55
|
+
# or <> angle brackets
|
56
|
+
#
|
57
|
+
class UnicodeTitle < Scrub::Title
|
58
|
+
self.complaint = "should only contain keyboard characters (and should not use \\ < or >)."
|
59
|
+
self.validator = %r{[:alpha:][:digit:]#{Scrub::Title.validator}}u
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Visible characters and spaces (i.e. anything except control characters, etc.)
|
64
|
+
#
|
65
|
+
class FreeText < Scrub::Generic
|
66
|
+
self.complaint = "should not contain control characters or that kind of junk."
|
67
|
+
self.validator = %r{[:print:]\n\t}u
|
68
|
+
end
|
69
|
+
|
70
|
+
module BeginsWithAlpha
|
71
|
+
mattr_accessor :slug
|
72
|
+
self.slug = 'x'
|
73
|
+
# prepend #{slug}#{replacer} to the string if it starts with non-alpha.
|
74
|
+
# so, for instance '23jumpstreet' => 'x_23jumpstreet'
|
75
|
+
def sanitize_with_begins_with_alpha str
|
76
|
+
str = sanitize_without_begins_with_alpha str
|
77
|
+
str = 'x' + replacer + str if (str !~ /^[a-z]/i) # call at end of chain!
|
78
|
+
str
|
79
|
+
end
|
80
|
+
def valid_with_begins_with_alpha? str
|
81
|
+
(str =~ /^[a-z]/i) && valid_without_begins_with_alpha?(str)
|
82
|
+
end
|
83
|
+
def self.included base
|
84
|
+
base.alias_method_chain :sanitize, :begins_with_alpha # unless defined?(base.sanitize_without_begins_with_alpha)
|
85
|
+
base.alias_method_chain :valid?, :begins_with_alpha # unless defined?(base.valid_without_begins_with_alpha?)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
#
|
90
|
+
# insist that a string be lowercased.
|
91
|
+
#
|
92
|
+
module Lowercased
|
93
|
+
def sanitize_with_lowercased str
|
94
|
+
str = sanitize_without_lowercased str
|
95
|
+
str.downcase # call at end of chain!
|
96
|
+
end
|
97
|
+
def valid_with_lowercase? str
|
98
|
+
(str !~ /[[:upper:]]/u) && valid_without_lowercase?(str)
|
99
|
+
end
|
100
|
+
def self.included base
|
101
|
+
base.alias_method_chain :sanitize, :lowercased # unless defined?(base.sanitize_without_lowercased)
|
102
|
+
base.alias_method_chain :valid?, :lowercase # unless defined?(base.valid_without_lowercase?)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# start with a letter, and contain only A-Za-z0-9_
|
108
|
+
#
|
109
|
+
class Identifier < Scrub::Generic
|
110
|
+
self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
|
111
|
+
self.validator = %r{a-z0-9_}u
|
112
|
+
self.replacer = '_'
|
113
|
+
include Scrub::BeginsWithAlpha
|
114
|
+
include Scrub::Lowercased
|
115
|
+
end
|
116
|
+
|
117
|
+
#
|
118
|
+
# start with a letter, and contain only A-Za-z0-9_
|
119
|
+
#
|
120
|
+
class Handle < Scrub::Generic
|
121
|
+
self.complaint = "should be an identifier: it should start with a letter, and contain only a-z, 0-9 and '_'."
|
122
|
+
self.validator = %r{a-z0-9_}u
|
123
|
+
self.replacer = '_'
|
124
|
+
include Scrub::BeginsWithAlpha
|
125
|
+
include Scrub::Lowercased
|
126
|
+
end
|
127
|
+
|
128
|
+
# HANDLE_RE = %r{\A[a-z][]*\z}i # ascii, not :alpha: etc.
|
129
|
+
# HANDLE_MSG = "should start with a letter, and contain only characters like a-z0-9_-."
|
130
|
+
#
|
131
|
+
# # "Domain names are restricted to the ASCII letters a through z
|
132
|
+
# # (case-insensitive), the digits 0 through 9, and the hyphen, with some other
|
133
|
+
# # restrictions in terms of name length and position of hyphens."
|
134
|
+
# # (http://en.wikipedia.org/wiki/Domain_name#Overview)
|
135
|
+
# # http://tools.ietf.org/html/rfc1034
|
136
|
+
# DOMAIN_RE = %r{\A[a-z][a-z0-9\-][a-z0-9]\z}i # case insensitive
|
137
|
+
# DOMAIN_MSG = "should look like a domain name."
|
138
|
+
# DOMAIN_MORE = "only letters, digits or hyphens (-), start with a letter and end with a letter or number."
|
139
|
+
MSG_EMAIL_BAD = "should look like an email address (you@somethingsomething.com) and include only letters, numbers and . + - % please."
|
140
|
+
RE_EMAIL_NAME = '[\w\.%\+\-]+' # what you actually see in practice
|
141
|
+
RE_EMAIL_N_RFC2822 = '0-9A-Z!#\$%\&\'\*\+_/=\?^\-`\{|\}~\.' # technically allowed by RFC-2822
|
142
|
+
RE_DOMAIN_HEAD = '(?:[A-Z0-9\-]+\.)+'
|
143
|
+
RE_DOMAIN_TLD = '(?:[A-Z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
|
144
|
+
RE_EMAIL_OK = /\A#{RE_EMAIL_NAME}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
|
145
|
+
RE_EMAIL_RFC2822 = /\A#{RE_EMAIL_N_RFC2822}@#{RE_DOMAIN_HEAD}#{RE_DOMAIN_TLD}\z/i
|
146
|
+
|
147
|
+
end
|