rdf-sak 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +268 -0
- data/Rakefile +13 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/example/cleanup.xsl +14 -0
- data/example/matches.xhtml +11 -0
- data/example/transforms.ttl +58 -0
- data/lib/rdf-sak.rb +1 -0
- data/lib/rdf/sak.rb +2506 -0
- data/lib/rdf/sak/ci.rb +827 -0
- data/lib/rdf/sak/cli.rb +35 -0
- data/lib/rdf/sak/docstats.rb +188 -0
- data/lib/rdf/sak/document.rb +772 -0
- data/lib/rdf/sak/ibis.rb +248 -0
- data/lib/rdf/sak/mimemagic.rb +73 -0
- data/lib/rdf/sak/pav.rb +479 -0
- data/lib/rdf/sak/qb.rb +280 -0
- data/lib/rdf/sak/scovo.rb +51 -0
- data/lib/rdf/sak/tfo.rb +301 -0
- data/lib/rdf/sak/transform.rb +1172 -0
- data/lib/rdf/sak/urlrunner.rb +602 -0
- data/lib/rdf/sak/util.rb +2081 -0
- data/lib/rdf/sak/version.rb +5 -0
- data/rdf-sak.gemspec +60 -0
- metadata +366 -0
@@ -0,0 +1,602 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'concurrent'
|
3
|
+
require 'concurrent-edge'
|
4
|
+
|
5
|
+
require 'uri'
|
6
|
+
require 'uri/urn/uuid'
|
7
|
+
require 'time'
|
8
|
+
|
9
|
+
require 'rdf'
|
10
|
+
require 'rdf/rdfa'
|
11
|
+
#require 'rdf/vocab'
|
12
|
+
require 'rdf/vocab/dc'
|
13
|
+
require 'rdf/vocab/prov'
|
14
|
+
require 'rdf/sak/ci'
|
15
|
+
require 'rdf/sak/tfo'
|
16
|
+
require 'tidy_ffi'
|
17
|
+
require 'uuidtools'
|
18
|
+
require 'nokogiri'
|
19
|
+
require 'crass'
|
20
|
+
|
21
|
+
class RDF::SAK::URLRunner
|
22
|
+
private
|
23
|
+
|
24
|
+
UA = 'RDF::SAK::URLRunner/0.1'.freeze
|
25
|
+
TIDY_OPTS = {
|
26
|
+
wrap: 0,
|
27
|
+
numeric_entities: true,
|
28
|
+
tidy_mark: false,
|
29
|
+
output_xhtml: true,
|
30
|
+
custom_tags: 'inline',
|
31
|
+
}.freeze
|
32
|
+
|
33
|
+
TIDY_U = RDF::URI('urn:x-dummy:tidy').freeze
|
34
|
+
XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
|
35
|
+
TTL = "normalize-space(/html:html/html:head/html:title" \
|
36
|
+
"[normalize-space(.) != ''][1])".freeze
|
37
|
+
JLD = "//html:script[text()]" \
|
38
|
+
"[normalize-space(@type) = 'application/ld+json']".freeze
|
39
|
+
QF = /^([^?#]*)(?:\?([^#]*))?(?:#(.*?))?$/.freeze
|
40
|
+
SF = /[^[:alpha:][:digit:]\/\?%@!$&'()*+,;=._~-]/.freeze
|
41
|
+
TOKEN = /^([^\x0-\x20()<>@,;:\\"\/\[\]?=\x7f-\xff]+)$/n.freeze
|
42
|
+
LANG_RE = /^[A-Za-z]+(?:[_-]+[0-9A-Za-z]+)*$/.freeze
|
43
|
+
|
44
|
+
# xpath for various linkages
|
45
|
+
XPATHNS = {
|
46
|
+
html: XHTMLNS,
|
47
|
+
svg: 'http://www.w3.org/2000/svg',
|
48
|
+
atom: 'http://www.w3.org/2005/Atom',
|
49
|
+
xlink: 'http://www.w3.org/1999/xlink',
|
50
|
+
}
|
51
|
+
XBASE_XP = 'ancestor-or-self::*[@xml:base][1]/@xml:base'.freeze
|
52
|
+
HTML_XP = %w[*[not(self::html:base)][@href]/@href
|
53
|
+
*[@src]/@src object/@data *[@srcset]/@srcset
|
54
|
+
form/@action].map { |e| '//html:%s' % e }.join(?|).freeze
|
55
|
+
ATOM_XP = %w[uri content/@src category/@scheme generator/@uri icon id
|
56
|
+
link/@href logo].map { |e| '//atom:%s' % e }.join(?|).freeze
|
57
|
+
RSS_XP = %w[image docs source/@url enclosure/@url
|
58
|
+
guid comments].map { |e| '//%s' % e }.join(?|).freeze
|
59
|
+
XLINK_XP = '//*/@xlink:href'.freeze
|
60
|
+
|
61
|
+
def uri_pp uri
|
62
|
+
m = QF.match uri.to_s
|
63
|
+
out = m[1]
|
64
|
+
[[2, ??], [3, ?#]].each do |i, c|
|
65
|
+
next if m[i].nil?
|
66
|
+
clean = m[i].gsub(SF) { |s| sprintf('%X', s.ord) }
|
67
|
+
out += c + clean
|
68
|
+
end
|
69
|
+
|
70
|
+
out
|
71
|
+
end
|
72
|
+
|
73
|
+
def css_urls css
|
74
|
+
out = []
|
75
|
+
case css
|
76
|
+
when Array
|
77
|
+
css.each { |c| out += find_urls(c) }
|
78
|
+
when Hash
|
79
|
+
if css[:node] == :url
|
80
|
+
out << css[:value]
|
81
|
+
else
|
82
|
+
out += find_urls(css.values)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
out.uniq.compact
|
87
|
+
end
|
88
|
+
|
89
|
+
def title_term title
|
90
|
+
title = title.first if title.is_a? Array
|
91
|
+
return unless title
|
92
|
+
|
93
|
+
text = title.content.strip
|
94
|
+
unless text.empty?
|
95
|
+
lang = if title.lang and LANG_RE.match? title.lang.strip
|
96
|
+
title.lang.strip.downcase.tr_s '_-', ?-
|
97
|
+
end
|
98
|
+
return RDF::Literal(text, language: lang)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
# sponge cartridges should add triples and return all the resolved
|
103
|
+
# (RDF::)URIs to the caller
|
104
|
+
SPONGE = {
|
105
|
+
'application/xhtml+xml': -> content, uri {
|
106
|
+
rs = RDF::URI(uri.to_s)
|
107
|
+
|
108
|
+
# lol we have done this a million times before
|
109
|
+
if title = content.xpath('/html:html/html:head/html:title', XPATHNS)
|
110
|
+
if title = title_term(title)
|
111
|
+
@repo << [rs, RDF::Vocab::DC.title, title]
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
content.xpath(HTML_XP, XPATHNS).map do |node|
|
116
|
+
# html will always be an attribute
|
117
|
+
pred = case node.parent.name
|
118
|
+
when 'script' then RDF::Vocab::DC.requires
|
119
|
+
when 'a', 'link', 'area', 'form' then RDF::Vocab::DC.references
|
120
|
+
else
|
121
|
+
RDF::Vocab::DC.hasPart
|
122
|
+
end
|
123
|
+
|
124
|
+
# srcset gets special treatment
|
125
|
+
objs = if node.name == 'srcset'
|
126
|
+
node.content.strip.split(/\s*,\s+/).map do |u|
|
127
|
+
u.split.first.strip
|
128
|
+
end
|
129
|
+
else
|
130
|
+
[node.content.strip]
|
131
|
+
end.map { |u| RDF::URI((uri + u).to_s) }
|
132
|
+
|
133
|
+
# phew
|
134
|
+
objs.each { |o| @repo << [rs, pred, o] }
|
135
|
+
end.flatten # don't forget to flatten
|
136
|
+
},
|
137
|
+
'application/atom+xml': -> content, uri {
|
138
|
+
rs = RDF::URI(uri.to_s)
|
139
|
+
|
140
|
+
if title = content.xpath('/atom:feed/atom:title', XPATHNS)
|
141
|
+
if title = title_term(title)
|
142
|
+
@repo << [rs, RDF::Vocab::DC.title, title]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
content.xpath(ATOM_XP, XPATHNS).map do |node|
|
147
|
+
o = RDF::URI((uri + node.content.strip).to_s)
|
148
|
+
@repo << [rs, RDF::Vocab::DC.references, o]
|
149
|
+
o
|
150
|
+
end
|
151
|
+
},
|
152
|
+
'application/x-rss+xml': -> content, uri {
|
153
|
+
rs = RDF::URI(uri.to_s)
|
154
|
+
|
155
|
+
if title = content.xpath('/rss/channel/title', XPATHNS)
|
156
|
+
if title = title_term(title)
|
157
|
+
@repo << [rs, RDF::Vocab::DC.title, title]
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
content.xpath(RSS_XP, XPATHNS).map do |node|
|
162
|
+
o = RDF::URI((uri + node.content.strip).to_s)
|
163
|
+
@repo << [rs, RDF::Vocab::DC.references, o]
|
164
|
+
o
|
165
|
+
end
|
166
|
+
},
|
167
|
+
'image/svg+xml': -> content, uri {
|
168
|
+
if title = content.xpath('//svg:title', XPATHNS)
|
169
|
+
if title = title_term(title)
|
170
|
+
@repo << [rs, RDF::Vocab::DC.title, title]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
content.xpath(XLINK_XP, XPATHNS).map do |node|
|
175
|
+
o = RDF::URI((uri + node.content.strip).to_s)
|
176
|
+
@repo << [rs, RDF::Vocab::DC.references, o]
|
177
|
+
o
|
178
|
+
end
|
179
|
+
},
|
180
|
+
'text/css': -> content, uri {
|
181
|
+
rs = RDF::URI(uri.to_s)
|
182
|
+
css = Crass.parse content
|
183
|
+
css_urls(css).map do |u|
|
184
|
+
ro = RDF::URI((uri + u).to_s)
|
185
|
+
@repo << [rs, RDF::Vocab::DC.requires, ro]
|
186
|
+
ro
|
187
|
+
end
|
188
|
+
}
|
189
|
+
}.freeze
|
190
|
+
|
191
|
+
XMLMAP = {
|
192
|
+
nil => {
|
193
|
+
'rss' => 'application/x-rss+xml',
|
194
|
+
'html' => 'text/html',
|
195
|
+
},
|
196
|
+
XHTMLNS => 'application/xhtml+xml',
|
197
|
+
'http://www.w3.org/2005/Atom' => 'application/atom+xml',
|
198
|
+
'http://www.w3.org/2000/svg' => 'image/svg+xml',
|
199
|
+
}.freeze
|
200
|
+
|
201
|
+
def xml_type doc
|
202
|
+
r = doc.root
|
203
|
+
if r and x = XMLMAP[r.namespace]
|
204
|
+
if x.is_a? Hash
|
205
|
+
return x[r.name] if x[r.name]
|
206
|
+
else
|
207
|
+
return x
|
208
|
+
end
|
209
|
+
end
|
210
|
+
'application/xml'
|
211
|
+
end
|
212
|
+
|
213
|
+
public
|
214
|
+
|
215
|
+
# NS = {
|
216
|
+
# rdf: RDF::RDFV,
|
217
|
+
# rdfs: RDF::RDFS,
|
218
|
+
# owl: RDF::OWL,
|
219
|
+
# xsd: RDF::XSD,
|
220
|
+
# xhv: RDF::Vocab::XHV,
|
221
|
+
# http: RDF::Vocabulary.new('http://www.w3.org/2011/http#'),
|
222
|
+
# vann: RDF::Vocabulary.new('http://purl.org/vocab/vann/'),
|
223
|
+
# skos: RDF::Vocab::SKOS,
|
224
|
+
# dcat: RDF::Vocab::DCAT,
|
225
|
+
# so: RDF::Vocab::SCHEMA,
|
226
|
+
# dct: RDF::Vocab::DC,
|
227
|
+
# ci: RDF::SAK::CI,
|
228
|
+
# ogp: RDF::Vocab::OG,
|
229
|
+
# foaf: RDF::Vocab::FOAF,
|
230
|
+
# org: RDF::Vocab::ORG,
|
231
|
+
# bibo: RDF::Vocab::BIBO,
|
232
|
+
# qb: RDF::Vocabulary.new('http://purl.org/linked-data/cube#'),
|
233
|
+
# }.freeze
|
234
|
+
|
235
|
+
def initialize store: nil, repo: nil, ua: nil, ignore: nil, traverse: nil
|
236
|
+
@store = store
|
237
|
+
@repo = repo
|
238
|
+
# @urls = Concurrent::Array.new
|
239
|
+
@jobs = Concurrent::Array.new
|
240
|
+
@seen = Concurrent::Map.new
|
241
|
+
|
242
|
+
# @fchan = Concurrent::Promises::Channel.new 10
|
243
|
+
# @schan = Concurrent::Promises::Channel.new 10
|
244
|
+
# @tchan = Concurrent::Promises::Channel.new 10
|
245
|
+
|
246
|
+
@fthrot = Concurrent::Throttle.new 5
|
247
|
+
|
248
|
+
# @done = Concurrent::Cancellation.new
|
249
|
+
|
250
|
+
@ua = ua ? ua.to_s : UA
|
251
|
+
@ignore = ignore ? ignore.respond_to?(:to_a) ? ignore.to_a : [ignore] : []
|
252
|
+
@traverse = traverse ?
|
253
|
+
traverse.respond_to?(:to_a) ? traverse.to_a : [traverse] : []
|
254
|
+
# other stuff
|
255
|
+
|
256
|
+
#Signal.trap(:INT) { warn 'FART'; @done.origin.resolve }
|
257
|
+
end
|
258
|
+
|
259
|
+
def fetch url, redir = 10
|
260
|
+
# `return` in a lambda is okay but in a block you have to use
|
261
|
+
# `break`, and it complains if you do that
|
262
|
+
|
263
|
+
url = URI(url.to_s) unless url.is_a? URI
|
264
|
+
url.normalize!
|
265
|
+
|
266
|
+
# XXX apparently you can't just *return* a fulfilled future, you
|
267
|
+
# have to assign it to something.
|
268
|
+
bailout = Concurrent::Promises.fulfilled_future nil
|
269
|
+
return bailout unless %w[http https].include? url.scheme
|
270
|
+
|
271
|
+
# nuke the fragment
|
272
|
+
url.fragment = nil
|
273
|
+
|
274
|
+
# make sure we don't do this a second time
|
275
|
+
return bailout if seen? url
|
276
|
+
@seen[url.to_s] = url
|
277
|
+
|
278
|
+
ru = RDF::URI(url.to_s)
|
279
|
+
|
280
|
+
# obtain last-modified from object
|
281
|
+
q = RDF::Query.new { pattern [ru, RDF::Vocab::DC.modified, :m] }
|
282
|
+
|
283
|
+
ims = q.execute(@repo).map do |s|
|
284
|
+
s[:m].object.to_time.getgm if
|
285
|
+
s[:m].literal? and s[:m].object.respond_to? :to_time
|
286
|
+
end.compact.sort { |a, b| b <=> a }.first
|
287
|
+
|
288
|
+
# XXX this is a little too low-level, don't you think?
|
289
|
+
http = Net::HTTP.new(url.hostname, url.port)
|
290
|
+
http.continue_timeout = 10
|
291
|
+
http.open_timeout = 30
|
292
|
+
http.read_timeout = 10
|
293
|
+
http.write_timeout = 10
|
294
|
+
http.use_ssl = url.is_a?(URI::HTTPS)
|
295
|
+
http.start
|
296
|
+
|
297
|
+
hdr = { 'User-Agent' => @ua, 'Connection' => 'close' }
|
298
|
+
hdr['If-Modified-Since'] = ims.rfc2822 if ims
|
299
|
+
req = Net::HTTP::Get.new url, hdr
|
300
|
+
resp = http.request req
|
301
|
+
http.finish
|
302
|
+
|
303
|
+
case resp
|
304
|
+
when Net::HTTPSuccess
|
305
|
+
Concurrent::Promises.fulfilled_future(resp)
|
306
|
+
when Net::HTTPNotModified
|
307
|
+
warn "Already seen #{url}"
|
308
|
+
bailout
|
309
|
+
when Net::HTTPRedirection
|
310
|
+
raise Net::HTTPClientException.new "Too many redirects (#{redir})",
|
311
|
+
resp if redir <= 0
|
312
|
+
unless dest = resp['location']
|
313
|
+
raise Net::HTTPBadResponse.new(
|
314
|
+
"Redirect on #{url} missing Location header", resp)
|
315
|
+
end
|
316
|
+
|
317
|
+
dest = (url + dest).normalize
|
318
|
+
|
319
|
+
@repo << [ru, RDF::SAK::CI.canonical, RDF::URI(dest.to_s)]
|
320
|
+
|
321
|
+
raise Net::HTTPClientException.new "Loop detected on #{url}",
|
322
|
+
resp if url == dest
|
323
|
+
|
324
|
+
fetch(dest, redir - 1)
|
325
|
+
else
|
326
|
+
raise Net::HTTPClientException.new "Response failed #{url}", resp
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
def store resp
|
331
|
+
return unless resp
|
332
|
+
|
333
|
+
now = Time.now.getgm # we mint a single "now" to use everywhere
|
334
|
+
date = if d = resp['Date']
|
335
|
+
# who knows what weird shit is coming off the wire
|
336
|
+
d.gsub!(/^([^,]*(?:,[^,]*)?)(?:\s*,.*)?$/, "\\1")
|
337
|
+
Time.httpdate(d).getgm rescue now
|
338
|
+
else
|
339
|
+
now # rfc says server MUST send a date header, but does it?
|
340
|
+
end
|
341
|
+
mtime = if lm = resp['Last-Modified']
|
342
|
+
# lol god two Last-Modified headers no that's not messed up
|
343
|
+
lm.gsub!(/^([^,]*(?:,[^,]*)?)(?:\s*,.*)?$/, "\\1")
|
344
|
+
delta = now - date
|
345
|
+
|
346
|
+
Time.httpdate(lm).getgm + delta rescue nil
|
347
|
+
end
|
348
|
+
lang = if resp['Content-Language']
|
349
|
+
# same friggin deal
|
350
|
+
resp['Content-Language'].strip.split('\s*,+\s*').first
|
351
|
+
end
|
352
|
+
charset = nil
|
353
|
+
if type = resp['Content-Type']
|
354
|
+
# and again, wtf
|
355
|
+
type = type.split(/\s*,+\s*/).first || 'application/octet-stream'
|
356
|
+
type, *params = type.strip.split(/\s*;\s*/).reject(&:empty?)
|
357
|
+
params = params.map do |p|
|
358
|
+
p.split(/\s*=\s*/, 2)
|
359
|
+
end.reject { |p| p.length < 2 }.to_h.transform_keys(&:downcase)
|
360
|
+
|
361
|
+
charset = params['charset'] if TOKEN.match? params['charset']
|
362
|
+
end
|
363
|
+
|
364
|
+
|
365
|
+
# obj = @store.add resp.body, strict: false,
|
366
|
+
# type: type, charset: charset, language: lang, mtime: mtime
|
367
|
+
|
368
|
+
s = RDF::URI(resp.uri.to_s) # - the subject
|
369
|
+
cs = RDF::Changeset.new # - a receptacle for statements
|
370
|
+
|
371
|
+
cs << [s, RDF::Vocab::DC.modified, obj.mtime.getgm]
|
372
|
+
#cs << [s, RDF::Vocab::DC.hasVersion, RDF::URI(obj[:"sha-256"].to_s)]
|
373
|
+
|
374
|
+
cs.apply @repo
|
375
|
+
|
376
|
+
obj
|
377
|
+
#nil
|
378
|
+
end
|
379
|
+
|
380
|
+
def tidy obj
|
381
|
+
if obj and /html/i.match? obj.type
|
382
|
+
# obtain original digest uri
|
383
|
+
oldu = RDF::URI(obj[:"sha-256"].to_s)
|
384
|
+
|
385
|
+
# first let's detect if this job has already been done
|
386
|
+
RDF::Query.new do
|
387
|
+
pattern [:a, RDF::SAK::TFO.input, oldu]
|
388
|
+
pattern [:a, RDF::SAK::TFO.transform, TIDY_U]
|
389
|
+
pattern [:a, RDF::SAK::TFO.output, :n]
|
390
|
+
end.execute(@repo).map do |s|
|
391
|
+
# can't completely trust what's in the repo so check then convert
|
392
|
+
URI(s[:n].to_s) if s[:n].uri? and s[:n].scheme.downcase == 'ni'
|
393
|
+
end.compact.each do |n|
|
394
|
+
# just because it's in the rdf doesn't mean it's in the store
|
395
|
+
out = @store.get n
|
396
|
+
return out if out and !out.deleted? # don't return an empty record
|
397
|
+
end
|
398
|
+
|
399
|
+
# tidy the object and reinsert it back into the store as xhtml
|
400
|
+
start = Time.now.getgm
|
401
|
+
if clean = TidyFFI::Tidy.clean(obj.content.read, TIDY_OPTS)
|
402
|
+
newobj = @store.add clean, mtime: obj.mtime,
|
403
|
+
type: 'application/xhtml+xml', language: obj.language,
|
404
|
+
charset: obj.charset, encoding: obj.encoding
|
405
|
+
stop = Time.now.getgm
|
406
|
+
newu = RDF::URI(newobj[:"sha-256"].to_s)
|
407
|
+
|
408
|
+
q = RDF::Query.new do
|
409
|
+
pattern [:a, RDF::SAK::TFO.input, oldu]
|
410
|
+
pattern [:a, RDF::SAK::TFO.output, newu]
|
411
|
+
end
|
412
|
+
|
413
|
+
if q.execute(@repo).empty?
|
414
|
+
s = RDF::URI(UUIDTools::UUID.random_create.to_uri)
|
415
|
+
cs = RDF::Changeset.new
|
416
|
+
cs << [s, RDF.type, RDF::SAK::TFO.Application]
|
417
|
+
cs << [s, RDF::Vocab::PROV.startedAtTime, start]
|
418
|
+
cs << [s, RDF::Vocab::PROV.endedAtTime, stop]
|
419
|
+
cs << [s, RDF::SAK::TFO.transform, TIDY_U]
|
420
|
+
cs << [s, RDF::SAK::TFO.input, oldu]
|
421
|
+
cs << [s, RDF::SAK::TFO.output, newu]
|
422
|
+
|
423
|
+
cs.apply @repo
|
424
|
+
end
|
425
|
+
|
426
|
+
newobj
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
def sponge obj
|
432
|
+
return unless obj
|
433
|
+
#if obj and /xml/.match? obj.type
|
434
|
+
|
435
|
+
# get rdf stuff
|
436
|
+
ru = RDF::URI(obj[:"sha-256"].to_s)
|
437
|
+
uri = RDF::Query.new do
|
438
|
+
pattern [:b, RDF::SAK::TFO.output, ru]
|
439
|
+
pattern [:b, RDF::SAK::TFO.transform, TIDY_U]
|
440
|
+
pattern [:b, RDF::SAK::TFO.input, :a]
|
441
|
+
pattern [:s, RDF::Vocab::DC.hasVersion, :a]
|
442
|
+
end.execute(@repo) + RDF::Query.new do
|
443
|
+
pattern [:s, RDF::Vocab::DC.hasVersion, ru]
|
444
|
+
end.execute(@repo).uniq.map do |sol|
|
445
|
+
u = sol[:s]
|
446
|
+
u if u.uri? and u.scheme and %w[http https].include? u.scheme.downcase
|
447
|
+
end.compact.first
|
448
|
+
|
449
|
+
uuri = URI(uri ? uri_pp(uri.to_s) : ru)
|
450
|
+
content = obj.content
|
451
|
+
sponge = SPONGE[obj.type]
|
452
|
+
|
453
|
+
if /xml/.match? type
|
454
|
+
content = Nokogiri.XML(content, uuri.to_s)
|
455
|
+
unless sponge
|
456
|
+
type = xml_type(content)
|
457
|
+
sponge = SPONGE[type] || SPONGE['image/svg+xml'] # svg is just xlink
|
458
|
+
end
|
459
|
+
end
|
460
|
+
|
461
|
+
if sponge
|
462
|
+
instance_exec(content, uuri, &sponge).compact.uniq.each do |link|
|
463
|
+
enqueue link if traverse? link
|
464
|
+
end
|
465
|
+
end
|
466
|
+
|
467
|
+
end
|
468
|
+
|
469
|
+
def enqueue uri
|
470
|
+
uri = URI(uri_pp uri.to_s).normalize
|
471
|
+
|
472
|
+
return if seen? uri
|
473
|
+
return if ignored? uri
|
474
|
+
|
475
|
+
warn "enqueuing #{uri}"
|
476
|
+
|
477
|
+
@fthrot.future(uri) do |u|
|
478
|
+
fr = fetch u
|
479
|
+
fr.on_rejection { |reason| warn "fetch fail: #{reason.inspect}" }
|
480
|
+
fs = fr.then do |resp|
|
481
|
+
begin
|
482
|
+
store resp
|
483
|
+
rescue Exception => e
|
484
|
+
warn e
|
485
|
+
end
|
486
|
+
end
|
487
|
+
fs.on_rejection { |reason| warn "store fail: #{reason.inspect}" }
|
488
|
+
ft = fs.then do |obj|
|
489
|
+
if obj and /html/i.match? obj.type
|
490
|
+
warn "tidying #{obj[:"sha-256"]}"
|
491
|
+
tidy obj
|
492
|
+
else
|
493
|
+
obj
|
494
|
+
end
|
495
|
+
end
|
496
|
+
ft.on_rejection { |reason| warn "tidy fail: #{reason.inspect}" }
|
497
|
+
ft.then { |obj| sponge obj }
|
498
|
+
end.on_rejection { |reason| warn "throttle fail: #{reason.inspect}" }
|
499
|
+
end
|
500
|
+
|
501
|
+
def seen? uri
|
502
|
+
!!@seen[uri.to_s]
|
503
|
+
end
|
504
|
+
|
505
|
+
def traverse? uri
|
506
|
+
return if @traverse.empty?
|
507
|
+
|
508
|
+
re = /(?:^|\.)#{@traverse.map { |t| Regexp.escape t }.join ?|}$/o
|
509
|
+
re.match? uri.host
|
510
|
+
end
|
511
|
+
|
512
|
+
def ignored? uri
|
513
|
+
return if @ignore.empty?
|
514
|
+
|
515
|
+
re = /(?:^|\.)#{@ignore.map { |t| Regexp.escape t }.join ?|}$/o
|
516
|
+
re.match? uri.host
|
517
|
+
end
|
518
|
+
|
519
|
+
def run urls, shuffle: false
|
520
|
+
# ingest URLs and prune
|
521
|
+
urls = urls.map { |u| URI(uri_pp u).normalize }.reject { |u| ignored? u }
|
522
|
+
|
523
|
+
# optionally randomize
|
524
|
+
urls.shuffle! if shuffle
|
525
|
+
|
526
|
+
# now add the queue
|
527
|
+
urls.map { |url| enqueue url }.map(&:wait)
|
528
|
+
|
529
|
+
# while job = @jobs.shift
|
530
|
+
# job.wait
|
531
|
+
# end
|
532
|
+
|
533
|
+
self
|
534
|
+
end
|
535
|
+
end
|
536
|
+
|
537
|
+
|
538
|
+
if __FILE__ == $0
|
539
|
+
require 'pathname'
|
540
|
+
require 'rdf/turtle'
|
541
|
+
require 'rdf/lmdb'
|
542
|
+
require 'store/digest'
|
543
|
+
require 'commander'
|
544
|
+
|
545
|
+
Commander.configure do
|
546
|
+
program :name, 'URLRunner'
|
547
|
+
program :version, '0.0.0'
|
548
|
+
program :description, 'snarf a bunch of URLs for great justice'
|
549
|
+
|
550
|
+
command :process do |c|
|
551
|
+
c.syntax = 'process [options] [urls]'
|
552
|
+
c.description = 'just do the friggin urls already'
|
553
|
+
|
554
|
+
c.option '-c', '--csv FILE',
|
555
|
+
'a CSV file containing URLs in the first column'
|
556
|
+
c.option '-i', '--ignore DOMAIN[,DOMAIN...]', Array,
|
557
|
+
'domain(s) to ignore'
|
558
|
+
c.option '-t', '--traverse DOMAIN[,DOMAIN...]', Array,
|
559
|
+
'traverse links within hypermedia documents'
|
560
|
+
c.option '-p', '--print', 'print the results when finished'
|
561
|
+
c.option '-A', '--user-agent STRING', 'override the User-Agent string'
|
562
|
+
c.option '--shuffle', 'shuffle the list of URLs'
|
563
|
+
|
564
|
+
# wtf why did i have to do this
|
565
|
+
c.option '-s', '--store DIR', 'Directory for digest store'
|
566
|
+
c.option '-r', '--rdf DIR', 'Directory for RDF store'
|
567
|
+
|
568
|
+
c.action do |args, options|
|
569
|
+
raise ArgumentError, 'Store and RDF directories are required' unless
|
570
|
+
options.store and options.rdf
|
571
|
+
repo = RDF::LMDB::Repository.new options.rdf, mapsize: 2**27
|
572
|
+
store = Store::Digest.new dir: options.store, mapsize: 2**27
|
573
|
+
|
574
|
+
urls = args.dup
|
575
|
+
|
576
|
+
if options.csv
|
577
|
+
require 'csv'
|
578
|
+
csv = Pathname(options.csv).expand_path
|
579
|
+
raise ArgumentError, "CSV file #{csv} is not readable" unless
|
580
|
+
csv.file? and csv.readable?
|
581
|
+
urls += CSV.read(csv).map(&:first).compact
|
582
|
+
end
|
583
|
+
|
584
|
+
RDF::SAK::URLRunner.new(
|
585
|
+
repo: repo, store: store, ua: options.user_agent,
|
586
|
+
ignore: options.ignore, traverse: options.traverse
|
587
|
+
).run urls, shuffle: options.shuffle
|
588
|
+
|
589
|
+
if options.print
|
590
|
+
print repo.dump :turtle #, prefixes: URLRunner::NS
|
591
|
+
end
|
592
|
+
end
|
593
|
+
end
|
594
|
+
|
595
|
+
# XXX these should work wtf
|
596
|
+
#global_option '-s', '--store DIR', 'Directory for digest store'
|
597
|
+
#global_option '-r', '--rdf DIR', 'Directory for RDF store'
|
598
|
+
|
599
|
+
default_command :process
|
600
|
+
|
601
|
+
end
|
602
|
+
end
|