rdf-sak 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,602 @@
1
+ require 'net/http'
2
+ require 'concurrent'
3
+ require 'concurrent-edge'
4
+
5
+ require 'uri'
6
+ require 'uri/urn/uuid'
7
+ require 'time'
8
+
9
+ require 'rdf'
10
+ require 'rdf/rdfa'
11
+ #require 'rdf/vocab'
12
+ require 'rdf/vocab/dc'
13
+ require 'rdf/vocab/prov'
14
+ require 'rdf/sak/ci'
15
+ require 'rdf/sak/tfo'
16
+ require 'tidy_ffi'
17
+ require 'uuidtools'
18
+ require 'nokogiri'
19
+ require 'crass'
20
+
21
+ class RDF::SAK::URLRunner
22
+ private
23
+
24
+ UA = 'RDF::SAK::URLRunner/0.1'.freeze
25
+ TIDY_OPTS = {
26
+ wrap: 0,
27
+ numeric_entities: true,
28
+ tidy_mark: false,
29
+ output_xhtml: true,
30
+ custom_tags: 'inline',
31
+ }.freeze
32
+
33
+ TIDY_U = RDF::URI('urn:x-dummy:tidy').freeze
34
+ XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
35
+ TTL = "normalize-space(/html:html/html:head/html:title" \
36
+ "[normalize-space(.) != ''][1])".freeze
37
+ JLD = "//html:script[text()]" \
38
+ "[normalize-space(@type) = 'application/ld+json']".freeze
39
+ QF = /^([^?#]*)(?:\?([^#]*))?(?:#(.*?))?$/.freeze
40
+ SF = /[^[:alpha:][:digit:]\/\?%@!$&'()*+,;=._~-]/.freeze
41
+ TOKEN = /^([^\x0-\x20()<>@,;:\\"\/\[\]?=\x7f-\xff]+)$/n.freeze
42
+ LANG_RE = /^[A-Za-z]+(?:[_-]+[0-9A-Za-z]+)*$/.freeze
43
+
44
+ # xpath for various linkages
45
+ XPATHNS = {
46
+ html: XHTMLNS,
47
+ svg: 'http://www.w3.org/2000/svg',
48
+ atom: 'http://www.w3.org/2005/Atom',
49
+ xlink: 'http://www.w3.org/1999/xlink',
50
+ }
51
+ XBASE_XP = 'ancestor-or-self::*[@xml:base][1]/@xml:base'.freeze
52
+ HTML_XP = %w[*[not(self::html:base)][@href]/@href
53
+ *[@src]/@src object/@data *[@srcset]/@srcset
54
+ form/@action].map { |e| '//html:%s' % e }.join(?|).freeze
55
+ ATOM_XP = %w[uri content/@src category/@scheme generator/@uri icon id
56
+ link/@href logo].map { |e| '//atom:%s' % e }.join(?|).freeze
57
+ RSS_XP = %w[image docs source/@url enclosure/@url
58
+ guid comments].map { |e| '//%s' % e }.join(?|).freeze
59
+ XLINK_XP = '//*/@xlink:href'.freeze
60
+
61
+ def uri_pp uri
62
+ m = QF.match uri.to_s
63
+ out = m[1]
64
+ [[2, ??], [3, ?#]].each do |i, c|
65
+ next if m[i].nil?
66
+ clean = m[i].gsub(SF) { |s| sprintf('%X', s.ord) }
67
+ out += c + clean
68
+ end
69
+
70
+ out
71
+ end
72
+
73
+ def css_urls css
74
+ out = []
75
+ case css
76
+ when Array
77
+ css.each { |c| out += find_urls(c) }
78
+ when Hash
79
+ if css[:node] == :url
80
+ out << css[:value]
81
+ else
82
+ out += find_urls(css.values)
83
+ end
84
+ end
85
+
86
+ out.uniq.compact
87
+ end
88
+
89
+ def title_term title
90
+ title = title.first if title.is_a? Array
91
+ return unless title
92
+
93
+ text = title.content.strip
94
+ unless text.empty?
95
+ lang = if title.lang and LANG_RE.match? title.lang.strip
96
+ title.lang.strip.downcase.tr_s '_-', ?-
97
+ end
98
+ return RDF::Literal(text, language: lang)
99
+ end
100
+ end
101
+
102
+ # sponge cartridges should add triples and return all the resolved
103
+ # (RDF::)URIs to the caller
104
+ SPONGE = {
105
+ 'application/xhtml+xml': -> content, uri {
106
+ rs = RDF::URI(uri.to_s)
107
+
108
+ # lol we have done this a million times before
109
+ if title = content.xpath('/html:html/html:head/html:title', XPATHNS)
110
+ if title = title_term(title)
111
+ @repo << [rs, RDF::Vocab::DC.title, title]
112
+ end
113
+ end
114
+
115
+ content.xpath(HTML_XP, XPATHNS).map do |node|
116
+ # html will always be an attribute
117
+ pred = case node.parent.name
118
+ when 'script' then RDF::Vocab::DC.requires
119
+ when 'a', 'link', 'area', 'form' then RDF::Vocab::DC.references
120
+ else
121
+ RDF::Vocab::DC.hasPart
122
+ end
123
+
124
+ # srcset gets special treatment
125
+ objs = if node.name == 'srcset'
126
+ node.content.strip.split(/\s*,\s+/).map do |u|
127
+ u.split.first.strip
128
+ end
129
+ else
130
+ [node.content.strip]
131
+ end.map { |u| RDF::URI((uri + u).to_s) }
132
+
133
+ # phew
134
+ objs.each { |o| @repo << [rs, pred, o] }
135
+ end.flatten # don't forget to flatten
136
+ },
137
+ 'application/atom+xml': -> content, uri {
138
+ rs = RDF::URI(uri.to_s)
139
+
140
+ if title = content.xpath('/atom:feed/atom:title', XPATHNS)
141
+ if title = title_term(title)
142
+ @repo << [rs, RDF::Vocab::DC.title, title]
143
+ end
144
+ end
145
+
146
+ content.xpath(ATOM_XP, XPATHNS).map do |node|
147
+ o = RDF::URI((uri + node.content.strip).to_s)
148
+ @repo << [rs, RDF::Vocab::DC.references, o]
149
+ o
150
+ end
151
+ },
152
+ 'application/x-rss+xml': -> content, uri {
153
+ rs = RDF::URI(uri.to_s)
154
+
155
+ if title = content.xpath('/rss/channel/title', XPATHNS)
156
+ if title = title_term(title)
157
+ @repo << [rs, RDF::Vocab::DC.title, title]
158
+ end
159
+ end
160
+
161
+ content.xpath(RSS_XP, XPATHNS).map do |node|
162
+ o = RDF::URI((uri + node.content.strip).to_s)
163
+ @repo << [rs, RDF::Vocab::DC.references, o]
164
+ o
165
+ end
166
+ },
167
+ 'image/svg+xml': -> content, uri {
168
+ if title = content.xpath('//svg:title', XPATHNS)
169
+ if title = title_term(title)
170
+ @repo << [rs, RDF::Vocab::DC.title, title]
171
+ end
172
+ end
173
+
174
+ content.xpath(XLINK_XP, XPATHNS).map do |node|
175
+ o = RDF::URI((uri + node.content.strip).to_s)
176
+ @repo << [rs, RDF::Vocab::DC.references, o]
177
+ o
178
+ end
179
+ },
180
+ 'text/css': -> content, uri {
181
+ rs = RDF::URI(uri.to_s)
182
+ css = Crass.parse content
183
+ css_urls(css).map do |u|
184
+ ro = RDF::URI((uri + u).to_s)
185
+ @repo << [rs, RDF::Vocab::DC.requires, ro]
186
+ ro
187
+ end
188
+ }
189
+ }.freeze
190
+
191
+ XMLMAP = {
192
+ nil => {
193
+ 'rss' => 'application/x-rss+xml',
194
+ 'html' => 'text/html',
195
+ },
196
+ XHTMLNS => 'application/xhtml+xml',
197
+ 'http://www.w3.org/2005/Atom' => 'application/atom+xml',
198
+ 'http://www.w3.org/2000/svg' => 'image/svg+xml',
199
+ }.freeze
200
+
201
+ def xml_type doc
202
+ r = doc.root
203
+ if r and x = XMLMAP[r.namespace]
204
+ if x.is_a? Hash
205
+ return x[r.name] if x[r.name]
206
+ else
207
+ return x
208
+ end
209
+ end
210
+ 'application/xml'
211
+ end
212
+
213
+ public
214
+
215
+ # NS = {
216
+ # rdf: RDF::RDFV,
217
+ # rdfs: RDF::RDFS,
218
+ # owl: RDF::OWL,
219
+ # xsd: RDF::XSD,
220
+ # xhv: RDF::Vocab::XHV,
221
+ # http: RDF::Vocabulary.new('http://www.w3.org/2011/http#'),
222
+ # vann: RDF::Vocabulary.new('http://purl.org/vocab/vann/'),
223
+ # skos: RDF::Vocab::SKOS,
224
+ # dcat: RDF::Vocab::DCAT,
225
+ # so: RDF::Vocab::SCHEMA,
226
+ # dct: RDF::Vocab::DC,
227
+ # ci: RDF::SAK::CI,
228
+ # ogp: RDF::Vocab::OG,
229
+ # foaf: RDF::Vocab::FOAF,
230
+ # org: RDF::Vocab::ORG,
231
+ # bibo: RDF::Vocab::BIBO,
232
+ # qb: RDF::Vocabulary.new('http://purl.org/linked-data/cube#'),
233
+ # }.freeze
234
+
235
+ def initialize store: nil, repo: nil, ua: nil, ignore: nil, traverse: nil
236
+ @store = store
237
+ @repo = repo
238
+ # @urls = Concurrent::Array.new
239
+ @jobs = Concurrent::Array.new
240
+ @seen = Concurrent::Map.new
241
+
242
+ # @fchan = Concurrent::Promises::Channel.new 10
243
+ # @schan = Concurrent::Promises::Channel.new 10
244
+ # @tchan = Concurrent::Promises::Channel.new 10
245
+
246
+ @fthrot = Concurrent::Throttle.new 5
247
+
248
+ # @done = Concurrent::Cancellation.new
249
+
250
+ @ua = ua ? ua.to_s : UA
251
+ @ignore = ignore ? ignore.respond_to?(:to_a) ? ignore.to_a : [ignore] : []
252
+ @traverse = traverse ?
253
+ traverse.respond_to?(:to_a) ? traverse.to_a : [traverse] : []
254
+ # other stuff
255
+
256
+ #Signal.trap(:INT) { warn 'FART'; @done.origin.resolve }
257
+ end
258
+
259
+ def fetch url, redir = 10
260
+ # `return` in a lambda is okay but in a block you have to use
261
+ # `break`, and it complains if you do that
262
+
263
+ url = URI(url.to_s) unless url.is_a? URI
264
+ url.normalize!
265
+
266
+ # XXX apparently you can't just *return* a fulfilled future, you
267
+ # have to assign it to something.
268
+ bailout = Concurrent::Promises.fulfilled_future nil
269
+ return bailout unless %w[http https].include? url.scheme
270
+
271
+ # nuke the fragment
272
+ url.fragment = nil
273
+
274
+ # make sure we don't do this a second time
275
+ return bailout if seen? url
276
+ @seen[url.to_s] = url
277
+
278
+ ru = RDF::URI(url.to_s)
279
+
280
+ # obtain last-modified from object
281
+ q = RDF::Query.new { pattern [ru, RDF::Vocab::DC.modified, :m] }
282
+
283
+ ims = q.execute(@repo).map do |s|
284
+ s[:m].object.to_time.getgm if
285
+ s[:m].literal? and s[:m].object.respond_to? :to_time
286
+ end.compact.sort { |a, b| b <=> a }.first
287
+
288
+ # XXX this is a little too low-level, don't you think?
289
+ http = Net::HTTP.new(url.hostname, url.port)
290
+ http.continue_timeout = 10
291
+ http.open_timeout = 30
292
+ http.read_timeout = 10
293
+ http.write_timeout = 10
294
+ http.use_ssl = url.is_a?(URI::HTTPS)
295
+ http.start
296
+
297
+ hdr = { 'User-Agent' => @ua, 'Connection' => 'close' }
298
+ hdr['If-Modified-Since'] = ims.rfc2822 if ims
299
+ req = Net::HTTP::Get.new url, hdr
300
+ resp = http.request req
301
+ http.finish
302
+
303
+ case resp
304
+ when Net::HTTPSuccess
305
+ Concurrent::Promises.fulfilled_future(resp)
306
+ when Net::HTTPNotModified
307
+ warn "Already seen #{url}"
308
+ bailout
309
+ when Net::HTTPRedirection
310
+ raise Net::HTTPClientException.new "Too many redirects (#{redir})",
311
+ resp if redir <= 0
312
+ unless dest = resp['location']
313
+ raise Net::HTTPBadResponse.new(
314
+ "Redirect on #{url} missing Location header", resp)
315
+ end
316
+
317
+ dest = (url + dest).normalize
318
+
319
+ @repo << [ru, RDF::SAK::CI.canonical, RDF::URI(dest.to_s)]
320
+
321
+ raise Net::HTTPClientException.new "Loop detected on #{url}",
322
+ resp if url == dest
323
+
324
+ fetch(dest, redir - 1)
325
+ else
326
+ raise Net::HTTPClientException.new "Response failed #{url}", resp
327
+ end
328
+ end
329
+
330
+ def store resp
331
+ return unless resp
332
+
333
+ now = Time.now.getgm # we mint a single "now" to use everywhere
334
+ date = if d = resp['Date']
335
+ # who knows what weird shit is coming off the wire
336
+ d.gsub!(/^([^,]*(?:,[^,]*)?)(?:\s*,.*)?$/, "\\1")
337
+ Time.httpdate(d).getgm rescue now
338
+ else
339
+ now # rfc says server MUST send a date header, but does it?
340
+ end
341
+ mtime = if lm = resp['Last-Modified']
342
+ # lol god two Last-Modified headers no that's not messed up
343
+ lm.gsub!(/^([^,]*(?:,[^,]*)?)(?:\s*,.*)?$/, "\\1")
344
+ delta = now - date
345
+
346
+ Time.httpdate(lm).getgm + delta rescue nil
347
+ end
348
+ lang = if resp['Content-Language']
349
+ # same friggin deal
350
+ resp['Content-Language'].strip.split('\s*,+\s*').first
351
+ end
352
+ charset = nil
353
+ if type = resp['Content-Type']
354
+ # and again, wtf
355
+ type = type.split(/\s*,+\s*/).first || 'application/octet-stream'
356
+ type, *params = type.strip.split(/\s*;\s*/).reject(&:empty?)
357
+ params = params.map do |p|
358
+ p.split(/\s*=\s*/, 2)
359
+ end.reject { |p| p.length < 2 }.to_h.transform_keys(&:downcase)
360
+
361
+ charset = params['charset'] if TOKEN.match? params['charset']
362
+ end
363
+
364
+
365
+ # obj = @store.add resp.body, strict: false,
366
+ # type: type, charset: charset, language: lang, mtime: mtime
367
+
368
+ s = RDF::URI(resp.uri.to_s) # - the subject
369
+ cs = RDF::Changeset.new # - a receptacle for statements
370
+
371
+ cs << [s, RDF::Vocab::DC.modified, obj.mtime.getgm]
372
+ #cs << [s, RDF::Vocab::DC.hasVersion, RDF::URI(obj[:"sha-256"].to_s)]
373
+
374
+ cs.apply @repo
375
+
376
+ obj
377
+ #nil
378
+ end
379
+
380
+ def tidy obj
381
+ if obj and /html/i.match? obj.type
382
+ # obtain original digest uri
383
+ oldu = RDF::URI(obj[:"sha-256"].to_s)
384
+
385
+ # first let's detect if this job has already been done
386
+ RDF::Query.new do
387
+ pattern [:a, RDF::SAK::TFO.input, oldu]
388
+ pattern [:a, RDF::SAK::TFO.transform, TIDY_U]
389
+ pattern [:a, RDF::SAK::TFO.output, :n]
390
+ end.execute(@repo).map do |s|
391
+ # can't completely trust what's in the repo so check then convert
392
+ URI(s[:n].to_s) if s[:n].uri? and s[:n].scheme.downcase == 'ni'
393
+ end.compact.each do |n|
394
+ # just because it's in the rdf doesn't mean it's in the store
395
+ out = @store.get n
396
+ return out if out and !out.deleted? # don't return an empty record
397
+ end
398
+
399
+ # tidy the object and reinsert it back into the store as xhtml
400
+ start = Time.now.getgm
401
+ if clean = TidyFFI::Tidy.clean(obj.content.read, TIDY_OPTS)
402
+ newobj = @store.add clean, mtime: obj.mtime,
403
+ type: 'application/xhtml+xml', language: obj.language,
404
+ charset: obj.charset, encoding: obj.encoding
405
+ stop = Time.now.getgm
406
+ newu = RDF::URI(newobj[:"sha-256"].to_s)
407
+
408
+ q = RDF::Query.new do
409
+ pattern [:a, RDF::SAK::TFO.input, oldu]
410
+ pattern [:a, RDF::SAK::TFO.output, newu]
411
+ end
412
+
413
+ if q.execute(@repo).empty?
414
+ s = RDF::URI(UUIDTools::UUID.random_create.to_uri)
415
+ cs = RDF::Changeset.new
416
+ cs << [s, RDF.type, RDF::SAK::TFO.Application]
417
+ cs << [s, RDF::Vocab::PROV.startedAtTime, start]
418
+ cs << [s, RDF::Vocab::PROV.endedAtTime, stop]
419
+ cs << [s, RDF::SAK::TFO.transform, TIDY_U]
420
+ cs << [s, RDF::SAK::TFO.input, oldu]
421
+ cs << [s, RDF::SAK::TFO.output, newu]
422
+
423
+ cs.apply @repo
424
+ end
425
+
426
+ newobj
427
+ end
428
+ end
429
+ end
430
+
431
+ def sponge obj
432
+ return unless obj
433
+ #if obj and /xml/.match? obj.type
434
+
435
+ # get rdf stuff
436
+ ru = RDF::URI(obj[:"sha-256"].to_s)
437
+ uri = RDF::Query.new do
438
+ pattern [:b, RDF::SAK::TFO.output, ru]
439
+ pattern [:b, RDF::SAK::TFO.transform, TIDY_U]
440
+ pattern [:b, RDF::SAK::TFO.input, :a]
441
+ pattern [:s, RDF::Vocab::DC.hasVersion, :a]
442
+ end.execute(@repo) + RDF::Query.new do
443
+ pattern [:s, RDF::Vocab::DC.hasVersion, ru]
444
+ end.execute(@repo).uniq.map do |sol|
445
+ u = sol[:s]
446
+ u if u.uri? and u.scheme and %w[http https].include? u.scheme.downcase
447
+ end.compact.first
448
+
449
+ uuri = URI(uri ? uri_pp(uri.to_s) : ru)
450
+ content = obj.content
451
+ sponge = SPONGE[obj.type]
452
+
453
+ if /xml/.match? type
454
+ content = Nokogiri.XML(content, uuri.to_s)
455
+ unless sponge
456
+ type = xml_type(content)
457
+ sponge = SPONGE[type] || SPONGE['image/svg+xml'] # svg is just xlink
458
+ end
459
+ end
460
+
461
+ if sponge
462
+ instance_exec(content, uuri, &sponge).compact.uniq.each do |link|
463
+ enqueue link if traverse? link
464
+ end
465
+ end
466
+
467
+ end
468
+
469
+ def enqueue uri
470
+ uri = URI(uri_pp uri.to_s).normalize
471
+
472
+ return if seen? uri
473
+ return if ignored? uri
474
+
475
+ warn "enqueuing #{uri}"
476
+
477
+ @fthrot.future(uri) do |u|
478
+ fr = fetch u
479
+ fr.on_rejection { |reason| warn "fetch fail: #{reason.inspect}" }
480
+ fs = fr.then do |resp|
481
+ begin
482
+ store resp
483
+ rescue Exception => e
484
+ warn e
485
+ end
486
+ end
487
+ fs.on_rejection { |reason| warn "store fail: #{reason.inspect}" }
488
+ ft = fs.then do |obj|
489
+ if obj and /html/i.match? obj.type
490
+ warn "tidying #{obj[:"sha-256"]}"
491
+ tidy obj
492
+ else
493
+ obj
494
+ end
495
+ end
496
+ ft.on_rejection { |reason| warn "tidy fail: #{reason.inspect}" }
497
+ ft.then { |obj| sponge obj }
498
+ end.on_rejection { |reason| warn "throttle fail: #{reason.inspect}" }
499
+ end
500
+
501
+ def seen? uri
502
+ !!@seen[uri.to_s]
503
+ end
504
+
505
+ def traverse? uri
506
+ return if @traverse.empty?
507
+
508
+ re = /(?:^|\.)#{@traverse.map { |t| Regexp.escape t }.join ?|}$/o
509
+ re.match? uri.host
510
+ end
511
+
512
+ def ignored? uri
513
+ return if @ignore.empty?
514
+
515
+ re = /(?:^|\.)#{@ignore.map { |t| Regexp.escape t }.join ?|}$/o
516
+ re.match? uri.host
517
+ end
518
+
519
+ def run urls, shuffle: false
520
+ # ingest URLs and prune
521
+ urls = urls.map { |u| URI(uri_pp u).normalize }.reject { |u| ignored? u }
522
+
523
+ # optionally randomize
524
+ urls.shuffle! if shuffle
525
+
526
+ # now add the queue
527
+ urls.map { |url| enqueue url }.map(&:wait)
528
+
529
+ # while job = @jobs.shift
530
+ # job.wait
531
+ # end
532
+
533
+ self
534
+ end
535
+ end
536
+
537
+
538
+ if __FILE__ == $0
539
+ require 'pathname'
540
+ require 'rdf/turtle'
541
+ require 'rdf/lmdb'
542
+ require 'store/digest'
543
+ require 'commander'
544
+
545
+ Commander.configure do
546
+ program :name, 'URLRunner'
547
+ program :version, '0.0.0'
548
+ program :description, 'snarf a bunch of URLs for great justice'
549
+
550
+ command :process do |c|
551
+ c.syntax = 'process [options] [urls]'
552
+ c.description = 'just do the friggin urls already'
553
+
554
+ c.option '-c', '--csv FILE',
555
+ 'a CSV file containing URLs in the first column'
556
+ c.option '-i', '--ignore DOMAIN[,DOMAIN...]', Array,
557
+ 'domain(s) to ignore'
558
+ c.option '-t', '--traverse DOMAIN[,DOMAIN...]', Array,
559
+ 'traverse links within hypermedia documents'
560
+ c.option '-p', '--print', 'print the results when finished'
561
+ c.option '-A', '--user-agent STRING', 'override the User-Agent string'
562
+ c.option '--shuffle', 'shuffle the list of URLs'
563
+
564
+ # wtf why did i have to do this
565
+ c.option '-s', '--store DIR', 'Directory for digest store'
566
+ c.option '-r', '--rdf DIR', 'Directory for RDF store'
567
+
568
+ c.action do |args, options|
569
+ raise ArgumentError, 'Store and RDF directories are required' unless
570
+ options.store and options.rdf
571
+ repo = RDF::LMDB::Repository.new options.rdf, mapsize: 2**27
572
+ store = Store::Digest.new dir: options.store, mapsize: 2**27
573
+
574
+ urls = args.dup
575
+
576
+ if options.csv
577
+ require 'csv'
578
+ csv = Pathname(options.csv).expand_path
579
+ raise ArgumentError, "CSV file #{csv} is not readable" unless
580
+ csv.file? and csv.readable?
581
+ urls += CSV.read(csv).map(&:first).compact
582
+ end
583
+
584
+ RDF::SAK::URLRunner.new(
585
+ repo: repo, store: store, ua: options.user_agent,
586
+ ignore: options.ignore, traverse: options.traverse
587
+ ).run urls, shuffle: options.shuffle
588
+
589
+ if options.print
590
+ print repo.dump :turtle #, prefixes: URLRunner::NS
591
+ end
592
+ end
593
+ end
594
+
595
+ # XXX these should work wtf
596
+ #global_option '-s', '--store DIR', 'Directory for digest store'
597
+ #global_option '-r', '--rdf DIR', 'Directory for RDF store'
598
+
599
+ default_command :process
600
+
601
+ end
602
+ end