rdf-sak 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,602 @@
1
+ require 'net/http'
2
+ require 'concurrent'
3
+ require 'concurrent-edge'
4
+
5
+ require 'uri'
6
+ require 'uri/urn/uuid'
7
+ require 'time'
8
+
9
+ require 'rdf'
10
+ require 'rdf/rdfa'
11
+ #require 'rdf/vocab'
12
+ require 'rdf/vocab/dc'
13
+ require 'rdf/vocab/prov'
14
+ require 'rdf/sak/ci'
15
+ require 'rdf/sak/tfo'
16
+ require 'tidy_ffi'
17
+ require 'uuidtools'
18
+ require 'nokogiri'
19
+ require 'crass'
20
+
21
+ class RDF::SAK::URLRunner
22
+ private
23
+
24
+ UA = 'RDF::SAK::URLRunner/0.1'.freeze
25
+ TIDY_OPTS = {
26
+ wrap: 0,
27
+ numeric_entities: true,
28
+ tidy_mark: false,
29
+ output_xhtml: true,
30
+ custom_tags: 'inline',
31
+ }.freeze
32
+
33
+ TIDY_U = RDF::URI('urn:x-dummy:tidy').freeze
34
+ XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
35
+ TTL = "normalize-space(/html:html/html:head/html:title" \
36
+ "[normalize-space(.) != ''][1])".freeze
37
+ JLD = "//html:script[text()]" \
38
+ "[normalize-space(@type) = 'application/ld+json']".freeze
39
+ QF = /^([^?#]*)(?:\?([^#]*))?(?:#(.*?))?$/.freeze
40
+ SF = /[^[:alpha:][:digit:]\/\?%@!$&'()*+,;=._~-]/.freeze
41
+ TOKEN = /^([^\x0-\x20()<>@,;:\\"\/\[\]?=\x7f-\xff]+)$/n.freeze
42
+ LANG_RE = /^[A-Za-z]+(?:[_-]+[0-9A-Za-z]+)*$/.freeze
43
+
44
+ # xpath for various linkages
45
+ XPATHNS = {
46
+ html: XHTMLNS,
47
+ svg: 'http://www.w3.org/2000/svg',
48
+ atom: 'http://www.w3.org/2005/Atom',
49
+ xlink: 'http://www.w3.org/1999/xlink',
50
+ }
51
+ XBASE_XP = 'ancestor-or-self::*[@xml:base][1]/@xml:base'.freeze
52
+ HTML_XP = %w[*[not(self::html:base)][@href]/@href
53
+ *[@src]/@src object/@data *[@srcset]/@srcset
54
+ form/@action].map { |e| '//html:%s' % e }.join(?|).freeze
55
+ ATOM_XP = %w[uri content/@src category/@scheme generator/@uri icon id
56
+ link/@href logo].map { |e| '//atom:%s' % e }.join(?|).freeze
57
+ RSS_XP = %w[image docs source/@url enclosure/@url
58
+ guid comments].map { |e| '//%s' % e }.join(?|).freeze
59
+ XLINK_XP = '//*/@xlink:href'.freeze
60
+
61
+ def uri_pp uri
62
+ m = QF.match uri.to_s
63
+ out = m[1]
64
+ [[2, ??], [3, ?#]].each do |i, c|
65
+ next if m[i].nil?
66
+ clean = m[i].gsub(SF) { |s| sprintf('%X', s.ord) }
67
+ out += c + clean
68
+ end
69
+
70
+ out
71
+ end
72
+
73
+ def css_urls css
74
+ out = []
75
+ case css
76
+ when Array
77
+ css.each { |c| out += find_urls(c) }
78
+ when Hash
79
+ if css[:node] == :url
80
+ out << css[:value]
81
+ else
82
+ out += find_urls(css.values)
83
+ end
84
+ end
85
+
86
+ out.uniq.compact
87
+ end
88
+
89
+ def title_term title
90
+ title = title.first if title.is_a? Array
91
+ return unless title
92
+
93
+ text = title.content.strip
94
+ unless text.empty?
95
+ lang = if title.lang and LANG_RE.match? title.lang.strip
96
+ title.lang.strip.downcase.tr_s '_-', ?-
97
+ end
98
+ return RDF::Literal(text, language: lang)
99
+ end
100
+ end
101
+
102
+ # sponge cartridges should add triples and return all the resolved
103
+ # (RDF::)URIs to the caller
104
+ SPONGE = {
105
+ 'application/xhtml+xml': -> content, uri {
106
+ rs = RDF::URI(uri.to_s)
107
+
108
+ # lol we have done this a million times before
109
+ if title = content.xpath('/html:html/html:head/html:title', XPATHNS)
110
+ if title = title_term(title)
111
+ @repo << [rs, RDF::Vocab::DC.title, title]
112
+ end
113
+ end
114
+
115
+ content.xpath(HTML_XP, XPATHNS).map do |node|
116
+ # html will always be an attribute
117
+ pred = case node.parent.name
118
+ when 'script' then RDF::Vocab::DC.requires
119
+ when 'a', 'link', 'area', 'form' then RDF::Vocab::DC.references
120
+ else
121
+ RDF::Vocab::DC.hasPart
122
+ end
123
+
124
+ # srcset gets special treatment
125
+ objs = if node.name == 'srcset'
126
+ node.content.strip.split(/\s*,\s+/).map do |u|
127
+ u.split.first.strip
128
+ end
129
+ else
130
+ [node.content.strip]
131
+ end.map { |u| RDF::URI((uri + u).to_s) }
132
+
133
+ # phew
134
+ objs.each { |o| @repo << [rs, pred, o] }
135
+ end.flatten # don't forget to flatten
136
+ },
137
+ 'application/atom+xml': -> content, uri {
138
+ rs = RDF::URI(uri.to_s)
139
+
140
+ if title = content.xpath('/atom:feed/atom:title', XPATHNS)
141
+ if title = title_term(title)
142
+ @repo << [rs, RDF::Vocab::DC.title, title]
143
+ end
144
+ end
145
+
146
+ content.xpath(ATOM_XP, XPATHNS).map do |node|
147
+ o = RDF::URI((uri + node.content.strip).to_s)
148
+ @repo << [rs, RDF::Vocab::DC.references, o]
149
+ o
150
+ end
151
+ },
152
+ 'application/x-rss+xml': -> content, uri {
153
+ rs = RDF::URI(uri.to_s)
154
+
155
+ if title = content.xpath('/rss/channel/title', XPATHNS)
156
+ if title = title_term(title)
157
+ @repo << [rs, RDF::Vocab::DC.title, title]
158
+ end
159
+ end
160
+
161
+ content.xpath(RSS_XP, XPATHNS).map do |node|
162
+ o = RDF::URI((uri + node.content.strip).to_s)
163
+ @repo << [rs, RDF::Vocab::DC.references, o]
164
+ o
165
+ end
166
+ },
167
+ 'image/svg+xml': -> content, uri {
168
+ if title = content.xpath('//svg:title', XPATHNS)
169
+ if title = title_term(title)
170
+ @repo << [rs, RDF::Vocab::DC.title, title]
171
+ end
172
+ end
173
+
174
+ content.xpath(XLINK_XP, XPATHNS).map do |node|
175
+ o = RDF::URI((uri + node.content.strip).to_s)
176
+ @repo << [rs, RDF::Vocab::DC.references, o]
177
+ o
178
+ end
179
+ },
180
+ 'text/css': -> content, uri {
181
+ rs = RDF::URI(uri.to_s)
182
+ css = Crass.parse content
183
+ css_urls(css).map do |u|
184
+ ro = RDF::URI((uri + u).to_s)
185
+ @repo << [rs, RDF::Vocab::DC.requires, ro]
186
+ ro
187
+ end
188
+ }
189
+ }.freeze
190
+
191
+ XMLMAP = {
192
+ nil => {
193
+ 'rss' => 'application/x-rss+xml',
194
+ 'html' => 'text/html',
195
+ },
196
+ XHTMLNS => 'application/xhtml+xml',
197
+ 'http://www.w3.org/2005/Atom' => 'application/atom+xml',
198
+ 'http://www.w3.org/2000/svg' => 'image/svg+xml',
199
+ }.freeze
200
+
201
+ def xml_type doc
202
+ r = doc.root
203
+ if r and x = XMLMAP[r.namespace]
204
+ if x.is_a? Hash
205
+ return x[r.name] if x[r.name]
206
+ else
207
+ return x
208
+ end
209
+ end
210
+ 'application/xml'
211
+ end
212
+
213
+ public
214
+
215
+ # NS = {
216
+ # rdf: RDF::RDFV,
217
+ # rdfs: RDF::RDFS,
218
+ # owl: RDF::OWL,
219
+ # xsd: RDF::XSD,
220
+ # xhv: RDF::Vocab::XHV,
221
+ # http: RDF::Vocabulary.new('http://www.w3.org/2011/http#'),
222
+ # vann: RDF::Vocabulary.new('http://purl.org/vocab/vann/'),
223
+ # skos: RDF::Vocab::SKOS,
224
+ # dcat: RDF::Vocab::DCAT,
225
+ # so: RDF::Vocab::SCHEMA,
226
+ # dct: RDF::Vocab::DC,
227
+ # ci: RDF::SAK::CI,
228
+ # ogp: RDF::Vocab::OG,
229
+ # foaf: RDF::Vocab::FOAF,
230
+ # org: RDF::Vocab::ORG,
231
+ # bibo: RDF::Vocab::BIBO,
232
+ # qb: RDF::Vocabulary.new('http://purl.org/linked-data/cube#'),
233
+ # }.freeze
234
+
235
+ def initialize store: nil, repo: nil, ua: nil, ignore: nil, traverse: nil
236
+ @store = store
237
+ @repo = repo
238
+ # @urls = Concurrent::Array.new
239
+ @jobs = Concurrent::Array.new
240
+ @seen = Concurrent::Map.new
241
+
242
+ # @fchan = Concurrent::Promises::Channel.new 10
243
+ # @schan = Concurrent::Promises::Channel.new 10
244
+ # @tchan = Concurrent::Promises::Channel.new 10
245
+
246
+ @fthrot = Concurrent::Throttle.new 5
247
+
248
+ # @done = Concurrent::Cancellation.new
249
+
250
+ @ua = ua ? ua.to_s : UA
251
+ @ignore = ignore ? ignore.respond_to?(:to_a) ? ignore.to_a : [ignore] : []
252
+ @traverse = traverse ?
253
+ traverse.respond_to?(:to_a) ? traverse.to_a : [traverse] : []
254
+ # other stuff
255
+
256
+ #Signal.trap(:INT) { warn 'FART'; @done.origin.resolve }
257
+ end
258
+
259
+ def fetch url, redir = 10
260
+ # `return` in a lambda is okay but in a block you have to use
261
+ # `break`, and it complains if you do that
262
+
263
+ url = URI(url.to_s) unless url.is_a? URI
264
+ url.normalize!
265
+
266
+ # XXX apparently you can't just *return* a fulfilled future, you
267
+ # have to assign it to something.
268
+ bailout = Concurrent::Promises.fulfilled_future nil
269
+ return bailout unless %w[http https].include? url.scheme
270
+
271
+ # nuke the fragment
272
+ url.fragment = nil
273
+
274
+ # make sure we don't do this a second time
275
+ return bailout if seen? url
276
+ @seen[url.to_s] = url
277
+
278
+ ru = RDF::URI(url.to_s)
279
+
280
+ # obtain last-modified from object
281
+ q = RDF::Query.new { pattern [ru, RDF::Vocab::DC.modified, :m] }
282
+
283
+ ims = q.execute(@repo).map do |s|
284
+ s[:m].object.to_time.getgm if
285
+ s[:m].literal? and s[:m].object.respond_to? :to_time
286
+ end.compact.sort { |a, b| b <=> a }.first
287
+
288
+ # XXX this is a little too low-level, don't you think?
289
+ http = Net::HTTP.new(url.hostname, url.port)
290
+ http.continue_timeout = 10
291
+ http.open_timeout = 30
292
+ http.read_timeout = 10
293
+ http.write_timeout = 10
294
+ http.use_ssl = url.is_a?(URI::HTTPS)
295
+ http.start
296
+
297
+ hdr = { 'User-Agent' => @ua, 'Connection' => 'close' }
298
+ hdr['If-Modified-Since'] = ims.rfc2822 if ims
299
+ req = Net::HTTP::Get.new url, hdr
300
+ resp = http.request req
301
+ http.finish
302
+
303
+ case resp
304
+ when Net::HTTPSuccess
305
+ Concurrent::Promises.fulfilled_future(resp)
306
+ when Net::HTTPNotModified
307
+ warn "Already seen #{url}"
308
+ bailout
309
+ when Net::HTTPRedirection
310
+ raise Net::HTTPClientException.new "Too many redirects (#{redir})",
311
+ resp if redir <= 0
312
+ unless dest = resp['location']
313
+ raise Net::HTTPBadResponse.new(
314
+ "Redirect on #{url} missing Location header", resp)
315
+ end
316
+
317
+ dest = (url + dest).normalize
318
+
319
+ @repo << [ru, RDF::SAK::CI.canonical, RDF::URI(dest.to_s)]
320
+
321
+ raise Net::HTTPClientException.new "Loop detected on #{url}",
322
+ resp if url == dest
323
+
324
+ fetch(dest, redir - 1)
325
+ else
326
+ raise Net::HTTPClientException.new "Response failed #{url}", resp
327
+ end
328
+ end
329
+
330
+ def store resp
331
+ return unless resp
332
+
333
+ now = Time.now.getgm # we mint a single "now" to use everywhere
334
+ date = if d = resp['Date']
335
+ # who knows what weird shit is coming off the wire
336
+ d.gsub!(/^([^,]*(?:,[^,]*)?)(?:\s*,.*)?$/, "\\1")
337
+ Time.httpdate(d).getgm rescue now
338
+ else
339
+ now # rfc says server MUST send a date header, but does it?
340
+ end
341
+ mtime = if lm = resp['Last-Modified']
342
+ # lol god two Last-Modified headers no that's not messed up
343
+ lm.gsub!(/^([^,]*(?:,[^,]*)?)(?:\s*,.*)?$/, "\\1")
344
+ delta = now - date
345
+
346
+ Time.httpdate(lm).getgm + delta rescue nil
347
+ end
348
+ lang = if resp['Content-Language']
349
+ # same friggin deal
350
+ resp['Content-Language'].strip.split('\s*,+\s*').first
351
+ end
352
+ charset = nil
353
+ if type = resp['Content-Type']
354
+ # and again, wtf
355
+ type = type.split(/\s*,+\s*/).first || 'application/octet-stream'
356
+ type, *params = type.strip.split(/\s*;\s*/).reject(&:empty?)
357
+ params = params.map do |p|
358
+ p.split(/\s*=\s*/, 2)
359
+ end.reject { |p| p.length < 2 }.to_h.transform_keys(&:downcase)
360
+
361
+ charset = params['charset'] if TOKEN.match? params['charset']
362
+ end
363
+
364
+
365
+ # obj = @store.add resp.body, strict: false,
366
+ # type: type, charset: charset, language: lang, mtime: mtime
367
+
368
+ s = RDF::URI(resp.uri.to_s) # - the subject
369
+ cs = RDF::Changeset.new # - a receptacle for statements
370
+
371
+ cs << [s, RDF::Vocab::DC.modified, obj.mtime.getgm]
372
+ #cs << [s, RDF::Vocab::DC.hasVersion, RDF::URI(obj[:"sha-256"].to_s)]
373
+
374
+ cs.apply @repo
375
+
376
+ obj
377
+ #nil
378
+ end
379
+
380
+ def tidy obj
381
+ if obj and /html/i.match? obj.type
382
+ # obtain original digest uri
383
+ oldu = RDF::URI(obj[:"sha-256"].to_s)
384
+
385
+ # first let's detect if this job has already been done
386
+ RDF::Query.new do
387
+ pattern [:a, RDF::SAK::TFO.input, oldu]
388
+ pattern [:a, RDF::SAK::TFO.transform, TIDY_U]
389
+ pattern [:a, RDF::SAK::TFO.output, :n]
390
+ end.execute(@repo).map do |s|
391
+ # can't completely trust what's in the repo so check then convert
392
+ URI(s[:n].to_s) if s[:n].uri? and s[:n].scheme.downcase == 'ni'
393
+ end.compact.each do |n|
394
+ # just because it's in the rdf doesn't mean it's in the store
395
+ out = @store.get n
396
+ return out if out and !out.deleted? # don't return an empty record
397
+ end
398
+
399
+ # tidy the object and reinsert it back into the store as xhtml
400
+ start = Time.now.getgm
401
+ if clean = TidyFFI::Tidy.clean(obj.content.read, TIDY_OPTS)
402
+ newobj = @store.add clean, mtime: obj.mtime,
403
+ type: 'application/xhtml+xml', language: obj.language,
404
+ charset: obj.charset, encoding: obj.encoding
405
+ stop = Time.now.getgm
406
+ newu = RDF::URI(newobj[:"sha-256"].to_s)
407
+
408
+ q = RDF::Query.new do
409
+ pattern [:a, RDF::SAK::TFO.input, oldu]
410
+ pattern [:a, RDF::SAK::TFO.output, newu]
411
+ end
412
+
413
+ if q.execute(@repo).empty?
414
+ s = RDF::URI(UUIDTools::UUID.random_create.to_uri)
415
+ cs = RDF::Changeset.new
416
+ cs << [s, RDF.type, RDF::SAK::TFO.Application]
417
+ cs << [s, RDF::Vocab::PROV.startedAtTime, start]
418
+ cs << [s, RDF::Vocab::PROV.endedAtTime, stop]
419
+ cs << [s, RDF::SAK::TFO.transform, TIDY_U]
420
+ cs << [s, RDF::SAK::TFO.input, oldu]
421
+ cs << [s, RDF::SAK::TFO.output, newu]
422
+
423
+ cs.apply @repo
424
+ end
425
+
426
+ newobj
427
+ end
428
+ end
429
+ end
430
+
431
+ def sponge obj
432
+ return unless obj
433
+ #if obj and /xml/.match? obj.type
434
+
435
+ # get rdf stuff
436
+ ru = RDF::URI(obj[:"sha-256"].to_s)
437
+ uri = RDF::Query.new do
438
+ pattern [:b, RDF::SAK::TFO.output, ru]
439
+ pattern [:b, RDF::SAK::TFO.transform, TIDY_U]
440
+ pattern [:b, RDF::SAK::TFO.input, :a]
441
+ pattern [:s, RDF::Vocab::DC.hasVersion, :a]
442
+ end.execute(@repo) + RDF::Query.new do
443
+ pattern [:s, RDF::Vocab::DC.hasVersion, ru]
444
+ end.execute(@repo).uniq.map do |sol|
445
+ u = sol[:s]
446
+ u if u.uri? and u.scheme and %w[http https].include? u.scheme.downcase
447
+ end.compact.first
448
+
449
+ uuri = URI(uri ? uri_pp(uri.to_s) : ru)
450
+ content = obj.content
451
+ sponge = SPONGE[obj.type]
452
+
453
+ if /xml/.match? type
454
+ content = Nokogiri.XML(content, uuri.to_s)
455
+ unless sponge
456
+ type = xml_type(content)
457
+ sponge = SPONGE[type] || SPONGE['image/svg+xml'] # svg is just xlink
458
+ end
459
+ end
460
+
461
+ if sponge
462
+ instance_exec(content, uuri, &sponge).compact.uniq.each do |link|
463
+ enqueue link if traverse? link
464
+ end
465
+ end
466
+
467
+ end
468
+
469
+ def enqueue uri
470
+ uri = URI(uri_pp uri.to_s).normalize
471
+
472
+ return if seen? uri
473
+ return if ignored? uri
474
+
475
+ warn "enqueuing #{uri}"
476
+
477
+ @fthrot.future(uri) do |u|
478
+ fr = fetch u
479
+ fr.on_rejection { |reason| warn "fetch fail: #{reason.inspect}" }
480
+ fs = fr.then do |resp|
481
+ begin
482
+ store resp
483
+ rescue Exception => e
484
+ warn e
485
+ end
486
+ end
487
+ fs.on_rejection { |reason| warn "store fail: #{reason.inspect}" }
488
+ ft = fs.then do |obj|
489
+ if obj and /html/i.match? obj.type
490
+ warn "tidying #{obj[:"sha-256"]}"
491
+ tidy obj
492
+ else
493
+ obj
494
+ end
495
+ end
496
+ ft.on_rejection { |reason| warn "tidy fail: #{reason.inspect}" }
497
+ ft.then { |obj| sponge obj }
498
+ end.on_rejection { |reason| warn "throttle fail: #{reason.inspect}" }
499
+ end
500
+
501
+ def seen? uri
502
+ !!@seen[uri.to_s]
503
+ end
504
+
505
+ def traverse? uri
506
+ return if @traverse.empty?
507
+
508
+ re = /(?:^|\.)#{@traverse.map { |t| Regexp.escape t }.join ?|}$/o
509
+ re.match? uri.host
510
+ end
511
+
512
+ def ignored? uri
513
+ return if @ignore.empty?
514
+
515
+ re = /(?:^|\.)#{@ignore.map { |t| Regexp.escape t }.join ?|}$/o
516
+ re.match? uri.host
517
+ end
518
+
519
+ def run urls, shuffle: false
520
+ # ingest URLs and prune
521
+ urls = urls.map { |u| URI(uri_pp u).normalize }.reject { |u| ignored? u }
522
+
523
+ # optionally randomize
524
+ urls.shuffle! if shuffle
525
+
526
+ # now add the queue
527
+ urls.map { |url| enqueue url }.map(&:wait)
528
+
529
+ # while job = @jobs.shift
530
+ # job.wait
531
+ # end
532
+
533
+ self
534
+ end
535
+ end
536
+
537
+
538
+ if __FILE__ == $0
539
+ require 'pathname'
540
+ require 'rdf/turtle'
541
+ require 'rdf/lmdb'
542
+ require 'store/digest'
543
+ require 'commander'
544
+
545
+ Commander.configure do
546
+ program :name, 'URLRunner'
547
+ program :version, '0.0.0'
548
+ program :description, 'snarf a bunch of URLs for great justice'
549
+
550
+ command :process do |c|
551
+ c.syntax = 'process [options] [urls]'
552
+ c.description = 'just do the friggin urls already'
553
+
554
+ c.option '-c', '--csv FILE',
555
+ 'a CSV file containing URLs in the first column'
556
+ c.option '-i', '--ignore DOMAIN[,DOMAIN...]', Array,
557
+ 'domain(s) to ignore'
558
+ c.option '-t', '--traverse DOMAIN[,DOMAIN...]', Array,
559
+ 'traverse links within hypermedia documents'
560
+ c.option '-p', '--print', 'print the results when finished'
561
+ c.option '-A', '--user-agent STRING', 'override the User-Agent string'
562
+ c.option '--shuffle', 'shuffle the list of URLs'
563
+
564
+ # wtf why did i have to do this
565
+ c.option '-s', '--store DIR', 'Directory for digest store'
566
+ c.option '-r', '--rdf DIR', 'Directory for RDF store'
567
+
568
+ c.action do |args, options|
569
+ raise ArgumentError, 'Store and RDF directories are required' unless
570
+ options.store and options.rdf
571
+ repo = RDF::LMDB::Repository.new options.rdf, mapsize: 2**27
572
+ store = Store::Digest.new dir: options.store, mapsize: 2**27
573
+
574
+ urls = args.dup
575
+
576
+ if options.csv
577
+ require 'csv'
578
+ csv = Pathname(options.csv).expand_path
579
+ raise ArgumentError, "CSV file #{csv} is not readable" unless
580
+ csv.file? and csv.readable?
581
+ urls += CSV.read(csv).map(&:first).compact
582
+ end
583
+
584
+ RDF::SAK::URLRunner.new(
585
+ repo: repo, store: store, ua: options.user_agent,
586
+ ignore: options.ignore, traverse: options.traverse
587
+ ).run urls, shuffle: options.shuffle
588
+
589
+ if options.print
590
+ print repo.dump :turtle #, prefixes: URLRunner::NS
591
+ end
592
+ end
593
+ end
594
+
595
+ # XXX these should work wtf
596
+ #global_option '-s', '--store DIR', 'Directory for digest store'
597
+ #global_option '-r', '--rdf DIR', 'Directory for RDF store'
598
+
599
+ default_command :process
600
+
601
+ end
602
+ end