rdf-sak 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2081 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rdf/sak/version'
3
+
4
+ require 'uri'
5
+ require 'uri/urn'
6
+ require 'set'
7
+ require 'uuid-ncname'
8
+
9
+ require 'rdf'
10
+ require 'rdf/vocab'
11
+ require 'rdf/reasoner'
12
+ require 'rdf/vocab/skos'
13
+ require 'rdf/vocab/foaf'
14
+ require 'rdf/vocab/bibo'
15
+ require 'rdf/vocab/dc'
16
+ require 'rdf/vocab/dc11'
17
+
18
+ require 'rdf/sak/mimemagic'
19
+ require 'rdf/sak/ci'
20
+ require 'rdf/sak/tfo'
21
+ require 'rdf/sak/ibis'
22
+ require 'rdf/sak/pav'
23
+ require 'rdf/sak/qb'
24
+
25
+ unless RDF::List.respond_to? :from
26
+ class RDF::List
27
+ private
28
+
29
+ def self.get_list repo, subject, seen = []
30
+ out = []
31
+ return out if seen.include? subject
32
+ seen << subject
33
+ first = repo.query([subject, RDF.first, nil]).objects.first or return out
34
+ out << first
35
+ rest = repo.query([subject, RDF.rest, nil]).objects.select do |x|
36
+ !x.literal?
37
+ end.first or return out
38
+
39
+ out + (rest != RDF.nil ? get_list(repo, rest, seen) : [])
40
+ end
41
+
42
+ public
43
+
44
+ # Inflate a list from a graph but don't change the graph
45
+ def self.from graph, subject
46
+ self.new graph: graph, subject: subject, values: get_list(graph, subject)
47
+ end
48
+ end
49
+ end
50
+
51
+ module RDF::SAK::Util
52
+
53
+ private
54
+
55
+ RDF::Reasoner.apply(:rdfs, :owl)
56
+
57
+ R3986 = /^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/
58
+ SF = /[^[:alpha:][:digit:]\/\?%@!$&'()*+,:;=._~-]/n
59
+ RFC3986 =
60
+ /^(?:([^:\/?#]+):)?(?:\/\/([^\/?#]*))?([^?#]+)?(?:\?([^#]*))?(?:#(.*))?$/
61
+ SEPS = [['', ?:], ['//', ''], ['', ''], [??, ''], [?#, '']].freeze
62
+
63
+ XPATH = {
64
+ htmlbase: proc {
65
+ x = ['ancestor-or-self::html:html[1]/' \
66
+ 'html:head[html:base[@href]][1]/html:base[@href][1]/@href']
67
+ (x << x.first.gsub('html:', '')).join ?| }.call,
68
+ xmlbase: 'ancestor-or-self::*[@xml:base][1]/@xml:base',
69
+ lang: 'normalize-space((%s)[last()])' %
70
+ %w[lang xml:lang].map do |a|
71
+ 'ancestor-or-self::*[@%s][1]/@%s' % [a,a]
72
+ end.join(?|),
73
+ literal: '(ancestor::*[@property][not(@content)]' \
74
+ '[not(@resource|@href|@src) or @rel|@rev])[1]',
75
+ leaves: 'descendant::html:section[not(descendant::html:section)]' \
76
+ '[not(*[not(self::html:script)])]',
77
+ headers: './*[1][%s]//text()' %
78
+ (1..6).map { |x| "self::html:h#{x}" }.join(?|),
79
+ modernize: ([
80
+ "//html:div[*[1][#{(1..6).map { |i| 'self::html:h%d' % i }.join ?|}]]"] +
81
+ { div: %i[section figure], blockquote: :note,
82
+ table: :figure, img: :figure }.map do |k, v|
83
+ (v.is_a?(Array) ? v : [v]).map do |cl|
84
+ "//html:#{k}[contains(concat(' ', " \
85
+ "normalize-space(@class), ' '), ' #{cl} ')]"
86
+ end
87
+ end.flatten).join(?|),
88
+ dehydrate: '//html:a[count(*)=1][html:dfn|html:abbr|html:span]',
89
+ rehydrate: %w[//html:dfn
90
+ //html:abbr[not(parent::html:dfn)] //html:span].join(?|) +
91
+ '[not(parent::html:a)]',
92
+ htmllinks: (%w[*[not(self::html:base)][@href]/@href
93
+ *[@src]/@src object[@data]/@data *[@srcset]/@srcset
94
+ form[@action]/@action].map { |e|
95
+ '//html:%s' % e} + %w[//*[@xlink:href]/@xlink:href]).join(?|).freeze,
96
+ atomlinks: %w[uri content/@src category/@scheme generator/@uri icon id
97
+ link/@href logo].map { |e| '//atom:%s' % e }.join(?|).freeze,
98
+ rsslinks: %w[image/text()[1] docs/text()[1] source/@url enclosure/@url
99
+ guid/text()[1] comments/text()[1]].map { |e|
100
+ '//%s' % e }.join(?|).freeze,
101
+ xlinks: '//*[@xlink:href]/@xlink:href'.freeze,
102
+ rdflinks: %w[about resource datatype].map { |e|
103
+ '//*[@rdf:%s]/@rdf:%s' % [e, e] }.join(?|).freeze,
104
+ }
105
+
106
+ LINK_MAP = {
107
+ 'text/html' => :htmllinks,
108
+ 'application/xhtml+xml' => :htmllinks,
109
+ 'application/atom+xml' => :atomlinks,
110
+ 'application/x-rss+xml' => :rsslinks,
111
+ 'application/rdf+xml' => :rdflinks,
112
+ 'image/svg+xml' => :xlinks,
113
+ }.transform_values { |v| XPATH[v] }.freeze
114
+
115
+ URI_COERCIONS = {
116
+ nil => -> t { t.to_s },
117
+ false => -> t { t.to_s },
118
+ uri: -> t { URI.parse t.to_s },
119
+ rdf: -> t {
120
+ t = t.to_s
121
+ t.start_with?('_:') ? RDF::Node.new(t.delete_prefix '_:') : RDF::URI(t) },
122
+ }
123
+
124
+ UUID_RE = /^(?:urn:uuid:)?([0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8})$/i
125
+
126
+ # okay labels: what do we want to do about them? poor man's fresnel!
127
+
128
+ # basic structure is an asserted base class corresponding to a
129
+ # ranked list of asserted predicates. to the subject we first
130
+ # match the closest class, then the closest property.
131
+
132
+ # if the instance data doesn't have an exact property mentioned in
133
+ # the spec, it may have an equivalent property or subproperty we
134
+ # may be able to use. we could imagine a scoring system analogous
135
+ # to the one used by CSS selectors, albeit using the topological
136
+ # distance of classes/predicates in the spec versus those in the
137
+ # instance data.
138
+
139
+ # think about dcterms:title is a subproperty of dc11:title even
140
+ # though they are actually more like equivalent properties;
141
+ # owl:equivalentProperty is not as big a conundrum as
142
+ # rdfs:subPropertyOf.
143
+
144
+ # if Q rdfs:subPropertyOf P then S Q O implies S P O. this is
145
+ # great but property Q may not be desirable to display.
146
+
147
+ # it may be desirable to be able to express properties to never
148
+ # use as a label, such as skos:hiddenLabel
149
+
150
+ # consider ranked alternates, sequences, sequences of alternates.
151
+ # (this is what fresnel does fyi)
152
+
153
+ STRINGS = {
154
+ RDF::RDFS.Resource => {
155
+ label: [
156
+ # main
157
+ [RDF::Vocab::SKOS.prefLabel, RDF::RDFS.label,
158
+ RDF::Vocab::DC.title, RDF::Vocab::DC11.title, RDF::RDFV.value],
159
+ # alt
160
+ [RDF::Vocab::SKOS.altLabel, RDF::Vocab::DC.alternative],
161
+ ],
162
+ desc: [
163
+ # main will be cloned into alt
164
+ [RDF::Vocab::DC.abstract, RDF::Vocab::DC.description,
165
+ RDF::Vocab::DC11.description, RDF::RDFS.comment,
166
+ RDF::Vocab::SKOS.note],
167
+ ],
168
+ },
169
+ RDF::Vocab::FOAF.Document => {
170
+ label: [
171
+ # main
172
+ [RDF::Vocab::DC.title, RDF::Vocab::DC11.title],
173
+ # alt
174
+ [RDF::Vocab::BIBO.shortTitle, RDF::Vocab::DC.alternative],
175
+ ],
176
+ desc: [
177
+ # main
178
+ [RDF::Vocab::BIBO.abstract, RDF::Vocab::DC.abstract,
179
+ RDF::Vocab::DC.description, RDF::Vocab::DC11.description],
180
+ # alt
181
+ [RDF::Vocab::BIBO.shortDescription],
182
+ ],
183
+ },
184
+ RDF::Vocab::FOAF.Agent => {
185
+ label: [
186
+ # main (will get cloned into alt)
187
+ [RDF::Vocab::FOAF.name],
188
+ ],
189
+ desc: [
190
+ # main cloned into alt
191
+ [RDF::Vocab::FOAF.status],
192
+ ],
193
+ },
194
+ }
195
+ STRINGS[RDF::OWL.Thing] = STRINGS[RDF::RDFS.Resource]
196
+
197
+ # note this is to_a because "can't modify a hash during iteration"
198
+ # which i guess is sensible, so we generate a set of pairs first
199
+ STRINGS.to_a.each do |type, struct|
200
+ struct.values.each do |lst|
201
+ # assert a whole bunch of stuff
202
+ raise 'STRINGS content must be an array of arrays' unless
203
+ lst.is_a? Array
204
+ raise 'Spec must contain 1 or 2 Array elements' if lst.empty?
205
+ raise 'Spec must be array of arrays of terms' unless
206
+ lst.all? { |x| x.is_a? Array and x.all? { |y|
207
+ RDF::Vocabulary.find_term(y) } }
208
+
209
+ # prune this to two elements (not that there should be more than)
210
+ lst.slice!(2, lst.length) if lst.length > 2
211
+
212
+ # pre-fill equivalent properties
213
+ lst.each do |preds|
214
+ # for each predicate, find its equivalent properties
215
+
216
+ # splice them in after the current predicate only if they
217
+ # are not already explicitly in the list
218
+ i = 0
219
+ loop do
220
+ equiv = preds[i].entail(:equivalentProperty) - preds
221
+ preds.insert(i + 1, *equiv) unless equiv.empty?
222
+
223
+ i += equiv.length + 1
224
+ break if i >= preds.length
225
+ end
226
+
227
+ # this just causes too many problems otherwise
228
+ # preds.map! { |p| p.to_s }
229
+ end
230
+
231
+ # duplicate main predicates to alternatives
232
+ lst[1] ||= lst[0]
233
+ end
234
+
235
+ # may as well seed equivalent classes so we don't have to look them up
236
+ type.entail(:equivalentClass).each do |equiv|
237
+ STRINGS[equiv] ||= struct
238
+ end
239
+
240
+ # tempting to do subclasses too but it seems pretty costly in
241
+ # this framework; save it for the clojure version
242
+ end
243
+
244
+ AUTHOR = [RDF::SAK::PAV.authoredBy, RDF::Vocab::DC.creator,
245
+ RDF::Vocab::DC11.creator, RDF::Vocab::PROV.wasAttributedTo]
246
+ CONTRIB = [RDF::SAK::PAV.contributedBy, RDF::Vocab::DC.contributor,
247
+ RDF::Vocab::DC11.contributor]
248
+ [AUTHOR, CONTRIB].each do |preds|
249
+ i = 0
250
+ loop do
251
+ equiv = preds[i].entail(:equivalentProperty) - preds
252
+ preds.insert(i + 1, *equiv) unless equiv.empty?
253
+ i += equiv.length + 1
254
+ break if i >= preds.length
255
+ end
256
+
257
+ preds.freeze
258
+ end
259
+
260
+ def sanitize_prefixes prefixes, nonnil = false
261
+ raise ArgumentError, 'prefixes must be a hash' unless
262
+ prefixes.is_a? Hash or prefixes.respond_to? :to_h
263
+ prefixes = prefixes.to_h.map do |k, v|
264
+ [k ? k.to_s.to_sym : nil, v ? v.to_s : nil]
265
+ end.to_h
266
+
267
+ prefixes.reject! { |k, v| k.nil? || v.nil? } if nonnil
268
+ prefixes
269
+ end
270
+
271
+ def assert_uri_coercion coerce
272
+ if coerce
273
+ coerce = coerce.to_s.to_sym if coerce.respond_to? :to_s
274
+ raise 'coerce must be either :uri or :rdf' unless
275
+ %i[uri rdf].include?(coerce)
276
+ end
277
+ coerce
278
+ end
279
+
280
+ def assert_xml_node node
281
+ raise 'Argument must be a Nokogiri::XML::Element' unless
282
+ node.is_a? Nokogiri::XML::Element
283
+ node
284
+ end
285
+
286
+ def internal_subject_for node, prefixes: nil, base: nil, coerce: nil,
287
+ is_ancestor: false
288
+
289
+ # note we assign these AFTER the literal check or it will be wrong
290
+ prefixes ||= get_prefixes node
291
+
292
+ base ||= get_base node
293
+ base = coerce_resource base, as: :uri unless base
294
+
295
+ # answer a bunch of helpful questions about this element
296
+ subject = nil
297
+ parent = node.parent
298
+ ns_href = node.namespace.href if node.namespace
299
+ up_ok = %i[rel rev].none? { |a| node.key? a }
300
+ is_root = !parent or parent.document?
301
+ special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
302
+ (ns_href == 'http://www.w3.org/1999/xhtml' or
303
+ /^(?:[^:]+:)?html$/xi === parent.name)
304
+
305
+ # if the node is being inspected as an ancestor to the
306
+ # original node, we have to check it backwards.
307
+ if is_ancestor
308
+ # ah right @resource gets special treatment
309
+ if subject = node[:resource]
310
+ subject = resolve_curie subject,
311
+ prefixes: prefixes, base: base, scalar: true
312
+ else
313
+ # then check @href and @src
314
+ %i[href src].each do |attr|
315
+ if node.key? attr
316
+ # merge with the root and return it
317
+ subject = base + node[attr]
318
+ break
319
+ end
320
+ end
321
+ end
322
+
323
+ return coerce_resource subject, as: coerce if subject
324
+
325
+ # note if we are being called with is_ancestor, that means
326
+ # the original node (or indeed any of the nodes previously
327
+ # tested) have anything resembling a resource in them. this
328
+ # means @rel/@rev should be ignored, and we should keep
329
+ # looking for a subject.
330
+ end
331
+
332
+ if node[:about]
333
+
334
+ subject = resolve_curie node[:about],
335
+ prefixes: prefixes, base: base, scalar: true
336
+
337
+ # ignore coercion
338
+ return subject if subject.is_a? RDF::Node
339
+
340
+ elsif is_root
341
+ subject = base
342
+ elsif special
343
+ subject = subject_for_internal parent
344
+ elsif node[:resource]
345
+ # XXX resolve @about against potential curie
346
+ subject = resolve_curie node[:resource], prefixes: prefixes, base: base
347
+ elsif node[:href]
348
+ subject = base + node[:href]
349
+ elsif node[:src]
350
+ subject = base + node[:src]
351
+ elsif node[:typeof]
352
+ # bnode the typeof attr
353
+
354
+ # note we return bnodes irrespective of the rdf flag
355
+ return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
356
+ elsif node[:inlist]
357
+ # bnode the inlist attr
358
+ return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
359
+ elsif (parent[:inlist] && %i[href src].none? { |a| parent.key? a }) ||
360
+ (is_ancestor && !up_ok)
361
+ # bnode the element
362
+ return RDF::Node('id-%016x' % node.pointer_id)
363
+ # elsif node[:id]
364
+ else
365
+ subject = subject_for_internal parent, is_ancestor: true
366
+ end
367
+
368
+ coerce_resource subject, as: coerce if subject
369
+ end
370
+
371
+ MODERNIZE = {
372
+ div: -> e {
373
+ if e.classes.include? 'figure'
374
+ e.remove_class 'figure'
375
+ e.name = 'figure' unless e.parent.name == 'figure'
376
+ else
377
+ e.remove_class 'section'
378
+ e.name = 'section'
379
+ end
380
+ },
381
+ blockquote: -> e {
382
+ e.remove_class 'note'
383
+ e.name = 'aside'
384
+ e['role'] = 'note'
385
+ },
386
+ table: -> e {
387
+ e.remove_class 'figure'
388
+ unless e.parent.name == 'figure'
389
+ inner = e.dup
390
+ markup replace: e, spec: { [inner] => :figure }
391
+ end
392
+ },
393
+ img: -> e {
394
+ e.remove_class 'figure'
395
+ unless e.parent.name == 'figure'
396
+ inner = e.dup
397
+ markup replace: e, spec: { [inner] => :figure }
398
+ end
399
+ },
400
+ }
401
+
402
+ # rdf term type tests
403
+ NTESTS = { uri: :"uri?", blank: :"node?", literal: :"literal?" }.freeze
404
+ NMAP = ({ iri: :uri, bnode: :blank }.merge(
405
+ [:uri, :blank, :literal].map { |x| [x, x] }.to_h)).freeze
406
+
407
+ public
408
+
409
+ def coerce_node_spec spec, rev: false
410
+ spec = [spec] unless spec.respond_to? :to_a
411
+ spec = spec - [:resource] + [:uri, :blank] if spec.include? :resource
412
+ raise 'Subjects are never literals' if rev and spec.include? :literal
413
+
414
+ spec = NMAP.values_at(*spec).reject(&:nil?).uniq
415
+ spec = NTESTS.keys if spec.empty?
416
+ spec.delete :literal if rev
417
+ spec.uniq
418
+ end
419
+
420
+ def node_matches? node, spec
421
+ spec.any? { |k| node.send NTESTS[k] }
422
+ end
423
+
424
+ # Obtain all and only the rdf:types directly asserted on the subject.
425
+ #
426
+ # @param repo [RDF::Queryable]
427
+ # @param subject [RDF::Resource]
428
+ # @param type [RDF::Term, :to_a]
429
+ #
430
+ # @return [Array]
431
+ def self.asserted_types repo, subject, type = nil
432
+ asserted = nil
433
+
434
+ if type
435
+ type = type.respond_to?(:to_a) ? type.to_a : [type]
436
+ asserted = type.select { |t| t.is_a? RDF::Value }.map do |t|
437
+ RDF::Vocabulary.find_term t
438
+ end
439
+ end
440
+
441
+ asserted ||= repo.query([subject, RDF.type, nil]).objects.map do |o|
442
+ RDF::Vocabulary.find_term o
443
+ end.compact
444
+
445
+ asserted.select { |t| t && t.uri? }.uniq
446
+ end
447
+
448
+ # Obtain a stack of types for an asserted initial type or set
449
+ # thereof. Returns an array of arrays, where the first is the
450
+ # asserted types and their inferred equivalents, and subsequent
451
+ # elements are immediate superclasses and their equivalents. A
452
+ # given URI will only appear once in the entire structure.
453
+ #
454
+ # @param rdftype [RDF::Term, :to_a]
455
+ #
456
+ # @return [Array]
457
+ #
458
+ def type_strata rdftype
459
+ # first we coerce this to an array
460
+ if rdftype.respond_to? :to_a
461
+ rdftype = rdftype.to_a
462
+ else
463
+ rdftype = [rdftype]
464
+ end
465
+
466
+ # now squash and coerce
467
+ rdftype = rdftype.uniq.map { |t| RDF::Vocabulary.find_term t }.compact
468
+
469
+ # bail out early
470
+ return [] if rdftype.empty?
471
+
472
+ # essentially what we want to do is construct a layer of
473
+ # asserted classes and their inferred equivalents, then probe
474
+ # the classes in the first layer for subClassOf assertions,
475
+ # which will form the second layer, and so on.
476
+
477
+ queue = [rdftype]
478
+ strata = []
479
+ seen = Set.new
480
+
481
+ while qin = queue.shift
482
+ qwork = []
483
+
484
+ qin.each do |q|
485
+ qwork << q # entail doesn't include q
486
+ qwork += q.entail(:equivalentClass) if q.uri?
487
+ end
488
+
489
+ # grep and flatten
490
+ qwork = qwork.map do |t|
491
+ next t if t.is_a? RDF::Vocabulary::Term
492
+ RDF::Vocabulary.find_term t
493
+ end.compact.uniq - seen.to_a
494
+ seen |= qwork
495
+
496
+ # warn "qwork == #{qwork.inspect}"
497
+
498
+ # push current layer out
499
+ strata.push qwork.dup unless qwork.empty?
500
+
501
+ # now deal with subClassOf
502
+ qsuper = []
503
+ qwork.each { |q| qsuper += q.subClassOf }
504
+
505
+ # grep and flatten this too
506
+ qsuper = qsuper.map do |t|
507
+ next t if t.is_a? RDF::Vocabulary::Term
508
+ RDF::Vocabulary.find_term t
509
+ end.compact.uniq - seen.to_a
510
+ # do not append qsuper to seen!
511
+
512
+ # warn "qsuper == #{qsuper.inspect}"
513
+
514
+ # same deal, conditionally push the input queue
515
+ queue.push qsuper.dup unless qsuper.empty?
516
+ end
517
+
518
+ # voila
519
+ strata
520
+ end
521
+
522
+ # Obtain the objects for a given subject-predicate pair.
523
+ #
524
+ # @param subject [RDF::Resource]
525
+ # @param predicate [RDF::URI]
526
+ # @param entail [false, true]
527
+ # @param only [:uri, :iri, :resource, :blank, :bnode, :literal]
528
+ # @param datatype [RDF::Term]
529
+ #
530
+ # @return [Array]
531
+ #
532
+ def predicate_set predicates, seen: Set.new
533
+ predicates = Set[predicates] if predicates.is_a? RDF::URI
534
+ unless predicates.is_a? Set
535
+ raise "predicates must be a set" unless predicates.respond_to? :to_set
536
+ predicates = predicates.to_set
537
+ end
538
+
539
+ # shortcut
540
+ return predicates if predicates.empty?
541
+
542
+ raise 'predicates must all be RDF::URI' unless predicates.all? do |p|
543
+ p.is_a? RDF::URI
544
+ end
545
+
546
+ # first we generate the set of equivalent properties for the given
547
+ # properties
548
+ predicates += predicates.map do |p|
549
+ p.entail :equivalentProperty
550
+ end.flatten.to_set
551
+
552
+ # then we take the resulting set of properties and
553
+ # compute their subproperties
554
+ subp = Set.new
555
+ (predicates - seen).each do |p|
556
+ subp += p.subProperty.flatten.to_set
557
+ end
558
+
559
+ # uhh this whole "seen" business might not be necessary
560
+ predicates + predicate_set(subp - predicates - seen, seen: predicates)
561
+ end
562
+
563
+ # Returns subjects from the graph with entailment.
564
+ #
565
+ # @param repo
566
+ # @param predicate
567
+ # @param object
568
+ # @param entail
569
+ # @param only
570
+ #
571
+ # @return [RDF::Resource]
572
+ #
573
+ def self.subjects_for repo, predicate, object, entail: true, only: []
574
+ raise 'Object must be a Term' unless object.is_a? RDF::Term
575
+ predicate = predicate.respond_to?(:to_a) ? predicate.to_a : [predicate]
576
+ raise 'Predicate must be some kind of term' unless
577
+ predicate.all? { |p| p.is_a? RDF::URI }
578
+
579
+ only = coerce_node_spec only, rev: true
580
+
581
+ predicate = predicate.map { |x| RDF::Vocabulary.find_term x }.compact
582
+ predicate = predicate_set predicate if entail
583
+
584
+ out = {}
585
+ revp = Set.new
586
+ predicate.each do |p|
587
+ repo.query([nil, p, object]).subjects.each do |s|
588
+ next unless node_matches? s, only
589
+
590
+ entry = out[s] ||= [Set.new, Set.new]
591
+ entry[0] << p
592
+ end
593
+
594
+ # do this here while we're at it
595
+ unless object.literal?
596
+ revp += p.inverseOf.to_set
597
+ revp << p if p.type.include? RDF::OWL.SymmetricProperty
598
+ end
599
+ end
600
+
601
+ unless object.literal?
602
+ revp = predicate_set revp if entail
603
+
604
+ revp.each do |p|
605
+ repo.query([object, p, nil]).objects.each do |o|
606
+ next unless node_matches? o, only
607
+
608
+ entry = out[o] ||= [Set.new, Set.new]
609
+ entry[1] << p
610
+ end
611
+ end
612
+ end
613
+
614
+ # run this through a block to get access to the predicates
615
+ return out.map { |p, v| yield p, *v } if block_given?
616
+
617
+ out.keys
618
+ end
619
+
620
+ # Returns objects from the graph with entailment.
621
+ #
622
+ # @param repo
623
+ # @param subject
624
+ # @param predicate
625
+ # @param entail
626
+ # @param only
627
+ # @param datatype
628
+ #
629
+ # @return [RDF::Term]
630
+ #
631
+ def self.objects_for repo, subject, predicate,
632
+ entail: true, only: [], datatype: nil
633
+ raise "Subject must be a resource, not #{subject.inspect}" unless
634
+ subject.is_a? RDF::Resource
635
+ predicate = predicate.respond_to?(:to_a) ? predicate.to_a : [predicate]
636
+ raise "Predicate must be a term, not #{predicate.first.class}" unless
637
+ predicate.all? { |p| p.is_a? RDF::URI }
638
+
639
+ predicate = predicate.map { |x| RDF::Vocabulary.find_term x }.compact
640
+
641
+ only = coerce_node_spec only
642
+
643
+ datatype = (
644
+ datatype.respond_to?(:to_a) ? datatype.to_a : [datatype]).compact
645
+ raise 'Datatype must be some kind of term' unless
646
+ datatype.all? { |p| p.is_a? RDF::URI }
647
+
648
+ # fluff this out
649
+ predicate = predicate_set predicate if entail
650
+
651
+ out = {}
652
+ predicate.each do |p|
653
+ repo.query([subject, p, nil]).objects.each do |o|
654
+
655
+ # make sure it's in the spec
656
+ next unless node_matches? o, only
657
+
658
+ # constrain output
659
+ next if o.literal? and
660
+ !(datatype.empty? or datatype.include?(o.datatype))
661
+
662
+ entry = out[o] ||= [Set.new, Set.new]
663
+ entry.first << p
664
+ end
665
+ end
666
+
667
+ # now we do the reverse
668
+ unless only == [:literal]
669
+ # generate reverse predicates
670
+ revp = Set.new
671
+ predicate.each do |p|
672
+ revp += p.inverseOf.to_set
673
+ revp << p if p.type.include? RDF::OWL.SymmetricProperty
674
+ end
675
+ revp = predicate_set revp if entail
676
+
677
+ # now scan 'em
678
+ revp.each do |p|
679
+ repo.query([nil, p, subject]).subjects.each do |s|
680
+ next unless node_matches? s, only
681
+ # no need to check datatype; subject is never a literal
682
+
683
+ entry = out[s] ||= [Set.new, Set.new]
684
+ entry.last << p
685
+ end
686
+ end
687
+ end
688
+
689
+ # run this through a block to get access to the predicates
690
+ return out.map { |p, v| yield p, *v } if block_given?
691
+
692
+ out.keys
693
+ end
694
+
695
+ # Obtain the canonical UUID for the given URI
696
+ #
697
+ # @param repo [RDF::Queryable]
698
+ # @param uri [RDF::URI, URI, to_s] the subject of the inquiry
699
+ # @param unique [true, false] return a single resource/nil or an array
700
+ # @param published [true, false] whether to restrict to published docs
701
+ #
702
+ # @return [RDF::URI, Array]
703
+ #
704
+ def self.canonical_uuid repo, uri, unique: true, published: false,
705
+ scache: {}, ucache: {}, base: nil
706
+ # make sure this is actually a uri
707
+ orig = uri = coerce_resource uri, base
708
+ unless uri.is_a? RDF::Node
709
+ tu = URI(uri_pp(uri).to_s).normalize
710
+
711
+ if tu.path && !tu.fragment &&
712
+ UUID_RE.match?(uu = tu.path.delete_prefix(?/))
713
+ tu = URI('urn:uuid:' + uu.downcase)
714
+ end
715
+
716
+ # unconditionally overwrite uri
717
+ uri = RDF::URI(tu.to_s)
718
+
719
+ # now check if it's a uuid
720
+ if tu.respond_to? :uuid
721
+ # warn "lol uuid #{orig}"
722
+ # if it's a uuid, check that we have it as a subject
723
+ # if we have it as a subject, return it
724
+ return uri if scache[uri] ||= repo.has_subject?(uri)
725
+ # note i don't want to screw around right now dealing with the
726
+ # case that a UUID might not itself be canonical
727
+ end
728
+ end
729
+
730
+ # spit up the cache if present
731
+ if out = ucache[orig]
732
+ # warn "lol cached #{orig}"
733
+ return unique ? out.first : out
734
+ end
735
+
736
+ # otherwise we proceed:
737
+
738
+ # goal: return the most "appropriate" UUID for the given URI
739
+
740
+ # it is so lame i have to do this
741
+ bits = { nil => 0, false => 0, true => 1 }
742
+
743
+ # rank (0 is higher):
744
+ # * (00) exact & canonical == 0,
745
+ # * (01) exact == 1,
746
+ # * (10) inexact & canonical == 2,
747
+ # * (11) inexact == 3.
748
+
749
+ # warn "WTF URI #{uri}"
750
+
751
+ # handle path parameters by generating a bunch of candidates
752
+ uris = if uri.respond_to? :path and uri.path.start_with? ?/
753
+ # split any path parameters off
754
+ uu, *pp = split_pp uri
755
+ if pp.empty?
756
+ [uri] # no path parameters
757
+ else
758
+ uu = RDF::URI(uu.to_s)
759
+ bp = uu.path # base path
760
+ (0..pp.length).to_a.reverse.map do |i|
761
+ u = uu.dup
762
+ u.path = ([bp] + pp.take(i)).join(';')
763
+ u
764
+ end
765
+ end
766
+ else
767
+ [uri] # not a pathful URI
768
+ end
769
+
770
+ # collect the candidates by URI
771
+ sa = predicate_set [RDF::SAK::CI.canonical,
772
+ RDF::SAK::CI.alias, RDF::OWL.sameAs]
773
+ candidates = nil
774
+ uris.each do |u|
775
+ candidates = subjects_for(repo, sa, u, entail: false) do |s, f|
776
+ # there is no #to_i for booleans and also we xor this number
777
+ [s, { rank: bits[f.include?(RDF::SAK::CI.canonical)] ^ 1,
778
+ published: published?(repo, s),
779
+ mtime: dates_for(repo, s).last || DateTime.new }]
780
+ end.compact.to_h
781
+ break unless candidates.empty?
782
+ end
783
+
784
+ # now collect by slug
785
+ slug = terminal_slug uri, base: base
786
+ if slug and !slug.empty?
787
+ exact = uri == coerce_resource(slug, base) # slug represents exact match
788
+ sl = [RDF::SAK::CI['canonical-slug'], RDF::SAK::CI.slug]
789
+ [RDF::XSD.string, RDF::XSD.token].each do |t|
790
+ subjects_for(repo, sl, RDF::Literal(slug, datatype: t)) do |s, f|
791
+ # default to lowest rank if this candidate is new
792
+ entry = candidates[s] ||= {
793
+ published: published?(repo, s, base: base),
794
+ rank: 0b11, mtime: dates_for(repo, s).last || DateTime.new }
795
+ # true is 1 and false is zero so we xor this too
796
+ rank = (BITS[exact] << 1 | BITS[f.include?(sl[0])]) ^ 0b11
797
+ # now amend the rank if we have found a better one
798
+ entry[:rank] = rank if rank < entry[:rank]
799
+ end
800
+ end
801
+ end
802
+
803
+ candidates.delete_if { |s, _| !/^urn:uuid:/.match?(s.to_s) }
804
+
805
+ # scan all the candidates for replacements and remove any
806
+ # candidates that have been replaced
807
+ candidates.to_a.each do |k, v|
808
+ # note that
809
+ reps = replacements_for(repo, k, published: published) - [k]
810
+ unless reps.empty?
811
+ v[:replaced] = true
812
+ reps.each do |r|
813
+ c = candidates[r] ||= { rank: v[:rank],
814
+ published: published?(repo, r),
815
+ mtime: dates_for(repo, r).last || v[:mtime] || DateTime.new }
816
+ # we give the replacement the rank and mtime of the
817
+ # resource being replaced if it scores better
818
+ c[:rank] = v[:rank] if v[:rank] < c[:rank]
819
+ c[:mtime] = v[:mtime] if v[:mtime] > c[:mtime]
820
+ end
821
+ end
822
+ end
823
+
824
+ # now we can remove all unpublished candidates if the context is
825
+ # published
826
+ candidates.select! do |_, v|
827
+ !v[:replaced] && (published ? v[:published] : true)
828
+ end
829
+
830
+ # now we sort by rank and date; the highest-ranking newest
831
+ # candidate is the one
832
+
833
+ out = candidates.sort do |a, b|
834
+ _, va = a
835
+ _, vb = b
836
+ cb = published ? BITS[vb[:published]] <=> BITS[va[:published]] : 0
837
+ cr = va[:rank] <=> vb[:rank]
838
+ cb == 0 ? cr == 0 ? vb[:mtime] <=> va[:mtime] : cr : cb
839
+ end.map { |x| x.first }.compact
840
+
841
+ # set cache
842
+ ucache[orig] = out
843
+
844
+ #warn "lol not cached #{orig}"
845
+
846
+ unique ? out.first : out
847
+
848
+ # an exact match is better than an inexact one
849
+
850
+ # a canonical match is better than non-canonical
851
+
852
+ # note this is four bits: exact, canon(exact), inexact, canon(inexact)
853
+ # !canon(exact) should rank higher than canon(inexact)
854
+
855
+ # unreplaced is better than replaced
856
+
857
+ # newer is better than older (though no reason an older item
858
+ # can't replace a newer one)
859
+
860
+ # published is better than not, unless the context is
861
+ # unpublished and an unpublished document replaces a published one
862
+ end
863
+
864
+ SCHEME_RANK = { https: 0, http: 1 }
865
+
866
+ def cmp_resource a, b, www: nil
867
+ raise 'Comparands must be instances of RDF::Value' unless
868
+ [a, b].all? { |x| x.is_a? RDF::Value }
869
+
870
+ # URI beats non-URI
871
+ if a.uri?
872
+ if b.uri?
873
+ # https beats http beats other
874
+ as = a.scheme.downcase.to_sym
875
+ bs = b.scheme.downcase.to_sym
876
+ cmp = SCHEME_RANK.fetch(as, 2) <=> SCHEME_RANK.fetch(bs, 2)
877
+
878
+ # bail out early
879
+ return cmp unless cmp == 0
880
+
881
+ # this would have returned if the schemes were different, as
882
+ # such we only need to test one of them
883
+ if [:http, :https].any?(as) and not www.nil?
884
+ # if www is non-nil, prefer www or no-www depending on
885
+ # truthiness of `www` parameter
886
+ pref = [false, true].zip(www ? [1, 0] : [0, 1]).to_h
887
+ re = /^(?:(www)\.)?(.*?)$/
888
+
889
+ ah = re.match(a.host.to_s.downcase)[1,2]
890
+ bh = re.match(b.host.to_s.downcase)[1,2]
891
+
892
+ # compare hosts sans www
893
+ cmp = ah[1] <=> bh[1]
894
+ return cmp unless cmp == 0
895
+
896
+ # now compare presence of www
897
+ cmp = pref[ah[0] == 'www'] <=> pref[bh[0] == 'www']
898
+ return cmp unless cmp == 0
899
+
900
+ # if we're still here, compare the path/query/fragment
901
+ re = /^.*?\/\/.*?(\/.*)$/
902
+ al = re.match(a.to_s)[1].to_s
903
+ bl = re.match(b.to_s)[1].to_s
904
+
905
+ return al <=> bl
906
+ end
907
+
908
+ return a <=> b
909
+ else
910
+ return -1
911
+ end
912
+ elsif b.uri?
913
+ return 1
914
+ else
915
+ return a <=> b
916
+ end
917
+ end
918
+
919
+ def self.cmp_label repo, a, b, labels: nil, supplant: true, reverse: false
920
+ labels ||= {}
921
+
922
+ # try supplied label or fall back
923
+ pair = [a, b].map do |x|
924
+ if labels[x]
925
+ labels[x][1]
926
+ elsif supplant and y = label_for(repo, x)
927
+ labels[x] = y
928
+ y[1]
929
+ else
930
+ x
931
+ end
932
+ end
933
+
934
+ pair.reverse! if reverse
935
+ # warn "#{pair[0]} <=> #{pair[1]}"
936
+ pair[0].to_s <=> pair[1].to_s
937
+ end
938
+
939
+ # Obtain the "best" dereferenceable URI for the subject.
940
+ # Optionally returns all candidates.
941
+ #
942
+ # @param repo [RDF::Queryable]
943
+ # @param subject [RDF::Resource]
944
+ # @param unique [true, false] flag for unique return value
945
+ # @param rdf [true, false] flag to specify RDF::URI vs URI
946
+ # @param slugs [true, false] flag to include slugs
947
+ # @param fragment [true, false] flag to include fragment URIs
948
+ #
949
+ # @return [RDF::URI, URI, Array]
950
+ #
951
+ def self.canonical_uri repo, subject, base: nil,
952
+ unique: true, rdf: true, slugs: false, fragment: false
953
+ subject = coerce_resource subject, base
954
+ out = []
955
+
956
+ # try to find it first
957
+ out = objects_for(repo, subject, [RDF::SAK::CI.canonical, RDF::OWL.sameAs],
958
+ entail: false, only: :resource).select do |o|
959
+ # only consider the subjects
960
+ repo.has_subject? o
961
+ end.sort { |a, b| cmp_resource a, b }
962
+
963
+ # try to generate in lieu
964
+ if subject.uri? and (out.empty? or slugs)
965
+
966
+ out += objects_for(repo, subject,
967
+ [RDF::SAK::CI['canonical-slug'], RDF::SAK::CI.slug],
968
+ only: :literal).map do |o|
969
+ base + o.value
970
+ end if slugs
971
+
972
+ uri = URI(uri_pp(subject.to_s))
973
+ if base and uri.respond_to? :uuid
974
+ b = base.clone
975
+ b.query = b.fragment = nil
976
+ b.path = '/' + uri.uuid
977
+ out << RDF::URI.new(b.to_s)
978
+ else
979
+ out << subject
980
+ end
981
+ end
982
+
983
+ # remove all URIs with fragments unless specified
984
+ unless fragment
985
+ tmp = out.reject(&:fragment)
986
+ out = tmp unless tmp.empty?
987
+ end
988
+
989
+ # coerce to URI objects if specified
990
+ out.map! { |u| URI(uri_pp u.to_s) } unless rdf
991
+
992
+ unique ? out.first : out.uniq
993
+ end
994
+
995
+ # Determine whether the URI represents a published document.
996
+ #
997
+ # @param repo
998
+ # @param uri
999
+ #
1000
+ # @return [true, false]
1001
+ def self.published? repo, uri, circulated: false, base: nil
1002
+ uri = coerce_resource uri, base
1003
+ candidates = objects_for(
1004
+ repo, uri, RDF::Vocab::BIBO.status, only: :resource).to_set
1005
+
1006
+ test = Set[RDF::Vocab::BIBO['status/published']]
1007
+ test << RDF::SAK::CI.circulated if circulated
1008
+
1009
+ # warn candidates, test, candidates & test
1010
+
1011
+ !(candidates & test).empty?
1012
+ end
1013
+
1014
+ # Obtain a key-value structure for the given subject, optionally
1015
+ # constraining the result by node type (:resource, :uri/:iri,
1016
+ # :blank/:bnode, :literal)
1017
+ #
1018
+ # @param repo
1019
+ # @param subject of the inquiry
1020
+ # @param rev map in reverse
1021
+ # @param only one or more node types
1022
+ # @param uuids coerce resources to if possible
1023
+ #
1024
+ # @return [Hash]
1025
+ #
1026
+ def self.struct_for repo, subject, base: nil,
1027
+ rev: false, only: [], uuids: false, canon: false, ucache: {}, scache: {}
1028
+ only = coerce_node_spec only
1029
+
1030
+ # coerce the subject
1031
+ subject = canonical_uuid(repo, subject,
1032
+ base: base, scache: scache, ucache: ucache) || subject if uuids
1033
+
1034
+ rsrc = {}
1035
+ pattern = rev ? [nil, nil, subject] : [subject, nil, nil]
1036
+ repo.query(pattern) do |stmt|
1037
+ # this will skip over any term not matching the type
1038
+ node = rev ? stmt.subject : stmt.object
1039
+ next unless node_matches? node, only
1040
+
1041
+ # coerce the node to uuid if told to
1042
+ if node.resource?
1043
+ if uuids
1044
+ uu = canonical_uuid(repo, node, scache: scache, ucache: ucache) unless
1045
+ ucache.key? node
1046
+ node = uu || (canon ? canonical_uri(repo, node) : node)
1047
+ elsif canon
1048
+ node = canonical_uri(repo, node)
1049
+ end
1050
+ end
1051
+
1052
+ p = RDF::Vocabulary.find_term(stmt.predicate) || stmt.predicate
1053
+ o = rsrc[p] ||= []
1054
+ o.push node if node # may be nil
1055
+ end
1056
+
1057
+ # XXX in here we can do fun stuff like filter/sort by language/datatype
1058
+ rsrc.values.each { |v| v.sort!.uniq! }
1059
+
1060
+ rsrc
1061
+ end
1062
+
1063
+ # Obtain the most appropriate label(s) for the subject's type(s).
1064
+ # Returns one or more (depending on the `unique` flag)
1065
+ # predicate-object pairs in order of preference.
1066
+ #
1067
+ # @param repo [RDF::Queryable]
1068
+ # @param subject [RDF::Resource]
1069
+ # @param unique [true, false] only return the first pair
1070
+ # @param type [RDF::Term, Array] supply asserted types if already retrieved
1071
+ # @param lang [nil] not currently implemented (will be conneg)
1072
+ # @param desc [false, true] retrieve description instead of label
1073
+ # @param alt [false, true] retrieve alternate instead of main
1074
+ #
1075
+ # @return [Array] either a predicate-object pair or an array of pairs.
1076
+ #
1077
+ def self.label_for repo, subject, candidates: nil, unique: true, type: nil,
1078
+ lang: nil, desc: false, alt: false, base: nil
1079
+ raise ArgumentError, 'no repo!' unless repo.is_a? RDF::Queryable
1080
+ return unless subject.is_a? RDF::Value and subject.resource?
1081
+
1082
+ asserted = asserted_types repo, subject, type
1083
+
1084
+ # get all the inferred types by layer; add default class if needed
1085
+ strata = type_strata asserted
1086
+ strata.push [RDF::RDFS.Resource] if
1087
+ strata.empty? or not strata[-1].include?(RDF::RDFS.Resource)
1088
+
1089
+ # get the key-value pairs for the subject
1090
+ candidates ||= struct_for repo, subject, only: :literal
1091
+
1092
+ seen = {}
1093
+ accum = []
1094
+ strata.each do |lst|
1095
+ lst.each do |cls|
1096
+ next unless STRINGS[cls] and
1097
+ preds = STRINGS[cls][desc ? :desc : :label][alt ? 1 : 0]
1098
+ # warn cls
1099
+ preds.each do |p|
1100
+ # warn p.inspect
1101
+ next unless vals = candidates[p]
1102
+ vals.each do |v|
1103
+ pair = [p, v]
1104
+ accum.push(pair) unless seen[pair]
1105
+ seen[pair] = true
1106
+ end
1107
+ end
1108
+ end
1109
+ end
1110
+
1111
+ # try that for now
1112
+ unique ? accum[0] : accum.uniq
1113
+
1114
+ # what we want to do is match the predicates from the subject to
1115
+ # the predicates in the label designation
1116
+
1117
+ # get label predicate stack(s) for RDF type(s)
1118
+
1119
+ # get all predicates in order (use alt stack if doubly specified)
1120
+
1121
+ # filter out desired language(s)
1122
+
1123
+ # XXX note we will probably want to return the predicate as well
1124
+ end
1125
+
1126
+ # Assuming the subject is a thing that has authors, return the
1127
+ # list of authors. Try bibo:authorList first for an explicit
1128
+ # ordering, then continue to the various other predicates.
1129
+ #
1130
+ # @param repo [RDF::Queryable]
1131
+ # @param subject [RDF::Resource]
1132
+ # @param unique [false, true] only return the first author
1133
+ # @param contrib [false, true] return contributors instead of authors
1134
+ #
1135
+ # @return [RDF::Value, Array]
1136
+ #
1137
+ def authors_for repo, subject, unique: false, contrib: false, base: nil
1138
+ authors = []
1139
+
1140
+ # try the author list
1141
+ lp = [RDF::Vocab::BIBO[contrib ? :contributorList : :authorList]]
1142
+ lp += lp.first.entail(:equivalentProperty) # XXX cache this
1143
+ lp.each do |pred|
1144
+ o = repo.first_object([subject, pred, nil])
1145
+ next unless o
1146
+ # note this use of RDF::List is not particularly well-documented
1147
+ authors += RDF::List.from(repo, o).to_a
1148
+ end
1149
+
1150
+ # now try various permutations of the author/contributor predicate
1151
+ unsorted = []
1152
+ preds = contrib ? CONTRIB : AUTHOR
1153
+ preds.each do |pred|
1154
+ unsorted += repo.query([subject, pred, nil]).objects
1155
+ end
1156
+
1157
+ # prefetch the author names
1158
+ labels = authors.map { |a| [a, label_for(repo, a)] }.to_h
1159
+
1160
+ authors += unsorted.uniq.sort { |a, b| labels[a] <=> labels[b] }
1161
+
1162
+ unique ? authors.first : authors.uniq
1163
+ end
1164
+
1165
+ # Find the terminal replacements for the given subject, if any exist.
1166
+ #
1167
+ # @param repo
1168
+ # @param subject
1169
+ # @param published indicate the context is published
1170
+ #
1171
+ # @return [Set]
1172
+ #
1173
+ def self.replacements_for repo, subject, published: true, base: nil
1174
+ subject = coerce_resource subject, base
1175
+
1176
+ # `seen` is a hash mapping resources to publication status and
1177
+ # subsequent replacements. it collects all the resources in the
1178
+ # replacement chain in :fwd (replaces) and :rev (replaced-by)
1179
+ # members, along with a boolean :pub. `seen` also performs a
1180
+ # duty as cycle-breaking sentinel.
1181
+
1182
+ seen = {}
1183
+ queue = [subject]
1184
+ while (test = queue.shift)
1185
+ # fwd is "replaces", rev is "replaced by"
1186
+ entry = seen[test] ||= {
1187
+ pub: published?(repo, test), fwd: Set.new, rev: Set.new }
1188
+ queue += (
1189
+ subjects_for(repo, RDF::Vocab::DC.replaces, subject) +
1190
+ objects_for(repo, subject, RDF::Vocab::DC.isReplacedBy,
1191
+ only: :resource)
1192
+ ).uniq.map do |r| # r = replacement
1193
+ next if seen.include? r
1194
+ seen[r] ||= { pub: published?(repo, r), fwd: Set.new, rev: Set.new }
1195
+ seen[r][:fwd] << test
1196
+ entry[:rev] << r
1197
+ r
1198
+ end.compact.uniq
1199
+ end
1200
+
1201
+ # if we're calling from a published context, we return the
1202
+ # (topologically) last published resource(s), even if they are
1203
+ # replaced ultimately by unpublished resources.
1204
+
1205
+ out = seen.map { |k, v| v[:rev].empty? ? k : nil }.compact - [subject]
1206
+
1207
+ # now we modify `out` based on the publication status of the context
1208
+ if published
1209
+ pubout = out.select { |o| seen[o][:pub] }
1210
+ # if there is anything left after this, return it
1211
+ return pubout unless pubout.empty?
1212
+ # now we want to find the penultimate elements of `seen` that
1213
+ # are farthest along the replacement chain but whose status is
1214
+ # published
1215
+
1216
+ # start with `out`, take the union of their :fwd members, then
1217
+ # take the subset of those which are published. if the result
1218
+ # is empty, repeat. (this is walking backwards through the
1219
+ # graph we just walked forwards through to construct `seen`)
1220
+ loop do
1221
+ # XXX THIS NEEDS A TEST CASE
1222
+ out = seen.values_at(*out).map { |v| v[:fwd] }.reduce(:+).to_a
1223
+ break if out.empty?
1224
+ pubout = out.select { |o| seen[o][:pub] }
1225
+ return pubout unless pubout.empty?
1226
+ end
1227
+ end
1228
+
1229
+ out
1230
+ end
1231
+
1232
+ # Obtain dates for the subject as instances of Date(Time). This is
1233
+ # just shorthand for a common application of `objects_for`.
1234
+ #
1235
+ # @param repo
1236
+ # @param subject
1237
+ # @param predicate
1238
+ # @param datatype
1239
+ #
1240
+ # @return [Array] of dates
1241
+ #
1242
+ def self.dates_for repo, subject, predicate: RDF::Vocab::DC.date,
1243
+ datatype: [RDF::XSD.date, RDF::XSD.dateTime]
1244
+ objects_for(
1245
+ repo, subject, predicate, only: :literal, datatype: datatype) do |o|
1246
+ o.object
1247
+ end.sort.uniq
1248
+ end
1249
+
1250
+ # Obtain any specified MIME types for the subject. Just shorthand
1251
+ # for a common application of `objects_for`.
1252
+ #
1253
+ # @param repo
1254
+ # @param subject
1255
+ # @param predicate
1256
+ # @param datatype
1257
+ #
1258
+ # @return [Array] of internet media types
1259
+ #
1260
+ def formats_for repo, subject, predicate: RDF::Vocab::DC.format,
1261
+ datatype: [RDF::XSD.token]
1262
+ objects_for(
1263
+ repo, subject, predicate, only: :literal, datatype: datatype) do |o|
1264
+ t = o.object
1265
+ t =~ /\// ? RDF::SAK::MimeMagic.new(t.to_s.downcase) : nil
1266
+ end.compact.sort.uniq
1267
+ end
1268
+
1269
+ def self.base_for xmlnode, base
1270
+ base = URI(base.to_s) unless base.is_a? URI
1271
+ out = base
1272
+
1273
+ if xmlnode.at_xpath('self::html:*|/html', XPATHNS)
1274
+ b = URI(xmlnode.at_xpath(XPATH[:htmlbase], XPATHNS).to_s.strip)
1275
+
1276
+ out = b if b.absolute?
1277
+ elsif b = xmlnode.root.at_xpath(XPATH[:xmlbase])
1278
+ b = URI(b.to_s.strip)
1279
+ out = b if b.absolute?
1280
+ end
1281
+
1282
+ out
1283
+ end
1284
+
1285
+ # Traverse links based on content type.
1286
+ def self.traverse_links node, type: 'application/xhtml+xml', &block
1287
+ enum_for :traverse_links, node, type: type unless block
1288
+ type = type.strip.downcase.gsub(/\s*;.*/, '')
1289
+ xpath = LINK_MAP.fetch type, XPATH[:xlinks]
1290
+ node.xpath(xpath, XPATHNS).each { |node| block.call node }
1291
+ end
1292
+
1293
+
1294
+ # XXX OTHER STUFF
1295
+
1296
+ # isolate an element into a new document
1297
+ def subtree doc, xpath = '/*', reindent: true, prefixes: {}
1298
+ # at this time we shouldn't try to do anything cute with the xpath
1299
+ # even though it is attractive to want to prune out prefixes
1300
+
1301
+ # how about we start with a noop
1302
+ return doc.root.dup if xpath == '/*'
1303
+
1304
+ begin
1305
+ nodes = doc.xpath xpath, prefixes
1306
+ return unless
1307
+ nodes and nodes.is_a?(Nokogiri::XML::NodeSet) and !nodes.empty?
1308
+ out = Nokogiri::XML::Document.new
1309
+ out << nodes.first.dup
1310
+ reindent out.root if reindent
1311
+ out
1312
+ rescue Nokogiri::SyntaxError
1313
+ return
1314
+ end
1315
+ end
1316
+
1317
+ # reindent text nodes
1318
+ def reindent node, depth = 0, indent = ' '
1319
+ kids = node.children
1320
+ if kids and child = kids.first
1321
+ loop do
1322
+ if child.element?
1323
+ # recurse into the element
1324
+ reindent child, depth + 1, indent
1325
+ elsif child.text?
1326
+ text = child.content || ''
1327
+
1328
+ # optional horizontal whitespace followed by at least
1329
+ # one newline (we don't care what kind), followed by
1330
+ # optional horizontal or vertical whitespace
1331
+ preamble = !!text.gsub!(/\A[ \t]*[\r\n]+\s*/, '')
1332
+
1333
+ # then we don't care what's in the middle, but hey let's get
1334
+ # rid of dos newlines because we can always put them back
1335
+ # later if we absolutely have to
1336
+ text.gsub!(/\r+/, '')
1337
+
1338
+ # then optionally any whitespace followed by at least
1339
+ # another newline again, followed by optional horizontal
1340
+ # whitespace and then the end of the string
1341
+ epilogue = !!text.gsub!(/\s*[\r\n]+[ \t]*\z/, '')
1342
+
1343
+ # if we prune these off we'll have a text node that is
1344
+ # either the empty string or it isn't (note we will only
1345
+ # register an epilogue if the text has some non-whitespace
1346
+ # in it, because otherwise the first regex would have
1347
+ # snagged everything, so it's probably redundant)
1348
+
1349
+ # if it's *not* empty then we *prepend* indented whitespace
1350
+ if preamble and !text.empty?
1351
+ d = depth + (child.previous ? 1 : 0)
1352
+ text = "\n" + (indent * d) + text
1353
+ end
1354
+
1355
+ # then we unconditionally *append*, (modulo there being a
1356
+ # newline in the original at all), but we have to check by
1357
+ # how much: if this is *not* the last node then depth + 1,
1358
+ # otherwise depth
1359
+ if preamble or epilogue
1360
+ d = depth + (child.next ? 1 : 0)
1361
+ text << "\n" + (indent * d)
1362
+ end
1363
+
1364
+ child.content = text
1365
+ end
1366
+
1367
+ break unless child = child.next
1368
+ end
1369
+ end
1370
+
1371
+ node
1372
+ end
1373
+
1374
+ XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
1375
+ XHV = 'http://www.w3.org/1999/xhtml/vocab#'.freeze
1376
+ XPATHNS = {
1377
+ html: XHTMLNS,
1378
+ svg: 'http://www.w3.org/2000/svg',
1379
+ atom: 'http://www.w3.org/2005/Atom',
1380
+ xlink: 'http://www.w3.org/1999/xlink',
1381
+ }.freeze
1382
+
1383
+ ######## URI STUFF ########
1384
+
1385
+ # Preprocess a URI string so that it can be handed to +URI.parse+
1386
+ # without crashing.
1387
+ #
1388
+ # @param uri [#to_s] The URI string in question
1389
+ # @param extra [#to_s] Character class of any extra characters to escape
1390
+ # @return [String] The sanitized (appropriately escaped) URI string
1391
+
1392
+ # really gotta stop carting this thing around
1393
+ def uri_pp uri, extra = ''
1394
+ # take care of malformed escapes
1395
+ uri = uri.to_s.b.gsub(/%(?![0-9A-Fa-f]{2})/n, '%25')
1396
+ uri.gsub!(/([#{Regexp.quote extra}])/) do |s|
1397
+ sprintf('%%%02X', s.ord)
1398
+ end unless extra.empty?
1399
+ # we want the minimal amount of escaping so we split out the separators
1400
+ out = ''
1401
+ parts = RFC3986.match(uri).captures
1402
+ parts.each_index do |i|
1403
+ next if parts[i].nil?
1404
+ out << SEPS[i].first
1405
+ out << parts[i].b.gsub(SF) { |s| sprintf('%%%02X', s.ord) }
1406
+ out << SEPS[i].last
1407
+ end
1408
+
1409
+ # make sure escaped hex is upper case like the rfc says
1410
+ out.gsub(/(%[0-9A-Fa-f]{2})/) { |x| x.upcase }
1411
+ end
1412
+
1413
+ # Given a URI as input, split any query parameters into an array of
1414
+ # key-value pairs. If +:only+ is true, this will just return the
1415
+ # pairs. Otherwise it will prepend the query-less URI to the array,
1416
+ # and can be captured with an idiom like +uri, *qp = split_qp uri+.
1417
+ #
1418
+ # @param uri [URI,#to_s] The URI to extract parameters from
1419
+ # @param only [false, true] whether to only return the parameters
1420
+ # @return [Array] (See description)
1421
+ #
1422
+ def split_qp uri, only: false
1423
+ uri = URI(uri_pp uri.to_s) unless uri.is_a? URI
1424
+ qp = URI::decode_www_form(uri.query)
1425
+ return qp if only
1426
+ uri.query = nil
1427
+ [uri] + qp
1428
+ end
1429
+
1430
+ # Given a URI as input, split any path parameters out of the last
1431
+ # path segment. Works the same way as #split_pp.
1432
+ #
1433
+ # @param uri [URI,#to_s] The URI to extract parameters from
1434
+ # @param only [false, true] whether to only return the parameters
1435
+ # @return [Array] (See description)
1436
+ #
1437
+ def split_pp uri, only: false
1438
+ begin
1439
+ u = (uri.is_a?(URI) ? uri : URI(uri_pp uri.to_s)).normalize
1440
+
1441
+ rescue URI::InvalidURIError => e
1442
+ # these stock error messages don't even tell you what the uri is
1443
+ raise URI::InvalidURIError, "#{e.message} (#{uri.to_s})"
1444
+ end
1445
+
1446
+ return only ? [] : [uri] unless u.path
1447
+ uri = u
1448
+
1449
+ ps = uri.path.split '/', -1
1450
+ pp = ps.pop.split ';', -1
1451
+ bp = (ps + [pp.shift]).join '/'
1452
+ uri = uri.dup
1453
+
1454
+ begin
1455
+ uri.path = bp
1456
+ rescue URI::InvalidURIError => e
1457
+ # these stock error messages don't even tell you what the uri is
1458
+ m = e.message
1459
+ raise URI::InvalidURIError, "#{m} (#{uri.to_s}, #{bp})"
1460
+ end
1461
+
1462
+ return pp if only
1463
+ [uri] + pp
1464
+ end
1465
+
1466
+ def split_pp2 path, only: false
1467
+ # ugh apparently we need a special case for ''.split
1468
+ return only ? [] : [''] if !path or path.empty?
1469
+
1470
+ ps = path.to_s.split ?/, -1 # path segments
1471
+ pp = ps.pop.to_s.split ?;, -1 # path parameters
1472
+ bp = (ps + [pp.shift]).join ?/ # base path
1473
+
1474
+ only ? pp : [bp] + pp
1475
+ end
1476
+
1477
+ # Coerce a stringlike argument into a URI. Raises an exception if
1478
+ # the string can't be turned into a valid URI. Optionally resolves
1479
+ # against a +base+, and the coercion can be tuned to either URI or
1480
+ # RDF::URI via +:as+.
1481
+ #
1482
+ # @param arg [URI, RDF::URI, #to_s] The input string
1483
+ # @param base [URI, RDF::URI, #to_s] The optional base URI
1484
+ # @param as [:rdf, :uri, nil] The optional coercion type
1485
+ # @return [URI, RDF::URI, String]
1486
+ #
1487
+ def coerce_resource arg, base = nil, as: :rdf
1488
+ as = assert_uri_coercion as
1489
+ return arg if as and arg.is_a?({ uri: URI, rdf: RDF::URI }[as])
1490
+ raise ArgumentError, 'arg must be stringable' unless arg.respond_to? :to_s
1491
+
1492
+ arg = arg.to_s.strip
1493
+
1494
+ if arg.start_with? '_:' and as
1495
+ # override the coercion if this is a blank node
1496
+ as = :rdf
1497
+ elsif base
1498
+ begin
1499
+ arg = (base.is_a?(URI) ? base : URI(uri_pp base.to_s.strip)).merge arg
1500
+ rescue URI::InvalidURIError => e
1501
+ warn "attempted to coerce #{arg} which turned out to be invalid: #{e}"
1502
+ return
1503
+ end
1504
+ end
1505
+
1506
+ URI_COERCIONS[as].call arg
1507
+ end
1508
+
1509
+ # Coerce a stringlike argument into a UUID URN. Will
1510
+ def coerce_uuid_urn arg, base = nil
1511
+ # if this is an ncname then change it
1512
+ if ([URI, RDF::URI] & arg.class.ancestors).empty? &&
1513
+ arg.respond_to?(:to_s)
1514
+ arg = arg.to_s
1515
+
1516
+ # coerce ncname to uuid
1517
+ arg = UUID::NCName::from_ncname(arg, version: 1) if arg =~
1518
+ /^[A-P](?:[0-9A-Z_-]{20}|[2-7A-Z]{24})[A-P]$/i
1519
+
1520
+ # now the string is either a UUID or it isn't
1521
+ arg = "urn:uuid:#{arg}" unless arg.start_with? 'urn:uuid:'
1522
+ else
1523
+ arg = arg.class.new arg.to_s.downcase unless arg == arg.to_s.downcase
1524
+ end
1525
+
1526
+ raise ArgumentError, 'not a UUID' unless
1527
+ arg.to_s =~ /^urn:uuid:[0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8}$/
1528
+
1529
+ arg = coerce_resource arg, base
1530
+ end
1531
+
1532
+ # Get the last non-empty path segment of the URI
1533
+ #
1534
+ # @param uri
1535
+ #
1536
+ # @return [String]
1537
+ def terminal_slug uri, base: nil
1538
+ uri = coerce_resource uri, base
1539
+ return unless uri.respond_to? :path
1540
+ if p = uri.path
1541
+ if p = /^\/+(.*?)\/*$/.match(p)
1542
+ if p = p[1].split(/\/+/).last
1543
+ # we need to escape colons or it will think it's absolute
1544
+ return uri_pp(p.split(/;+/).first || '', ':')
1545
+ end
1546
+ end
1547
+ end
1548
+ ''
1549
+ end
1550
+
1551
+ # Resolve a string or array or attribute node containing one or more
1552
+ # terms/CURIEs against a set of prefixes. The CURIE can be a string,
1553
+ # Nokogiri::XML::Attr, or an array thereof. Strings are stripped and
1554
+ # split on whitespace. +:prefixes+ and +:base+ can be supplied or
1555
+ # gleaned from +:refnode+, which itself can be gleaned if +curie+ is
1556
+ # a Nokogiri::XML::Attr. Returns an array of (attempted) resolved
1557
+ # terms unless +:scalar+ is true, in which case only the first URI
1558
+ # is returned. When +:noop+ is true, this method will always return
1559
+ # a value. Can coerce results to either RDF::URI or URI objects.
1560
+ #
1561
+ # @note +:vocab+ overrides, and is the same as supplying
1562
+ # +prefix[nil]+. It is only meaningful when +:term+ (i.e., when we
1563
+ # expect the input to be an RDFa term) is true.
1564
+ #
1565
+ # @param curie [#to_s, Nokogiri::XML::Attr,Array] One or more CURIEs
1566
+ # @param prefixes [#to_h] The hash of prefixes (nil key is equivalent
1567
+ # to vocab)
1568
+ # @param vocab [nil,#to_s] An optional base URI
1569
+ # @param refnode [nil, Nokogiri::XML::Element] A reference node for resolution
1570
+ # @param term [false, true] Whether to treat the input as an RDFa _term_
1571
+ # @param noop [true, false] Whether to skip if the CURIE can't be resolved
1572
+ # @param scalar [false, true] Whether to return a scalar value
1573
+ # @param coerce [nil, :rdf, :uri] Desired type coercion for the output
1574
+ #
1575
+ # @return [nil,URI,RDF::URI,Array<nil,URI,RDF::URI>]
1576
+ #
1577
+ def resolve_curie curie, prefixes: {}, vocab: nil, base: nil,
1578
+ refnode: nil, term: false, noop: true, scalar: false, coerce: nil
1579
+ prefixes = sanitize_prefixes prefixes
1580
+
1581
+ raise 'coerce must be either :uri or :rdf' if coerce and
1582
+ not %i[uri rdf].include?(coerce)
1583
+
1584
+ # coerce curie to its value and set refnode if not present
1585
+ if curie.is_a? Nokogiri::XML::Attr
1586
+ refnode ||= curie.parent
1587
+ curie = curie.value.strip.split
1588
+ elsif curie.respond_to? :to_a
1589
+ curie = curie.to_a
1590
+ raise ArgumentError,
1591
+ 'if curie is an array, it has to be all strings' unless
1592
+ curie.all? { |x| x.respond_to? :to_s }
1593
+ curie = curie.map { |x| x.to_s.strip.split }.flatten
1594
+ else
1595
+ raise ArgumentError, 'curie must be stringable' unless
1596
+ curie.respond_to? :to_s
1597
+ curie = curie.to_s.strip.split
1598
+ end
1599
+
1600
+ if refnode
1601
+ raise ArgumentError, 'refnode must be an element' unless
1602
+ refnode.is_a? Nokogiri::XML::Element
1603
+ prefixes = get_prefixes refnode if prefixes.empty?
1604
+ end
1605
+
1606
+ # now we overwrite the vocab
1607
+ if vocab
1608
+ raise ArgumentError, 'vocab must be stringable' unless
1609
+ vocab.respond_to? :to_s
1610
+ prefixes[nil] = vocab.to_s.strip
1611
+ end
1612
+
1613
+ out = curie.map do |c|
1614
+ prefix, slug = /^\[?(?:([^:]+):)?(.*?)\]?$/.match(c).captures
1615
+ prefix = prefix.to_sym if prefix
1616
+ tmp = if prefixes[prefix]
1617
+ prefixes[prefix] + slug
1618
+ else
1619
+ noop ? c : nil
1620
+ end
1621
+ tmp && coerce ? URI_COERCIONS[coerce].call(tmp) : tmp
1622
+ end
1623
+
1624
+ scalar ? out.first : out
1625
+ end
1626
+
1627
+ # Abbreviate one or more URIs into one or more CURIEs if we
1628
+ # can. Will through if +noop:+ is true, or if false, return nil for
1629
+ # any URI that can't be abbreviated this way. Takes a hash of
1630
+ # prefix-URI mappings where the keys are assumed to be symbols or
1631
+ # +nil+ to express the current vocabulary, which can be overridden
1632
+ # via +vocab:+.
1633
+ #
1634
+ # @note Only +noop: true+ can be guaranteed to return a value.
1635
+ #
1636
+ # @param term [Array<#to_s>, #to_s] the term(s)
1637
+ # @param prefixes [Hash<Symbol,nil>, #to_h] the prefix mappings
1638
+ # @param vocab [#to_s] current vocabulary, overrides +prefixes[nil]+
1639
+ # @param noop [true, false] whether or not to pass terms through
1640
+ # @param sort [true, false] whether or not to sort (only if +noop:+)
1641
+ # @return [String, nil, Array<String,nil>] the (maybe) abbreviated term(s)
1642
+ #
1643
+ def abbreviate term, prefixes: {}, vocab: nil, noop: true, sort: true
1644
+ # this returns a duplicate that we can mess with
1645
+ prefixes = sanitize_prefixes prefixes
1646
+
1647
+ # sanitize vocab
1648
+ raise ArgumentError, 'vocab must be nil or stringable' unless
1649
+ vocab.nil? or vocab.respond_to? :to_s
1650
+ prefixes[nil] = vocab.to_s if vocab
1651
+ scalar = true
1652
+
1653
+ term = if term.respond_to? :to_a
1654
+ scalar = false
1655
+ term.to_a
1656
+ else [term]; end
1657
+
1658
+ rev = prefixes.invert
1659
+
1660
+ term.map! do |t|
1661
+ t = t.to_s
1662
+ slug = nil # we want this value to be nil if no match and !noop
1663
+
1664
+ # try matching each prefix URI from longest to shortest
1665
+ rev.sort { |a, b| b.first.length <=> a.first.length }.each do |uri, pfx|
1666
+ slug = t.delete_prefix uri
1667
+ # this is saying the URI either doesn't match or abbreviates to ""
1668
+ if slug == t or pfx.nil? && slug.empty?
1669
+ slug = nil
1670
+ else
1671
+ # it's already a slug so we add a prefix if there is one
1672
+ slug = '%s:%s' % [pfx, slug] unless pfx.nil?
1673
+ break # we have our match
1674
+ end
1675
+ end
1676
+
1677
+ # at this point slug is either an abbreviated term or nil, so:
1678
+ slug ||= t if noop
1679
+ slug
1680
+ end
1681
+
1682
+ # only sort if noop is set
1683
+ term.sort! if noop && sort
1684
+
1685
+ scalar ? term.first : term
1686
+ end
1687
+
1688
+ ######## RDFA/XML STUFF ########
1689
+
1690
+ # Returns the base URI from the perspective of the given element.
1691
+ # Can optionally be coerced into either a URI or RDF::URI. Also
1692
+ # takes a default value.
1693
+ #
1694
+ # @param elem [Nokogiri::XML::Node] the context element
1695
+ # @param default [nil, #to_s] the default URI
1696
+ # @param coerce [nil, :uri, :rdf] the coercion scheme, if any
1697
+ # @return [nil, String, URI, RDF::URI] the context's base URI
1698
+ def get_base elem, default: nil, coerce: nil
1699
+ assert_uri_coercion coerce
1700
+
1701
+ if elem.document?
1702
+ elem = elem.root
1703
+ return unless elem
1704
+ end
1705
+
1706
+ # get the xpath
1707
+ xpath = (elem.namespace && elem.namespace.href == XHTMLNS or
1708
+ elem.at_xpath('/html')) ? :htmlbase : :xmlbase
1709
+
1710
+ # now we go looking for the attribute
1711
+ if base = elem.at_xpath(XPATH[xpath], XPATHNS)
1712
+ base = base.value.strip
1713
+ else
1714
+ base = default.to_s.strip if default
1715
+ end
1716
+
1717
+ # clear it out if it's the empty string
1718
+ base = nil if base and base.empty?
1719
+
1720
+ # eh that's about all the input sanitation we're gonna get
1721
+ base && coerce ? URI_COERCIONS[coerce].call(base) : base
1722
+ end
1723
+
1724
+ # Given an X(HT)ML element, returns a hash of prefixes of the form
1725
+ # +{ prefix: "vocab" }+, where the current +@vocab+ is represented
1726
+ # by the +nil+ key. An optional +:traverse+ parameter can be set to
1727
+ # +false+ to prevent ascending the node tree. Any XML namespace
1728
+ # declarations are superseded by the +@prefix+ attribute. Returns
1729
+ # any +@vocab+ declaration found as the +nil+ key.
1730
+ #
1731
+ # @note The +descend: true+ parameter assumes we are trying to
1732
+ # collect all the namespaces in use in the entire subtree, rather
1733
+ # than resolve any particular CURIE. As such, the _first_ prefix
1734
+ # mapping in document order is preserved over subsequent/descendant
1735
+ # ones.
1736
+ #
1737
+ # @param elem [Nokogiri::XML::Node] The context element
1738
+ # @param traverse [true, false] whether or not to traverse the tree
1739
+ # @param coerce [nil, :rdf, :uri] a type coercion for the URIs, if any
1740
+ # @param descend [false, true] go _down_ the tree instead of up
1741
+ # @return [Hash] Depending on +:traverse+, either all prefixes
1742
+ # merged, or just the ones asserted in the element.
1743
+ def get_prefixes elem, traverse: true, coerce: nil, descend: false
1744
+ coerce = assert_uri_coercion coerce
1745
+
1746
+ # deal with a common phenomenon
1747
+ elem = elem.root if elem.is_a? Nokogiri::XML::Document
1748
+
1749
+ # get namespace definitions first
1750
+ prefix = elem.namespaces.reject do |k, _| k == 'xmlns'
1751
+ end.transform_keys { |k| k.split(?:)[1].to_sym }
1752
+
1753
+ # now do the prefix attribute
1754
+ if elem.key? 'prefix'
1755
+ # XXX note this assumes largely that the input is clean
1756
+ elem['prefix'].strip.split.each_slice(2) do |k, v|
1757
+ pfx = k.split(?:)[0] or next # otherwise error
1758
+ prefix[pfx.to_sym] = v
1759
+ end
1760
+ end
1761
+
1762
+ # encode the vocab as the null prefix
1763
+ if vocab = elem['vocab']
1764
+ vocab.strip!
1765
+ # note that a specified but empty @vocab means kill any existing vocab
1766
+ prefix[nil] = vocab.empty? ? nil : vocab
1767
+ end
1768
+
1769
+ # don't forget we can coerce
1770
+ prefix.transform_values! { |v| COERCIONS[coerce].call v } if coerce
1771
+
1772
+ # don't proceed if `traverse` is false
1773
+ return prefix unless traverse
1774
+
1775
+ # save us having to recurse in ruby by using xpath implemented in c
1776
+ xpath = '%s::*[namespace::*|@prefix|@vocab]' %
1777
+ (descend ? :descendant : :ancestor)
1778
+ elem.xpath(xpath).each do |e|
1779
+ # this will always merge our prefix on top irrespective of direction
1780
+ prefix = get_prefix(e, traverse: false, coerce: coerce).merge prefix
1781
+ end
1782
+
1783
+ prefix
1784
+ end
1785
+
1786
+ # Given an X(HT)ML element, return the nearest RDFa _subject_.
1787
+ # Optionally takes +:prefix+ and +:base+ parameters which override
1788
+ # anything found in the document tree.
1789
+ #
1790
+ # @param node [Nokogiri::XML::Element] the node
1791
+ # @param prefixes [Hash] Prefix mapping. Overrides derived values.
1792
+ # @param base [#to_s,URI,RDF::URI] Base URI, overrides as well.
1793
+ # @param coerce [nil, :rdf, :uri] the coercion regime
1794
+ #
1795
+ # @return [URI,RDF::URI,String] the subject
1796
+ #
1797
+ def subject_for node, prefixes: nil, base: nil, coerce: :rdf
1798
+ assert_xml_node node
1799
+ coerce = assert_uri_coercion coerce
1800
+
1801
+ if n = node.at_xpath(XPATH[:literal])
1802
+ return internal_subject_for n,
1803
+ prefixes: prefixes, base: base, coerce: coerce
1804
+ end
1805
+
1806
+ internal_subject_for node, prefixes: prefixes, base: base, coerce: coerce
1807
+ end
1808
+
1809
+ def modernize doc
1810
+ doc.xpath(XPATH[:modernize], XPATHNS).each do |e|
1811
+ # gotta instance_exec because `markup` is otherwise unbound
1812
+ instance_exec e, &MODERNIZE[e.name.to_sym]
1813
+ end
1814
+ end
1815
+
1816
+ # Strip all the links surrounding and RDFa attributes off
1817
+ # +dfn+/+abbr+/+span+ tags. Assuming a construct like +<a
1818
+ # rel="some:relation" href="#..." typeof="skos:Concept"><dfn
1819
+ # property="some:property">Term</dfn></a>+ is a link to a glossary
1820
+ # entry, this method returns the term back to an undecorated state
1821
+ # (+<dfn>Term</dfn>+).
1822
+
1823
+ def dehydrate doc
1824
+ doc.xpath(XPATH[:dehydrate], XPATHNS).each do |e|
1825
+ e = e.replace e.elements.first.dup
1826
+ %w[about resource typeof rel rev property datatype].each do |a|
1827
+ e.delete a if e.key? a
1828
+ end
1829
+ end
1830
+ end
1831
+
1832
+ # Scan all the +dfn+/+abbr+/+span+ tags in the document that are not
1833
+ # already wrapped in a link. This method scans the text (or
1834
+ # +@content+) of each element and compares it to the contents of the
1835
+ # graph. If the process locates a subject, it will use that subject
1836
+ # as the basis of a link. if there are zero subjects, or more than
1837
+ # one, then the method executes a block which can be used to pick
1838
+ # (e.g., via user interface) a definite subject or otherwise add one.
1839
+
1840
+ # (maybe add +code+/+kbd+/+samp+/+var+/+time+ one day too)
1841
+
1842
+ def rehydrate doc, graph, &block
1843
+ doc.xpath(XPATH[:rehydrate], XPATHNS).each do |e|
1844
+ lang = e.xpath(XPATH[:lang]).to_s.strip
1845
+ # dt = e['datatype'] # XXX no datatype rn
1846
+ text = (e['content'] || e.xpath('.//text()').to_a.join).strip
1847
+
1848
+ # now we have the literal
1849
+ lit = [RDF::Literal(text)]
1850
+ lit.unshift RDF::Literal(text, language: lang) unless lang.empty?
1851
+
1852
+ # candidates
1853
+ cand = {}
1854
+ lit.map do |t|
1855
+ graph.query(object: t).to_a
1856
+ end.flatten.each do |x|
1857
+ y = cand[x.subject] ||= {}
1858
+ (y[:stmts] ||= []) << x
1859
+ y[:types] ||= graph.query([x.subject, RDF.type, nil]).objects.sort
1860
+ end
1861
+
1862
+ # if there's only one candidate, this is basically a noop
1863
+ chosen = cand.keys.first if cand.size == 1
1864
+
1865
+ # call the block to reconcile any gaps or conflicts
1866
+ if block_given? and cand.size != 1
1867
+ # the block is expected to return one of the candidates or
1868
+ # nil. we call the block with the graph so that the block can
1869
+ # manipulate its contents.
1870
+ chosen = block.call cand, graph
1871
+ raise ArgumentError, 'block must return nil or a term' unless
1872
+ chosen.nil? or chosen.is_a? RDF::Term
1873
+ end
1874
+
1875
+ if chosen
1876
+ # we assume this has been retrieved from the graph
1877
+ cc = cand[chosen]
1878
+ unless cc
1879
+ cc = cand[chosen] = {}
1880
+ cc[:stmts] = graph.query([chosen, nil, lit[0]]).to_a.sort
1881
+ cc[:types] = graph.query([chosen, RDF.type, nil]).objects.sort
1882
+ # if either of these are empty then the graph was not
1883
+ # appropriately populated
1884
+ raise 'Missing a statement relating #{chosen} to #{text}' if
1885
+ cc[:stmts].empty?
1886
+ end
1887
+
1888
+ # we should actually probably move any prefix/vocab/xmlns
1889
+ # declarations from the inner node to the outer one (although
1890
+ # in practice this will be an unlikely configuration)
1891
+ pfx = get_prefixes e
1892
+
1893
+ # here we have pretty much everything except for the prefixes
1894
+ # and wherever we want to actually link to.
1895
+
1896
+ inner = e.dup
1897
+ spec = { [inner] => :a, href: '' }
1898
+ # we should have types
1899
+ spec[:typeof] = abbreviate cc[:types], prefixes: pfx unless
1900
+ cc[:types].empty?
1901
+
1902
+ markup replace: e, spec: spec
1903
+ end
1904
+ end
1905
+ # return maybe the elements that did/didn't get changed?
1906
+ end
1907
+
1908
+ ######## RENDERING STUFF ########
1909
+
1910
+ # Given a structure of the form +{ predicate => [objects] }+,
1911
+ # rearrange the structure into one more amenable to rendering
1912
+ # RDFa. Returns a hash of the form +{ resources: { r1 => Set[p1, pn]
1913
+ # }, literals: { l1 => Set[p2, pm] }, types: Set[t1, tn], datatypes:
1914
+ # Set[d1, dn] }+. This inverted structure can then be conveniently
1915
+ # traversed to generate the RDFa. An optional block lets us examine
1916
+ # the predicate-object pairs as they go by.
1917
+ #
1918
+ # @param struct [Hash] The struct of the designated form
1919
+ # @yield [p, o] An optional block is given the predicate-object pair
1920
+ # @return [Hash] The inverted structure, as described.
1921
+ #
1922
+ def prepare_collation struct, &block
1923
+ resources = {}
1924
+ literals = {}
1925
+ datatypes = Set.new
1926
+ types = Set.new
1927
+
1928
+ struct.each do |p, v|
1929
+ v.each do |o|
1930
+ block.call p, o if block
1931
+
1932
+ if o.literal?
1933
+ literals[o] ||= Set.new
1934
+ literals[o].add p
1935
+ # collect the datatype
1936
+ datatypes.add o.datatype if o.has_datatype?
1937
+ else
1938
+ if p == RDF::RDFV.type
1939
+ # separate the type
1940
+ types.add o
1941
+ else
1942
+ # collect the resource
1943
+ resources[o] ||= Set.new
1944
+ resources[o].add p
1945
+ end
1946
+ end
1947
+ end
1948
+ end
1949
+
1950
+ { resources: resources, literals: literals,
1951
+ datatypes: datatypes, types: types }
1952
+ end
1953
+
1954
+ # Given a hash of prefixes and an array of nodes, obtain the the
1955
+ # subset of prefixes that abbreviate the nodes. Scans RDF URIs as
1956
+ # well as RDF::Literal datatypes.
1957
+ #
1958
+ # @param prefixes [#to_h] The prefixes, of the form +{ k: "v" }+
1959
+ # @param nodes [Array<RDF::Term>] The nodes to supply
1960
+ # @return [Hash] The prefix subset
1961
+ def prefix_subset prefixes, nodes
1962
+ prefixes = sanitize_prefixes prefixes, true
1963
+
1964
+ raise 'nodes must be arrayable' unless nodes.respond_to? :to_a
1965
+
1966
+ # sniff out all the URIs and datatypes
1967
+ resources = Set.new
1968
+ nodes.each do |n|
1969
+ next unless n.is_a? RDF::Term
1970
+ if n.literal? && n.datatype?
1971
+ resources << n.datatype
1972
+ elsif n.uri?
1973
+ resources << n
1974
+ end
1975
+ end
1976
+
1977
+ # now we abbreviate all the resources
1978
+ pfx = abbreviate(resources.to_a,
1979
+ prefixes: prefixes, noop: false, sort: false).uniq.compact.map do |p|
1980
+ p.split(?:).first.to_sym
1981
+ end.uniq.to_set
1982
+
1983
+ # now we return the subset
1984
+ prefixes.select { |k, _| pfx.include? k.to_sym }
1985
+ end
1986
+
1987
+ # turns any data structure into a set of nodes
1988
+ def smush_struct struct
1989
+ out = Set.new
1990
+
1991
+ if struct.is_a? RDF::Term
1992
+ out << struct
1993
+ elsif struct.respond_to? :to_a
1994
+ out |= struct.to_a.map { |s| smush_struct(s).to_a }.flatten.to_set
1995
+ end
1996
+
1997
+ out
1998
+ end
1999
+
2000
+ def invert_struct struct
2001
+ nodes = {}
2002
+
2003
+ struct.each do |p, v|
2004
+ v.each do |o|
2005
+ nodes[o] ||= Set.new
2006
+ nodes[o] << p
2007
+ end
2008
+ end
2009
+
2010
+ nodes
2011
+ end
2012
+
2013
+ def title_tag predicates, content,
2014
+ prefixes: {}, vocab: nil, lang: nil, xhtml: true
2015
+
2016
+ # begin with the tag
2017
+ tag = { '#title' => content.to_s,
2018
+ property: abbreviate(predicates, prefixes: prefixes, vocab: vocab) }
2019
+
2020
+ # we set the language if it exists and is different from the
2021
+ # body OR if it is xsd:string we set it to the empty string
2022
+ lang = (content.language? && content.language != lang ?
2023
+ content.language : nil) || (content.datatype == RDF::XSD.string &&
2024
+ lang ? '' : nil)
2025
+ if lang
2026
+ tag['xml:lang'] = lang if xhtml
2027
+ tag[:lang] = lang
2028
+ end
2029
+ if content.datatype? && content.datatype != RDF::XSD.string
2030
+ tag[:datatype] = abbreviate(content.datatype,
2031
+ prefixes: prefixes, vocab: vocab)
2032
+ end
2033
+
2034
+ tag
2035
+ end
2036
+
2037
+ ######## MISC STUFF ########
2038
+
2039
+ # Obtain everything that is an owl:equivalentClass or
2040
+ # rdfs:subClassOf the given type.
2041
+ #
2042
+ # @param rdftype [RDF::Term]
2043
+ #
2044
+ # @return [Array]
2045
+
2046
+ def all_related rdftype
2047
+ t = RDF::Vocabulary.find_term(rdftype) or raise "No type #{rdftype.to_s}"
2048
+ q = [t] # queue
2049
+ c = {} # cache
2050
+
2051
+ while term = q.shift
2052
+ # add term to cache
2053
+ c[term] = term
2054
+
2055
+ # keep this from tripping up
2056
+ next unless term.uri? and term.respond_to? :class?
2057
+
2058
+ # entail equivalent classes
2059
+ term.entail(:equivalentClass).each do |ec|
2060
+ # add equivalent classes to queue (if not already cached)
2061
+ q.push ec unless c[ec]
2062
+ c[ec] = ec unless ec == term
2063
+ end
2064
+
2065
+ # entail subclasses
2066
+ term.subClass.each do |sc|
2067
+ # add subclasses to queue (if not already cached)
2068
+ q.push sc unless c[sc]
2069
+ c[sc] = sc unless sc == term
2070
+ end
2071
+ end
2072
+
2073
+ # smush the result
2074
+ c.keys
2075
+ end
2076
+
2077
+
2078
+
2079
+ # duplicate instance methods as module methods
2080
+ extend self
2081
+ end