rdf-sak 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2081 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rdf/sak/version'
3
+
4
+ require 'uri'
5
+ require 'uri/urn'
6
+ require 'set'
7
+ require 'uuid-ncname'
8
+
9
+ require 'rdf'
10
+ require 'rdf/vocab'
11
+ require 'rdf/reasoner'
12
+ require 'rdf/vocab/skos'
13
+ require 'rdf/vocab/foaf'
14
+ require 'rdf/vocab/bibo'
15
+ require 'rdf/vocab/dc'
16
+ require 'rdf/vocab/dc11'
17
+
18
+ require 'rdf/sak/mimemagic'
19
+ require 'rdf/sak/ci'
20
+ require 'rdf/sak/tfo'
21
+ require 'rdf/sak/ibis'
22
+ require 'rdf/sak/pav'
23
+ require 'rdf/sak/qb'
24
+
25
+ unless RDF::List.respond_to? :from
26
+ class RDF::List
27
+ private
28
+
29
+ def self.get_list repo, subject, seen = []
30
+ out = []
31
+ return out if seen.include? subject
32
+ seen << subject
33
+ first = repo.query([subject, RDF.first, nil]).objects.first or return out
34
+ out << first
35
+ rest = repo.query([subject, RDF.rest, nil]).objects.select do |x|
36
+ !x.literal?
37
+ end.first or return out
38
+
39
+ out + (rest != RDF.nil ? get_list(repo, rest, seen) : [])
40
+ end
41
+
42
+ public
43
+
44
+ # Inflate a list from a graph but don't change the graph
45
+ def self.from graph, subject
46
+ self.new graph: graph, subject: subject, values: get_list(graph, subject)
47
+ end
48
+ end
49
+ end
50
+
51
+ module RDF::SAK::Util
52
+
53
+ private
54
+
55
+ RDF::Reasoner.apply(:rdfs, :owl)
56
+
57
+ R3986 = /^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/
58
+ SF = /[^[:alpha:][:digit:]\/\?%@!$&'()*+,:;=._~-]/n
59
+ RFC3986 =
60
+ /^(?:([^:\/?#]+):)?(?:\/\/([^\/?#]*))?([^?#]+)?(?:\?([^#]*))?(?:#(.*))?$/
61
+ SEPS = [['', ?:], ['//', ''], ['', ''], [??, ''], [?#, '']].freeze
62
+
63
+ XPATH = {
64
+ htmlbase: proc {
65
+ x = ['ancestor-or-self::html:html[1]/' \
66
+ 'html:head[html:base[@href]][1]/html:base[@href][1]/@href']
67
+ (x << x.first.gsub('html:', '')).join ?| }.call,
68
+ xmlbase: 'ancestor-or-self::*[@xml:base][1]/@xml:base',
69
+ lang: 'normalize-space((%s)[last()])' %
70
+ %w[lang xml:lang].map do |a|
71
+ 'ancestor-or-self::*[@%s][1]/@%s' % [a,a]
72
+ end.join(?|),
73
+ literal: '(ancestor::*[@property][not(@content)]' \
74
+ '[not(@resource|@href|@src) or @rel|@rev])[1]',
75
+ leaves: 'descendant::html:section[not(descendant::html:section)]' \
76
+ '[not(*[not(self::html:script)])]',
77
+ headers: './*[1][%s]//text()' %
78
+ (1..6).map { |x| "self::html:h#{x}" }.join(?|),
79
+ modernize: ([
80
+ "//html:div[*[1][#{(1..6).map { |i| 'self::html:h%d' % i }.join ?|}]]"] +
81
+ { div: %i[section figure], blockquote: :note,
82
+ table: :figure, img: :figure }.map do |k, v|
83
+ (v.is_a?(Array) ? v : [v]).map do |cl|
84
+ "//html:#{k}[contains(concat(' ', " \
85
+ "normalize-space(@class), ' '), ' #{cl} ')]"
86
+ end
87
+ end.flatten).join(?|),
88
+ dehydrate: '//html:a[count(*)=1][html:dfn|html:abbr|html:span]',
89
+ rehydrate: %w[//html:dfn
90
+ //html:abbr[not(parent::html:dfn)] //html:span].join(?|) +
91
+ '[not(parent::html:a)]',
92
+ htmllinks: (%w[*[not(self::html:base)][@href]/@href
93
+ *[@src]/@src object[@data]/@data *[@srcset]/@srcset
94
+ form[@action]/@action].map { |e|
95
+ '//html:%s' % e} + %w[//*[@xlink:href]/@xlink:href]).join(?|).freeze,
96
+ atomlinks: %w[uri content/@src category/@scheme generator/@uri icon id
97
+ link/@href logo].map { |e| '//atom:%s' % e }.join(?|).freeze,
98
+ rsslinks: %w[image/text()[1] docs/text()[1] source/@url enclosure/@url
99
+ guid/text()[1] comments/text()[1]].map { |e|
100
+ '//%s' % e }.join(?|).freeze,
101
+ xlinks: '//*[@xlink:href]/@xlink:href'.freeze,
102
+ rdflinks: %w[about resource datatype].map { |e|
103
+ '//*[@rdf:%s]/@rdf:%s' % [e, e] }.join(?|).freeze,
104
+ }
105
+
106
+ LINK_MAP = {
107
+ 'text/html' => :htmllinks,
108
+ 'application/xhtml+xml' => :htmllinks,
109
+ 'application/atom+xml' => :atomlinks,
110
+ 'application/x-rss+xml' => :rsslinks,
111
+ 'application/rdf+xml' => :rdflinks,
112
+ 'image/svg+xml' => :xlinks,
113
+ }.transform_values { |v| XPATH[v] }.freeze
114
+
115
+ URI_COERCIONS = {
116
+ nil => -> t { t.to_s },
117
+ false => -> t { t.to_s },
118
+ uri: -> t { URI.parse t.to_s },
119
+ rdf: -> t {
120
+ t = t.to_s
121
+ t.start_with?('_:') ? RDF::Node.new(t.delete_prefix '_:') : RDF::URI(t) },
122
+ }
123
+
124
+ UUID_RE = /^(?:urn:uuid:)?([0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8})$/i
125
+
126
+ # okay labels: what do we want to do about them? poor man's fresnel!
127
+
128
+ # basic structure is an asserted base class corresponding to a
129
+ # ranked list of asserted predicates. to the subject we first
130
+ # match the closest class, then the closest property.
131
+
132
+ # if the instance data doesn't have an exact property mentioned in
133
+ # the spec, it may have an equivalent property or subproperty we
134
+ # may be able to use. we could imagine a scoring system analogous
135
+ # to the one used by CSS selectors, albeit using the topological
136
+ # distance of classes/predicates in the spec versus those in the
137
+ # instance data.
138
+
139
+ # think about dcterms:title is a subproperty of dc11:title even
140
+ # though they are actually more like equivalent properties;
141
+ # owl:equivalentProperty is not as big a conundrum as
142
+ # rdfs:subPropertyOf.
143
+
144
+ # if Q rdfs:subPropertyOf P then S Q O implies S P O. this is
145
+ # great but property Q may not be desirable to display.
146
+
147
+ # it may be desirable to be able to express properties to never
148
+ # use as a label, such as skos:hiddenLabel
149
+
150
+ # consider ranked alternates, sequences, sequences of alternates.
151
+ # (this is what fresnel does fyi)
152
+
153
+ STRINGS = {
154
+ RDF::RDFS.Resource => {
155
+ label: [
156
+ # main
157
+ [RDF::Vocab::SKOS.prefLabel, RDF::RDFS.label,
158
+ RDF::Vocab::DC.title, RDF::Vocab::DC11.title, RDF::RDFV.value],
159
+ # alt
160
+ [RDF::Vocab::SKOS.altLabel, RDF::Vocab::DC.alternative],
161
+ ],
162
+ desc: [
163
+ # main will be cloned into alt
164
+ [RDF::Vocab::DC.abstract, RDF::Vocab::DC.description,
165
+ RDF::Vocab::DC11.description, RDF::RDFS.comment,
166
+ RDF::Vocab::SKOS.note],
167
+ ],
168
+ },
169
+ RDF::Vocab::FOAF.Document => {
170
+ label: [
171
+ # main
172
+ [RDF::Vocab::DC.title, RDF::Vocab::DC11.title],
173
+ # alt
174
+ [RDF::Vocab::BIBO.shortTitle, RDF::Vocab::DC.alternative],
175
+ ],
176
+ desc: [
177
+ # main
178
+ [RDF::Vocab::BIBO.abstract, RDF::Vocab::DC.abstract,
179
+ RDF::Vocab::DC.description, RDF::Vocab::DC11.description],
180
+ # alt
181
+ [RDF::Vocab::BIBO.shortDescription],
182
+ ],
183
+ },
184
+ RDF::Vocab::FOAF.Agent => {
185
+ label: [
186
+ # main (will get cloned into alt)
187
+ [RDF::Vocab::FOAF.name],
188
+ ],
189
+ desc: [
190
+ # main cloned into alt
191
+ [RDF::Vocab::FOAF.status],
192
+ ],
193
+ },
194
+ }
195
+ STRINGS[RDF::OWL.Thing] = STRINGS[RDF::RDFS.Resource]
196
+
197
+ # note this is to_a because "can't modify a hash during iteration"
198
+ # which i guess is sensible, so we generate a set of pairs first
199
+ STRINGS.to_a.each do |type, struct|
200
+ struct.values.each do |lst|
201
+ # assert a whole bunch of stuff
202
+ raise 'STRINGS content must be an array of arrays' unless
203
+ lst.is_a? Array
204
+ raise 'Spec must contain 1 or 2 Array elements' if lst.empty?
205
+ raise 'Spec must be array of arrays of terms' unless
206
+ lst.all? { |x| x.is_a? Array and x.all? { |y|
207
+ RDF::Vocabulary.find_term(y) } }
208
+
209
+ # prune this to two elements (not that there should be more than)
210
+ lst.slice!(2, lst.length) if lst.length > 2
211
+
212
+ # pre-fill equivalent properties
213
+ lst.each do |preds|
214
+ # for each predicate, find its equivalent properties
215
+
216
+ # splice them in after the current predicate only if they
217
+ # are not already explicitly in the list
218
+ i = 0
219
+ loop do
220
+ equiv = preds[i].entail(:equivalentProperty) - preds
221
+ preds.insert(i + 1, *equiv) unless equiv.empty?
222
+
223
+ i += equiv.length + 1
224
+ break if i >= preds.length
225
+ end
226
+
227
+ # this just causes too many problems otherwise
228
+ # preds.map! { |p| p.to_s }
229
+ end
230
+
231
+ # duplicate main predicates to alternatives
232
+ lst[1] ||= lst[0]
233
+ end
234
+
235
+ # may as well seed equivalent classes so we don't have to look them up
236
+ type.entail(:equivalentClass).each do |equiv|
237
+ STRINGS[equiv] ||= struct
238
+ end
239
+
240
+ # tempting to do subclasses too but it seems pretty costly in
241
+ # this framework; save it for the clojure version
242
+ end
243
+
244
+ AUTHOR = [RDF::SAK::PAV.authoredBy, RDF::Vocab::DC.creator,
245
+ RDF::Vocab::DC11.creator, RDF::Vocab::PROV.wasAttributedTo]
246
+ CONTRIB = [RDF::SAK::PAV.contributedBy, RDF::Vocab::DC.contributor,
247
+ RDF::Vocab::DC11.contributor]
248
+ [AUTHOR, CONTRIB].each do |preds|
249
+ i = 0
250
+ loop do
251
+ equiv = preds[i].entail(:equivalentProperty) - preds
252
+ preds.insert(i + 1, *equiv) unless equiv.empty?
253
+ i += equiv.length + 1
254
+ break if i >= preds.length
255
+ end
256
+
257
+ preds.freeze
258
+ end
259
+
260
+ def sanitize_prefixes prefixes, nonnil = false
261
+ raise ArgumentError, 'prefixes must be a hash' unless
262
+ prefixes.is_a? Hash or prefixes.respond_to? :to_h
263
+ prefixes = prefixes.to_h.map do |k, v|
264
+ [k ? k.to_s.to_sym : nil, v ? v.to_s : nil]
265
+ end.to_h
266
+
267
+ prefixes.reject! { |k, v| k.nil? || v.nil? } if nonnil
268
+ prefixes
269
+ end
270
+
271
+ def assert_uri_coercion coerce
272
+ if coerce
273
+ coerce = coerce.to_s.to_sym if coerce.respond_to? :to_s
274
+ raise 'coerce must be either :uri or :rdf' unless
275
+ %i[uri rdf].include?(coerce)
276
+ end
277
+ coerce
278
+ end
279
+
280
+ def assert_xml_node node
281
+ raise 'Argument must be a Nokogiri::XML::Element' unless
282
+ node.is_a? Nokogiri::XML::Element
283
+ node
284
+ end
285
+
286
+ def internal_subject_for node, prefixes: nil, base: nil, coerce: nil,
287
+ is_ancestor: false
288
+
289
+ # note we assign these AFTER the literal check or it will be wrong
290
+ prefixes ||= get_prefixes node
291
+
292
+ base ||= get_base node
293
+ base = coerce_resource base, as: :uri unless base
294
+
295
+ # answer a bunch of helpful questions about this element
296
+ subject = nil
297
+ parent = node.parent
298
+ ns_href = node.namespace.href if node.namespace
299
+ up_ok = %i[rel rev].none? { |a| node.key? a }
300
+ is_root = !parent or parent.document?
301
+ special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
302
+ (ns_href == 'http://www.w3.org/1999/xhtml' or
303
+ /^(?:[^:]+:)?html$/xi === parent.name)
304
+
305
+ # if the node is being inspected as an ancestor to the
306
+ # original node, we have to check it backwards.
307
+ if is_ancestor
308
+ # ah right @resource gets special treatment
309
+ if subject = node[:resource]
310
+ subject = resolve_curie subject,
311
+ prefixes: prefixes, base: base, scalar: true
312
+ else
313
+ # then check @href and @src
314
+ %i[href src].each do |attr|
315
+ if node.key? attr
316
+ # merge with the root and return it
317
+ subject = base + node[attr]
318
+ break
319
+ end
320
+ end
321
+ end
322
+
323
+ return coerce_resource subject, as: coerce if subject
324
+
325
+ # note if we are being called with is_ancestor, that means
326
+ # the original node (or indeed any of the nodes previously
327
+ # tested) have anything resembling a resource in them. this
328
+ # means @rel/@rev should be ignored, and we should keep
329
+ # looking for a subject.
330
+ end
331
+
332
+ if node[:about]
333
+
334
+ subject = resolve_curie node[:about],
335
+ prefixes: prefixes, base: base, scalar: true
336
+
337
+ # ignore coercion
338
+ return subject if subject.is_a? RDF::Node
339
+
340
+ elsif is_root
341
+ subject = base
342
+ elsif special
343
+ subject = subject_for_internal parent
344
+ elsif node[:resource]
345
+ # XXX resolve @about against potential curie
346
+ subject = resolve_curie node[:resource], prefixes: prefixes, base: base
347
+ elsif node[:href]
348
+ subject = base + node[:href]
349
+ elsif node[:src]
350
+ subject = base + node[:src]
351
+ elsif node[:typeof]
352
+ # bnode the typeof attr
353
+
354
+ # note we return bnodes irrespective of the rdf flag
355
+ return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
356
+ elsif node[:inlist]
357
+ # bnode the inlist attr
358
+ return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
359
+ elsif (parent[:inlist] && %i[href src].none? { |a| parent.key? a }) ||
360
+ (is_ancestor && !up_ok)
361
+ # bnode the element
362
+ return RDF::Node('id-%016x' % node.pointer_id)
363
+ # elsif node[:id]
364
+ else
365
+ subject = subject_for_internal parent, is_ancestor: true
366
+ end
367
+
368
+ coerce_resource subject, as: coerce if subject
369
+ end
370
+
371
+ MODERNIZE = {
372
+ div: -> e {
373
+ if e.classes.include? 'figure'
374
+ e.remove_class 'figure'
375
+ e.name = 'figure' unless e.parent.name == 'figure'
376
+ else
377
+ e.remove_class 'section'
378
+ e.name = 'section'
379
+ end
380
+ },
381
+ blockquote: -> e {
382
+ e.remove_class 'note'
383
+ e.name = 'aside'
384
+ e['role'] = 'note'
385
+ },
386
+ table: -> e {
387
+ e.remove_class 'figure'
388
+ unless e.parent.name == 'figure'
389
+ inner = e.dup
390
+ markup replace: e, spec: { [inner] => :figure }
391
+ end
392
+ },
393
+ img: -> e {
394
+ e.remove_class 'figure'
395
+ unless e.parent.name == 'figure'
396
+ inner = e.dup
397
+ markup replace: e, spec: { [inner] => :figure }
398
+ end
399
+ },
400
+ }
401
+
402
+ # rdf term type tests
403
+ NTESTS = { uri: :"uri?", blank: :"node?", literal: :"literal?" }.freeze
404
+ NMAP = ({ iri: :uri, bnode: :blank }.merge(
405
+ [:uri, :blank, :literal].map { |x| [x, x] }.to_h)).freeze
406
+
407
+ public
408
+
409
+ def coerce_node_spec spec, rev: false
410
+ spec = [spec] unless spec.respond_to? :to_a
411
+ spec = spec - [:resource] + [:uri, :blank] if spec.include? :resource
412
+ raise 'Subjects are never literals' if rev and spec.include? :literal
413
+
414
+ spec = NMAP.values_at(*spec).reject(&:nil?).uniq
415
+ spec = NTESTS.keys if spec.empty?
416
+ spec.delete :literal if rev
417
+ spec.uniq
418
+ end
419
+
420
+ def node_matches? node, spec
421
+ spec.any? { |k| node.send NTESTS[k] }
422
+ end
423
+
424
+ # Obtain all and only the rdf:types directly asserted on the subject.
425
+ #
426
+ # @param repo [RDF::Queryable]
427
+ # @param subject [RDF::Resource]
428
+ # @param type [RDF::Term, :to_a]
429
+ #
430
+ # @return [Array]
431
+ def self.asserted_types repo, subject, type = nil
432
+ asserted = nil
433
+
434
+ if type
435
+ type = type.respond_to?(:to_a) ? type.to_a : [type]
436
+ asserted = type.select { |t| t.is_a? RDF::Value }.map do |t|
437
+ RDF::Vocabulary.find_term t
438
+ end
439
+ end
440
+
441
+ asserted ||= repo.query([subject, RDF.type, nil]).objects.map do |o|
442
+ RDF::Vocabulary.find_term o
443
+ end.compact
444
+
445
+ asserted.select { |t| t && t.uri? }.uniq
446
+ end
447
+
448
+ # Obtain a stack of types for an asserted initial type or set
449
+ # thereof. Returns an array of arrays, where the first is the
450
+ # asserted types and their inferred equivalents, and subsequent
451
+ # elements are immediate superclasses and their equivalents. A
452
+ # given URI will only appear once in the entire structure.
453
+ #
454
+ # @param rdftype [RDF::Term, :to_a]
455
+ #
456
+ # @return [Array]
457
+ #
458
+ def type_strata rdftype
459
+ # first we coerce this to an array
460
+ if rdftype.respond_to? :to_a
461
+ rdftype = rdftype.to_a
462
+ else
463
+ rdftype = [rdftype]
464
+ end
465
+
466
+ # now squash and coerce
467
+ rdftype = rdftype.uniq.map { |t| RDF::Vocabulary.find_term t }.compact
468
+
469
+ # bail out early
470
+ return [] if rdftype.empty?
471
+
472
+ # essentially what we want to do is construct a layer of
473
+ # asserted classes and their inferred equivalents, then probe
474
+ # the classes in the first layer for subClassOf assertions,
475
+ # which will form the second layer, and so on.
476
+
477
+ queue = [rdftype]
478
+ strata = []
479
+ seen = Set.new
480
+
481
+ while qin = queue.shift
482
+ qwork = []
483
+
484
+ qin.each do |q|
485
+ qwork << q # entail doesn't include q
486
+ qwork += q.entail(:equivalentClass) if q.uri?
487
+ end
488
+
489
+ # grep and flatten
490
+ qwork = qwork.map do |t|
491
+ next t if t.is_a? RDF::Vocabulary::Term
492
+ RDF::Vocabulary.find_term t
493
+ end.compact.uniq - seen.to_a
494
+ seen |= qwork
495
+
496
+ # warn "qwork == #{qwork.inspect}"
497
+
498
+ # push current layer out
499
+ strata.push qwork.dup unless qwork.empty?
500
+
501
+ # now deal with subClassOf
502
+ qsuper = []
503
+ qwork.each { |q| qsuper += q.subClassOf }
504
+
505
+ # grep and flatten this too
506
+ qsuper = qsuper.map do |t|
507
+ next t if t.is_a? RDF::Vocabulary::Term
508
+ RDF::Vocabulary.find_term t
509
+ end.compact.uniq - seen.to_a
510
+ # do not append qsuper to seen!
511
+
512
+ # warn "qsuper == #{qsuper.inspect}"
513
+
514
+ # same deal, conditionally push the input queue
515
+ queue.push qsuper.dup unless qsuper.empty?
516
+ end
517
+
518
+ # voila
519
+ strata
520
+ end
521
+
522
+ # Obtain the objects for a given subject-predicate pair.
523
+ #
524
+ # @param subject [RDF::Resource]
525
+ # @param predicate [RDF::URI]
526
+ # @param entail [false, true]
527
+ # @param only [:uri, :iri, :resource, :blank, :bnode, :literal]
528
+ # @param datatype [RDF::Term]
529
+ #
530
+ # @return [Array]
531
+ #
532
+ def predicate_set predicates, seen: Set.new
533
+ predicates = Set[predicates] if predicates.is_a? RDF::URI
534
+ unless predicates.is_a? Set
535
+ raise "predicates must be a set" unless predicates.respond_to? :to_set
536
+ predicates = predicates.to_set
537
+ end
538
+
539
+ # shortcut
540
+ return predicates if predicates.empty?
541
+
542
+ raise 'predicates must all be RDF::URI' unless predicates.all? do |p|
543
+ p.is_a? RDF::URI
544
+ end
545
+
546
+ # first we generate the set of equivalent properties for the given
547
+ # properties
548
+ predicates += predicates.map do |p|
549
+ p.entail :equivalentProperty
550
+ end.flatten.to_set
551
+
552
+ # then we take the resulting set of properties and
553
+ # compute their subproperties
554
+ subp = Set.new
555
+ (predicates - seen).each do |p|
556
+ subp += p.subProperty.flatten.to_set
557
+ end
558
+
559
+ # uhh this whole "seen" business might not be necessary
560
+ predicates + predicate_set(subp - predicates - seen, seen: predicates)
561
+ end
562
+
563
+ # Returns subjects from the graph with entailment.
564
+ #
565
+ # @param repo
566
+ # @param predicate
567
+ # @param object
568
+ # @param entail
569
+ # @param only
570
+ #
571
+ # @return [RDF::Resource]
572
+ #
573
+ def self.subjects_for repo, predicate, object, entail: true, only: []
574
+ raise 'Object must be a Term' unless object.is_a? RDF::Term
575
+ predicate = predicate.respond_to?(:to_a) ? predicate.to_a : [predicate]
576
+ raise 'Predicate must be some kind of term' unless
577
+ predicate.all? { |p| p.is_a? RDF::URI }
578
+
579
+ only = coerce_node_spec only, rev: true
580
+
581
+ predicate = predicate.map { |x| RDF::Vocabulary.find_term x }.compact
582
+ predicate = predicate_set predicate if entail
583
+
584
+ out = {}
585
+ revp = Set.new
586
+ predicate.each do |p|
587
+ repo.query([nil, p, object]).subjects.each do |s|
588
+ next unless node_matches? s, only
589
+
590
+ entry = out[s] ||= [Set.new, Set.new]
591
+ entry[0] << p
592
+ end
593
+
594
+ # do this here while we're at it
595
+ unless object.literal?
596
+ revp += p.inverseOf.to_set
597
+ revp << p if p.type.include? RDF::OWL.SymmetricProperty
598
+ end
599
+ end
600
+
601
+ unless object.literal?
602
+ revp = predicate_set revp if entail
603
+
604
+ revp.each do |p|
605
+ repo.query([object, p, nil]).objects.each do |o|
606
+ next unless node_matches? o, only
607
+
608
+ entry = out[o] ||= [Set.new, Set.new]
609
+ entry[1] << p
610
+ end
611
+ end
612
+ end
613
+
614
+ # run this through a block to get access to the predicates
615
+ return out.map { |p, v| yield p, *v } if block_given?
616
+
617
+ out.keys
618
+ end
619
+
620
+ # Returns objects from the graph with entailment.
621
+ #
622
+ # @param repo
623
+ # @param subject
624
+ # @param predicate
625
+ # @param entail
626
+ # @param only
627
+ # @param datatype
628
+ #
629
+ # @return [RDF::Term]
630
+ #
631
+ def self.objects_for repo, subject, predicate,
632
+ entail: true, only: [], datatype: nil
633
+ raise "Subject must be a resource, not #{subject.inspect}" unless
634
+ subject.is_a? RDF::Resource
635
+ predicate = predicate.respond_to?(:to_a) ? predicate.to_a : [predicate]
636
+ raise "Predicate must be a term, not #{predicate.first.class}" unless
637
+ predicate.all? { |p| p.is_a? RDF::URI }
638
+
639
+ predicate = predicate.map { |x| RDF::Vocabulary.find_term x }.compact
640
+
641
+ only = coerce_node_spec only
642
+
643
+ datatype = (
644
+ datatype.respond_to?(:to_a) ? datatype.to_a : [datatype]).compact
645
+ raise 'Datatype must be some kind of term' unless
646
+ datatype.all? { |p| p.is_a? RDF::URI }
647
+
648
+ # fluff this out
649
+ predicate = predicate_set predicate if entail
650
+
651
+ out = {}
652
+ predicate.each do |p|
653
+ repo.query([subject, p, nil]).objects.each do |o|
654
+
655
+ # make sure it's in the spec
656
+ next unless node_matches? o, only
657
+
658
+ # constrain output
659
+ next if o.literal? and
660
+ !(datatype.empty? or datatype.include?(o.datatype))
661
+
662
+ entry = out[o] ||= [Set.new, Set.new]
663
+ entry.first << p
664
+ end
665
+ end
666
+
667
+ # now we do the reverse
668
+ unless only == [:literal]
669
+ # generate reverse predicates
670
+ revp = Set.new
671
+ predicate.each do |p|
672
+ revp += p.inverseOf.to_set
673
+ revp << p if p.type.include? RDF::OWL.SymmetricProperty
674
+ end
675
+ revp = predicate_set revp if entail
676
+
677
+ # now scan 'em
678
+ revp.each do |p|
679
+ repo.query([nil, p, subject]).subjects.each do |s|
680
+ next unless node_matches? s, only
681
+ # no need to check datatype; subject is never a literal
682
+
683
+ entry = out[s] ||= [Set.new, Set.new]
684
+ entry.last << p
685
+ end
686
+ end
687
+ end
688
+
689
+ # run this through a block to get access to the predicates
690
+ return out.map { |p, v| yield p, *v } if block_given?
691
+
692
+ out.keys
693
+ end
694
+
695
+ # Obtain the canonical UUID for the given URI
696
+ #
697
+ # @param repo [RDF::Queryable]
698
+ # @param uri [RDF::URI, URI, to_s] the subject of the inquiry
699
+ # @param unique [true, false] return a single resource/nil or an array
700
+ # @param published [true, false] whether to restrict to published docs
701
+ #
702
+ # @return [RDF::URI, Array]
703
+ #
704
+ def self.canonical_uuid repo, uri, unique: true, published: false,
705
+ scache: {}, ucache: {}, base: nil
706
+ # make sure this is actually a uri
707
+ orig = uri = coerce_resource uri, base
708
+ unless uri.is_a? RDF::Node
709
+ tu = URI(uri_pp(uri).to_s).normalize
710
+
711
+ if tu.path && !tu.fragment &&
712
+ UUID_RE.match?(uu = tu.path.delete_prefix(?/))
713
+ tu = URI('urn:uuid:' + uu.downcase)
714
+ end
715
+
716
+ # unconditionally overwrite uri
717
+ uri = RDF::URI(tu.to_s)
718
+
719
+ # now check if it's a uuid
720
+ if tu.respond_to? :uuid
721
+ # warn "lol uuid #{orig}"
722
+ # if it's a uuid, check that we have it as a subject
723
+ # if we have it as a subject, return it
724
+ return uri if scache[uri] ||= repo.has_subject?(uri)
725
+ # note i don't want to screw around right now dealing with the
726
+ # case that a UUID might not itself be canonical
727
+ end
728
+ end
729
+
730
+ # spit up the cache if present
731
+ if out = ucache[orig]
732
+ # warn "lol cached #{orig}"
733
+ return unique ? out.first : out
734
+ end
735
+
736
+ # otherwise we proceed:
737
+
738
+ # goal: return the most "appropriate" UUID for the given URI
739
+
740
+ # it is so lame i have to do this
741
+ bits = { nil => 0, false => 0, true => 1 }
742
+
743
+ # rank (0 is higher):
744
+ # * (00) exact & canonical == 0,
745
+ # * (01) exact == 1,
746
+ # * (10) inexact & canonical == 2,
747
+ # * (11) inexact == 3.
748
+
749
+ # warn "WTF URI #{uri}"
750
+
751
+ # handle path parameters by generating a bunch of candidates
752
+ uris = if uri.respond_to? :path and uri.path.start_with? ?/
753
+ # split any path parameters off
754
+ uu, *pp = split_pp uri
755
+ if pp.empty?
756
+ [uri] # no path parameters
757
+ else
758
+ uu = RDF::URI(uu.to_s)
759
+ bp = uu.path # base path
760
+ (0..pp.length).to_a.reverse.map do |i|
761
+ u = uu.dup
762
+ u.path = ([bp] + pp.take(i)).join(';')
763
+ u
764
+ end
765
+ end
766
+ else
767
+ [uri] # not a pathful URI
768
+ end
769
+
770
+ # collect the candidates by URI
771
+ sa = predicate_set [RDF::SAK::CI.canonical,
772
+ RDF::SAK::CI.alias, RDF::OWL.sameAs]
773
+ candidates = nil
774
+ uris.each do |u|
775
+ candidates = subjects_for(repo, sa, u, entail: false) do |s, f|
776
+ # there is no #to_i for booleans and also we xor this number
777
+ [s, { rank: bits[f.include?(RDF::SAK::CI.canonical)] ^ 1,
778
+ published: published?(repo, s),
779
+ mtime: dates_for(repo, s).last || DateTime.new }]
780
+ end.compact.to_h
781
+ break unless candidates.empty?
782
+ end
783
+
784
+ # now collect by slug
785
+ slug = terminal_slug uri, base: base
786
+ if slug and !slug.empty?
787
+ exact = uri == coerce_resource(slug, base) # slug represents exact match
788
+ sl = [RDF::SAK::CI['canonical-slug'], RDF::SAK::CI.slug]
789
+ [RDF::XSD.string, RDF::XSD.token].each do |t|
790
+ subjects_for(repo, sl, RDF::Literal(slug, datatype: t)) do |s, f|
791
+ # default to lowest rank if this candidate is new
792
+ entry = candidates[s] ||= {
793
+ published: published?(repo, s, base: base),
794
+ rank: 0b11, mtime: dates_for(repo, s).last || DateTime.new }
795
+ # true is 1 and false is zero so we xor this too
796
+ rank = (BITS[exact] << 1 | BITS[f.include?(sl[0])]) ^ 0b11
797
+ # now amend the rank if we have found a better one
798
+ entry[:rank] = rank if rank < entry[:rank]
799
+ end
800
+ end
801
+ end
802
+
803
+ candidates.delete_if { |s, _| !/^urn:uuid:/.match?(s.to_s) }
804
+
805
+ # scan all the candidates for replacements and remove any
806
+ # candidates that have been replaced
807
+ candidates.to_a.each do |k, v|
808
+ # note that
809
+ reps = replacements_for(repo, k, published: published) - [k]
810
+ unless reps.empty?
811
+ v[:replaced] = true
812
+ reps.each do |r|
813
+ c = candidates[r] ||= { rank: v[:rank],
814
+ published: published?(repo, r),
815
+ mtime: dates_for(repo, r).last || v[:mtime] || DateTime.new }
816
+ # we give the replacement the rank and mtime of the
817
+ # resource being replaced if it scores better
818
+ c[:rank] = v[:rank] if v[:rank] < c[:rank]
819
+ c[:mtime] = v[:mtime] if v[:mtime] > c[:mtime]
820
+ end
821
+ end
822
+ end
823
+
824
+ # now we can remove all unpublished candidates if the context is
825
+ # published
826
+ candidates.select! do |_, v|
827
+ !v[:replaced] && (published ? v[:published] : true)
828
+ end
829
+
830
+ # now we sort by rank and date; the highest-ranking newest
831
+ # candidate is the one
832
+
833
+ out = candidates.sort do |a, b|
834
+ _, va = a
835
+ _, vb = b
836
+ cb = published ? BITS[vb[:published]] <=> BITS[va[:published]] : 0
837
+ cr = va[:rank] <=> vb[:rank]
838
+ cb == 0 ? cr == 0 ? vb[:mtime] <=> va[:mtime] : cr : cb
839
+ end.map { |x| x.first }.compact
840
+
841
+ # set cache
842
+ ucache[orig] = out
843
+
844
+ #warn "lol not cached #{orig}"
845
+
846
+ unique ? out.first : out
847
+
848
+ # an exact match is better than an inexact one
849
+
850
+ # a canonical match is better than non-canonical
851
+
852
+ # note this is four bits: exact, canon(exact), inexact, canon(inexact)
853
+ # !canon(exact) should rank higher than canon(inexact)
854
+
855
+ # unreplaced is better than replaced
856
+
857
+ # newer is better than older (though no reason an older item
858
+ # can't replace a newer one)
859
+
860
+ # published is better than not, unless the context is
861
+ # unpublished and an unpublished document replaces a published one
862
+ end
863
+
864
+ SCHEME_RANK = { https: 0, http: 1 }
865
+
866
+ def cmp_resource a, b, www: nil
867
+ raise 'Comparands must be instances of RDF::Value' unless
868
+ [a, b].all? { |x| x.is_a? RDF::Value }
869
+
870
+ # URI beats non-URI
871
+ if a.uri?
872
+ if b.uri?
873
+ # https beats http beats other
874
+ as = a.scheme.downcase.to_sym
875
+ bs = b.scheme.downcase.to_sym
876
+ cmp = SCHEME_RANK.fetch(as, 2) <=> SCHEME_RANK.fetch(bs, 2)
877
+
878
+ # bail out early
879
+ return cmp unless cmp == 0
880
+
881
+ # this would have returned if the schemes were different, as
882
+ # such we only need to test one of them
883
+ if [:http, :https].any?(as) and not www.nil?
884
+ # if www is non-nil, prefer www or no-www depending on
885
+ # truthiness of `www` parameter
886
+ pref = [false, true].zip(www ? [1, 0] : [0, 1]).to_h
887
+ re = /^(?:(www)\.)?(.*?)$/
888
+
889
+ ah = re.match(a.host.to_s.downcase)[1,2]
890
+ bh = re.match(b.host.to_s.downcase)[1,2]
891
+
892
+ # compare hosts sans www
893
+ cmp = ah[1] <=> bh[1]
894
+ return cmp unless cmp == 0
895
+
896
+ # now compare presence of www
897
+ cmp = pref[ah[0] == 'www'] <=> pref[bh[0] == 'www']
898
+ return cmp unless cmp == 0
899
+
900
+ # if we're still here, compare the path/query/fragment
901
+ re = /^.*?\/\/.*?(\/.*)$/
902
+ al = re.match(a.to_s)[1].to_s
903
+ bl = re.match(b.to_s)[1].to_s
904
+
905
+ return al <=> bl
906
+ end
907
+
908
+ return a <=> b
909
+ else
910
+ return -1
911
+ end
912
+ elsif b.uri?
913
+ return 1
914
+ else
915
+ return a <=> b
916
+ end
917
+ end
918
+
919
+ def self.cmp_label repo, a, b, labels: nil, supplant: true, reverse: false
920
+ labels ||= {}
921
+
922
+ # try supplied label or fall back
923
+ pair = [a, b].map do |x|
924
+ if labels[x]
925
+ labels[x][1]
926
+ elsif supplant and y = label_for(repo, x)
927
+ labels[x] = y
928
+ y[1]
929
+ else
930
+ x
931
+ end
932
+ end
933
+
934
+ pair.reverse! if reverse
935
+ # warn "#{pair[0]} <=> #{pair[1]}"
936
+ pair[0].to_s <=> pair[1].to_s
937
+ end
938
+
939
+ # Obtain the "best" dereferenceable URI for the subject.
940
+ # Optionally returns all candidates.
941
+ #
942
+ # @param repo [RDF::Queryable]
943
+ # @param subject [RDF::Resource]
944
+ # @param unique [true, false] flag for unique return value
945
+ # @param rdf [true, false] flag to specify RDF::URI vs URI
946
+ # @param slugs [true, false] flag to include slugs
947
+ # @param fragment [true, false] flag to include fragment URIs
948
+ #
949
+ # @return [RDF::URI, URI, Array]
950
+ #
951
+ def self.canonical_uri repo, subject, base: nil,
952
+ unique: true, rdf: true, slugs: false, fragment: false
953
+ subject = coerce_resource subject, base
954
+ out = []
955
+
956
+ # try to find it first
957
+ out = objects_for(repo, subject, [RDF::SAK::CI.canonical, RDF::OWL.sameAs],
958
+ entail: false, only: :resource).select do |o|
959
+ # only consider the subjects
960
+ repo.has_subject? o
961
+ end.sort { |a, b| cmp_resource a, b }
962
+
963
+ # try to generate in lieu
964
+ if subject.uri? and (out.empty? or slugs)
965
+
966
+ out += objects_for(repo, subject,
967
+ [RDF::SAK::CI['canonical-slug'], RDF::SAK::CI.slug],
968
+ only: :literal).map do |o|
969
+ base + o.value
970
+ end if slugs
971
+
972
+ uri = URI(uri_pp(subject.to_s))
973
+ if base and uri.respond_to? :uuid
974
+ b = base.clone
975
+ b.query = b.fragment = nil
976
+ b.path = '/' + uri.uuid
977
+ out << RDF::URI.new(b.to_s)
978
+ else
979
+ out << subject
980
+ end
981
+ end
982
+
983
+ # remove all URIs with fragments unless specified
984
+ unless fragment
985
+ tmp = out.reject(&:fragment)
986
+ out = tmp unless tmp.empty?
987
+ end
988
+
989
+ # coerce to URI objects if specified
990
+ out.map! { |u| URI(uri_pp u.to_s) } unless rdf
991
+
992
+ unique ? out.first : out.uniq
993
+ end
994
+
995
+ # Determine whether the URI represents a published document.
996
+ #
997
+ # @param repo
998
+ # @param uri
999
+ #
1000
+ # @return [true, false]
1001
+ def self.published? repo, uri, circulated: false, base: nil
1002
+ uri = coerce_resource uri, base
1003
+ candidates = objects_for(
1004
+ repo, uri, RDF::Vocab::BIBO.status, only: :resource).to_set
1005
+
1006
+ test = Set[RDF::Vocab::BIBO['status/published']]
1007
+ test << RDF::SAK::CI.circulated if circulated
1008
+
1009
+ # warn candidates, test, candidates & test
1010
+
1011
+ !(candidates & test).empty?
1012
+ end
1013
+
1014
+ # Obtain a key-value structure for the given subject, optionally
1015
+ # constraining the result by node type (:resource, :uri/:iri,
1016
+ # :blank/:bnode, :literal)
1017
+ #
1018
+ # @param repo
1019
+ # @param subject of the inquiry
1020
+ # @param rev map in reverse
1021
+ # @param only one or more node types
1022
+ # @param uuids coerce resources to if possible
1023
+ #
1024
+ # @return [Hash]
1025
+ #
1026
+ def self.struct_for repo, subject, base: nil,
1027
+ rev: false, only: [], uuids: false, canon: false, ucache: {}, scache: {}
1028
+ only = coerce_node_spec only
1029
+
1030
+ # coerce the subject
1031
+ subject = canonical_uuid(repo, subject,
1032
+ base: base, scache: scache, ucache: ucache) || subject if uuids
1033
+
1034
+ rsrc = {}
1035
+ pattern = rev ? [nil, nil, subject] : [subject, nil, nil]
1036
+ repo.query(pattern) do |stmt|
1037
+ # this will skip over any term not matching the type
1038
+ node = rev ? stmt.subject : stmt.object
1039
+ next unless node_matches? node, only
1040
+
1041
+ # coerce the node to uuid if told to
1042
+ if node.resource?
1043
+ if uuids
1044
+ uu = canonical_uuid(repo, node, scache: scache, ucache: ucache) unless
1045
+ ucache.key? node
1046
+ node = uu || (canon ? canonical_uri(repo, node) : node)
1047
+ elsif canon
1048
+ node = canonical_uri(repo, node)
1049
+ end
1050
+ end
1051
+
1052
+ p = RDF::Vocabulary.find_term(stmt.predicate) || stmt.predicate
1053
+ o = rsrc[p] ||= []
1054
+ o.push node if node # may be nil
1055
+ end
1056
+
1057
+ # XXX in here we can do fun stuff like filter/sort by language/datatype
1058
+ rsrc.values.each { |v| v.sort!.uniq! }
1059
+
1060
+ rsrc
1061
+ end
1062
+
1063
+ # Obtain the most appropriate label(s) for the subject's type(s).
1064
+ # Returns one or more (depending on the `unique` flag)
1065
+ # predicate-object pairs in order of preference.
1066
+ #
1067
+ # @param repo [RDF::Queryable]
1068
+ # @param subject [RDF::Resource]
1069
+ # @param unique [true, false] only return the first pair
1070
+ # @param type [RDF::Term, Array] supply asserted types if already retrieved
1071
+ # @param lang [nil] not currently implemented (will be conneg)
1072
+ # @param desc [false, true] retrieve description instead of label
1073
+ # @param alt [false, true] retrieve alternate instead of main
1074
+ #
1075
+ # @return [Array] either a predicate-object pair or an array of pairs.
1076
+ #
1077
+ def self.label_for repo, subject, candidates: nil, unique: true, type: nil,
1078
+ lang: nil, desc: false, alt: false, base: nil
1079
+ raise ArgumentError, 'no repo!' unless repo.is_a? RDF::Queryable
1080
+ return unless subject.is_a? RDF::Value and subject.resource?
1081
+
1082
+ asserted = asserted_types repo, subject, type
1083
+
1084
+ # get all the inferred types by layer; add default class if needed
1085
+ strata = type_strata asserted
1086
+ strata.push [RDF::RDFS.Resource] if
1087
+ strata.empty? or not strata[-1].include?(RDF::RDFS.Resource)
1088
+
1089
+ # get the key-value pairs for the subject
1090
+ candidates ||= struct_for repo, subject, only: :literal
1091
+
1092
+ seen = {}
1093
+ accum = []
1094
+ strata.each do |lst|
1095
+ lst.each do |cls|
1096
+ next unless STRINGS[cls] and
1097
+ preds = STRINGS[cls][desc ? :desc : :label][alt ? 1 : 0]
1098
+ # warn cls
1099
+ preds.each do |p|
1100
+ # warn p.inspect
1101
+ next unless vals = candidates[p]
1102
+ vals.each do |v|
1103
+ pair = [p, v]
1104
+ accum.push(pair) unless seen[pair]
1105
+ seen[pair] = true
1106
+ end
1107
+ end
1108
+ end
1109
+ end
1110
+
1111
+ # try that for now
1112
+ unique ? accum[0] : accum.uniq
1113
+
1114
+ # what we want to do is match the predicates from the subject to
1115
+ # the predicates in the label designation
1116
+
1117
+ # get label predicate stack(s) for RDF type(s)
1118
+
1119
+ # get all predicates in order (use alt stack if doubly specified)
1120
+
1121
+ # filter out desired language(s)
1122
+
1123
+ # XXX note we will probably want to return the predicate as well
1124
+ end
1125
+
1126
+ # Assuming the subject is a thing that has authors, return the
1127
+ # list of authors. Try bibo:authorList first for an explicit
1128
+ # ordering, then continue to the various other predicates.
1129
+ #
1130
+ # @param repo [RDF::Queryable]
1131
+ # @param subject [RDF::Resource]
1132
+ # @param unique [false, true] only return the first author
1133
+ # @param contrib [false, true] return contributors instead of authors
1134
+ #
1135
+ # @return [RDF::Value, Array]
1136
+ #
1137
+ def authors_for repo, subject, unique: false, contrib: false, base: nil
1138
+ authors = []
1139
+
1140
+ # try the author list
1141
+ lp = [RDF::Vocab::BIBO[contrib ? :contributorList : :authorList]]
1142
+ lp += lp.first.entail(:equivalentProperty) # XXX cache this
1143
+ lp.each do |pred|
1144
+ o = repo.first_object([subject, pred, nil])
1145
+ next unless o
1146
+ # note this use of RDF::List is not particularly well-documented
1147
+ authors += RDF::List.from(repo, o).to_a
1148
+ end
1149
+
1150
+ # now try various permutations of the author/contributor predicate
1151
+ unsorted = []
1152
+ preds = contrib ? CONTRIB : AUTHOR
1153
+ preds.each do |pred|
1154
+ unsorted += repo.query([subject, pred, nil]).objects
1155
+ end
1156
+
1157
+ # prefetch the author names
1158
+ labels = authors.map { |a| [a, label_for(repo, a)] }.to_h
1159
+
1160
+ authors += unsorted.uniq.sort { |a, b| labels[a] <=> labels[b] }
1161
+
1162
+ unique ? authors.first : authors.uniq
1163
+ end
1164
+
1165
+ # Find the terminal replacements for the given subject, if any exist.
1166
+ #
1167
+ # @param repo
1168
+ # @param subject
1169
+ # @param published indicate the context is published
1170
+ #
1171
+ # @return [Set]
1172
+ #
1173
+ def self.replacements_for repo, subject, published: true, base: nil
1174
+ subject = coerce_resource subject, base
1175
+
1176
+ # `seen` is a hash mapping resources to publication status and
1177
+ # subsequent replacements. it collects all the resources in the
1178
+ # replacement chain in :fwd (replaces) and :rev (replaced-by)
1179
+ # members, along with a boolean :pub. `seen` also performs a
1180
+ # duty as cycle-breaking sentinel.
1181
+
1182
+ seen = {}
1183
+ queue = [subject]
1184
+ while (test = queue.shift)
1185
+ # fwd is "replaces", rev is "replaced by"
1186
+ entry = seen[test] ||= {
1187
+ pub: published?(repo, test), fwd: Set.new, rev: Set.new }
1188
+ queue += (
1189
+ subjects_for(repo, RDF::Vocab::DC.replaces, subject) +
1190
+ objects_for(repo, subject, RDF::Vocab::DC.isReplacedBy,
1191
+ only: :resource)
1192
+ ).uniq.map do |r| # r = replacement
1193
+ next if seen.include? r
1194
+ seen[r] ||= { pub: published?(repo, r), fwd: Set.new, rev: Set.new }
1195
+ seen[r][:fwd] << test
1196
+ entry[:rev] << r
1197
+ r
1198
+ end.compact.uniq
1199
+ end
1200
+
1201
+ # if we're calling from a published context, we return the
1202
+ # (topologically) last published resource(s), even if they are
1203
+ # replaced ultimately by unpublished resources.
1204
+
1205
+ out = seen.map { |k, v| v[:rev].empty? ? k : nil }.compact - [subject]
1206
+
1207
+ # now we modify `out` based on the publication status of the context
1208
+ if published
1209
+ pubout = out.select { |o| seen[o][:pub] }
1210
+ # if there is anything left after this, return it
1211
+ return pubout unless pubout.empty?
1212
+ # now we want to find the penultimate elements of `seen` that
1213
+ # are farthest along the replacement chain but whose status is
1214
+ # published
1215
+
1216
+ # start with `out`, take the union of their :fwd members, then
1217
+ # take the subset of those which are published. if the result
1218
+ # is empty, repeat. (this is walking backwards through the
1219
+ # graph we just walked forwards through to construct `seen`)
1220
+ loop do
1221
+ # XXX THIS NEEDS A TEST CASE
1222
+ out = seen.values_at(*out).map { |v| v[:fwd] }.reduce(:+).to_a
1223
+ break if out.empty?
1224
+ pubout = out.select { |o| seen[o][:pub] }
1225
+ return pubout unless pubout.empty?
1226
+ end
1227
+ end
1228
+
1229
+ out
1230
+ end
1231
+
1232
+ # Obtain dates for the subject as instances of Date(Time). This is
1233
+ # just shorthand for a common application of `objects_for`.
1234
+ #
1235
+ # @param repo
1236
+ # @param subject
1237
+ # @param predicate
1238
+ # @param datatype
1239
+ #
1240
+ # @return [Array] of dates
1241
+ #
1242
+ def self.dates_for repo, subject, predicate: RDF::Vocab::DC.date,
1243
+ datatype: [RDF::XSD.date, RDF::XSD.dateTime]
1244
+ objects_for(
1245
+ repo, subject, predicate, only: :literal, datatype: datatype) do |o|
1246
+ o.object
1247
+ end.sort.uniq
1248
+ end
1249
+
1250
+ # Obtain any specified MIME types for the subject. Just shorthand
1251
+ # for a common application of `objects_for`.
1252
+ #
1253
+ # @param repo
1254
+ # @param subject
1255
+ # @param predicate
1256
+ # @param datatype
1257
+ #
1258
+ # @return [Array] of internet media types
1259
+ #
1260
+ def formats_for repo, subject, predicate: RDF::Vocab::DC.format,
1261
+ datatype: [RDF::XSD.token]
1262
+ objects_for(
1263
+ repo, subject, predicate, only: :literal, datatype: datatype) do |o|
1264
+ t = o.object
1265
+ t =~ /\// ? RDF::SAK::MimeMagic.new(t.to_s.downcase) : nil
1266
+ end.compact.sort.uniq
1267
+ end
1268
+
1269
+ def self.base_for xmlnode, base
1270
+ base = URI(base.to_s) unless base.is_a? URI
1271
+ out = base
1272
+
1273
+ if xmlnode.at_xpath('self::html:*|/html', XPATHNS)
1274
+ b = URI(xmlnode.at_xpath(XPATH[:htmlbase], XPATHNS).to_s.strip)
1275
+
1276
+ out = b if b.absolute?
1277
+ elsif b = xmlnode.root.at_xpath(XPATH[:xmlbase])
1278
+ b = URI(b.to_s.strip)
1279
+ out = b if b.absolute?
1280
+ end
1281
+
1282
+ out
1283
+ end
1284
+
1285
+ # Traverse links based on content type.
1286
+ def self.traverse_links node, type: 'application/xhtml+xml', &block
1287
+ enum_for :traverse_links, node, type: type unless block
1288
+ type = type.strip.downcase.gsub(/\s*;.*/, '')
1289
+ xpath = LINK_MAP.fetch type, XPATH[:xlinks]
1290
+ node.xpath(xpath, XPATHNS).each { |node| block.call node }
1291
+ end
1292
+
1293
+
1294
+ # XXX OTHER STUFF
1295
+
1296
+ # isolate an element into a new document
1297
+ def subtree doc, xpath = '/*', reindent: true, prefixes: {}
1298
+ # at this time we shouldn't try to do anything cute with the xpath
1299
+ # even though it is attractive to want to prune out prefixes
1300
+
1301
+ # how about we start with a noop
1302
+ return doc.root.dup if xpath == '/*'
1303
+
1304
+ begin
1305
+ nodes = doc.xpath xpath, prefixes
1306
+ return unless
1307
+ nodes and nodes.is_a?(Nokogiri::XML::NodeSet) and !nodes.empty?
1308
+ out = Nokogiri::XML::Document.new
1309
+ out << nodes.first.dup
1310
+ reindent out.root if reindent
1311
+ out
1312
+ rescue Nokogiri::SyntaxError
1313
+ return
1314
+ end
1315
+ end
1316
+
1317
+ # reindent text nodes
1318
+ def reindent node, depth = 0, indent = ' '
1319
+ kids = node.children
1320
+ if kids and child = kids.first
1321
+ loop do
1322
+ if child.element?
1323
+ # recurse into the element
1324
+ reindent child, depth + 1, indent
1325
+ elsif child.text?
1326
+ text = child.content || ''
1327
+
1328
+ # optional horizontal whitespace followed by at least
1329
+ # one newline (we don't care what kind), followed by
1330
+ # optional horizontal or vertical whitespace
1331
+ preamble = !!text.gsub!(/\A[ \t]*[\r\n]+\s*/, '')
1332
+
1333
+ # then we don't care what's in the middle, but hey let's get
1334
+ # rid of dos newlines because we can always put them back
1335
+ # later if we absolutely have to
1336
+ text.gsub!(/\r+/, '')
1337
+
1338
+ # then optionally any whitespace followed by at least
1339
+ # another newline again, followed by optional horizontal
1340
+ # whitespace and then the end of the string
1341
+ epilogue = !!text.gsub!(/\s*[\r\n]+[ \t]*\z/, '')
1342
+
1343
+ # if we prune these off we'll have a text node that is
1344
+ # either the empty string or it isn't (note we will only
1345
+ # register an epilogue if the text has some non-whitespace
1346
+ # in it, because otherwise the first regex would have
1347
+ # snagged everything, so it's probably redundant)
1348
+
1349
+ # if it's *not* empty then we *prepend* indented whitespace
1350
+ if preamble and !text.empty?
1351
+ d = depth + (child.previous ? 1 : 0)
1352
+ text = "\n" + (indent * d) + text
1353
+ end
1354
+
1355
+ # then we unconditionally *append*, (modulo there being a
1356
+ # newline in the original at all), but we have to check by
1357
+ # how much: if this is *not* the last node then depth + 1,
1358
+ # otherwise depth
1359
+ if preamble or epilogue
1360
+ d = depth + (child.next ? 1 : 0)
1361
+ text << "\n" + (indent * d)
1362
+ end
1363
+
1364
+ child.content = text
1365
+ end
1366
+
1367
+ break unless child = child.next
1368
+ end
1369
+ end
1370
+
1371
+ node
1372
+ end
1373
+
1374
+ XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
1375
+ XHV = 'http://www.w3.org/1999/xhtml/vocab#'.freeze
1376
+ XPATHNS = {
1377
+ html: XHTMLNS,
1378
+ svg: 'http://www.w3.org/2000/svg',
1379
+ atom: 'http://www.w3.org/2005/Atom',
1380
+ xlink: 'http://www.w3.org/1999/xlink',
1381
+ }.freeze
1382
+
1383
+ ######## URI STUFF ########
1384
+
1385
+ # Preprocess a URI string so that it can be handed to +URI.parse+
1386
+ # without crashing.
1387
+ #
1388
+ # @param uri [#to_s] The URI string in question
1389
+ # @param extra [#to_s] Character class of any extra characters to escape
1390
+ # @return [String] The sanitized (appropriately escaped) URI string
1391
+
1392
+ # really gotta stop carting this thing around
1393
+ def uri_pp uri, extra = ''
1394
+ # take care of malformed escapes
1395
+ uri = uri.to_s.b.gsub(/%(?![0-9A-Fa-f]{2})/n, '%25')
1396
+ uri.gsub!(/([#{Regexp.quote extra}])/) do |s|
1397
+ sprintf('%%%02X', s.ord)
1398
+ end unless extra.empty?
1399
+ # we want the minimal amount of escaping so we split out the separators
1400
+ out = ''
1401
+ parts = RFC3986.match(uri).captures
1402
+ parts.each_index do |i|
1403
+ next if parts[i].nil?
1404
+ out << SEPS[i].first
1405
+ out << parts[i].b.gsub(SF) { |s| sprintf('%%%02X', s.ord) }
1406
+ out << SEPS[i].last
1407
+ end
1408
+
1409
+ # make sure escaped hex is upper case like the rfc says
1410
+ out.gsub(/(%[0-9A-Fa-f]{2})/) { |x| x.upcase }
1411
+ end
1412
+
1413
+ # Given a URI as input, split any query parameters into an array of
1414
+ # key-value pairs. If +:only+ is true, this will just return the
1415
+ # pairs. Otherwise it will prepend the query-less URI to the array,
1416
+ # and can be captured with an idiom like +uri, *qp = split_qp uri+.
1417
+ #
1418
+ # @param uri [URI,#to_s] The URI to extract parameters from
1419
+ # @param only [false, true] whether to only return the parameters
1420
+ # @return [Array] (See description)
1421
+ #
1422
+ def split_qp uri, only: false
1423
+ uri = URI(uri_pp uri.to_s) unless uri.is_a? URI
1424
+ qp = URI::decode_www_form(uri.query)
1425
+ return qp if only
1426
+ uri.query = nil
1427
+ [uri] + qp
1428
+ end
1429
+
1430
+ # Given a URI as input, split any path parameters out of the last
1431
+ # path segment. Works the same way as #split_pp.
1432
+ #
1433
+ # @param uri [URI,#to_s] The URI to extract parameters from
1434
+ # @param only [false, true] whether to only return the parameters
1435
+ # @return [Array] (See description)
1436
+ #
1437
+ def split_pp uri, only: false
1438
+ begin
1439
+ u = (uri.is_a?(URI) ? uri : URI(uri_pp uri.to_s)).normalize
1440
+
1441
+ rescue URI::InvalidURIError => e
1442
+ # these stock error messages don't even tell you what the uri is
1443
+ raise URI::InvalidURIError, "#{e.message} (#{uri.to_s})"
1444
+ end
1445
+
1446
+ return only ? [] : [uri] unless u.path
1447
+ uri = u
1448
+
1449
+ ps = uri.path.split '/', -1
1450
+ pp = ps.pop.split ';', -1
1451
+ bp = (ps + [pp.shift]).join '/'
1452
+ uri = uri.dup
1453
+
1454
+ begin
1455
+ uri.path = bp
1456
+ rescue URI::InvalidURIError => e
1457
+ # these stock error messages don't even tell you what the uri is
1458
+ m = e.message
1459
+ raise URI::InvalidURIError, "#{m} (#{uri.to_s}, #{bp})"
1460
+ end
1461
+
1462
+ return pp if only
1463
+ [uri] + pp
1464
+ end
1465
+
1466
+ def split_pp2 path, only: false
1467
+ # ugh apparently we need a special case for ''.split
1468
+ return only ? [] : [''] if !path or path.empty?
1469
+
1470
+ ps = path.to_s.split ?/, -1 # path segments
1471
+ pp = ps.pop.to_s.split ?;, -1 # path parameters
1472
+ bp = (ps + [pp.shift]).join ?/ # base path
1473
+
1474
+ only ? pp : [bp] + pp
1475
+ end
1476
+
1477
+ # Coerce a stringlike argument into a URI. Raises an exception if
1478
+ # the string can't be turned into a valid URI. Optionally resolves
1479
+ # against a +base+, and the coercion can be tuned to either URI or
1480
+ # RDF::URI via +:as+.
1481
+ #
1482
+ # @param arg [URI, RDF::URI, #to_s] The input string
1483
+ # @param base [URI, RDF::URI, #to_s] The optional base URI
1484
+ # @param as [:rdf, :uri, nil] The optional coercion type
1485
+ # @return [URI, RDF::URI, String]
1486
+ #
1487
+ def coerce_resource arg, base = nil, as: :rdf
1488
+ as = assert_uri_coercion as
1489
+ return arg if as and arg.is_a?({ uri: URI, rdf: RDF::URI }[as])
1490
+ raise ArgumentError, 'arg must be stringable' unless arg.respond_to? :to_s
1491
+
1492
+ arg = arg.to_s.strip
1493
+
1494
+ if arg.start_with? '_:' and as
1495
+ # override the coercion if this is a blank node
1496
+ as = :rdf
1497
+ elsif base
1498
+ begin
1499
+ arg = (base.is_a?(URI) ? base : URI(uri_pp base.to_s.strip)).merge arg
1500
+ rescue URI::InvalidURIError => e
1501
+ warn "attempted to coerce #{arg} which turned out to be invalid: #{e}"
1502
+ return
1503
+ end
1504
+ end
1505
+
1506
+ URI_COERCIONS[as].call arg
1507
+ end
1508
+
1509
+ # Coerce a stringlike argument into a UUID URN. Will
1510
+ def coerce_uuid_urn arg, base = nil
1511
+ # if this is an ncname then change it
1512
+ if ([URI, RDF::URI] & arg.class.ancestors).empty? &&
1513
+ arg.respond_to?(:to_s)
1514
+ arg = arg.to_s
1515
+
1516
+ # coerce ncname to uuid
1517
+ arg = UUID::NCName::from_ncname(arg, version: 1) if arg =~
1518
+ /^[A-P](?:[0-9A-Z_-]{20}|[2-7A-Z]{24})[A-P]$/i
1519
+
1520
+ # now the string is either a UUID or it isn't
1521
+ arg = "urn:uuid:#{arg}" unless arg.start_with? 'urn:uuid:'
1522
+ else
1523
+ arg = arg.class.new arg.to_s.downcase unless arg == arg.to_s.downcase
1524
+ end
1525
+
1526
+ raise ArgumentError, 'not a UUID' unless
1527
+ arg.to_s =~ /^urn:uuid:[0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8}$/
1528
+
1529
+ arg = coerce_resource arg, base
1530
+ end
1531
+
1532
+ # Get the last non-empty path segment of the URI
1533
+ #
1534
+ # @param uri
1535
+ #
1536
+ # @return [String]
1537
+ def terminal_slug uri, base: nil
1538
+ uri = coerce_resource uri, base
1539
+ return unless uri.respond_to? :path
1540
+ if p = uri.path
1541
+ if p = /^\/+(.*?)\/*$/.match(p)
1542
+ if p = p[1].split(/\/+/).last
1543
+ # we need to escape colons or it will think it's absolute
1544
+ return uri_pp(p.split(/;+/).first || '', ':')
1545
+ end
1546
+ end
1547
+ end
1548
+ ''
1549
+ end
1550
+
1551
+ # Resolve a string or array or attribute node containing one or more
1552
+ # terms/CURIEs against a set of prefixes. The CURIE can be a string,
1553
+ # Nokogiri::XML::Attr, or an array thereof. Strings are stripped and
1554
+ # split on whitespace. +:prefixes+ and +:base+ can be supplied or
1555
+ # gleaned from +:refnode+, which itself can be gleaned if +curie+ is
1556
+ # a Nokogiri::XML::Attr. Returns an array of (attempted) resolved
1557
+ # terms unless +:scalar+ is true, in which case only the first URI
1558
+ # is returned. When +:noop+ is true, this method will always return
1559
+ # a value. Can coerce results to either RDF::URI or URI objects.
1560
+ #
1561
+ # @note +:vocab+ overrides, and is the same as supplying
1562
+ # +prefix[nil]+. It is only meaningful when +:term+ (i.e., when we
1563
+ # expect the input to be an RDFa term) is true.
1564
+ #
1565
+ # @param curie [#to_s, Nokogiri::XML::Attr,Array] One or more CURIEs
1566
+ # @param prefixes [#to_h] The hash of prefixes (nil key is equivalent
1567
+ # to vocab)
1568
+ # @param vocab [nil,#to_s] An optional base URI
1569
+ # @param refnode [nil, Nokogiri::XML::Element] A reference node for resolution
1570
+ # @param term [false, true] Whether to treat the input as an RDFa _term_
1571
+ # @param noop [true, false] Whether to skip if the CURIE can't be resolved
1572
+ # @param scalar [false, true] Whether to return a scalar value
1573
+ # @param coerce [nil, :rdf, :uri] Desired type coercion for the output
1574
+ #
1575
+ # @return [nil,URI,RDF::URI,Array<nil,URI,RDF::URI>]
1576
+ #
1577
+ def resolve_curie curie, prefixes: {}, vocab: nil, base: nil,
1578
+ refnode: nil, term: false, noop: true, scalar: false, coerce: nil
1579
+ prefixes = sanitize_prefixes prefixes
1580
+
1581
+ raise 'coerce must be either :uri or :rdf' if coerce and
1582
+ not %i[uri rdf].include?(coerce)
1583
+
1584
+ # coerce curie to its value and set refnode if not present
1585
+ if curie.is_a? Nokogiri::XML::Attr
1586
+ refnode ||= curie.parent
1587
+ curie = curie.value.strip.split
1588
+ elsif curie.respond_to? :to_a
1589
+ curie = curie.to_a
1590
+ raise ArgumentError,
1591
+ 'if curie is an array, it has to be all strings' unless
1592
+ curie.all? { |x| x.respond_to? :to_s }
1593
+ curie = curie.map { |x| x.to_s.strip.split }.flatten
1594
+ else
1595
+ raise ArgumentError, 'curie must be stringable' unless
1596
+ curie.respond_to? :to_s
1597
+ curie = curie.to_s.strip.split
1598
+ end
1599
+
1600
+ if refnode
1601
+ raise ArgumentError, 'refnode must be an element' unless
1602
+ refnode.is_a? Nokogiri::XML::Element
1603
+ prefixes = get_prefixes refnode if prefixes.empty?
1604
+ end
1605
+
1606
+ # now we overwrite the vocab
1607
+ if vocab
1608
+ raise ArgumentError, 'vocab must be stringable' unless
1609
+ vocab.respond_to? :to_s
1610
+ prefixes[nil] = vocab.to_s.strip
1611
+ end
1612
+
1613
+ out = curie.map do |c|
1614
+ prefix, slug = /^\[?(?:([^:]+):)?(.*?)\]?$/.match(c).captures
1615
+ prefix = prefix.to_sym if prefix
1616
+ tmp = if prefixes[prefix]
1617
+ prefixes[prefix] + slug
1618
+ else
1619
+ noop ? c : nil
1620
+ end
1621
+ tmp && coerce ? URI_COERCIONS[coerce].call(tmp) : tmp
1622
+ end
1623
+
1624
+ scalar ? out.first : out
1625
+ end
1626
+
1627
+ # Abbreviate one or more URIs into one or more CURIEs if we
1628
+ # can. Will through if +noop:+ is true, or if false, return nil for
1629
+ # any URI that can't be abbreviated this way. Takes a hash of
1630
+ # prefix-URI mappings where the keys are assumed to be symbols or
1631
+ # +nil+ to express the current vocabulary, which can be overridden
1632
+ # via +vocab:+.
1633
+ #
1634
+ # @note Only +noop: true+ can be guaranteed to return a value.
1635
+ #
1636
+ # @param term [Array<#to_s>, #to_s] the term(s)
1637
+ # @param prefixes [Hash<Symbol,nil>, #to_h] the prefix mappings
1638
+ # @param vocab [#to_s] current vocabulary, overrides +prefixes[nil]+
1639
+ # @param noop [true, false] whether or not to pass terms through
1640
+ # @param sort [true, false] whether or not to sort (only if +noop:+)
1641
+ # @return [String, nil, Array<String,nil>] the (maybe) abbreviated term(s)
1642
+ #
1643
+ def abbreviate term, prefixes: {}, vocab: nil, noop: true, sort: true
1644
+ # this returns a duplicate that we can mess with
1645
+ prefixes = sanitize_prefixes prefixes
1646
+
1647
+ # sanitize vocab
1648
+ raise ArgumentError, 'vocab must be nil or stringable' unless
1649
+ vocab.nil? or vocab.respond_to? :to_s
1650
+ prefixes[nil] = vocab.to_s if vocab
1651
+ scalar = true
1652
+
1653
+ term = if term.respond_to? :to_a
1654
+ scalar = false
1655
+ term.to_a
1656
+ else [term]; end
1657
+
1658
+ rev = prefixes.invert
1659
+
1660
+ term.map! do |t|
1661
+ t = t.to_s
1662
+ slug = nil # we want this value to be nil if no match and !noop
1663
+
1664
+ # try matching each prefix URI from longest to shortest
1665
+ rev.sort { |a, b| b.first.length <=> a.first.length }.each do |uri, pfx|
1666
+ slug = t.delete_prefix uri
1667
+ # this is saying the URI either doesn't match or abbreviates to ""
1668
+ if slug == t or pfx.nil? && slug.empty?
1669
+ slug = nil
1670
+ else
1671
+ # it's already a slug so we add a prefix if there is one
1672
+ slug = '%s:%s' % [pfx, slug] unless pfx.nil?
1673
+ break # we have our match
1674
+ end
1675
+ end
1676
+
1677
+ # at this point slug is either an abbreviated term or nil, so:
1678
+ slug ||= t if noop
1679
+ slug
1680
+ end
1681
+
1682
+ # only sort if noop is set
1683
+ term.sort! if noop && sort
1684
+
1685
+ scalar ? term.first : term
1686
+ end
1687
+
1688
+ ######## RDFA/XML STUFF ########
1689
+
1690
+ # Returns the base URI from the perspective of the given element.
1691
+ # Can optionally be coerced into either a URI or RDF::URI. Also
1692
+ # takes a default value.
1693
+ #
1694
+ # @param elem [Nokogiri::XML::Node] the context element
1695
+ # @param default [nil, #to_s] the default URI
1696
+ # @param coerce [nil, :uri, :rdf] the coercion scheme, if any
1697
+ # @return [nil, String, URI, RDF::URI] the context's base URI
1698
+ def get_base elem, default: nil, coerce: nil
1699
+ assert_uri_coercion coerce
1700
+
1701
+ if elem.document?
1702
+ elem = elem.root
1703
+ return unless elem
1704
+ end
1705
+
1706
+ # get the xpath
1707
+ xpath = (elem.namespace && elem.namespace.href == XHTMLNS or
1708
+ elem.at_xpath('/html')) ? :htmlbase : :xmlbase
1709
+
1710
+ # now we go looking for the attribute
1711
+ if base = elem.at_xpath(XPATH[xpath], XPATHNS)
1712
+ base = base.value.strip
1713
+ else
1714
+ base = default.to_s.strip if default
1715
+ end
1716
+
1717
+ # clear it out if it's the empty string
1718
+ base = nil if base and base.empty?
1719
+
1720
+ # eh that's about all the input sanitation we're gonna get
1721
+ base && coerce ? URI_COERCIONS[coerce].call(base) : base
1722
+ end
1723
+
1724
+ # Given an X(HT)ML element, returns a hash of prefixes of the form
1725
+ # +{ prefix: "vocab" }+, where the current +@vocab+ is represented
1726
+ # by the +nil+ key. An optional +:traverse+ parameter can be set to
1727
+ # +false+ to prevent ascending the node tree. Any XML namespace
1728
+ # declarations are superseded by the +@prefix+ attribute. Returns
1729
+ # any +@vocab+ declaration found as the +nil+ key.
1730
+ #
1731
+ # @note The +descend: true+ parameter assumes we are trying to
1732
+ # collect all the namespaces in use in the entire subtree, rather
1733
+ # than resolve any particular CURIE. As such, the _first_ prefix
1734
+ # mapping in document order is preserved over subsequent/descendant
1735
+ # ones.
1736
+ #
1737
+ # @param elem [Nokogiri::XML::Node] The context element
1738
+ # @param traverse [true, false] whether or not to traverse the tree
1739
+ # @param coerce [nil, :rdf, :uri] a type coercion for the URIs, if any
1740
+ # @param descend [false, true] go _down_ the tree instead of up
1741
+ # @return [Hash] Depending on +:traverse+, either all prefixes
1742
+ # merged, or just the ones asserted in the element.
1743
+ def get_prefixes elem, traverse: true, coerce: nil, descend: false
1744
+ coerce = assert_uri_coercion coerce
1745
+
1746
+ # deal with a common phenomenon
1747
+ elem = elem.root if elem.is_a? Nokogiri::XML::Document
1748
+
1749
+ # get namespace definitions first
1750
+ prefix = elem.namespaces.reject do |k, _| k == 'xmlns'
1751
+ end.transform_keys { |k| k.split(?:)[1].to_sym }
1752
+
1753
+ # now do the prefix attribute
1754
+ if elem.key? 'prefix'
1755
+ # XXX note this assumes largely that the input is clean
1756
+ elem['prefix'].strip.split.each_slice(2) do |k, v|
1757
+ pfx = k.split(?:)[0] or next # otherwise error
1758
+ prefix[pfx.to_sym] = v
1759
+ end
1760
+ end
1761
+
1762
+ # encode the vocab as the null prefix
1763
+ if vocab = elem['vocab']
1764
+ vocab.strip!
1765
+ # note that a specified but empty @vocab means kill any existing vocab
1766
+ prefix[nil] = vocab.empty? ? nil : vocab
1767
+ end
1768
+
1769
+ # don't forget we can coerce
1770
+ prefix.transform_values! { |v| COERCIONS[coerce].call v } if coerce
1771
+
1772
+ # don't proceed if `traverse` is false
1773
+ return prefix unless traverse
1774
+
1775
+ # save us having to recurse in ruby by using xpath implemented in c
1776
+ xpath = '%s::*[namespace::*|@prefix|@vocab]' %
1777
+ (descend ? :descendant : :ancestor)
1778
+ elem.xpath(xpath).each do |e|
1779
+ # this will always merge our prefix on top irrespective of direction
1780
+ prefix = get_prefix(e, traverse: false, coerce: coerce).merge prefix
1781
+ end
1782
+
1783
+ prefix
1784
+ end
1785
+
1786
+ # Given an X(HT)ML element, return the nearest RDFa _subject_.
1787
+ # Optionally takes +:prefix+ and +:base+ parameters which override
1788
+ # anything found in the document tree.
1789
+ #
1790
+ # @param node [Nokogiri::XML::Element] the node
1791
+ # @param prefixes [Hash] Prefix mapping. Overrides derived values.
1792
+ # @param base [#to_s,URI,RDF::URI] Base URI, overrides as well.
1793
+ # @param coerce [nil, :rdf, :uri] the coercion regime
1794
+ #
1795
+ # @return [URI,RDF::URI,String] the subject
1796
+ #
1797
+ def subject_for node, prefixes: nil, base: nil, coerce: :rdf
1798
+ assert_xml_node node
1799
+ coerce = assert_uri_coercion coerce
1800
+
1801
+ if n = node.at_xpath(XPATH[:literal])
1802
+ return internal_subject_for n,
1803
+ prefixes: prefixes, base: base, coerce: coerce
1804
+ end
1805
+
1806
+ internal_subject_for node, prefixes: prefixes, base: base, coerce: coerce
1807
+ end
1808
+
1809
+ def modernize doc
1810
+ doc.xpath(XPATH[:modernize], XPATHNS).each do |e|
1811
+ # gotta instance_exec because `markup` is otherwise unbound
1812
+ instance_exec e, &MODERNIZE[e.name.to_sym]
1813
+ end
1814
+ end
1815
+
1816
+ # Strip all the links surrounding and RDFa attributes off
1817
+ # +dfn+/+abbr+/+span+ tags. Assuming a construct like +<a
1818
+ # rel="some:relation" href="#..." typeof="skos:Concept"><dfn
1819
+ # property="some:property">Term</dfn></a>+ is a link to a glossary
1820
+ # entry, this method returns the term back to an undecorated state
1821
+ # (+<dfn>Term</dfn>+).
1822
+
1823
+ def dehydrate doc
1824
+ doc.xpath(XPATH[:dehydrate], XPATHNS).each do |e|
1825
+ e = e.replace e.elements.first.dup
1826
+ %w[about resource typeof rel rev property datatype].each do |a|
1827
+ e.delete a if e.key? a
1828
+ end
1829
+ end
1830
+ end
1831
+
1832
+ # Scan all the +dfn+/+abbr+/+span+ tags in the document that are not
1833
+ # already wrapped in a link. This method scans the text (or
1834
+ # +@content+) of each element and compares it to the contents of the
1835
+ # graph. If the process locates a subject, it will use that subject
1836
+ # as the basis of a link. if there are zero subjects, or more than
1837
+ # one, then the method executes a block which can be used to pick
1838
+ # (e.g., via user interface) a definite subject or otherwise add one.
1839
+
1840
+ # (maybe add +code+/+kbd+/+samp+/+var+/+time+ one day too)
1841
+
1842
+ def rehydrate doc, graph, &block
1843
+ doc.xpath(XPATH[:rehydrate], XPATHNS).each do |e|
1844
+ lang = e.xpath(XPATH[:lang]).to_s.strip
1845
+ # dt = e['datatype'] # XXX no datatype rn
1846
+ text = (e['content'] || e.xpath('.//text()').to_a.join).strip
1847
+
1848
+ # now we have the literal
1849
+ lit = [RDF::Literal(text)]
1850
+ lit.unshift RDF::Literal(text, language: lang) unless lang.empty?
1851
+
1852
+ # candidates
1853
+ cand = {}
1854
+ lit.map do |t|
1855
+ graph.query(object: t).to_a
1856
+ end.flatten.each do |x|
1857
+ y = cand[x.subject] ||= {}
1858
+ (y[:stmts] ||= []) << x
1859
+ y[:types] ||= graph.query([x.subject, RDF.type, nil]).objects.sort
1860
+ end
1861
+
1862
+ # if there's only one candidate, this is basically a noop
1863
+ chosen = cand.keys.first if cand.size == 1
1864
+
1865
+ # call the block to reconcile any gaps or conflicts
1866
+ if block_given? and cand.size != 1
1867
+ # the block is expected to return one of the candidates or
1868
+ # nil. we call the block with the graph so that the block can
1869
+ # manipulate its contents.
1870
+ chosen = block.call cand, graph
1871
+ raise ArgumentError, 'block must return nil or a term' unless
1872
+ chosen.nil? or chosen.is_a? RDF::Term
1873
+ end
1874
+
1875
+ if chosen
1876
+ # we assume this has been retrieved from the graph
1877
+ cc = cand[chosen]
1878
+ unless cc
1879
+ cc = cand[chosen] = {}
1880
+ cc[:stmts] = graph.query([chosen, nil, lit[0]]).to_a.sort
1881
+ cc[:types] = graph.query([chosen, RDF.type, nil]).objects.sort
1882
+ # if either of these are empty then the graph was not
1883
+ # appropriately populated
1884
+ raise 'Missing a statement relating #{chosen} to #{text}' if
1885
+ cc[:stmts].empty?
1886
+ end
1887
+
1888
+ # we should actually probably move any prefix/vocab/xmlns
1889
+ # declarations from the inner node to the outer one (although
1890
+ # in practice this will be an unlikely configuration)
1891
+ pfx = get_prefixes e
1892
+
1893
+ # here we have pretty much everything except for the prefixes
1894
+ # and wherever we want to actually link to.
1895
+
1896
+ inner = e.dup
1897
+ spec = { [inner] => :a, href: '' }
1898
+ # we should have types
1899
+ spec[:typeof] = abbreviate cc[:types], prefixes: pfx unless
1900
+ cc[:types].empty?
1901
+
1902
+ markup replace: e, spec: spec
1903
+ end
1904
+ end
1905
+ # return maybe the elements that did/didn't get changed?
1906
+ end
1907
+
1908
+ ######## RENDERING STUFF ########
1909
+
1910
+ # Given a structure of the form +{ predicate => [objects] }+,
1911
+ # rearrange the structure into one more amenable to rendering
1912
+ # RDFa. Returns a hash of the form +{ resources: { r1 => Set[p1, pn]
1913
+ # }, literals: { l1 => Set[p2, pm] }, types: Set[t1, tn], datatypes:
1914
+ # Set[d1, dn] }+. This inverted structure can then be conveniently
1915
+ # traversed to generate the RDFa. An optional block lets us examine
1916
+ # the predicate-object pairs as they go by.
1917
+ #
1918
+ # @param struct [Hash] The struct of the designated form
1919
+ # @yield [p, o] An optional block is given the predicate-object pair
1920
+ # @return [Hash] The inverted structure, as described.
1921
+ #
1922
+ def prepare_collation struct, &block
1923
+ resources = {}
1924
+ literals = {}
1925
+ datatypes = Set.new
1926
+ types = Set.new
1927
+
1928
+ struct.each do |p, v|
1929
+ v.each do |o|
1930
+ block.call p, o if block
1931
+
1932
+ if o.literal?
1933
+ literals[o] ||= Set.new
1934
+ literals[o].add p
1935
+ # collect the datatype
1936
+ datatypes.add o.datatype if o.has_datatype?
1937
+ else
1938
+ if p == RDF::RDFV.type
1939
+ # separate the type
1940
+ types.add o
1941
+ else
1942
+ # collect the resource
1943
+ resources[o] ||= Set.new
1944
+ resources[o].add p
1945
+ end
1946
+ end
1947
+ end
1948
+ end
1949
+
1950
+ { resources: resources, literals: literals,
1951
+ datatypes: datatypes, types: types }
1952
+ end
1953
+
1954
+ # Given a hash of prefixes and an array of nodes, obtain the the
1955
+ # subset of prefixes that abbreviate the nodes. Scans RDF URIs as
1956
+ # well as RDF::Literal datatypes.
1957
+ #
1958
+ # @param prefixes [#to_h] The prefixes, of the form +{ k: "v" }+
1959
+ # @param nodes [Array<RDF::Term>] The nodes to supply
1960
+ # @return [Hash] The prefix subset
1961
+ def prefix_subset prefixes, nodes
1962
+ prefixes = sanitize_prefixes prefixes, true
1963
+
1964
+ raise 'nodes must be arrayable' unless nodes.respond_to? :to_a
1965
+
1966
+ # sniff out all the URIs and datatypes
1967
+ resources = Set.new
1968
+ nodes.each do |n|
1969
+ next unless n.is_a? RDF::Term
1970
+ if n.literal? && n.datatype?
1971
+ resources << n.datatype
1972
+ elsif n.uri?
1973
+ resources << n
1974
+ end
1975
+ end
1976
+
1977
+ # now we abbreviate all the resources
1978
+ pfx = abbreviate(resources.to_a,
1979
+ prefixes: prefixes, noop: false, sort: false).uniq.compact.map do |p|
1980
+ p.split(?:).first.to_sym
1981
+ end.uniq.to_set
1982
+
1983
+ # now we return the subset
1984
+ prefixes.select { |k, _| pfx.include? k.to_sym }
1985
+ end
1986
+
1987
+ # turns any data structure into a set of nodes
1988
+ def smush_struct struct
1989
+ out = Set.new
1990
+
1991
+ if struct.is_a? RDF::Term
1992
+ out << struct
1993
+ elsif struct.respond_to? :to_a
1994
+ out |= struct.to_a.map { |s| smush_struct(s).to_a }.flatten.to_set
1995
+ end
1996
+
1997
+ out
1998
+ end
1999
+
2000
+ def invert_struct struct
2001
+ nodes = {}
2002
+
2003
+ struct.each do |p, v|
2004
+ v.each do |o|
2005
+ nodes[o] ||= Set.new
2006
+ nodes[o] << p
2007
+ end
2008
+ end
2009
+
2010
+ nodes
2011
+ end
2012
+
2013
+ def title_tag predicates, content,
2014
+ prefixes: {}, vocab: nil, lang: nil, xhtml: true
2015
+
2016
+ # begin with the tag
2017
+ tag = { '#title' => content.to_s,
2018
+ property: abbreviate(predicates, prefixes: prefixes, vocab: vocab) }
2019
+
2020
+ # we set the language if it exists and is different from the
2021
+ # body OR if it is xsd:string we set it to the empty string
2022
+ lang = (content.language? && content.language != lang ?
2023
+ content.language : nil) || (content.datatype == RDF::XSD.string &&
2024
+ lang ? '' : nil)
2025
+ if lang
2026
+ tag['xml:lang'] = lang if xhtml
2027
+ tag[:lang] = lang
2028
+ end
2029
+ if content.datatype? && content.datatype != RDF::XSD.string
2030
+ tag[:datatype] = abbreviate(content.datatype,
2031
+ prefixes: prefixes, vocab: vocab)
2032
+ end
2033
+
2034
+ tag
2035
+ end
2036
+
2037
+ ######## MISC STUFF ########
2038
+
2039
+ # Obtain everything that is an owl:equivalentClass or
2040
+ # rdfs:subClassOf the given type.
2041
+ #
2042
+ # @param rdftype [RDF::Term]
2043
+ #
2044
+ # @return [Array]
2045
+
2046
+ def all_related rdftype
2047
+ t = RDF::Vocabulary.find_term(rdftype) or raise "No type #{rdftype.to_s}"
2048
+ q = [t] # queue
2049
+ c = {} # cache
2050
+
2051
+ while term = q.shift
2052
+ # add term to cache
2053
+ c[term] = term
2054
+
2055
+ # keep this from tripping up
2056
+ next unless term.uri? and term.respond_to? :class?
2057
+
2058
+ # entail equivalent classes
2059
+ term.entail(:equivalentClass).each do |ec|
2060
+ # add equivalent classes to queue (if not already cached)
2061
+ q.push ec unless c[ec]
2062
+ c[ec] = ec unless ec == term
2063
+ end
2064
+
2065
+ # entail subclasses
2066
+ term.subClass.each do |sc|
2067
+ # add subclasses to queue (if not already cached)
2068
+ q.push sc unless c[sc]
2069
+ c[sc] = sc unless sc == term
2070
+ end
2071
+ end
2072
+
2073
+ # smush the result
2074
+ c.keys
2075
+ end
2076
+
2077
+
2078
+
2079
+ # duplicate instance methods as module methods
2080
+ extend self
2081
+ end