rdf-sak 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +268 -0
- data/Rakefile +13 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/example/cleanup.xsl +14 -0
- data/example/matches.xhtml +11 -0
- data/example/transforms.ttl +58 -0
- data/lib/rdf-sak.rb +1 -0
- data/lib/rdf/sak.rb +2506 -0
- data/lib/rdf/sak/ci.rb +827 -0
- data/lib/rdf/sak/cli.rb +35 -0
- data/lib/rdf/sak/docstats.rb +188 -0
- data/lib/rdf/sak/document.rb +772 -0
- data/lib/rdf/sak/ibis.rb +248 -0
- data/lib/rdf/sak/mimemagic.rb +73 -0
- data/lib/rdf/sak/pav.rb +479 -0
- data/lib/rdf/sak/qb.rb +280 -0
- data/lib/rdf/sak/scovo.rb +51 -0
- data/lib/rdf/sak/tfo.rb +301 -0
- data/lib/rdf/sak/transform.rb +1172 -0
- data/lib/rdf/sak/urlrunner.rb +602 -0
- data/lib/rdf/sak/util.rb +2081 -0
- data/lib/rdf/sak/version.rb +5 -0
- data/rdf-sak.gemspec +60 -0
- metadata +366 -0
data/lib/rdf/sak/util.rb
ADDED
@@ -0,0 +1,2081 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'rdf/sak/version'
|
3
|
+
|
4
|
+
require 'uri'
|
5
|
+
require 'uri/urn'
|
6
|
+
require 'set'
|
7
|
+
require 'uuid-ncname'
|
8
|
+
|
9
|
+
require 'rdf'
|
10
|
+
require 'rdf/vocab'
|
11
|
+
require 'rdf/reasoner'
|
12
|
+
require 'rdf/vocab/skos'
|
13
|
+
require 'rdf/vocab/foaf'
|
14
|
+
require 'rdf/vocab/bibo'
|
15
|
+
require 'rdf/vocab/dc'
|
16
|
+
require 'rdf/vocab/dc11'
|
17
|
+
|
18
|
+
require 'rdf/sak/mimemagic'
|
19
|
+
require 'rdf/sak/ci'
|
20
|
+
require 'rdf/sak/tfo'
|
21
|
+
require 'rdf/sak/ibis'
|
22
|
+
require 'rdf/sak/pav'
|
23
|
+
require 'rdf/sak/qb'
|
24
|
+
|
25
|
+
unless RDF::List.respond_to? :from
|
26
|
+
class RDF::List
|
27
|
+
private
|
28
|
+
|
29
|
+
def self.get_list repo, subject, seen = []
|
30
|
+
out = []
|
31
|
+
return out if seen.include? subject
|
32
|
+
seen << subject
|
33
|
+
first = repo.query([subject, RDF.first, nil]).objects.first or return out
|
34
|
+
out << first
|
35
|
+
rest = repo.query([subject, RDF.rest, nil]).objects.select do |x|
|
36
|
+
!x.literal?
|
37
|
+
end.first or return out
|
38
|
+
|
39
|
+
out + (rest != RDF.nil ? get_list(repo, rest, seen) : [])
|
40
|
+
end
|
41
|
+
|
42
|
+
public
|
43
|
+
|
44
|
+
# Inflate a list from a graph but don't change the graph
|
45
|
+
def self.from graph, subject
|
46
|
+
self.new graph: graph, subject: subject, values: get_list(graph, subject)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
module RDF::SAK::Util
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
RDF::Reasoner.apply(:rdfs, :owl)
|
56
|
+
|
57
|
+
R3986 = /^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/
|
58
|
+
SF = /[^[:alpha:][:digit:]\/\?%@!$&'()*+,:;=._~-]/n
|
59
|
+
RFC3986 =
|
60
|
+
/^(?:([^:\/?#]+):)?(?:\/\/([^\/?#]*))?([^?#]+)?(?:\?([^#]*))?(?:#(.*))?$/
|
61
|
+
SEPS = [['', ?:], ['//', ''], ['', ''], [??, ''], [?#, '']].freeze
|
62
|
+
|
63
|
+
XPATH = {
|
64
|
+
htmlbase: proc {
|
65
|
+
x = ['ancestor-or-self::html:html[1]/' \
|
66
|
+
'html:head[html:base[@href]][1]/html:base[@href][1]/@href']
|
67
|
+
(x << x.first.gsub('html:', '')).join ?| }.call,
|
68
|
+
xmlbase: 'ancestor-or-self::*[@xml:base][1]/@xml:base',
|
69
|
+
lang: 'normalize-space((%s)[last()])' %
|
70
|
+
%w[lang xml:lang].map do |a|
|
71
|
+
'ancestor-or-self::*[@%s][1]/@%s' % [a,a]
|
72
|
+
end.join(?|),
|
73
|
+
literal: '(ancestor::*[@property][not(@content)]' \
|
74
|
+
'[not(@resource|@href|@src) or @rel|@rev])[1]',
|
75
|
+
leaves: 'descendant::html:section[not(descendant::html:section)]' \
|
76
|
+
'[not(*[not(self::html:script)])]',
|
77
|
+
headers: './*[1][%s]//text()' %
|
78
|
+
(1..6).map { |x| "self::html:h#{x}" }.join(?|),
|
79
|
+
modernize: ([
|
80
|
+
"//html:div[*[1][#{(1..6).map { |i| 'self::html:h%d' % i }.join ?|}]]"] +
|
81
|
+
{ div: %i[section figure], blockquote: :note,
|
82
|
+
table: :figure, img: :figure }.map do |k, v|
|
83
|
+
(v.is_a?(Array) ? v : [v]).map do |cl|
|
84
|
+
"//html:#{k}[contains(concat(' ', " \
|
85
|
+
"normalize-space(@class), ' '), ' #{cl} ')]"
|
86
|
+
end
|
87
|
+
end.flatten).join(?|),
|
88
|
+
dehydrate: '//html:a[count(*)=1][html:dfn|html:abbr|html:span]',
|
89
|
+
rehydrate: %w[//html:dfn
|
90
|
+
//html:abbr[not(parent::html:dfn)] //html:span].join(?|) +
|
91
|
+
'[not(parent::html:a)]',
|
92
|
+
htmllinks: (%w[*[not(self::html:base)][@href]/@href
|
93
|
+
*[@src]/@src object[@data]/@data *[@srcset]/@srcset
|
94
|
+
form[@action]/@action].map { |e|
|
95
|
+
'//html:%s' % e} + %w[//*[@xlink:href]/@xlink:href]).join(?|).freeze,
|
96
|
+
atomlinks: %w[uri content/@src category/@scheme generator/@uri icon id
|
97
|
+
link/@href logo].map { |e| '//atom:%s' % e }.join(?|).freeze,
|
98
|
+
rsslinks: %w[image/text()[1] docs/text()[1] source/@url enclosure/@url
|
99
|
+
guid/text()[1] comments/text()[1]].map { |e|
|
100
|
+
'//%s' % e }.join(?|).freeze,
|
101
|
+
xlinks: '//*[@xlink:href]/@xlink:href'.freeze,
|
102
|
+
rdflinks: %w[about resource datatype].map { |e|
|
103
|
+
'//*[@rdf:%s]/@rdf:%s' % [e, e] }.join(?|).freeze,
|
104
|
+
}
|
105
|
+
|
106
|
+
LINK_MAP = {
|
107
|
+
'text/html' => :htmllinks,
|
108
|
+
'application/xhtml+xml' => :htmllinks,
|
109
|
+
'application/atom+xml' => :atomlinks,
|
110
|
+
'application/x-rss+xml' => :rsslinks,
|
111
|
+
'application/rdf+xml' => :rdflinks,
|
112
|
+
'image/svg+xml' => :xlinks,
|
113
|
+
}.transform_values { |v| XPATH[v] }.freeze
|
114
|
+
|
115
|
+
URI_COERCIONS = {
|
116
|
+
nil => -> t { t.to_s },
|
117
|
+
false => -> t { t.to_s },
|
118
|
+
uri: -> t { URI.parse t.to_s },
|
119
|
+
rdf: -> t {
|
120
|
+
t = t.to_s
|
121
|
+
t.start_with?('_:') ? RDF::Node.new(t.delete_prefix '_:') : RDF::URI(t) },
|
122
|
+
}
|
123
|
+
|
124
|
+
UUID_RE = /^(?:urn:uuid:)?([0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8})$/i
|
125
|
+
|
126
|
+
# okay labels: what do we want to do about them? poor man's fresnel!
|
127
|
+
|
128
|
+
# basic structure is an asserted base class corresponding to a
|
129
|
+
# ranked list of asserted predicates. to the subject we first
|
130
|
+
# match the closest class, then the closest property.
|
131
|
+
|
132
|
+
# if the instance data doesn't have an exact property mentioned in
|
133
|
+
# the spec, it may have an equivalent property or subproperty we
|
134
|
+
# may be able to use. we could imagine a scoring system analogous
|
135
|
+
# to the one used by CSS selectors, albeit using the topological
|
136
|
+
# distance of classes/predicates in the spec versus those in the
|
137
|
+
# instance data.
|
138
|
+
|
139
|
+
# think about dcterms:title is a subproperty of dc11:title even
|
140
|
+
# though they are actually more like equivalent properties;
|
141
|
+
# owl:equivalentProperty is not as big a conundrum as
|
142
|
+
# rdfs:subPropertyOf.
|
143
|
+
|
144
|
+
# if Q rdfs:subPropertyOf P then S Q O implies S P O. this is
|
145
|
+
# great but property Q may not be desirable to display.
|
146
|
+
|
147
|
+
# it may be desirable to be able to express properties to never
|
148
|
+
# use as a label, such as skos:hiddenLabel
|
149
|
+
|
150
|
+
# consider ranked alternates, sequences, sequences of alternates.
|
151
|
+
# (this is what fresnel does fyi)
|
152
|
+
|
153
|
+
STRINGS = {
|
154
|
+
RDF::RDFS.Resource => {
|
155
|
+
label: [
|
156
|
+
# main
|
157
|
+
[RDF::Vocab::SKOS.prefLabel, RDF::RDFS.label,
|
158
|
+
RDF::Vocab::DC.title, RDF::Vocab::DC11.title, RDF::RDFV.value],
|
159
|
+
# alt
|
160
|
+
[RDF::Vocab::SKOS.altLabel, RDF::Vocab::DC.alternative],
|
161
|
+
],
|
162
|
+
desc: [
|
163
|
+
# main will be cloned into alt
|
164
|
+
[RDF::Vocab::DC.abstract, RDF::Vocab::DC.description,
|
165
|
+
RDF::Vocab::DC11.description, RDF::RDFS.comment,
|
166
|
+
RDF::Vocab::SKOS.note],
|
167
|
+
],
|
168
|
+
},
|
169
|
+
RDF::Vocab::FOAF.Document => {
|
170
|
+
label: [
|
171
|
+
# main
|
172
|
+
[RDF::Vocab::DC.title, RDF::Vocab::DC11.title],
|
173
|
+
# alt
|
174
|
+
[RDF::Vocab::BIBO.shortTitle, RDF::Vocab::DC.alternative],
|
175
|
+
],
|
176
|
+
desc: [
|
177
|
+
# main
|
178
|
+
[RDF::Vocab::BIBO.abstract, RDF::Vocab::DC.abstract,
|
179
|
+
RDF::Vocab::DC.description, RDF::Vocab::DC11.description],
|
180
|
+
# alt
|
181
|
+
[RDF::Vocab::BIBO.shortDescription],
|
182
|
+
],
|
183
|
+
},
|
184
|
+
RDF::Vocab::FOAF.Agent => {
|
185
|
+
label: [
|
186
|
+
# main (will get cloned into alt)
|
187
|
+
[RDF::Vocab::FOAF.name],
|
188
|
+
],
|
189
|
+
desc: [
|
190
|
+
# main cloned into alt
|
191
|
+
[RDF::Vocab::FOAF.status],
|
192
|
+
],
|
193
|
+
},
|
194
|
+
}
|
195
|
+
STRINGS[RDF::OWL.Thing] = STRINGS[RDF::RDFS.Resource]
|
196
|
+
|
197
|
+
# note this is to_a because "can't modify a hash during iteration"
|
198
|
+
# which i guess is sensible, so we generate a set of pairs first
|
199
|
+
STRINGS.to_a.each do |type, struct|
|
200
|
+
struct.values.each do |lst|
|
201
|
+
# assert a whole bunch of stuff
|
202
|
+
raise 'STRINGS content must be an array of arrays' unless
|
203
|
+
lst.is_a? Array
|
204
|
+
raise 'Spec must contain 1 or 2 Array elements' if lst.empty?
|
205
|
+
raise 'Spec must be array of arrays of terms' unless
|
206
|
+
lst.all? { |x| x.is_a? Array and x.all? { |y|
|
207
|
+
RDF::Vocabulary.find_term(y) } }
|
208
|
+
|
209
|
+
# prune this to two elements (not that there should be more than)
|
210
|
+
lst.slice!(2, lst.length) if lst.length > 2
|
211
|
+
|
212
|
+
# pre-fill equivalent properties
|
213
|
+
lst.each do |preds|
|
214
|
+
# for each predicate, find its equivalent properties
|
215
|
+
|
216
|
+
# splice them in after the current predicate only if they
|
217
|
+
# are not already explicitly in the list
|
218
|
+
i = 0
|
219
|
+
loop do
|
220
|
+
equiv = preds[i].entail(:equivalentProperty) - preds
|
221
|
+
preds.insert(i + 1, *equiv) unless equiv.empty?
|
222
|
+
|
223
|
+
i += equiv.length + 1
|
224
|
+
break if i >= preds.length
|
225
|
+
end
|
226
|
+
|
227
|
+
# this just causes too many problems otherwise
|
228
|
+
# preds.map! { |p| p.to_s }
|
229
|
+
end
|
230
|
+
|
231
|
+
# duplicate main predicates to alternatives
|
232
|
+
lst[1] ||= lst[0]
|
233
|
+
end
|
234
|
+
|
235
|
+
# may as well seed equivalent classes so we don't have to look them up
|
236
|
+
type.entail(:equivalentClass).each do |equiv|
|
237
|
+
STRINGS[equiv] ||= struct
|
238
|
+
end
|
239
|
+
|
240
|
+
# tempting to do subclasses too but it seems pretty costly in
|
241
|
+
# this framework; save it for the clojure version
|
242
|
+
end
|
243
|
+
|
244
|
+
AUTHOR = [RDF::SAK::PAV.authoredBy, RDF::Vocab::DC.creator,
|
245
|
+
RDF::Vocab::DC11.creator, RDF::Vocab::PROV.wasAttributedTo]
|
246
|
+
CONTRIB = [RDF::SAK::PAV.contributedBy, RDF::Vocab::DC.contributor,
|
247
|
+
RDF::Vocab::DC11.contributor]
|
248
|
+
[AUTHOR, CONTRIB].each do |preds|
|
249
|
+
i = 0
|
250
|
+
loop do
|
251
|
+
equiv = preds[i].entail(:equivalentProperty) - preds
|
252
|
+
preds.insert(i + 1, *equiv) unless equiv.empty?
|
253
|
+
i += equiv.length + 1
|
254
|
+
break if i >= preds.length
|
255
|
+
end
|
256
|
+
|
257
|
+
preds.freeze
|
258
|
+
end
|
259
|
+
|
260
|
+
def sanitize_prefixes prefixes, nonnil = false
|
261
|
+
raise ArgumentError, 'prefixes must be a hash' unless
|
262
|
+
prefixes.is_a? Hash or prefixes.respond_to? :to_h
|
263
|
+
prefixes = prefixes.to_h.map do |k, v|
|
264
|
+
[k ? k.to_s.to_sym : nil, v ? v.to_s : nil]
|
265
|
+
end.to_h
|
266
|
+
|
267
|
+
prefixes.reject! { |k, v| k.nil? || v.nil? } if nonnil
|
268
|
+
prefixes
|
269
|
+
end
|
270
|
+
|
271
|
+
def assert_uri_coercion coerce
|
272
|
+
if coerce
|
273
|
+
coerce = coerce.to_s.to_sym if coerce.respond_to? :to_s
|
274
|
+
raise 'coerce must be either :uri or :rdf' unless
|
275
|
+
%i[uri rdf].include?(coerce)
|
276
|
+
end
|
277
|
+
coerce
|
278
|
+
end
|
279
|
+
|
280
|
+
def assert_xml_node node
|
281
|
+
raise 'Argument must be a Nokogiri::XML::Element' unless
|
282
|
+
node.is_a? Nokogiri::XML::Element
|
283
|
+
node
|
284
|
+
end
|
285
|
+
|
286
|
+
def internal_subject_for node, prefixes: nil, base: nil, coerce: nil,
|
287
|
+
is_ancestor: false
|
288
|
+
|
289
|
+
# note we assign these AFTER the literal check or it will be wrong
|
290
|
+
prefixes ||= get_prefixes node
|
291
|
+
|
292
|
+
base ||= get_base node
|
293
|
+
base = coerce_resource base, as: :uri unless base
|
294
|
+
|
295
|
+
# answer a bunch of helpful questions about this element
|
296
|
+
subject = nil
|
297
|
+
parent = node.parent
|
298
|
+
ns_href = node.namespace.href if node.namespace
|
299
|
+
up_ok = %i[rel rev].none? { |a| node.key? a }
|
300
|
+
is_root = !parent or parent.document?
|
301
|
+
special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
|
302
|
+
(ns_href == 'http://www.w3.org/1999/xhtml' or
|
303
|
+
/^(?:[^:]+:)?html$/xi === parent.name)
|
304
|
+
|
305
|
+
# if the node is being inspected as an ancestor to the
|
306
|
+
# original node, we have to check it backwards.
|
307
|
+
if is_ancestor
|
308
|
+
# ah right @resource gets special treatment
|
309
|
+
if subject = node[:resource]
|
310
|
+
subject = resolve_curie subject,
|
311
|
+
prefixes: prefixes, base: base, scalar: true
|
312
|
+
else
|
313
|
+
# then check @href and @src
|
314
|
+
%i[href src].each do |attr|
|
315
|
+
if node.key? attr
|
316
|
+
# merge with the root and return it
|
317
|
+
subject = base + node[attr]
|
318
|
+
break
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
return coerce_resource subject, as: coerce if subject
|
324
|
+
|
325
|
+
# note if we are being called with is_ancestor, that means
|
326
|
+
# the original node (or indeed any of the nodes previously
|
327
|
+
# tested) have anything resembling a resource in them. this
|
328
|
+
# means @rel/@rev should be ignored, and we should keep
|
329
|
+
# looking for a subject.
|
330
|
+
end
|
331
|
+
|
332
|
+
if node[:about]
|
333
|
+
|
334
|
+
subject = resolve_curie node[:about],
|
335
|
+
prefixes: prefixes, base: base, scalar: true
|
336
|
+
|
337
|
+
# ignore coercion
|
338
|
+
return subject if subject.is_a? RDF::Node
|
339
|
+
|
340
|
+
elsif is_root
|
341
|
+
subject = base
|
342
|
+
elsif special
|
343
|
+
subject = subject_for_internal parent
|
344
|
+
elsif node[:resource]
|
345
|
+
# XXX resolve @about against potential curie
|
346
|
+
subject = resolve_curie node[:resource], prefixes: prefixes, base: base
|
347
|
+
elsif node[:href]
|
348
|
+
subject = base + node[:href]
|
349
|
+
elsif node[:src]
|
350
|
+
subject = base + node[:src]
|
351
|
+
elsif node[:typeof]
|
352
|
+
# bnode the typeof attr
|
353
|
+
|
354
|
+
# note we return bnodes irrespective of the rdf flag
|
355
|
+
return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
|
356
|
+
elsif node[:inlist]
|
357
|
+
# bnode the inlist attr
|
358
|
+
return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
|
359
|
+
elsif (parent[:inlist] && %i[href src].none? { |a| parent.key? a }) ||
|
360
|
+
(is_ancestor && !up_ok)
|
361
|
+
# bnode the element
|
362
|
+
return RDF::Node('id-%016x' % node.pointer_id)
|
363
|
+
# elsif node[:id]
|
364
|
+
else
|
365
|
+
subject = subject_for_internal parent, is_ancestor: true
|
366
|
+
end
|
367
|
+
|
368
|
+
coerce_resource subject, as: coerce if subject
|
369
|
+
end
|
370
|
+
|
371
|
+
MODERNIZE = {
|
372
|
+
div: -> e {
|
373
|
+
if e.classes.include? 'figure'
|
374
|
+
e.remove_class 'figure'
|
375
|
+
e.name = 'figure' unless e.parent.name == 'figure'
|
376
|
+
else
|
377
|
+
e.remove_class 'section'
|
378
|
+
e.name = 'section'
|
379
|
+
end
|
380
|
+
},
|
381
|
+
blockquote: -> e {
|
382
|
+
e.remove_class 'note'
|
383
|
+
e.name = 'aside'
|
384
|
+
e['role'] = 'note'
|
385
|
+
},
|
386
|
+
table: -> e {
|
387
|
+
e.remove_class 'figure'
|
388
|
+
unless e.parent.name == 'figure'
|
389
|
+
inner = e.dup
|
390
|
+
markup replace: e, spec: { [inner] => :figure }
|
391
|
+
end
|
392
|
+
},
|
393
|
+
img: -> e {
|
394
|
+
e.remove_class 'figure'
|
395
|
+
unless e.parent.name == 'figure'
|
396
|
+
inner = e.dup
|
397
|
+
markup replace: e, spec: { [inner] => :figure }
|
398
|
+
end
|
399
|
+
},
|
400
|
+
}
|
401
|
+
|
402
|
+
# rdf term type tests
|
403
|
+
NTESTS = { uri: :"uri?", blank: :"node?", literal: :"literal?" }.freeze
|
404
|
+
NMAP = ({ iri: :uri, bnode: :blank }.merge(
|
405
|
+
[:uri, :blank, :literal].map { |x| [x, x] }.to_h)).freeze
|
406
|
+
|
407
|
+
public
|
408
|
+
|
409
|
+
def coerce_node_spec spec, rev: false
|
410
|
+
spec = [spec] unless spec.respond_to? :to_a
|
411
|
+
spec = spec - [:resource] + [:uri, :blank] if spec.include? :resource
|
412
|
+
raise 'Subjects are never literals' if rev and spec.include? :literal
|
413
|
+
|
414
|
+
spec = NMAP.values_at(*spec).reject(&:nil?).uniq
|
415
|
+
spec = NTESTS.keys if spec.empty?
|
416
|
+
spec.delete :literal if rev
|
417
|
+
spec.uniq
|
418
|
+
end
|
419
|
+
|
420
|
+
def node_matches? node, spec
|
421
|
+
spec.any? { |k| node.send NTESTS[k] }
|
422
|
+
end
|
423
|
+
|
424
|
+
# Obtain all and only the rdf:types directly asserted on the subject.
|
425
|
+
#
|
426
|
+
# @param repo [RDF::Queryable]
|
427
|
+
# @param subject [RDF::Resource]
|
428
|
+
# @param type [RDF::Term, :to_a]
|
429
|
+
#
|
430
|
+
# @return [Array]
|
431
|
+
def self.asserted_types repo, subject, type = nil
|
432
|
+
asserted = nil
|
433
|
+
|
434
|
+
if type
|
435
|
+
type = type.respond_to?(:to_a) ? type.to_a : [type]
|
436
|
+
asserted = type.select { |t| t.is_a? RDF::Value }.map do |t|
|
437
|
+
RDF::Vocabulary.find_term t
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
asserted ||= repo.query([subject, RDF.type, nil]).objects.map do |o|
|
442
|
+
RDF::Vocabulary.find_term o
|
443
|
+
end.compact
|
444
|
+
|
445
|
+
asserted.select { |t| t && t.uri? }.uniq
|
446
|
+
end
|
447
|
+
|
448
|
+
# Obtain a stack of types for an asserted initial type or set
|
449
|
+
# thereof. Returns an array of arrays, where the first is the
|
450
|
+
# asserted types and their inferred equivalents, and subsequent
|
451
|
+
# elements are immediate superclasses and their equivalents. A
|
452
|
+
# given URI will only appear once in the entire structure.
|
453
|
+
#
|
454
|
+
# @param rdftype [RDF::Term, :to_a]
|
455
|
+
#
|
456
|
+
# @return [Array]
|
457
|
+
#
|
458
|
+
def type_strata rdftype
|
459
|
+
# first we coerce this to an array
|
460
|
+
if rdftype.respond_to? :to_a
|
461
|
+
rdftype = rdftype.to_a
|
462
|
+
else
|
463
|
+
rdftype = [rdftype]
|
464
|
+
end
|
465
|
+
|
466
|
+
# now squash and coerce
|
467
|
+
rdftype = rdftype.uniq.map { |t| RDF::Vocabulary.find_term t }.compact
|
468
|
+
|
469
|
+
# bail out early
|
470
|
+
return [] if rdftype.empty?
|
471
|
+
|
472
|
+
# essentially what we want to do is construct a layer of
|
473
|
+
# asserted classes and their inferred equivalents, then probe
|
474
|
+
# the classes in the first layer for subClassOf assertions,
|
475
|
+
# which will form the second layer, and so on.
|
476
|
+
|
477
|
+
queue = [rdftype]
|
478
|
+
strata = []
|
479
|
+
seen = Set.new
|
480
|
+
|
481
|
+
while qin = queue.shift
|
482
|
+
qwork = []
|
483
|
+
|
484
|
+
qin.each do |q|
|
485
|
+
qwork << q # entail doesn't include q
|
486
|
+
qwork += q.entail(:equivalentClass) if q.uri?
|
487
|
+
end
|
488
|
+
|
489
|
+
# grep and flatten
|
490
|
+
qwork = qwork.map do |t|
|
491
|
+
next t if t.is_a? RDF::Vocabulary::Term
|
492
|
+
RDF::Vocabulary.find_term t
|
493
|
+
end.compact.uniq - seen.to_a
|
494
|
+
seen |= qwork
|
495
|
+
|
496
|
+
# warn "qwork == #{qwork.inspect}"
|
497
|
+
|
498
|
+
# push current layer out
|
499
|
+
strata.push qwork.dup unless qwork.empty?
|
500
|
+
|
501
|
+
# now deal with subClassOf
|
502
|
+
qsuper = []
|
503
|
+
qwork.each { |q| qsuper += q.subClassOf }
|
504
|
+
|
505
|
+
# grep and flatten this too
|
506
|
+
qsuper = qsuper.map do |t|
|
507
|
+
next t if t.is_a? RDF::Vocabulary::Term
|
508
|
+
RDF::Vocabulary.find_term t
|
509
|
+
end.compact.uniq - seen.to_a
|
510
|
+
# do not append qsuper to seen!
|
511
|
+
|
512
|
+
# warn "qsuper == #{qsuper.inspect}"
|
513
|
+
|
514
|
+
# same deal, conditionally push the input queue
|
515
|
+
queue.push qsuper.dup unless qsuper.empty?
|
516
|
+
end
|
517
|
+
|
518
|
+
# voila
|
519
|
+
strata
|
520
|
+
end
|
521
|
+
|
522
|
+
# Obtain the objects for a given subject-predicate pair.
|
523
|
+
#
|
524
|
+
# @param subject [RDF::Resource]
|
525
|
+
# @param predicate [RDF::URI]
|
526
|
+
# @param entail [false, true]
|
527
|
+
# @param only [:uri, :iri, :resource, :blank, :bnode, :literal]
|
528
|
+
# @param datatype [RDF::Term]
|
529
|
+
#
|
530
|
+
# @return [Array]
|
531
|
+
#
|
532
|
+
def predicate_set predicates, seen: Set.new
|
533
|
+
predicates = Set[predicates] if predicates.is_a? RDF::URI
|
534
|
+
unless predicates.is_a? Set
|
535
|
+
raise "predicates must be a set" unless predicates.respond_to? :to_set
|
536
|
+
predicates = predicates.to_set
|
537
|
+
end
|
538
|
+
|
539
|
+
# shortcut
|
540
|
+
return predicates if predicates.empty?
|
541
|
+
|
542
|
+
raise 'predicates must all be RDF::URI' unless predicates.all? do |p|
|
543
|
+
p.is_a? RDF::URI
|
544
|
+
end
|
545
|
+
|
546
|
+
# first we generate the set of equivalent properties for the given
|
547
|
+
# properties
|
548
|
+
predicates += predicates.map do |p|
|
549
|
+
p.entail :equivalentProperty
|
550
|
+
end.flatten.to_set
|
551
|
+
|
552
|
+
# then we take the resulting set of properties and
|
553
|
+
# compute their subproperties
|
554
|
+
subp = Set.new
|
555
|
+
(predicates - seen).each do |p|
|
556
|
+
subp += p.subProperty.flatten.to_set
|
557
|
+
end
|
558
|
+
|
559
|
+
# uhh this whole "seen" business might not be necessary
|
560
|
+
predicates + predicate_set(subp - predicates - seen, seen: predicates)
|
561
|
+
end
|
562
|
+
|
563
|
+
# Returns subjects from the graph with entailment.
|
564
|
+
#
|
565
|
+
# @param repo
|
566
|
+
# @param predicate
|
567
|
+
# @param object
|
568
|
+
# @param entail
|
569
|
+
# @param only
|
570
|
+
#
|
571
|
+
# @return [RDF::Resource]
|
572
|
+
#
|
573
|
+
def self.subjects_for repo, predicate, object, entail: true, only: []
|
574
|
+
raise 'Object must be a Term' unless object.is_a? RDF::Term
|
575
|
+
predicate = predicate.respond_to?(:to_a) ? predicate.to_a : [predicate]
|
576
|
+
raise 'Predicate must be some kind of term' unless
|
577
|
+
predicate.all? { |p| p.is_a? RDF::URI }
|
578
|
+
|
579
|
+
only = coerce_node_spec only, rev: true
|
580
|
+
|
581
|
+
predicate = predicate.map { |x| RDF::Vocabulary.find_term x }.compact
|
582
|
+
predicate = predicate_set predicate if entail
|
583
|
+
|
584
|
+
out = {}
|
585
|
+
revp = Set.new
|
586
|
+
predicate.each do |p|
|
587
|
+
repo.query([nil, p, object]).subjects.each do |s|
|
588
|
+
next unless node_matches? s, only
|
589
|
+
|
590
|
+
entry = out[s] ||= [Set.new, Set.new]
|
591
|
+
entry[0] << p
|
592
|
+
end
|
593
|
+
|
594
|
+
# do this here while we're at it
|
595
|
+
unless object.literal?
|
596
|
+
revp += p.inverseOf.to_set
|
597
|
+
revp << p if p.type.include? RDF::OWL.SymmetricProperty
|
598
|
+
end
|
599
|
+
end
|
600
|
+
|
601
|
+
unless object.literal?
|
602
|
+
revp = predicate_set revp if entail
|
603
|
+
|
604
|
+
revp.each do |p|
|
605
|
+
repo.query([object, p, nil]).objects.each do |o|
|
606
|
+
next unless node_matches? o, only
|
607
|
+
|
608
|
+
entry = out[o] ||= [Set.new, Set.new]
|
609
|
+
entry[1] << p
|
610
|
+
end
|
611
|
+
end
|
612
|
+
end
|
613
|
+
|
614
|
+
# run this through a block to get access to the predicates
|
615
|
+
return out.map { |p, v| yield p, *v } if block_given?
|
616
|
+
|
617
|
+
out.keys
|
618
|
+
end
|
619
|
+
|
620
|
+
# Returns objects from the graph with entailment.
|
621
|
+
#
|
622
|
+
# @param repo
|
623
|
+
# @param subject
|
624
|
+
# @param predicate
|
625
|
+
# @param entail
|
626
|
+
# @param only
|
627
|
+
# @param datatype
|
628
|
+
#
|
629
|
+
# @return [RDF::Term]
|
630
|
+
#
|
631
|
+
def self.objects_for repo, subject, predicate,
|
632
|
+
entail: true, only: [], datatype: nil
|
633
|
+
raise "Subject must be a resource, not #{subject.inspect}" unless
|
634
|
+
subject.is_a? RDF::Resource
|
635
|
+
predicate = predicate.respond_to?(:to_a) ? predicate.to_a : [predicate]
|
636
|
+
raise "Predicate must be a term, not #{predicate.first.class}" unless
|
637
|
+
predicate.all? { |p| p.is_a? RDF::URI }
|
638
|
+
|
639
|
+
predicate = predicate.map { |x| RDF::Vocabulary.find_term x }.compact
|
640
|
+
|
641
|
+
only = coerce_node_spec only
|
642
|
+
|
643
|
+
datatype = (
|
644
|
+
datatype.respond_to?(:to_a) ? datatype.to_a : [datatype]).compact
|
645
|
+
raise 'Datatype must be some kind of term' unless
|
646
|
+
datatype.all? { |p| p.is_a? RDF::URI }
|
647
|
+
|
648
|
+
# fluff this out
|
649
|
+
predicate = predicate_set predicate if entail
|
650
|
+
|
651
|
+
out = {}
|
652
|
+
predicate.each do |p|
|
653
|
+
repo.query([subject, p, nil]).objects.each do |o|
|
654
|
+
|
655
|
+
# make sure it's in the spec
|
656
|
+
next unless node_matches? o, only
|
657
|
+
|
658
|
+
# constrain output
|
659
|
+
next if o.literal? and
|
660
|
+
!(datatype.empty? or datatype.include?(o.datatype))
|
661
|
+
|
662
|
+
entry = out[o] ||= [Set.new, Set.new]
|
663
|
+
entry.first << p
|
664
|
+
end
|
665
|
+
end
|
666
|
+
|
667
|
+
# now we do the reverse
|
668
|
+
unless only == [:literal]
|
669
|
+
# generate reverse predicates
|
670
|
+
revp = Set.new
|
671
|
+
predicate.each do |p|
|
672
|
+
revp += p.inverseOf.to_set
|
673
|
+
revp << p if p.type.include? RDF::OWL.SymmetricProperty
|
674
|
+
end
|
675
|
+
revp = predicate_set revp if entail
|
676
|
+
|
677
|
+
# now scan 'em
|
678
|
+
revp.each do |p|
|
679
|
+
repo.query([nil, p, subject]).subjects.each do |s|
|
680
|
+
next unless node_matches? s, only
|
681
|
+
# no need to check datatype; subject is never a literal
|
682
|
+
|
683
|
+
entry = out[s] ||= [Set.new, Set.new]
|
684
|
+
entry.last << p
|
685
|
+
end
|
686
|
+
end
|
687
|
+
end
|
688
|
+
|
689
|
+
# run this through a block to get access to the predicates
|
690
|
+
return out.map { |p, v| yield p, *v } if block_given?
|
691
|
+
|
692
|
+
out.keys
|
693
|
+
end
|
694
|
+
|
695
|
+
# Obtain the canonical UUID for the given URI
|
696
|
+
#
|
697
|
+
# @param repo [RDF::Queryable]
|
698
|
+
# @param uri [RDF::URI, URI, to_s] the subject of the inquiry
|
699
|
+
# @param unique [true, false] return a single resource/nil or an array
|
700
|
+
# @param published [true, false] whether to restrict to published docs
|
701
|
+
#
|
702
|
+
# @return [RDF::URI, Array]
|
703
|
+
#
|
704
|
+
def self.canonical_uuid repo, uri, unique: true, published: false,
|
705
|
+
scache: {}, ucache: {}, base: nil
|
706
|
+
# make sure this is actually a uri
|
707
|
+
orig = uri = coerce_resource uri, base
|
708
|
+
unless uri.is_a? RDF::Node
|
709
|
+
tu = URI(uri_pp(uri).to_s).normalize
|
710
|
+
|
711
|
+
if tu.path && !tu.fragment &&
|
712
|
+
UUID_RE.match?(uu = tu.path.delete_prefix(?/))
|
713
|
+
tu = URI('urn:uuid:' + uu.downcase)
|
714
|
+
end
|
715
|
+
|
716
|
+
# unconditionally overwrite uri
|
717
|
+
uri = RDF::URI(tu.to_s)
|
718
|
+
|
719
|
+
# now check if it's a uuid
|
720
|
+
if tu.respond_to? :uuid
|
721
|
+
# warn "lol uuid #{orig}"
|
722
|
+
# if it's a uuid, check that we have it as a subject
|
723
|
+
# if we have it as a subject, return it
|
724
|
+
return uri if scache[uri] ||= repo.has_subject?(uri)
|
725
|
+
# note i don't want to screw around right now dealing with the
|
726
|
+
# case that a UUID might not itself be canonical
|
727
|
+
end
|
728
|
+
end
|
729
|
+
|
730
|
+
# spit up the cache if present
|
731
|
+
if out = ucache[orig]
|
732
|
+
# warn "lol cached #{orig}"
|
733
|
+
return unique ? out.first : out
|
734
|
+
end
|
735
|
+
|
736
|
+
# otherwise we proceed:
|
737
|
+
|
738
|
+
# goal: return the most "appropriate" UUID for the given URI
|
739
|
+
|
740
|
+
# it is so lame i have to do this
|
741
|
+
bits = { nil => 0, false => 0, true => 1 }
|
742
|
+
|
743
|
+
# rank (0 is higher):
|
744
|
+
# * (00) exact & canonical == 0,
|
745
|
+
# * (01) exact == 1,
|
746
|
+
# * (10) inexact & canonical == 2,
|
747
|
+
# * (11) inexact == 3.
|
748
|
+
|
749
|
+
# warn "WTF URI #{uri}"
|
750
|
+
|
751
|
+
# handle path parameters by generating a bunch of candidates
|
752
|
+
uris = if uri.respond_to? :path and uri.path.start_with? ?/
|
753
|
+
# split any path parameters off
|
754
|
+
uu, *pp = split_pp uri
|
755
|
+
if pp.empty?
|
756
|
+
[uri] # no path parameters
|
757
|
+
else
|
758
|
+
uu = RDF::URI(uu.to_s)
|
759
|
+
bp = uu.path # base path
|
760
|
+
(0..pp.length).to_a.reverse.map do |i|
|
761
|
+
u = uu.dup
|
762
|
+
u.path = ([bp] + pp.take(i)).join(';')
|
763
|
+
u
|
764
|
+
end
|
765
|
+
end
|
766
|
+
else
|
767
|
+
[uri] # not a pathful URI
|
768
|
+
end
|
769
|
+
|
770
|
+
# collect the candidates by URI
|
771
|
+
sa = predicate_set [RDF::SAK::CI.canonical,
|
772
|
+
RDF::SAK::CI.alias, RDF::OWL.sameAs]
|
773
|
+
candidates = nil
|
774
|
+
uris.each do |u|
|
775
|
+
candidates = subjects_for(repo, sa, u, entail: false) do |s, f|
|
776
|
+
# there is no #to_i for booleans and also we xor this number
|
777
|
+
[s, { rank: bits[f.include?(RDF::SAK::CI.canonical)] ^ 1,
|
778
|
+
published: published?(repo, s),
|
779
|
+
mtime: dates_for(repo, s).last || DateTime.new }]
|
780
|
+
end.compact.to_h
|
781
|
+
break unless candidates.empty?
|
782
|
+
end
|
783
|
+
|
784
|
+
# now collect by slug
|
785
|
+
slug = terminal_slug uri, base: base
|
786
|
+
if slug and !slug.empty?
|
787
|
+
exact = uri == coerce_resource(slug, base) # slug represents exact match
|
788
|
+
sl = [RDF::SAK::CI['canonical-slug'], RDF::SAK::CI.slug]
|
789
|
+
[RDF::XSD.string, RDF::XSD.token].each do |t|
|
790
|
+
subjects_for(repo, sl, RDF::Literal(slug, datatype: t)) do |s, f|
|
791
|
+
# default to lowest rank if this candidate is new
|
792
|
+
entry = candidates[s] ||= {
|
793
|
+
published: published?(repo, s, base: base),
|
794
|
+
rank: 0b11, mtime: dates_for(repo, s).last || DateTime.new }
|
795
|
+
# true is 1 and false is zero so we xor this too
|
796
|
+
rank = (BITS[exact] << 1 | BITS[f.include?(sl[0])]) ^ 0b11
|
797
|
+
# now amend the rank if we have found a better one
|
798
|
+
entry[:rank] = rank if rank < entry[:rank]
|
799
|
+
end
|
800
|
+
end
|
801
|
+
end
|
802
|
+
|
803
|
+
candidates.delete_if { |s, _| !/^urn:uuid:/.match?(s.to_s) }
|
804
|
+
|
805
|
+
# scan all the candidates for replacements and remove any
|
806
|
+
# candidates that have been replaced
|
807
|
+
candidates.to_a.each do |k, v|
|
808
|
+
# note that
|
809
|
+
reps = replacements_for(repo, k, published: published) - [k]
|
810
|
+
unless reps.empty?
|
811
|
+
v[:replaced] = true
|
812
|
+
reps.each do |r|
|
813
|
+
c = candidates[r] ||= { rank: v[:rank],
|
814
|
+
published: published?(repo, r),
|
815
|
+
mtime: dates_for(repo, r).last || v[:mtime] || DateTime.new }
|
816
|
+
# we give the replacement the rank and mtime of the
|
817
|
+
# resource being replaced if it scores better
|
818
|
+
c[:rank] = v[:rank] if v[:rank] < c[:rank]
|
819
|
+
c[:mtime] = v[:mtime] if v[:mtime] > c[:mtime]
|
820
|
+
end
|
821
|
+
end
|
822
|
+
end
|
823
|
+
|
824
|
+
# now we can remove all unpublished candidates if the context is
|
825
|
+
# published
|
826
|
+
candidates.select! do |_, v|
|
827
|
+
!v[:replaced] && (published ? v[:published] : true)
|
828
|
+
end
|
829
|
+
|
830
|
+
# now we sort by rank and date; the highest-ranking newest
|
831
|
+
# candidate is the one
|
832
|
+
|
833
|
+
out = candidates.sort do |a, b|
|
834
|
+
_, va = a
|
835
|
+
_, vb = b
|
836
|
+
cb = published ? BITS[vb[:published]] <=> BITS[va[:published]] : 0
|
837
|
+
cr = va[:rank] <=> vb[:rank]
|
838
|
+
cb == 0 ? cr == 0 ? vb[:mtime] <=> va[:mtime] : cr : cb
|
839
|
+
end.map { |x| x.first }.compact
|
840
|
+
|
841
|
+
# set cache
|
842
|
+
ucache[orig] = out
|
843
|
+
|
844
|
+
#warn "lol not cached #{orig}"
|
845
|
+
|
846
|
+
unique ? out.first : out
|
847
|
+
|
848
|
+
# an exact match is better than an inexact one
|
849
|
+
|
850
|
+
# a canonical match is better than non-canonical
|
851
|
+
|
852
|
+
# note this is four bits: exact, canon(exact), inexact, canon(inexact)
|
853
|
+
# !canon(exact) should rank higher than canon(inexact)
|
854
|
+
|
855
|
+
# unreplaced is better than replaced
|
856
|
+
|
857
|
+
# newer is better than older (though no reason an older item
|
858
|
+
# can't replace a newer one)
|
859
|
+
|
860
|
+
# published is better than not, unless the context is
|
861
|
+
# unpublished and an unpublished document replaces a published one
|
862
|
+
end
|
863
|
+
|
864
|
+
SCHEME_RANK = { https: 0, http: 1 }
|
865
|
+
|
866
|
+
def cmp_resource a, b, www: nil
|
867
|
+
raise 'Comparands must be instances of RDF::Value' unless
|
868
|
+
[a, b].all? { |x| x.is_a? RDF::Value }
|
869
|
+
|
870
|
+
# URI beats non-URI
|
871
|
+
if a.uri?
|
872
|
+
if b.uri?
|
873
|
+
# https beats http beats other
|
874
|
+
as = a.scheme.downcase.to_sym
|
875
|
+
bs = b.scheme.downcase.to_sym
|
876
|
+
cmp = SCHEME_RANK.fetch(as, 2) <=> SCHEME_RANK.fetch(bs, 2)
|
877
|
+
|
878
|
+
# bail out early
|
879
|
+
return cmp unless cmp == 0
|
880
|
+
|
881
|
+
# this would have returned if the schemes were different, as
|
882
|
+
# such we only need to test one of them
|
883
|
+
if [:http, :https].any?(as) and not www.nil?
|
884
|
+
# if www is non-nil, prefer www or no-www depending on
|
885
|
+
# truthiness of `www` parameter
|
886
|
+
pref = [false, true].zip(www ? [1, 0] : [0, 1]).to_h
|
887
|
+
re = /^(?:(www)\.)?(.*?)$/
|
888
|
+
|
889
|
+
ah = re.match(a.host.to_s.downcase)[1,2]
|
890
|
+
bh = re.match(b.host.to_s.downcase)[1,2]
|
891
|
+
|
892
|
+
# compare hosts sans www
|
893
|
+
cmp = ah[1] <=> bh[1]
|
894
|
+
return cmp unless cmp == 0
|
895
|
+
|
896
|
+
# now compare presence of www
|
897
|
+
cmp = pref[ah[0] == 'www'] <=> pref[bh[0] == 'www']
|
898
|
+
return cmp unless cmp == 0
|
899
|
+
|
900
|
+
# if we're still here, compare the path/query/fragment
|
901
|
+
re = /^.*?\/\/.*?(\/.*)$/
|
902
|
+
al = re.match(a.to_s)[1].to_s
|
903
|
+
bl = re.match(b.to_s)[1].to_s
|
904
|
+
|
905
|
+
return al <=> bl
|
906
|
+
end
|
907
|
+
|
908
|
+
return a <=> b
|
909
|
+
else
|
910
|
+
return -1
|
911
|
+
end
|
912
|
+
elsif b.uri?
|
913
|
+
return 1
|
914
|
+
else
|
915
|
+
return a <=> b
|
916
|
+
end
|
917
|
+
end
|
918
|
+
|
919
|
+
def self.cmp_label repo, a, b, labels: nil, supplant: true, reverse: false
|
920
|
+
labels ||= {}
|
921
|
+
|
922
|
+
# try supplied label or fall back
|
923
|
+
pair = [a, b].map do |x|
|
924
|
+
if labels[x]
|
925
|
+
labels[x][1]
|
926
|
+
elsif supplant and y = label_for(repo, x)
|
927
|
+
labels[x] = y
|
928
|
+
y[1]
|
929
|
+
else
|
930
|
+
x
|
931
|
+
end
|
932
|
+
end
|
933
|
+
|
934
|
+
pair.reverse! if reverse
|
935
|
+
# warn "#{pair[0]} <=> #{pair[1]}"
|
936
|
+
pair[0].to_s <=> pair[1].to_s
|
937
|
+
end
|
938
|
+
|
939
|
+
# Obtain the "best" dereferenceable URI for the subject.
|
940
|
+
# Optionally returns all candidates.
|
941
|
+
#
|
942
|
+
# @param repo [RDF::Queryable]
|
943
|
+
# @param subject [RDF::Resource]
|
944
|
+
# @param unique [true, false] flag for unique return value
|
945
|
+
# @param rdf [true, false] flag to specify RDF::URI vs URI
|
946
|
+
# @param slugs [true, false] flag to include slugs
|
947
|
+
# @param fragment [true, false] flag to include fragment URIs
|
948
|
+
#
|
949
|
+
# @return [RDF::URI, URI, Array]
|
950
|
+
#
|
951
|
+
def self.canonical_uri repo, subject, base: nil,
|
952
|
+
unique: true, rdf: true, slugs: false, fragment: false
|
953
|
+
subject = coerce_resource subject, base
|
954
|
+
out = []
|
955
|
+
|
956
|
+
# try to find it first
|
957
|
+
out = objects_for(repo, subject, [RDF::SAK::CI.canonical, RDF::OWL.sameAs],
|
958
|
+
entail: false, only: :resource).select do |o|
|
959
|
+
# only consider the subjects
|
960
|
+
repo.has_subject? o
|
961
|
+
end.sort { |a, b| cmp_resource a, b }
|
962
|
+
|
963
|
+
# try to generate in lieu
|
964
|
+
if subject.uri? and (out.empty? or slugs)
|
965
|
+
|
966
|
+
out += objects_for(repo, subject,
|
967
|
+
[RDF::SAK::CI['canonical-slug'], RDF::SAK::CI.slug],
|
968
|
+
only: :literal).map do |o|
|
969
|
+
base + o.value
|
970
|
+
end if slugs
|
971
|
+
|
972
|
+
uri = URI(uri_pp(subject.to_s))
|
973
|
+
if base and uri.respond_to? :uuid
|
974
|
+
b = base.clone
|
975
|
+
b.query = b.fragment = nil
|
976
|
+
b.path = '/' + uri.uuid
|
977
|
+
out << RDF::URI.new(b.to_s)
|
978
|
+
else
|
979
|
+
out << subject
|
980
|
+
end
|
981
|
+
end
|
982
|
+
|
983
|
+
# remove all URIs with fragments unless specified
|
984
|
+
unless fragment
|
985
|
+
tmp = out.reject(&:fragment)
|
986
|
+
out = tmp unless tmp.empty?
|
987
|
+
end
|
988
|
+
|
989
|
+
# coerce to URI objects if specified
|
990
|
+
out.map! { |u| URI(uri_pp u.to_s) } unless rdf
|
991
|
+
|
992
|
+
unique ? out.first : out.uniq
|
993
|
+
end
|
994
|
+
|
995
|
+
# Determine whether the URI represents a published document.
|
996
|
+
#
|
997
|
+
# @param repo
|
998
|
+
# @param uri
|
999
|
+
#
|
1000
|
+
# @return [true, false]
|
1001
|
+
def self.published? repo, uri, circulated: false, base: nil
|
1002
|
+
uri = coerce_resource uri, base
|
1003
|
+
candidates = objects_for(
|
1004
|
+
repo, uri, RDF::Vocab::BIBO.status, only: :resource).to_set
|
1005
|
+
|
1006
|
+
test = Set[RDF::Vocab::BIBO['status/published']]
|
1007
|
+
test << RDF::SAK::CI.circulated if circulated
|
1008
|
+
|
1009
|
+
# warn candidates, test, candidates & test
|
1010
|
+
|
1011
|
+
!(candidates & test).empty?
|
1012
|
+
end
|
1013
|
+
|
1014
|
+
# Obtain a key-value structure for the given subject, optionally
|
1015
|
+
# constraining the result by node type (:resource, :uri/:iri,
|
1016
|
+
# :blank/:bnode, :literal)
|
1017
|
+
#
|
1018
|
+
# @param repo
|
1019
|
+
# @param subject of the inquiry
|
1020
|
+
# @param rev map in reverse
|
1021
|
+
# @param only one or more node types
|
1022
|
+
# @param uuids coerce resources to if possible
|
1023
|
+
#
|
1024
|
+
# @return [Hash]
|
1025
|
+
#
|
1026
|
+
def self.struct_for repo, subject, base: nil,
|
1027
|
+
rev: false, only: [], uuids: false, canon: false, ucache: {}, scache: {}
|
1028
|
+
only = coerce_node_spec only
|
1029
|
+
|
1030
|
+
# coerce the subject
|
1031
|
+
subject = canonical_uuid(repo, subject,
|
1032
|
+
base: base, scache: scache, ucache: ucache) || subject if uuids
|
1033
|
+
|
1034
|
+
rsrc = {}
|
1035
|
+
pattern = rev ? [nil, nil, subject] : [subject, nil, nil]
|
1036
|
+
repo.query(pattern) do |stmt|
|
1037
|
+
# this will skip over any term not matching the type
|
1038
|
+
node = rev ? stmt.subject : stmt.object
|
1039
|
+
next unless node_matches? node, only
|
1040
|
+
|
1041
|
+
# coerce the node to uuid if told to
|
1042
|
+
if node.resource?
|
1043
|
+
if uuids
|
1044
|
+
uu = canonical_uuid(repo, node, scache: scache, ucache: ucache) unless
|
1045
|
+
ucache.key? node
|
1046
|
+
node = uu || (canon ? canonical_uri(repo, node) : node)
|
1047
|
+
elsif canon
|
1048
|
+
node = canonical_uri(repo, node)
|
1049
|
+
end
|
1050
|
+
end
|
1051
|
+
|
1052
|
+
p = RDF::Vocabulary.find_term(stmt.predicate) || stmt.predicate
|
1053
|
+
o = rsrc[p] ||= []
|
1054
|
+
o.push node if node # may be nil
|
1055
|
+
end
|
1056
|
+
|
1057
|
+
# XXX in here we can do fun stuff like filter/sort by language/datatype
|
1058
|
+
rsrc.values.each { |v| v.sort!.uniq! }
|
1059
|
+
|
1060
|
+
rsrc
|
1061
|
+
end
|
1062
|
+
|
1063
|
+
# Obtain the most appropriate label(s) for the subject's type(s).
|
1064
|
+
# Returns one or more (depending on the `unique` flag)
|
1065
|
+
# predicate-object pairs in order of preference.
|
1066
|
+
#
|
1067
|
+
# @param repo [RDF::Queryable]
|
1068
|
+
# @param subject [RDF::Resource]
|
1069
|
+
# @param unique [true, false] only return the first pair
|
1070
|
+
# @param type [RDF::Term, Array] supply asserted types if already retrieved
|
1071
|
+
# @param lang [nil] not currently implemented (will be conneg)
|
1072
|
+
# @param desc [false, true] retrieve description instead of label
|
1073
|
+
# @param alt [false, true] retrieve alternate instead of main
|
1074
|
+
#
|
1075
|
+
# @return [Array] either a predicate-object pair or an array of pairs.
|
1076
|
+
#
|
1077
|
+
def self.label_for repo, subject, candidates: nil, unique: true, type: nil,
|
1078
|
+
lang: nil, desc: false, alt: false, base: nil
|
1079
|
+
raise ArgumentError, 'no repo!' unless repo.is_a? RDF::Queryable
|
1080
|
+
return unless subject.is_a? RDF::Value and subject.resource?
|
1081
|
+
|
1082
|
+
asserted = asserted_types repo, subject, type
|
1083
|
+
|
1084
|
+
# get all the inferred types by layer; add default class if needed
|
1085
|
+
strata = type_strata asserted
|
1086
|
+
strata.push [RDF::RDFS.Resource] if
|
1087
|
+
strata.empty? or not strata[-1].include?(RDF::RDFS.Resource)
|
1088
|
+
|
1089
|
+
# get the key-value pairs for the subject
|
1090
|
+
candidates ||= struct_for repo, subject, only: :literal
|
1091
|
+
|
1092
|
+
seen = {}
|
1093
|
+
accum = []
|
1094
|
+
strata.each do |lst|
|
1095
|
+
lst.each do |cls|
|
1096
|
+
next unless STRINGS[cls] and
|
1097
|
+
preds = STRINGS[cls][desc ? :desc : :label][alt ? 1 : 0]
|
1098
|
+
# warn cls
|
1099
|
+
preds.each do |p|
|
1100
|
+
# warn p.inspect
|
1101
|
+
next unless vals = candidates[p]
|
1102
|
+
vals.each do |v|
|
1103
|
+
pair = [p, v]
|
1104
|
+
accum.push(pair) unless seen[pair]
|
1105
|
+
seen[pair] = true
|
1106
|
+
end
|
1107
|
+
end
|
1108
|
+
end
|
1109
|
+
end
|
1110
|
+
|
1111
|
+
# try that for now
|
1112
|
+
unique ? accum[0] : accum.uniq
|
1113
|
+
|
1114
|
+
# what we want to do is match the predicates from the subject to
|
1115
|
+
# the predicates in the label designation
|
1116
|
+
|
1117
|
+
# get label predicate stack(s) for RDF type(s)
|
1118
|
+
|
1119
|
+
# get all predicates in order (use alt stack if doubly specified)
|
1120
|
+
|
1121
|
+
# filter out desired language(s)
|
1122
|
+
|
1123
|
+
# XXX note we will probably want to return the predicate as well
|
1124
|
+
end
|
1125
|
+
|
1126
|
+
# Assuming the subject is a thing that has authors, return the
|
1127
|
+
# list of authors. Try bibo:authorList first for an explicit
|
1128
|
+
# ordering, then continue to the various other predicates.
|
1129
|
+
#
|
1130
|
+
# @param repo [RDF::Queryable]
|
1131
|
+
# @param subject [RDF::Resource]
|
1132
|
+
# @param unique [false, true] only return the first author
|
1133
|
+
# @param contrib [false, true] return contributors instead of authors
|
1134
|
+
#
|
1135
|
+
# @return [RDF::Value, Array]
|
1136
|
+
#
|
1137
|
+
def authors_for repo, subject, unique: false, contrib: false, base: nil
|
1138
|
+
authors = []
|
1139
|
+
|
1140
|
+
# try the author list
|
1141
|
+
lp = [RDF::Vocab::BIBO[contrib ? :contributorList : :authorList]]
|
1142
|
+
lp += lp.first.entail(:equivalentProperty) # XXX cache this
|
1143
|
+
lp.each do |pred|
|
1144
|
+
o = repo.first_object([subject, pred, nil])
|
1145
|
+
next unless o
|
1146
|
+
# note this use of RDF::List is not particularly well-documented
|
1147
|
+
authors += RDF::List.from(repo, o).to_a
|
1148
|
+
end
|
1149
|
+
|
1150
|
+
# now try various permutations of the author/contributor predicate
|
1151
|
+
unsorted = []
|
1152
|
+
preds = contrib ? CONTRIB : AUTHOR
|
1153
|
+
preds.each do |pred|
|
1154
|
+
unsorted += repo.query([subject, pred, nil]).objects
|
1155
|
+
end
|
1156
|
+
|
1157
|
+
# prefetch the author names
|
1158
|
+
labels = authors.map { |a| [a, label_for(repo, a)] }.to_h
|
1159
|
+
|
1160
|
+
authors += unsorted.uniq.sort { |a, b| labels[a] <=> labels[b] }
|
1161
|
+
|
1162
|
+
unique ? authors.first : authors.uniq
|
1163
|
+
end
|
1164
|
+
|
1165
|
+
# Find the terminal replacements for the given subject, if any exist.
|
1166
|
+
#
|
1167
|
+
# @param repo
|
1168
|
+
# @param subject
|
1169
|
+
# @param published indicate the context is published
|
1170
|
+
#
|
1171
|
+
# @return [Set]
|
1172
|
+
#
|
1173
|
+
def self.replacements_for repo, subject, published: true, base: nil
|
1174
|
+
subject = coerce_resource subject, base
|
1175
|
+
|
1176
|
+
# `seen` is a hash mapping resources to publication status and
|
1177
|
+
# subsequent replacements. it collects all the resources in the
|
1178
|
+
# replacement chain in :fwd (replaces) and :rev (replaced-by)
|
1179
|
+
# members, along with a boolean :pub. `seen` also performs a
|
1180
|
+
# duty as cycle-breaking sentinel.
|
1181
|
+
|
1182
|
+
seen = {}
|
1183
|
+
queue = [subject]
|
1184
|
+
while (test = queue.shift)
|
1185
|
+
# fwd is "replaces", rev is "replaced by"
|
1186
|
+
entry = seen[test] ||= {
|
1187
|
+
pub: published?(repo, test), fwd: Set.new, rev: Set.new }
|
1188
|
+
queue += (
|
1189
|
+
subjects_for(repo, RDF::Vocab::DC.replaces, subject) +
|
1190
|
+
objects_for(repo, subject, RDF::Vocab::DC.isReplacedBy,
|
1191
|
+
only: :resource)
|
1192
|
+
).uniq.map do |r| # r = replacement
|
1193
|
+
next if seen.include? r
|
1194
|
+
seen[r] ||= { pub: published?(repo, r), fwd: Set.new, rev: Set.new }
|
1195
|
+
seen[r][:fwd] << test
|
1196
|
+
entry[:rev] << r
|
1197
|
+
r
|
1198
|
+
end.compact.uniq
|
1199
|
+
end
|
1200
|
+
|
1201
|
+
# if we're calling from a published context, we return the
|
1202
|
+
# (topologically) last published resource(s), even if they are
|
1203
|
+
# replaced ultimately by unpublished resources.
|
1204
|
+
|
1205
|
+
out = seen.map { |k, v| v[:rev].empty? ? k : nil }.compact - [subject]
|
1206
|
+
|
1207
|
+
# now we modify `out` based on the publication status of the context
|
1208
|
+
if published
|
1209
|
+
pubout = out.select { |o| seen[o][:pub] }
|
1210
|
+
# if there is anything left after this, return it
|
1211
|
+
return pubout unless pubout.empty?
|
1212
|
+
# now we want to find the penultimate elements of `seen` that
|
1213
|
+
# are farthest along the replacement chain but whose status is
|
1214
|
+
# published
|
1215
|
+
|
1216
|
+
# start with `out`, take the union of their :fwd members, then
|
1217
|
+
# take the subset of those which are published. if the result
|
1218
|
+
# is empty, repeat. (this is walking backwards through the
|
1219
|
+
# graph we just walked forwards through to construct `seen`)
|
1220
|
+
loop do
|
1221
|
+
# XXX THIS NEEDS A TEST CASE
|
1222
|
+
out = seen.values_at(*out).map { |v| v[:fwd] }.reduce(:+).to_a
|
1223
|
+
break if out.empty?
|
1224
|
+
pubout = out.select { |o| seen[o][:pub] }
|
1225
|
+
return pubout unless pubout.empty?
|
1226
|
+
end
|
1227
|
+
end
|
1228
|
+
|
1229
|
+
out
|
1230
|
+
end
|
1231
|
+
|
1232
|
+
# Obtain dates for the subject as instances of Date(Time). This is
|
1233
|
+
# just shorthand for a common application of `objects_for`.
|
1234
|
+
#
|
1235
|
+
# @param repo
|
1236
|
+
# @param subject
|
1237
|
+
# @param predicate
|
1238
|
+
# @param datatype
|
1239
|
+
#
|
1240
|
+
# @return [Array] of dates
|
1241
|
+
#
|
1242
|
+
def self.dates_for repo, subject, predicate: RDF::Vocab::DC.date,
|
1243
|
+
datatype: [RDF::XSD.date, RDF::XSD.dateTime]
|
1244
|
+
objects_for(
|
1245
|
+
repo, subject, predicate, only: :literal, datatype: datatype) do |o|
|
1246
|
+
o.object
|
1247
|
+
end.sort.uniq
|
1248
|
+
end
|
1249
|
+
|
1250
|
+
# Obtain any specified MIME types for the subject. Just shorthand
|
1251
|
+
# for a common application of `objects_for`.
|
1252
|
+
#
|
1253
|
+
# @param repo
|
1254
|
+
# @param subject
|
1255
|
+
# @param predicate
|
1256
|
+
# @param datatype
|
1257
|
+
#
|
1258
|
+
# @return [Array] of internet media types
|
1259
|
+
#
|
1260
|
+
def formats_for repo, subject, predicate: RDF::Vocab::DC.format,
|
1261
|
+
datatype: [RDF::XSD.token]
|
1262
|
+
objects_for(
|
1263
|
+
repo, subject, predicate, only: :literal, datatype: datatype) do |o|
|
1264
|
+
t = o.object
|
1265
|
+
t =~ /\// ? RDF::SAK::MimeMagic.new(t.to_s.downcase) : nil
|
1266
|
+
end.compact.sort.uniq
|
1267
|
+
end
|
1268
|
+
|
1269
|
+
def self.base_for xmlnode, base
|
1270
|
+
base = URI(base.to_s) unless base.is_a? URI
|
1271
|
+
out = base
|
1272
|
+
|
1273
|
+
if xmlnode.at_xpath('self::html:*|/html', XPATHNS)
|
1274
|
+
b = URI(xmlnode.at_xpath(XPATH[:htmlbase], XPATHNS).to_s.strip)
|
1275
|
+
|
1276
|
+
out = b if b.absolute?
|
1277
|
+
elsif b = xmlnode.root.at_xpath(XPATH[:xmlbase])
|
1278
|
+
b = URI(b.to_s.strip)
|
1279
|
+
out = b if b.absolute?
|
1280
|
+
end
|
1281
|
+
|
1282
|
+
out
|
1283
|
+
end
|
1284
|
+
|
1285
|
+
# Traverse links based on content type.
|
1286
|
+
def self.traverse_links node, type: 'application/xhtml+xml', &block
|
1287
|
+
enum_for :traverse_links, node, type: type unless block
|
1288
|
+
type = type.strip.downcase.gsub(/\s*;.*/, '')
|
1289
|
+
xpath = LINK_MAP.fetch type, XPATH[:xlinks]
|
1290
|
+
node.xpath(xpath, XPATHNS).each { |node| block.call node }
|
1291
|
+
end
|
1292
|
+
|
1293
|
+
|
1294
|
+
# XXX OTHER STUFF
|
1295
|
+
|
1296
|
+
# isolate an element into a new document
|
1297
|
+
def subtree doc, xpath = '/*', reindent: true, prefixes: {}
|
1298
|
+
# at this time we shouldn't try to do anything cute with the xpath
|
1299
|
+
# even though it is attractive to want to prune out prefixes
|
1300
|
+
|
1301
|
+
# how about we start with a noop
|
1302
|
+
return doc.root.dup if xpath == '/*'
|
1303
|
+
|
1304
|
+
begin
|
1305
|
+
nodes = doc.xpath xpath, prefixes
|
1306
|
+
return unless
|
1307
|
+
nodes and nodes.is_a?(Nokogiri::XML::NodeSet) and !nodes.empty?
|
1308
|
+
out = Nokogiri::XML::Document.new
|
1309
|
+
out << nodes.first.dup
|
1310
|
+
reindent out.root if reindent
|
1311
|
+
out
|
1312
|
+
rescue Nokogiri::SyntaxError
|
1313
|
+
return
|
1314
|
+
end
|
1315
|
+
end
|
1316
|
+
|
1317
|
+
# reindent text nodes
|
1318
|
+
def reindent node, depth = 0, indent = ' '
|
1319
|
+
kids = node.children
|
1320
|
+
if kids and child = kids.first
|
1321
|
+
loop do
|
1322
|
+
if child.element?
|
1323
|
+
# recurse into the element
|
1324
|
+
reindent child, depth + 1, indent
|
1325
|
+
elsif child.text?
|
1326
|
+
text = child.content || ''
|
1327
|
+
|
1328
|
+
# optional horizontal whitespace followed by at least
|
1329
|
+
# one newline (we don't care what kind), followed by
|
1330
|
+
# optional horizontal or vertical whitespace
|
1331
|
+
preamble = !!text.gsub!(/\A[ \t]*[\r\n]+\s*/, '')
|
1332
|
+
|
1333
|
+
# then we don't care what's in the middle, but hey let's get
|
1334
|
+
# rid of dos newlines because we can always put them back
|
1335
|
+
# later if we absolutely have to
|
1336
|
+
text.gsub!(/\r+/, '')
|
1337
|
+
|
1338
|
+
# then optionally any whitespace followed by at least
|
1339
|
+
# another newline again, followed by optional horizontal
|
1340
|
+
# whitespace and then the end of the string
|
1341
|
+
epilogue = !!text.gsub!(/\s*[\r\n]+[ \t]*\z/, '')
|
1342
|
+
|
1343
|
+
# if we prune these off we'll have a text node that is
|
1344
|
+
# either the empty string or it isn't (note we will only
|
1345
|
+
# register an epilogue if the text has some non-whitespace
|
1346
|
+
# in it, because otherwise the first regex would have
|
1347
|
+
# snagged everything, so it's probably redundant)
|
1348
|
+
|
1349
|
+
# if it's *not* empty then we *prepend* indented whitespace
|
1350
|
+
if preamble and !text.empty?
|
1351
|
+
d = depth + (child.previous ? 1 : 0)
|
1352
|
+
text = "\n" + (indent * d) + text
|
1353
|
+
end
|
1354
|
+
|
1355
|
+
# then we unconditionally *append*, (modulo there being a
|
1356
|
+
# newline in the original at all), but we have to check by
|
1357
|
+
# how much: if this is *not* the last node then depth + 1,
|
1358
|
+
# otherwise depth
|
1359
|
+
if preamble or epilogue
|
1360
|
+
d = depth + (child.next ? 1 : 0)
|
1361
|
+
text << "\n" + (indent * d)
|
1362
|
+
end
|
1363
|
+
|
1364
|
+
child.content = text
|
1365
|
+
end
|
1366
|
+
|
1367
|
+
break unless child = child.next
|
1368
|
+
end
|
1369
|
+
end
|
1370
|
+
|
1371
|
+
node
|
1372
|
+
end
|
1373
|
+
|
1374
|
+
XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
|
1375
|
+
XHV = 'http://www.w3.org/1999/xhtml/vocab#'.freeze
|
1376
|
+
XPATHNS = {
|
1377
|
+
html: XHTMLNS,
|
1378
|
+
svg: 'http://www.w3.org/2000/svg',
|
1379
|
+
atom: 'http://www.w3.org/2005/Atom',
|
1380
|
+
xlink: 'http://www.w3.org/1999/xlink',
|
1381
|
+
}.freeze
|
1382
|
+
|
1383
|
+
######## URI STUFF ########
|
1384
|
+
|
1385
|
+
# Preprocess a URI string so that it can be handed to +URI.parse+
|
1386
|
+
# without crashing.
|
1387
|
+
#
|
1388
|
+
# @param uri [#to_s] The URI string in question
|
1389
|
+
# @param extra [#to_s] Character class of any extra characters to escape
|
1390
|
+
# @return [String] The sanitized (appropriately escaped) URI string
|
1391
|
+
|
1392
|
+
# really gotta stop carting this thing around
|
1393
|
+
def uri_pp uri, extra = ''
|
1394
|
+
# take care of malformed escapes
|
1395
|
+
uri = uri.to_s.b.gsub(/%(?![0-9A-Fa-f]{2})/n, '%25')
|
1396
|
+
uri.gsub!(/([#{Regexp.quote extra}])/) do |s|
|
1397
|
+
sprintf('%%%02X', s.ord)
|
1398
|
+
end unless extra.empty?
|
1399
|
+
# we want the minimal amount of escaping so we split out the separators
|
1400
|
+
out = ''
|
1401
|
+
parts = RFC3986.match(uri).captures
|
1402
|
+
parts.each_index do |i|
|
1403
|
+
next if parts[i].nil?
|
1404
|
+
out << SEPS[i].first
|
1405
|
+
out << parts[i].b.gsub(SF) { |s| sprintf('%%%02X', s.ord) }
|
1406
|
+
out << SEPS[i].last
|
1407
|
+
end
|
1408
|
+
|
1409
|
+
# make sure escaped hex is upper case like the rfc says
|
1410
|
+
out.gsub(/(%[0-9A-Fa-f]{2})/) { |x| x.upcase }
|
1411
|
+
end
|
1412
|
+
|
1413
|
+
# Given a URI as input, split any query parameters into an array of
|
1414
|
+
# key-value pairs. If +:only+ is true, this will just return the
|
1415
|
+
# pairs. Otherwise it will prepend the query-less URI to the array,
|
1416
|
+
# and can be captured with an idiom like +uri, *qp = split_qp uri+.
|
1417
|
+
#
|
1418
|
+
# @param uri [URI,#to_s] The URI to extract parameters from
|
1419
|
+
# @param only [false, true] whether to only return the parameters
|
1420
|
+
# @return [Array] (See description)
|
1421
|
+
#
|
1422
|
+
def split_qp uri, only: false
|
1423
|
+
uri = URI(uri_pp uri.to_s) unless uri.is_a? URI
|
1424
|
+
qp = URI::decode_www_form(uri.query)
|
1425
|
+
return qp if only
|
1426
|
+
uri.query = nil
|
1427
|
+
[uri] + qp
|
1428
|
+
end
|
1429
|
+
|
1430
|
+
# Given a URI as input, split any path parameters out of the last
|
1431
|
+
# path segment. Works the same way as #split_pp.
|
1432
|
+
#
|
1433
|
+
# @param uri [URI,#to_s] The URI to extract parameters from
|
1434
|
+
# @param only [false, true] whether to only return the parameters
|
1435
|
+
# @return [Array] (See description)
|
1436
|
+
#
|
1437
|
+
def split_pp uri, only: false
|
1438
|
+
begin
|
1439
|
+
u = (uri.is_a?(URI) ? uri : URI(uri_pp uri.to_s)).normalize
|
1440
|
+
|
1441
|
+
rescue URI::InvalidURIError => e
|
1442
|
+
# these stock error messages don't even tell you what the uri is
|
1443
|
+
raise URI::InvalidURIError, "#{e.message} (#{uri.to_s})"
|
1444
|
+
end
|
1445
|
+
|
1446
|
+
return only ? [] : [uri] unless u.path
|
1447
|
+
uri = u
|
1448
|
+
|
1449
|
+
ps = uri.path.split '/', -1
|
1450
|
+
pp = ps.pop.split ';', -1
|
1451
|
+
bp = (ps + [pp.shift]).join '/'
|
1452
|
+
uri = uri.dup
|
1453
|
+
|
1454
|
+
begin
|
1455
|
+
uri.path = bp
|
1456
|
+
rescue URI::InvalidURIError => e
|
1457
|
+
# these stock error messages don't even tell you what the uri is
|
1458
|
+
m = e.message
|
1459
|
+
raise URI::InvalidURIError, "#{m} (#{uri.to_s}, #{bp})"
|
1460
|
+
end
|
1461
|
+
|
1462
|
+
return pp if only
|
1463
|
+
[uri] + pp
|
1464
|
+
end
|
1465
|
+
|
1466
|
+
def split_pp2 path, only: false
|
1467
|
+
# ugh apparently we need a special case for ''.split
|
1468
|
+
return only ? [] : [''] if !path or path.empty?
|
1469
|
+
|
1470
|
+
ps = path.to_s.split ?/, -1 # path segments
|
1471
|
+
pp = ps.pop.to_s.split ?;, -1 # path parameters
|
1472
|
+
bp = (ps + [pp.shift]).join ?/ # base path
|
1473
|
+
|
1474
|
+
only ? pp : [bp] + pp
|
1475
|
+
end
|
1476
|
+
|
1477
|
+
# Coerce a stringlike argument into a URI. Raises an exception if
|
1478
|
+
# the string can't be turned into a valid URI. Optionally resolves
|
1479
|
+
# against a +base+, and the coercion can be tuned to either URI or
|
1480
|
+
# RDF::URI via +:as+.
|
1481
|
+
#
|
1482
|
+
# @param arg [URI, RDF::URI, #to_s] The input string
|
1483
|
+
# @param base [URI, RDF::URI, #to_s] The optional base URI
|
1484
|
+
# @param as [:rdf, :uri, nil] The optional coercion type
|
1485
|
+
# @return [URI, RDF::URI, String]
|
1486
|
+
#
|
1487
|
+
def coerce_resource arg, base = nil, as: :rdf
|
1488
|
+
as = assert_uri_coercion as
|
1489
|
+
return arg if as and arg.is_a?({ uri: URI, rdf: RDF::URI }[as])
|
1490
|
+
raise ArgumentError, 'arg must be stringable' unless arg.respond_to? :to_s
|
1491
|
+
|
1492
|
+
arg = arg.to_s.strip
|
1493
|
+
|
1494
|
+
if arg.start_with? '_:' and as
|
1495
|
+
# override the coercion if this is a blank node
|
1496
|
+
as = :rdf
|
1497
|
+
elsif base
|
1498
|
+
begin
|
1499
|
+
arg = (base.is_a?(URI) ? base : URI(uri_pp base.to_s.strip)).merge arg
|
1500
|
+
rescue URI::InvalidURIError => e
|
1501
|
+
warn "attempted to coerce #{arg} which turned out to be invalid: #{e}"
|
1502
|
+
return
|
1503
|
+
end
|
1504
|
+
end
|
1505
|
+
|
1506
|
+
URI_COERCIONS[as].call arg
|
1507
|
+
end
|
1508
|
+
|
1509
|
+
# Coerce a stringlike argument into a UUID URN. Will
|
1510
|
+
def coerce_uuid_urn arg, base = nil
|
1511
|
+
# if this is an ncname then change it
|
1512
|
+
if ([URI, RDF::URI] & arg.class.ancestors).empty? &&
|
1513
|
+
arg.respond_to?(:to_s)
|
1514
|
+
arg = arg.to_s
|
1515
|
+
|
1516
|
+
# coerce ncname to uuid
|
1517
|
+
arg = UUID::NCName::from_ncname(arg, version: 1) if arg =~
|
1518
|
+
/^[A-P](?:[0-9A-Z_-]{20}|[2-7A-Z]{24})[A-P]$/i
|
1519
|
+
|
1520
|
+
# now the string is either a UUID or it isn't
|
1521
|
+
arg = "urn:uuid:#{arg}" unless arg.start_with? 'urn:uuid:'
|
1522
|
+
else
|
1523
|
+
arg = arg.class.new arg.to_s.downcase unless arg == arg.to_s.downcase
|
1524
|
+
end
|
1525
|
+
|
1526
|
+
raise ArgumentError, 'not a UUID' unless
|
1527
|
+
arg.to_s =~ /^urn:uuid:[0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8}$/
|
1528
|
+
|
1529
|
+
arg = coerce_resource arg, base
|
1530
|
+
end
|
1531
|
+
|
1532
|
+
# Get the last non-empty path segment of the URI
|
1533
|
+
#
|
1534
|
+
# @param uri
|
1535
|
+
#
|
1536
|
+
# @return [String]
|
1537
|
+
def terminal_slug uri, base: nil
|
1538
|
+
uri = coerce_resource uri, base
|
1539
|
+
return unless uri.respond_to? :path
|
1540
|
+
if p = uri.path
|
1541
|
+
if p = /^\/+(.*?)\/*$/.match(p)
|
1542
|
+
if p = p[1].split(/\/+/).last
|
1543
|
+
# we need to escape colons or it will think it's absolute
|
1544
|
+
return uri_pp(p.split(/;+/).first || '', ':')
|
1545
|
+
end
|
1546
|
+
end
|
1547
|
+
end
|
1548
|
+
''
|
1549
|
+
end
|
1550
|
+
|
1551
|
+
# Resolve a string or array or attribute node containing one or more
|
1552
|
+
# terms/CURIEs against a set of prefixes. The CURIE can be a string,
|
1553
|
+
# Nokogiri::XML::Attr, or an array thereof. Strings are stripped and
|
1554
|
+
# split on whitespace. +:prefixes+ and +:base+ can be supplied or
|
1555
|
+
# gleaned from +:refnode+, which itself can be gleaned if +curie+ is
|
1556
|
+
# a Nokogiri::XML::Attr. Returns an array of (attempted) resolved
|
1557
|
+
# terms unless +:scalar+ is true, in which case only the first URI
|
1558
|
+
# is returned. When +:noop+ is true, this method will always return
|
1559
|
+
# a value. Can coerce results to either RDF::URI or URI objects.
|
1560
|
+
#
|
1561
|
+
# @note +:vocab+ overrides, and is the same as supplying
|
1562
|
+
# +prefix[nil]+. It is only meaningful when +:term+ (i.e., when we
|
1563
|
+
# expect the input to be an RDFa term) is true.
|
1564
|
+
#
|
1565
|
+
# @param curie [#to_s, Nokogiri::XML::Attr,Array] One or more CURIEs
|
1566
|
+
# @param prefixes [#to_h] The hash of prefixes (nil key is equivalent
|
1567
|
+
# to vocab)
|
1568
|
+
# @param vocab [nil,#to_s] An optional base URI
|
1569
|
+
# @param refnode [nil, Nokogiri::XML::Element] A reference node for resolution
|
1570
|
+
# @param term [false, true] Whether to treat the input as an RDFa _term_
|
1571
|
+
# @param noop [true, false] Whether to skip if the CURIE can't be resolved
|
1572
|
+
# @param scalar [false, true] Whether to return a scalar value
|
1573
|
+
# @param coerce [nil, :rdf, :uri] Desired type coercion for the output
|
1574
|
+
#
|
1575
|
+
# @return [nil,URI,RDF::URI,Array<nil,URI,RDF::URI>]
|
1576
|
+
#
|
1577
|
+
def resolve_curie curie, prefixes: {}, vocab: nil, base: nil,
|
1578
|
+
refnode: nil, term: false, noop: true, scalar: false, coerce: nil
|
1579
|
+
prefixes = sanitize_prefixes prefixes
|
1580
|
+
|
1581
|
+
raise 'coerce must be either :uri or :rdf' if coerce and
|
1582
|
+
not %i[uri rdf].include?(coerce)
|
1583
|
+
|
1584
|
+
# coerce curie to its value and set refnode if not present
|
1585
|
+
if curie.is_a? Nokogiri::XML::Attr
|
1586
|
+
refnode ||= curie.parent
|
1587
|
+
curie = curie.value.strip.split
|
1588
|
+
elsif curie.respond_to? :to_a
|
1589
|
+
curie = curie.to_a
|
1590
|
+
raise ArgumentError,
|
1591
|
+
'if curie is an array, it has to be all strings' unless
|
1592
|
+
curie.all? { |x| x.respond_to? :to_s }
|
1593
|
+
curie = curie.map { |x| x.to_s.strip.split }.flatten
|
1594
|
+
else
|
1595
|
+
raise ArgumentError, 'curie must be stringable' unless
|
1596
|
+
curie.respond_to? :to_s
|
1597
|
+
curie = curie.to_s.strip.split
|
1598
|
+
end
|
1599
|
+
|
1600
|
+
if refnode
|
1601
|
+
raise ArgumentError, 'refnode must be an element' unless
|
1602
|
+
refnode.is_a? Nokogiri::XML::Element
|
1603
|
+
prefixes = get_prefixes refnode if prefixes.empty?
|
1604
|
+
end
|
1605
|
+
|
1606
|
+
# now we overwrite the vocab
|
1607
|
+
if vocab
|
1608
|
+
raise ArgumentError, 'vocab must be stringable' unless
|
1609
|
+
vocab.respond_to? :to_s
|
1610
|
+
prefixes[nil] = vocab.to_s.strip
|
1611
|
+
end
|
1612
|
+
|
1613
|
+
out = curie.map do |c|
|
1614
|
+
prefix, slug = /^\[?(?:([^:]+):)?(.*?)\]?$/.match(c).captures
|
1615
|
+
prefix = prefix.to_sym if prefix
|
1616
|
+
tmp = if prefixes[prefix]
|
1617
|
+
prefixes[prefix] + slug
|
1618
|
+
else
|
1619
|
+
noop ? c : nil
|
1620
|
+
end
|
1621
|
+
tmp && coerce ? URI_COERCIONS[coerce].call(tmp) : tmp
|
1622
|
+
end
|
1623
|
+
|
1624
|
+
scalar ? out.first : out
|
1625
|
+
end
|
1626
|
+
|
1627
|
+
# Abbreviate one or more URIs into one or more CURIEs if we
|
1628
|
+
# can. Will through if +noop:+ is true, or if false, return nil for
|
1629
|
+
# any URI that can't be abbreviated this way. Takes a hash of
|
1630
|
+
# prefix-URI mappings where the keys are assumed to be symbols or
|
1631
|
+
# +nil+ to express the current vocabulary, which can be overridden
|
1632
|
+
# via +vocab:+.
|
1633
|
+
#
|
1634
|
+
# @note Only +noop: true+ can be guaranteed to return a value.
|
1635
|
+
#
|
1636
|
+
# @param term [Array<#to_s>, #to_s] the term(s)
|
1637
|
+
# @param prefixes [Hash<Symbol,nil>, #to_h] the prefix mappings
|
1638
|
+
# @param vocab [#to_s] current vocabulary, overrides +prefixes[nil]+
|
1639
|
+
# @param noop [true, false] whether or not to pass terms through
|
1640
|
+
# @param sort [true, false] whether or not to sort (only if +noop:+)
|
1641
|
+
# @return [String, nil, Array<String,nil>] the (maybe) abbreviated term(s)
|
1642
|
+
#
|
1643
|
+
def abbreviate term, prefixes: {}, vocab: nil, noop: true, sort: true
|
1644
|
+
# this returns a duplicate that we can mess with
|
1645
|
+
prefixes = sanitize_prefixes prefixes
|
1646
|
+
|
1647
|
+
# sanitize vocab
|
1648
|
+
raise ArgumentError, 'vocab must be nil or stringable' unless
|
1649
|
+
vocab.nil? or vocab.respond_to? :to_s
|
1650
|
+
prefixes[nil] = vocab.to_s if vocab
|
1651
|
+
scalar = true
|
1652
|
+
|
1653
|
+
term = if term.respond_to? :to_a
|
1654
|
+
scalar = false
|
1655
|
+
term.to_a
|
1656
|
+
else [term]; end
|
1657
|
+
|
1658
|
+
rev = prefixes.invert
|
1659
|
+
|
1660
|
+
term.map! do |t|
|
1661
|
+
t = t.to_s
|
1662
|
+
slug = nil # we want this value to be nil if no match and !noop
|
1663
|
+
|
1664
|
+
# try matching each prefix URI from longest to shortest
|
1665
|
+
rev.sort { |a, b| b.first.length <=> a.first.length }.each do |uri, pfx|
|
1666
|
+
slug = t.delete_prefix uri
|
1667
|
+
# this is saying the URI either doesn't match or abbreviates to ""
|
1668
|
+
if slug == t or pfx.nil? && slug.empty?
|
1669
|
+
slug = nil
|
1670
|
+
else
|
1671
|
+
# it's already a slug so we add a prefix if there is one
|
1672
|
+
slug = '%s:%s' % [pfx, slug] unless pfx.nil?
|
1673
|
+
break # we have our match
|
1674
|
+
end
|
1675
|
+
end
|
1676
|
+
|
1677
|
+
# at this point slug is either an abbreviated term or nil, so:
|
1678
|
+
slug ||= t if noop
|
1679
|
+
slug
|
1680
|
+
end
|
1681
|
+
|
1682
|
+
# only sort if noop is set
|
1683
|
+
term.sort! if noop && sort
|
1684
|
+
|
1685
|
+
scalar ? term.first : term
|
1686
|
+
end
|
1687
|
+
|
1688
|
+
######## RDFA/XML STUFF ########
|
1689
|
+
|
1690
|
+
# Returns the base URI from the perspective of the given element.
|
1691
|
+
# Can optionally be coerced into either a URI or RDF::URI. Also
|
1692
|
+
# takes a default value.
|
1693
|
+
#
|
1694
|
+
# @param elem [Nokogiri::XML::Node] the context element
|
1695
|
+
# @param default [nil, #to_s] the default URI
|
1696
|
+
# @param coerce [nil, :uri, :rdf] the coercion scheme, if any
|
1697
|
+
# @return [nil, String, URI, RDF::URI] the context's base URI
|
1698
|
+
def get_base elem, default: nil, coerce: nil
|
1699
|
+
assert_uri_coercion coerce
|
1700
|
+
|
1701
|
+
if elem.document?
|
1702
|
+
elem = elem.root
|
1703
|
+
return unless elem
|
1704
|
+
end
|
1705
|
+
|
1706
|
+
# get the xpath
|
1707
|
+
xpath = (elem.namespace && elem.namespace.href == XHTMLNS or
|
1708
|
+
elem.at_xpath('/html')) ? :htmlbase : :xmlbase
|
1709
|
+
|
1710
|
+
# now we go looking for the attribute
|
1711
|
+
if base = elem.at_xpath(XPATH[xpath], XPATHNS)
|
1712
|
+
base = base.value.strip
|
1713
|
+
else
|
1714
|
+
base = default.to_s.strip if default
|
1715
|
+
end
|
1716
|
+
|
1717
|
+
# clear it out if it's the empty string
|
1718
|
+
base = nil if base and base.empty?
|
1719
|
+
|
1720
|
+
# eh that's about all the input sanitation we're gonna get
|
1721
|
+
base && coerce ? URI_COERCIONS[coerce].call(base) : base
|
1722
|
+
end
|
1723
|
+
|
1724
|
+
# Given an X(HT)ML element, returns a hash of prefixes of the form
|
1725
|
+
# +{ prefix: "vocab" }+, where the current +@vocab+ is represented
|
1726
|
+
# by the +nil+ key. An optional +:traverse+ parameter can be set to
|
1727
|
+
# +false+ to prevent ascending the node tree. Any XML namespace
|
1728
|
+
# declarations are superseded by the +@prefix+ attribute. Returns
|
1729
|
+
# any +@vocab+ declaration found as the +nil+ key.
|
1730
|
+
#
|
1731
|
+
# @note The +descend: true+ parameter assumes we are trying to
|
1732
|
+
# collect all the namespaces in use in the entire subtree, rather
|
1733
|
+
# than resolve any particular CURIE. As such, the _first_ prefix
|
1734
|
+
# mapping in document order is preserved over subsequent/descendant
|
1735
|
+
# ones.
|
1736
|
+
#
|
1737
|
+
# @param elem [Nokogiri::XML::Node] The context element
|
1738
|
+
# @param traverse [true, false] whether or not to traverse the tree
|
1739
|
+
# @param coerce [nil, :rdf, :uri] a type coercion for the URIs, if any
|
1740
|
+
# @param descend [false, true] go _down_ the tree instead of up
|
1741
|
+
# @return [Hash] Depending on +:traverse+, either all prefixes
|
1742
|
+
# merged, or just the ones asserted in the element.
|
1743
|
+
def get_prefixes elem, traverse: true, coerce: nil, descend: false
|
1744
|
+
coerce = assert_uri_coercion coerce
|
1745
|
+
|
1746
|
+
# deal with a common phenomenon
|
1747
|
+
elem = elem.root if elem.is_a? Nokogiri::XML::Document
|
1748
|
+
|
1749
|
+
# get namespace definitions first
|
1750
|
+
prefix = elem.namespaces.reject do |k, _| k == 'xmlns'
|
1751
|
+
end.transform_keys { |k| k.split(?:)[1].to_sym }
|
1752
|
+
|
1753
|
+
# now do the prefix attribute
|
1754
|
+
if elem.key? 'prefix'
|
1755
|
+
# XXX note this assumes largely that the input is clean
|
1756
|
+
elem['prefix'].strip.split.each_slice(2) do |k, v|
|
1757
|
+
pfx = k.split(?:)[0] or next # otherwise error
|
1758
|
+
prefix[pfx.to_sym] = v
|
1759
|
+
end
|
1760
|
+
end
|
1761
|
+
|
1762
|
+
# encode the vocab as the null prefix
|
1763
|
+
if vocab = elem['vocab']
|
1764
|
+
vocab.strip!
|
1765
|
+
# note that a specified but empty @vocab means kill any existing vocab
|
1766
|
+
prefix[nil] = vocab.empty? ? nil : vocab
|
1767
|
+
end
|
1768
|
+
|
1769
|
+
# don't forget we can coerce
|
1770
|
+
prefix.transform_values! { |v| COERCIONS[coerce].call v } if coerce
|
1771
|
+
|
1772
|
+
# don't proceed if `traverse` is false
|
1773
|
+
return prefix unless traverse
|
1774
|
+
|
1775
|
+
# save us having to recurse in ruby by using xpath implemented in c
|
1776
|
+
xpath = '%s::*[namespace::*|@prefix|@vocab]' %
|
1777
|
+
(descend ? :descendant : :ancestor)
|
1778
|
+
elem.xpath(xpath).each do |e|
|
1779
|
+
# this will always merge our prefix on top irrespective of direction
|
1780
|
+
prefix = get_prefix(e, traverse: false, coerce: coerce).merge prefix
|
1781
|
+
end
|
1782
|
+
|
1783
|
+
prefix
|
1784
|
+
end
|
1785
|
+
|
1786
|
+
# Given an X(HT)ML element, return the nearest RDFa _subject_.
|
1787
|
+
# Optionally takes +:prefix+ and +:base+ parameters which override
|
1788
|
+
# anything found in the document tree.
|
1789
|
+
#
|
1790
|
+
# @param node [Nokogiri::XML::Element] the node
|
1791
|
+
# @param prefixes [Hash] Prefix mapping. Overrides derived values.
|
1792
|
+
# @param base [#to_s,URI,RDF::URI] Base URI, overrides as well.
|
1793
|
+
# @param coerce [nil, :rdf, :uri] the coercion regime
|
1794
|
+
#
|
1795
|
+
# @return [URI,RDF::URI,String] the subject
|
1796
|
+
#
|
1797
|
+
def subject_for node, prefixes: nil, base: nil, coerce: :rdf
|
1798
|
+
assert_xml_node node
|
1799
|
+
coerce = assert_uri_coercion coerce
|
1800
|
+
|
1801
|
+
if n = node.at_xpath(XPATH[:literal])
|
1802
|
+
return internal_subject_for n,
|
1803
|
+
prefixes: prefixes, base: base, coerce: coerce
|
1804
|
+
end
|
1805
|
+
|
1806
|
+
internal_subject_for node, prefixes: prefixes, base: base, coerce: coerce
|
1807
|
+
end
|
1808
|
+
|
1809
|
+
def modernize doc
|
1810
|
+
doc.xpath(XPATH[:modernize], XPATHNS).each do |e|
|
1811
|
+
# gotta instance_exec because `markup` is otherwise unbound
|
1812
|
+
instance_exec e, &MODERNIZE[e.name.to_sym]
|
1813
|
+
end
|
1814
|
+
end
|
1815
|
+
|
1816
|
+
# Strip all the links surrounding and RDFa attributes off
|
1817
|
+
# +dfn+/+abbr+/+span+ tags. Assuming a construct like +<a
|
1818
|
+
# rel="some:relation" href="#..." typeof="skos:Concept"><dfn
|
1819
|
+
# property="some:property">Term</dfn></a>+ is a link to a glossary
|
1820
|
+
# entry, this method returns the term back to an undecorated state
|
1821
|
+
# (+<dfn>Term</dfn>+).
|
1822
|
+
|
1823
|
+
def dehydrate doc
|
1824
|
+
doc.xpath(XPATH[:dehydrate], XPATHNS).each do |e|
|
1825
|
+
e = e.replace e.elements.first.dup
|
1826
|
+
%w[about resource typeof rel rev property datatype].each do |a|
|
1827
|
+
e.delete a if e.key? a
|
1828
|
+
end
|
1829
|
+
end
|
1830
|
+
end
|
1831
|
+
|
1832
|
+
# Scan all the +dfn+/+abbr+/+span+ tags in the document that are not
|
1833
|
+
# already wrapped in a link. This method scans the text (or
|
1834
|
+
# +@content+) of each element and compares it to the contents of the
|
1835
|
+
# graph. If the process locates a subject, it will use that subject
|
1836
|
+
# as the basis of a link. if there are zero subjects, or more than
|
1837
|
+
# one, then the method executes a block which can be used to pick
|
1838
|
+
# (e.g., via user interface) a definite subject or otherwise add one.
|
1839
|
+
|
1840
|
+
# (maybe add +code+/+kbd+/+samp+/+var+/+time+ one day too)
|
1841
|
+
|
1842
|
+
def rehydrate doc, graph, &block
|
1843
|
+
doc.xpath(XPATH[:rehydrate], XPATHNS).each do |e|
|
1844
|
+
lang = e.xpath(XPATH[:lang]).to_s.strip
|
1845
|
+
# dt = e['datatype'] # XXX no datatype rn
|
1846
|
+
text = (e['content'] || e.xpath('.//text()').to_a.join).strip
|
1847
|
+
|
1848
|
+
# now we have the literal
|
1849
|
+
lit = [RDF::Literal(text)]
|
1850
|
+
lit.unshift RDF::Literal(text, language: lang) unless lang.empty?
|
1851
|
+
|
1852
|
+
# candidates
|
1853
|
+
cand = {}
|
1854
|
+
lit.map do |t|
|
1855
|
+
graph.query(object: t).to_a
|
1856
|
+
end.flatten.each do |x|
|
1857
|
+
y = cand[x.subject] ||= {}
|
1858
|
+
(y[:stmts] ||= []) << x
|
1859
|
+
y[:types] ||= graph.query([x.subject, RDF.type, nil]).objects.sort
|
1860
|
+
end
|
1861
|
+
|
1862
|
+
# if there's only one candidate, this is basically a noop
|
1863
|
+
chosen = cand.keys.first if cand.size == 1
|
1864
|
+
|
1865
|
+
# call the block to reconcile any gaps or conflicts
|
1866
|
+
if block_given? and cand.size != 1
|
1867
|
+
# the block is expected to return one of the candidates or
|
1868
|
+
# nil. we call the block with the graph so that the block can
|
1869
|
+
# manipulate its contents.
|
1870
|
+
chosen = block.call cand, graph
|
1871
|
+
raise ArgumentError, 'block must return nil or a term' unless
|
1872
|
+
chosen.nil? or chosen.is_a? RDF::Term
|
1873
|
+
end
|
1874
|
+
|
1875
|
+
if chosen
|
1876
|
+
# we assume this has been retrieved from the graph
|
1877
|
+
cc = cand[chosen]
|
1878
|
+
unless cc
|
1879
|
+
cc = cand[chosen] = {}
|
1880
|
+
cc[:stmts] = graph.query([chosen, nil, lit[0]]).to_a.sort
|
1881
|
+
cc[:types] = graph.query([chosen, RDF.type, nil]).objects.sort
|
1882
|
+
# if either of these are empty then the graph was not
|
1883
|
+
# appropriately populated
|
1884
|
+
raise 'Missing a statement relating #{chosen} to #{text}' if
|
1885
|
+
cc[:stmts].empty?
|
1886
|
+
end
|
1887
|
+
|
1888
|
+
# we should actually probably move any prefix/vocab/xmlns
|
1889
|
+
# declarations from the inner node to the outer one (although
|
1890
|
+
# in practice this will be an unlikely configuration)
|
1891
|
+
pfx = get_prefixes e
|
1892
|
+
|
1893
|
+
# here we have pretty much everything except for the prefixes
|
1894
|
+
# and wherever we want to actually link to.
|
1895
|
+
|
1896
|
+
inner = e.dup
|
1897
|
+
spec = { [inner] => :a, href: '' }
|
1898
|
+
# we should have types
|
1899
|
+
spec[:typeof] = abbreviate cc[:types], prefixes: pfx unless
|
1900
|
+
cc[:types].empty?
|
1901
|
+
|
1902
|
+
markup replace: e, spec: spec
|
1903
|
+
end
|
1904
|
+
end
|
1905
|
+
# return maybe the elements that did/didn't get changed?
|
1906
|
+
end
|
1907
|
+
|
1908
|
+
######## RENDERING STUFF ########
|
1909
|
+
|
1910
|
+
# Given a structure of the form +{ predicate => [objects] }+,
|
1911
|
+
# rearrange the structure into one more amenable to rendering
|
1912
|
+
# RDFa. Returns a hash of the form +{ resources: { r1 => Set[p1, pn]
|
1913
|
+
# }, literals: { l1 => Set[p2, pm] }, types: Set[t1, tn], datatypes:
|
1914
|
+
# Set[d1, dn] }+. This inverted structure can then be conveniently
|
1915
|
+
# traversed to generate the RDFa. An optional block lets us examine
|
1916
|
+
# the predicate-object pairs as they go by.
|
1917
|
+
#
|
1918
|
+
# @param struct [Hash] The struct of the designated form
|
1919
|
+
# @yield [p, o] An optional block is given the predicate-object pair
|
1920
|
+
# @return [Hash] The inverted structure, as described.
|
1921
|
+
#
|
1922
|
+
def prepare_collation struct, &block
|
1923
|
+
resources = {}
|
1924
|
+
literals = {}
|
1925
|
+
datatypes = Set.new
|
1926
|
+
types = Set.new
|
1927
|
+
|
1928
|
+
struct.each do |p, v|
|
1929
|
+
v.each do |o|
|
1930
|
+
block.call p, o if block
|
1931
|
+
|
1932
|
+
if o.literal?
|
1933
|
+
literals[o] ||= Set.new
|
1934
|
+
literals[o].add p
|
1935
|
+
# collect the datatype
|
1936
|
+
datatypes.add o.datatype if o.has_datatype?
|
1937
|
+
else
|
1938
|
+
if p == RDF::RDFV.type
|
1939
|
+
# separate the type
|
1940
|
+
types.add o
|
1941
|
+
else
|
1942
|
+
# collect the resource
|
1943
|
+
resources[o] ||= Set.new
|
1944
|
+
resources[o].add p
|
1945
|
+
end
|
1946
|
+
end
|
1947
|
+
end
|
1948
|
+
end
|
1949
|
+
|
1950
|
+
{ resources: resources, literals: literals,
|
1951
|
+
datatypes: datatypes, types: types }
|
1952
|
+
end
|
1953
|
+
|
1954
|
+
# Given a hash of prefixes and an array of nodes, obtain the the
|
1955
|
+
# subset of prefixes that abbreviate the nodes. Scans RDF URIs as
|
1956
|
+
# well as RDF::Literal datatypes.
|
1957
|
+
#
|
1958
|
+
# @param prefixes [#to_h] The prefixes, of the form +{ k: "v" }+
|
1959
|
+
# @param nodes [Array<RDF::Term>] The nodes to supply
|
1960
|
+
# @return [Hash] The prefix subset
|
1961
|
+
def prefix_subset prefixes, nodes
|
1962
|
+
prefixes = sanitize_prefixes prefixes, true
|
1963
|
+
|
1964
|
+
raise 'nodes must be arrayable' unless nodes.respond_to? :to_a
|
1965
|
+
|
1966
|
+
# sniff out all the URIs and datatypes
|
1967
|
+
resources = Set.new
|
1968
|
+
nodes.each do |n|
|
1969
|
+
next unless n.is_a? RDF::Term
|
1970
|
+
if n.literal? && n.datatype?
|
1971
|
+
resources << n.datatype
|
1972
|
+
elsif n.uri?
|
1973
|
+
resources << n
|
1974
|
+
end
|
1975
|
+
end
|
1976
|
+
|
1977
|
+
# now we abbreviate all the resources
|
1978
|
+
pfx = abbreviate(resources.to_a,
|
1979
|
+
prefixes: prefixes, noop: false, sort: false).uniq.compact.map do |p|
|
1980
|
+
p.split(?:).first.to_sym
|
1981
|
+
end.uniq.to_set
|
1982
|
+
|
1983
|
+
# now we return the subset
|
1984
|
+
prefixes.select { |k, _| pfx.include? k.to_sym }
|
1985
|
+
end
|
1986
|
+
|
1987
|
+
# turns any data structure into a set of nodes
|
1988
|
+
def smush_struct struct
|
1989
|
+
out = Set.new
|
1990
|
+
|
1991
|
+
if struct.is_a? RDF::Term
|
1992
|
+
out << struct
|
1993
|
+
elsif struct.respond_to? :to_a
|
1994
|
+
out |= struct.to_a.map { |s| smush_struct(s).to_a }.flatten.to_set
|
1995
|
+
end
|
1996
|
+
|
1997
|
+
out
|
1998
|
+
end
|
1999
|
+
|
2000
|
+
def invert_struct struct
|
2001
|
+
nodes = {}
|
2002
|
+
|
2003
|
+
struct.each do |p, v|
|
2004
|
+
v.each do |o|
|
2005
|
+
nodes[o] ||= Set.new
|
2006
|
+
nodes[o] << p
|
2007
|
+
end
|
2008
|
+
end
|
2009
|
+
|
2010
|
+
nodes
|
2011
|
+
end
|
2012
|
+
|
2013
|
+
def title_tag predicates, content,
|
2014
|
+
prefixes: {}, vocab: nil, lang: nil, xhtml: true
|
2015
|
+
|
2016
|
+
# begin with the tag
|
2017
|
+
tag = { '#title' => content.to_s,
|
2018
|
+
property: abbreviate(predicates, prefixes: prefixes, vocab: vocab) }
|
2019
|
+
|
2020
|
+
# we set the language if it exists and is different from the
|
2021
|
+
# body OR if it is xsd:string we set it to the empty string
|
2022
|
+
lang = (content.language? && content.language != lang ?
|
2023
|
+
content.language : nil) || (content.datatype == RDF::XSD.string &&
|
2024
|
+
lang ? '' : nil)
|
2025
|
+
if lang
|
2026
|
+
tag['xml:lang'] = lang if xhtml
|
2027
|
+
tag[:lang] = lang
|
2028
|
+
end
|
2029
|
+
if content.datatype? && content.datatype != RDF::XSD.string
|
2030
|
+
tag[:datatype] = abbreviate(content.datatype,
|
2031
|
+
prefixes: prefixes, vocab: vocab)
|
2032
|
+
end
|
2033
|
+
|
2034
|
+
tag
|
2035
|
+
end
|
2036
|
+
|
2037
|
+
######## MISC STUFF ########
|
2038
|
+
|
2039
|
+
# Obtain everything that is an owl:equivalentClass or
|
2040
|
+
# rdfs:subClassOf the given type.
|
2041
|
+
#
|
2042
|
+
# @param rdftype [RDF::Term]
|
2043
|
+
#
|
2044
|
+
# @return [Array]
|
2045
|
+
|
2046
|
+
def all_related rdftype
|
2047
|
+
t = RDF::Vocabulary.find_term(rdftype) or raise "No type #{rdftype.to_s}"
|
2048
|
+
q = [t] # queue
|
2049
|
+
c = {} # cache
|
2050
|
+
|
2051
|
+
while term = q.shift
|
2052
|
+
# add term to cache
|
2053
|
+
c[term] = term
|
2054
|
+
|
2055
|
+
# keep this from tripping up
|
2056
|
+
next unless term.uri? and term.respond_to? :class?
|
2057
|
+
|
2058
|
+
# entail equivalent classes
|
2059
|
+
term.entail(:equivalentClass).each do |ec|
|
2060
|
+
# add equivalent classes to queue (if not already cached)
|
2061
|
+
q.push ec unless c[ec]
|
2062
|
+
c[ec] = ec unless ec == term
|
2063
|
+
end
|
2064
|
+
|
2065
|
+
# entail subclasses
|
2066
|
+
term.subClass.each do |sc|
|
2067
|
+
# add subclasses to queue (if not already cached)
|
2068
|
+
q.push sc unless c[sc]
|
2069
|
+
c[sc] = sc unless sc == term
|
2070
|
+
end
|
2071
|
+
end
|
2072
|
+
|
2073
|
+
# smush the result
|
2074
|
+
c.keys
|
2075
|
+
end
|
2076
|
+
|
2077
|
+
|
2078
|
+
|
2079
|
+
# duplicate instance methods as module methods
|
2080
|
+
extend self
|
2081
|
+
end
|