rdf-sak 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +268 -0
- data/Rakefile +13 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/example/cleanup.xsl +14 -0
- data/example/matches.xhtml +11 -0
- data/example/transforms.ttl +58 -0
- data/lib/rdf-sak.rb +1 -0
- data/lib/rdf/sak.rb +2506 -0
- data/lib/rdf/sak/ci.rb +827 -0
- data/lib/rdf/sak/cli.rb +35 -0
- data/lib/rdf/sak/docstats.rb +188 -0
- data/lib/rdf/sak/document.rb +772 -0
- data/lib/rdf/sak/ibis.rb +248 -0
- data/lib/rdf/sak/mimemagic.rb +73 -0
- data/lib/rdf/sak/pav.rb +479 -0
- data/lib/rdf/sak/qb.rb +280 -0
- data/lib/rdf/sak/scovo.rb +51 -0
- data/lib/rdf/sak/tfo.rb +301 -0
- data/lib/rdf/sak/transform.rb +1172 -0
- data/lib/rdf/sak/urlrunner.rb +602 -0
- data/lib/rdf/sak/util.rb +2081 -0
- data/lib/rdf/sak/version.rb +5 -0
- data/rdf-sak.gemspec +60 -0
- metadata +366 -0
data/lib/rdf/sak/util.rb
ADDED
@@ -0,0 +1,2081 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'rdf/sak/version'
|
3
|
+
|
4
|
+
require 'uri'
|
5
|
+
require 'uri/urn'
|
6
|
+
require 'set'
|
7
|
+
require 'uuid-ncname'
|
8
|
+
|
9
|
+
require 'rdf'
|
10
|
+
require 'rdf/vocab'
|
11
|
+
require 'rdf/reasoner'
|
12
|
+
require 'rdf/vocab/skos'
|
13
|
+
require 'rdf/vocab/foaf'
|
14
|
+
require 'rdf/vocab/bibo'
|
15
|
+
require 'rdf/vocab/dc'
|
16
|
+
require 'rdf/vocab/dc11'
|
17
|
+
|
18
|
+
require 'rdf/sak/mimemagic'
|
19
|
+
require 'rdf/sak/ci'
|
20
|
+
require 'rdf/sak/tfo'
|
21
|
+
require 'rdf/sak/ibis'
|
22
|
+
require 'rdf/sak/pav'
|
23
|
+
require 'rdf/sak/qb'
|
24
|
+
|
25
|
+
unless RDF::List.respond_to? :from
|
26
|
+
class RDF::List
|
27
|
+
private
|
28
|
+
|
29
|
+
def self.get_list repo, subject, seen = []
|
30
|
+
out = []
|
31
|
+
return out if seen.include? subject
|
32
|
+
seen << subject
|
33
|
+
first = repo.query([subject, RDF.first, nil]).objects.first or return out
|
34
|
+
out << first
|
35
|
+
rest = repo.query([subject, RDF.rest, nil]).objects.select do |x|
|
36
|
+
!x.literal?
|
37
|
+
end.first or return out
|
38
|
+
|
39
|
+
out + (rest != RDF.nil ? get_list(repo, rest, seen) : [])
|
40
|
+
end
|
41
|
+
|
42
|
+
public
|
43
|
+
|
44
|
+
# Inflate a list from a graph but don't change the graph
|
45
|
+
def self.from graph, subject
|
46
|
+
self.new graph: graph, subject: subject, values: get_list(graph, subject)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
module RDF::SAK::Util
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
RDF::Reasoner.apply(:rdfs, :owl)
|
56
|
+
|
57
|
+
R3986 = /^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/
|
58
|
+
SF = /[^[:alpha:][:digit:]\/\?%@!$&'()*+,:;=._~-]/n
|
59
|
+
RFC3986 =
|
60
|
+
/^(?:([^:\/?#]+):)?(?:\/\/([^\/?#]*))?([^?#]+)?(?:\?([^#]*))?(?:#(.*))?$/
|
61
|
+
SEPS = [['', ?:], ['//', ''], ['', ''], [??, ''], [?#, '']].freeze
|
62
|
+
|
63
|
+
XPATH = {
|
64
|
+
htmlbase: proc {
|
65
|
+
x = ['ancestor-or-self::html:html[1]/' \
|
66
|
+
'html:head[html:base[@href]][1]/html:base[@href][1]/@href']
|
67
|
+
(x << x.first.gsub('html:', '')).join ?| }.call,
|
68
|
+
xmlbase: 'ancestor-or-self::*[@xml:base][1]/@xml:base',
|
69
|
+
lang: 'normalize-space((%s)[last()])' %
|
70
|
+
%w[lang xml:lang].map do |a|
|
71
|
+
'ancestor-or-self::*[@%s][1]/@%s' % [a,a]
|
72
|
+
end.join(?|),
|
73
|
+
literal: '(ancestor::*[@property][not(@content)]' \
|
74
|
+
'[not(@resource|@href|@src) or @rel|@rev])[1]',
|
75
|
+
leaves: 'descendant::html:section[not(descendant::html:section)]' \
|
76
|
+
'[not(*[not(self::html:script)])]',
|
77
|
+
headers: './*[1][%s]//text()' %
|
78
|
+
(1..6).map { |x| "self::html:h#{x}" }.join(?|),
|
79
|
+
modernize: ([
|
80
|
+
"//html:div[*[1][#{(1..6).map { |i| 'self::html:h%d' % i }.join ?|}]]"] +
|
81
|
+
{ div: %i[section figure], blockquote: :note,
|
82
|
+
table: :figure, img: :figure }.map do |k, v|
|
83
|
+
(v.is_a?(Array) ? v : [v]).map do |cl|
|
84
|
+
"//html:#{k}[contains(concat(' ', " \
|
85
|
+
"normalize-space(@class), ' '), ' #{cl} ')]"
|
86
|
+
end
|
87
|
+
end.flatten).join(?|),
|
88
|
+
dehydrate: '//html:a[count(*)=1][html:dfn|html:abbr|html:span]',
|
89
|
+
rehydrate: %w[//html:dfn
|
90
|
+
//html:abbr[not(parent::html:dfn)] //html:span].join(?|) +
|
91
|
+
'[not(parent::html:a)]',
|
92
|
+
htmllinks: (%w[*[not(self::html:base)][@href]/@href
|
93
|
+
*[@src]/@src object[@data]/@data *[@srcset]/@srcset
|
94
|
+
form[@action]/@action].map { |e|
|
95
|
+
'//html:%s' % e} + %w[//*[@xlink:href]/@xlink:href]).join(?|).freeze,
|
96
|
+
atomlinks: %w[uri content/@src category/@scheme generator/@uri icon id
|
97
|
+
link/@href logo].map { |e| '//atom:%s' % e }.join(?|).freeze,
|
98
|
+
rsslinks: %w[image/text()[1] docs/text()[1] source/@url enclosure/@url
|
99
|
+
guid/text()[1] comments/text()[1]].map { |e|
|
100
|
+
'//%s' % e }.join(?|).freeze,
|
101
|
+
xlinks: '//*[@xlink:href]/@xlink:href'.freeze,
|
102
|
+
rdflinks: %w[about resource datatype].map { |e|
|
103
|
+
'//*[@rdf:%s]/@rdf:%s' % [e, e] }.join(?|).freeze,
|
104
|
+
}
|
105
|
+
|
106
|
+
LINK_MAP = {
|
107
|
+
'text/html' => :htmllinks,
|
108
|
+
'application/xhtml+xml' => :htmllinks,
|
109
|
+
'application/atom+xml' => :atomlinks,
|
110
|
+
'application/x-rss+xml' => :rsslinks,
|
111
|
+
'application/rdf+xml' => :rdflinks,
|
112
|
+
'image/svg+xml' => :xlinks,
|
113
|
+
}.transform_values { |v| XPATH[v] }.freeze
|
114
|
+
|
115
|
+
URI_COERCIONS = {
|
116
|
+
nil => -> t { t.to_s },
|
117
|
+
false => -> t { t.to_s },
|
118
|
+
uri: -> t { URI.parse t.to_s },
|
119
|
+
rdf: -> t {
|
120
|
+
t = t.to_s
|
121
|
+
t.start_with?('_:') ? RDF::Node.new(t.delete_prefix '_:') : RDF::URI(t) },
|
122
|
+
}
|
123
|
+
|
124
|
+
UUID_RE = /^(?:urn:uuid:)?([0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8})$/i
|
125
|
+
|
126
|
+
# okay labels: what do we want to do about them? poor man's fresnel!
|
127
|
+
|
128
|
+
# basic structure is an asserted base class corresponding to a
|
129
|
+
# ranked list of asserted predicates. to the subject we first
|
130
|
+
# match the closest class, then the closest property.
|
131
|
+
|
132
|
+
# if the instance data doesn't have an exact property mentioned in
|
133
|
+
# the spec, it may have an equivalent property or subproperty we
|
134
|
+
# may be able to use. we could imagine a scoring system analogous
|
135
|
+
# to the one used by CSS selectors, albeit using the topological
|
136
|
+
# distance of classes/predicates in the spec versus those in the
|
137
|
+
# instance data.
|
138
|
+
|
139
|
+
# think about dcterms:title is a subproperty of dc11:title even
|
140
|
+
# though they are actually more like equivalent properties;
|
141
|
+
# owl:equivalentProperty is not as big a conundrum as
|
142
|
+
# rdfs:subPropertyOf.
|
143
|
+
|
144
|
+
# if Q rdfs:subPropertyOf P then S Q O implies S P O. this is
|
145
|
+
# great but property Q may not be desirable to display.
|
146
|
+
|
147
|
+
# it may be desirable to be able to express properties to never
|
148
|
+
# use as a label, such as skos:hiddenLabel
|
149
|
+
|
150
|
+
# consider ranked alternates, sequences, sequences of alternates.
|
151
|
+
# (this is what fresnel does fyi)
|
152
|
+
|
153
|
+
STRINGS = {
|
154
|
+
RDF::RDFS.Resource => {
|
155
|
+
label: [
|
156
|
+
# main
|
157
|
+
[RDF::Vocab::SKOS.prefLabel, RDF::RDFS.label,
|
158
|
+
RDF::Vocab::DC.title, RDF::Vocab::DC11.title, RDF::RDFV.value],
|
159
|
+
# alt
|
160
|
+
[RDF::Vocab::SKOS.altLabel, RDF::Vocab::DC.alternative],
|
161
|
+
],
|
162
|
+
desc: [
|
163
|
+
# main will be cloned into alt
|
164
|
+
[RDF::Vocab::DC.abstract, RDF::Vocab::DC.description,
|
165
|
+
RDF::Vocab::DC11.description, RDF::RDFS.comment,
|
166
|
+
RDF::Vocab::SKOS.note],
|
167
|
+
],
|
168
|
+
},
|
169
|
+
RDF::Vocab::FOAF.Document => {
|
170
|
+
label: [
|
171
|
+
# main
|
172
|
+
[RDF::Vocab::DC.title, RDF::Vocab::DC11.title],
|
173
|
+
# alt
|
174
|
+
[RDF::Vocab::BIBO.shortTitle, RDF::Vocab::DC.alternative],
|
175
|
+
],
|
176
|
+
desc: [
|
177
|
+
# main
|
178
|
+
[RDF::Vocab::BIBO.abstract, RDF::Vocab::DC.abstract,
|
179
|
+
RDF::Vocab::DC.description, RDF::Vocab::DC11.description],
|
180
|
+
# alt
|
181
|
+
[RDF::Vocab::BIBO.shortDescription],
|
182
|
+
],
|
183
|
+
},
|
184
|
+
RDF::Vocab::FOAF.Agent => {
|
185
|
+
label: [
|
186
|
+
# main (will get cloned into alt)
|
187
|
+
[RDF::Vocab::FOAF.name],
|
188
|
+
],
|
189
|
+
desc: [
|
190
|
+
# main cloned into alt
|
191
|
+
[RDF::Vocab::FOAF.status],
|
192
|
+
],
|
193
|
+
},
|
194
|
+
}
|
195
|
+
STRINGS[RDF::OWL.Thing] = STRINGS[RDF::RDFS.Resource]
|
196
|
+
|
197
|
+
# note this is to_a because "can't modify a hash during iteration"
|
198
|
+
# which i guess is sensible, so we generate a set of pairs first
|
199
|
+
STRINGS.to_a.each do |type, struct|
|
200
|
+
struct.values.each do |lst|
|
201
|
+
# assert a whole bunch of stuff
|
202
|
+
raise 'STRINGS content must be an array of arrays' unless
|
203
|
+
lst.is_a? Array
|
204
|
+
raise 'Spec must contain 1 or 2 Array elements' if lst.empty?
|
205
|
+
raise 'Spec must be array of arrays of terms' unless
|
206
|
+
lst.all? { |x| x.is_a? Array and x.all? { |y|
|
207
|
+
RDF::Vocabulary.find_term(y) } }
|
208
|
+
|
209
|
+
# prune this to two elements (not that there should be more than)
|
210
|
+
lst.slice!(2, lst.length) if lst.length > 2
|
211
|
+
|
212
|
+
# pre-fill equivalent properties
|
213
|
+
lst.each do |preds|
|
214
|
+
# for each predicate, find its equivalent properties
|
215
|
+
|
216
|
+
# splice them in after the current predicate only if they
|
217
|
+
# are not already explicitly in the list
|
218
|
+
i = 0
|
219
|
+
loop do
|
220
|
+
equiv = preds[i].entail(:equivalentProperty) - preds
|
221
|
+
preds.insert(i + 1, *equiv) unless equiv.empty?
|
222
|
+
|
223
|
+
i += equiv.length + 1
|
224
|
+
break if i >= preds.length
|
225
|
+
end
|
226
|
+
|
227
|
+
# this just causes too many problems otherwise
|
228
|
+
# preds.map! { |p| p.to_s }
|
229
|
+
end
|
230
|
+
|
231
|
+
# duplicate main predicates to alternatives
|
232
|
+
lst[1] ||= lst[0]
|
233
|
+
end
|
234
|
+
|
235
|
+
# may as well seed equivalent classes so we don't have to look them up
|
236
|
+
type.entail(:equivalentClass).each do |equiv|
|
237
|
+
STRINGS[equiv] ||= struct
|
238
|
+
end
|
239
|
+
|
240
|
+
# tempting to do subclasses too but it seems pretty costly in
|
241
|
+
# this framework; save it for the clojure version
|
242
|
+
end
|
243
|
+
|
244
|
+
AUTHOR = [RDF::SAK::PAV.authoredBy, RDF::Vocab::DC.creator,
|
245
|
+
RDF::Vocab::DC11.creator, RDF::Vocab::PROV.wasAttributedTo]
|
246
|
+
CONTRIB = [RDF::SAK::PAV.contributedBy, RDF::Vocab::DC.contributor,
|
247
|
+
RDF::Vocab::DC11.contributor]
|
248
|
+
[AUTHOR, CONTRIB].each do |preds|
|
249
|
+
i = 0
|
250
|
+
loop do
|
251
|
+
equiv = preds[i].entail(:equivalentProperty) - preds
|
252
|
+
preds.insert(i + 1, *equiv) unless equiv.empty?
|
253
|
+
i += equiv.length + 1
|
254
|
+
break if i >= preds.length
|
255
|
+
end
|
256
|
+
|
257
|
+
preds.freeze
|
258
|
+
end
|
259
|
+
|
260
|
+
def sanitize_prefixes prefixes, nonnil = false
|
261
|
+
raise ArgumentError, 'prefixes must be a hash' unless
|
262
|
+
prefixes.is_a? Hash or prefixes.respond_to? :to_h
|
263
|
+
prefixes = prefixes.to_h.map do |k, v|
|
264
|
+
[k ? k.to_s.to_sym : nil, v ? v.to_s : nil]
|
265
|
+
end.to_h
|
266
|
+
|
267
|
+
prefixes.reject! { |k, v| k.nil? || v.nil? } if nonnil
|
268
|
+
prefixes
|
269
|
+
end
|
270
|
+
|
271
|
+
def assert_uri_coercion coerce
|
272
|
+
if coerce
|
273
|
+
coerce = coerce.to_s.to_sym if coerce.respond_to? :to_s
|
274
|
+
raise 'coerce must be either :uri or :rdf' unless
|
275
|
+
%i[uri rdf].include?(coerce)
|
276
|
+
end
|
277
|
+
coerce
|
278
|
+
end
|
279
|
+
|
280
|
+
def assert_xml_node node
|
281
|
+
raise 'Argument must be a Nokogiri::XML::Element' unless
|
282
|
+
node.is_a? Nokogiri::XML::Element
|
283
|
+
node
|
284
|
+
end
|
285
|
+
|
286
|
+
def internal_subject_for node, prefixes: nil, base: nil, coerce: nil,
|
287
|
+
is_ancestor: false
|
288
|
+
|
289
|
+
# note we assign these AFTER the literal check or it will be wrong
|
290
|
+
prefixes ||= get_prefixes node
|
291
|
+
|
292
|
+
base ||= get_base node
|
293
|
+
base = coerce_resource base, as: :uri unless base
|
294
|
+
|
295
|
+
# answer a bunch of helpful questions about this element
|
296
|
+
subject = nil
|
297
|
+
parent = node.parent
|
298
|
+
ns_href = node.namespace.href if node.namespace
|
299
|
+
up_ok = %i[rel rev].none? { |a| node.key? a }
|
300
|
+
is_root = !parent or parent.document?
|
301
|
+
special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
|
302
|
+
(ns_href == 'http://www.w3.org/1999/xhtml' or
|
303
|
+
/^(?:[^:]+:)?html$/xi === parent.name)
|
304
|
+
|
305
|
+
# if the node is being inspected as an ancestor to the
|
306
|
+
# original node, we have to check it backwards.
|
307
|
+
if is_ancestor
|
308
|
+
# ah right @resource gets special treatment
|
309
|
+
if subject = node[:resource]
|
310
|
+
subject = resolve_curie subject,
|
311
|
+
prefixes: prefixes, base: base, scalar: true
|
312
|
+
else
|
313
|
+
# then check @href and @src
|
314
|
+
%i[href src].each do |attr|
|
315
|
+
if node.key? attr
|
316
|
+
# merge with the root and return it
|
317
|
+
subject = base + node[attr]
|
318
|
+
break
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
return coerce_resource subject, as: coerce if subject
|
324
|
+
|
325
|
+
# note if we are being called with is_ancestor, that means
|
326
|
+
# the original node (or indeed any of the nodes previously
|
327
|
+
# tested) have anything resembling a resource in them. this
|
328
|
+
# means @rel/@rev should be ignored, and we should keep
|
329
|
+
# looking for a subject.
|
330
|
+
end
|
331
|
+
|
332
|
+
if node[:about]
|
333
|
+
|
334
|
+
subject = resolve_curie node[:about],
|
335
|
+
prefixes: prefixes, base: base, scalar: true
|
336
|
+
|
337
|
+
# ignore coercion
|
338
|
+
return subject if subject.is_a? RDF::Node
|
339
|
+
|
340
|
+
elsif is_root
|
341
|
+
subject = base
|
342
|
+
elsif special
|
343
|
+
subject = subject_for_internal parent
|
344
|
+
elsif node[:resource]
|
345
|
+
# XXX resolve @about against potential curie
|
346
|
+
subject = resolve_curie node[:resource], prefixes: prefixes, base: base
|
347
|
+
elsif node[:href]
|
348
|
+
subject = base + node[:href]
|
349
|
+
elsif node[:src]
|
350
|
+
subject = base + node[:src]
|
351
|
+
elsif node[:typeof]
|
352
|
+
# bnode the typeof attr
|
353
|
+
|
354
|
+
# note we return bnodes irrespective of the rdf flag
|
355
|
+
return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
|
356
|
+
elsif node[:inlist]
|
357
|
+
# bnode the inlist attr
|
358
|
+
return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
|
359
|
+
elsif (parent[:inlist] && %i[href src].none? { |a| parent.key? a }) ||
|
360
|
+
(is_ancestor && !up_ok)
|
361
|
+
# bnode the element
|
362
|
+
return RDF::Node('id-%016x' % node.pointer_id)
|
363
|
+
# elsif node[:id]
|
364
|
+
else
|
365
|
+
subject = subject_for_internal parent, is_ancestor: true
|
366
|
+
end
|
367
|
+
|
368
|
+
coerce_resource subject, as: coerce if subject
|
369
|
+
end
|
370
|
+
|
371
|
+
MODERNIZE = {
|
372
|
+
div: -> e {
|
373
|
+
if e.classes.include? 'figure'
|
374
|
+
e.remove_class 'figure'
|
375
|
+
e.name = 'figure' unless e.parent.name == 'figure'
|
376
|
+
else
|
377
|
+
e.remove_class 'section'
|
378
|
+
e.name = 'section'
|
379
|
+
end
|
380
|
+
},
|
381
|
+
blockquote: -> e {
|
382
|
+
e.remove_class 'note'
|
383
|
+
e.name = 'aside'
|
384
|
+
e['role'] = 'note'
|
385
|
+
},
|
386
|
+
table: -> e {
|
387
|
+
e.remove_class 'figure'
|
388
|
+
unless e.parent.name == 'figure'
|
389
|
+
inner = e.dup
|
390
|
+
markup replace: e, spec: { [inner] => :figure }
|
391
|
+
end
|
392
|
+
},
|
393
|
+
img: -> e {
|
394
|
+
e.remove_class 'figure'
|
395
|
+
unless e.parent.name == 'figure'
|
396
|
+
inner = e.dup
|
397
|
+
markup replace: e, spec: { [inner] => :figure }
|
398
|
+
end
|
399
|
+
},
|
400
|
+
}
|
401
|
+
|
402
|
+
# rdf term type tests
|
403
|
+
NTESTS = { uri: :"uri?", blank: :"node?", literal: :"literal?" }.freeze
|
404
|
+
NMAP = ({ iri: :uri, bnode: :blank }.merge(
|
405
|
+
[:uri, :blank, :literal].map { |x| [x, x] }.to_h)).freeze
|
406
|
+
|
407
|
+
public
|
408
|
+
|
409
|
+
def coerce_node_spec spec, rev: false
|
410
|
+
spec = [spec] unless spec.respond_to? :to_a
|
411
|
+
spec = spec - [:resource] + [:uri, :blank] if spec.include? :resource
|
412
|
+
raise 'Subjects are never literals' if rev and spec.include? :literal
|
413
|
+
|
414
|
+
spec = NMAP.values_at(*spec).reject(&:nil?).uniq
|
415
|
+
spec = NTESTS.keys if spec.empty?
|
416
|
+
spec.delete :literal if rev
|
417
|
+
spec.uniq
|
418
|
+
end
|
419
|
+
|
420
|
+
def node_matches? node, spec
|
421
|
+
spec.any? { |k| node.send NTESTS[k] }
|
422
|
+
end
|
423
|
+
|
424
|
+
# Obtain all and only the rdf:types directly asserted on the subject.
|
425
|
+
#
|
426
|
+
# @param repo [RDF::Queryable]
|
427
|
+
# @param subject [RDF::Resource]
|
428
|
+
# @param type [RDF::Term, :to_a]
|
429
|
+
#
|
430
|
+
# @return [Array]
|
431
|
+
def self.asserted_types repo, subject, type = nil
|
432
|
+
asserted = nil
|
433
|
+
|
434
|
+
if type
|
435
|
+
type = type.respond_to?(:to_a) ? type.to_a : [type]
|
436
|
+
asserted = type.select { |t| t.is_a? RDF::Value }.map do |t|
|
437
|
+
RDF::Vocabulary.find_term t
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
asserted ||= repo.query([subject, RDF.type, nil]).objects.map do |o|
|
442
|
+
RDF::Vocabulary.find_term o
|
443
|
+
end.compact
|
444
|
+
|
445
|
+
asserted.select { |t| t && t.uri? }.uniq
|
446
|
+
end
|
447
|
+
|
448
|
+
# Obtain a stack of types for an asserted initial type or set
|
449
|
+
# thereof. Returns an array of arrays, where the first is the
|
450
|
+
# asserted types and their inferred equivalents, and subsequent
|
451
|
+
# elements are immediate superclasses and their equivalents. A
|
452
|
+
# given URI will only appear once in the entire structure.
|
453
|
+
#
|
454
|
+
# @param rdftype [RDF::Term, :to_a]
|
455
|
+
#
|
456
|
+
# @return [Array]
|
457
|
+
#
|
458
|
+
def type_strata rdftype
|
459
|
+
# first we coerce this to an array
|
460
|
+
if rdftype.respond_to? :to_a
|
461
|
+
rdftype = rdftype.to_a
|
462
|
+
else
|
463
|
+
rdftype = [rdftype]
|
464
|
+
end
|
465
|
+
|
466
|
+
# now squash and coerce
|
467
|
+
rdftype = rdftype.uniq.map { |t| RDF::Vocabulary.find_term t }.compact
|
468
|
+
|
469
|
+
# bail out early
|
470
|
+
return [] if rdftype.empty?
|
471
|
+
|
472
|
+
# essentially what we want to do is construct a layer of
|
473
|
+
# asserted classes and their inferred equivalents, then probe
|
474
|
+
# the classes in the first layer for subClassOf assertions,
|
475
|
+
# which will form the second layer, and so on.
|
476
|
+
|
477
|
+
queue = [rdftype]
|
478
|
+
strata = []
|
479
|
+
seen = Set.new
|
480
|
+
|
481
|
+
while qin = queue.shift
|
482
|
+
qwork = []
|
483
|
+
|
484
|
+
qin.each do |q|
|
485
|
+
qwork << q # entail doesn't include q
|
486
|
+
qwork += q.entail(:equivalentClass) if q.uri?
|
487
|
+
end
|
488
|
+
|
489
|
+
# grep and flatten
|
490
|
+
qwork = qwork.map do |t|
|
491
|
+
next t if t.is_a? RDF::Vocabulary::Term
|
492
|
+
RDF::Vocabulary.find_term t
|
493
|
+
end.compact.uniq - seen.to_a
|
494
|
+
seen |= qwork
|
495
|
+
|
496
|
+
# warn "qwork == #{qwork.inspect}"
|
497
|
+
|
498
|
+
# push current layer out
|
499
|
+
strata.push qwork.dup unless qwork.empty?
|
500
|
+
|
501
|
+
# now deal with subClassOf
|
502
|
+
qsuper = []
|
503
|
+
qwork.each { |q| qsuper += q.subClassOf }
|
504
|
+
|
505
|
+
# grep and flatten this too
|
506
|
+
qsuper = qsuper.map do |t|
|
507
|
+
next t if t.is_a? RDF::Vocabulary::Term
|
508
|
+
RDF::Vocabulary.find_term t
|
509
|
+
end.compact.uniq - seen.to_a
|
510
|
+
# do not append qsuper to seen!
|
511
|
+
|
512
|
+
# warn "qsuper == #{qsuper.inspect}"
|
513
|
+
|
514
|
+
# same deal, conditionally push the input queue
|
515
|
+
queue.push qsuper.dup unless qsuper.empty?
|
516
|
+
end
|
517
|
+
|
518
|
+
# voila
|
519
|
+
strata
|
520
|
+
end
|
521
|
+
|
522
|
+
# Obtain the objects for a given subject-predicate pair.
|
523
|
+
#
|
524
|
+
# @param subject [RDF::Resource]
|
525
|
+
# @param predicate [RDF::URI]
|
526
|
+
# @param entail [false, true]
|
527
|
+
# @param only [:uri, :iri, :resource, :blank, :bnode, :literal]
|
528
|
+
# @param datatype [RDF::Term]
|
529
|
+
#
|
530
|
+
# @return [Array]
|
531
|
+
#
|
532
|
+
def predicate_set predicates, seen: Set.new
|
533
|
+
predicates = Set[predicates] if predicates.is_a? RDF::URI
|
534
|
+
unless predicates.is_a? Set
|
535
|
+
raise "predicates must be a set" unless predicates.respond_to? :to_set
|
536
|
+
predicates = predicates.to_set
|
537
|
+
end
|
538
|
+
|
539
|
+
# shortcut
|
540
|
+
return predicates if predicates.empty?
|
541
|
+
|
542
|
+
raise 'predicates must all be RDF::URI' unless predicates.all? do |p|
|
543
|
+
p.is_a? RDF::URI
|
544
|
+
end
|
545
|
+
|
546
|
+
# first we generate the set of equivalent properties for the given
|
547
|
+
# properties
|
548
|
+
predicates += predicates.map do |p|
|
549
|
+
p.entail :equivalentProperty
|
550
|
+
end.flatten.to_set
|
551
|
+
|
552
|
+
# then we take the resulting set of properties and
|
553
|
+
# compute their subproperties
|
554
|
+
subp = Set.new
|
555
|
+
(predicates - seen).each do |p|
|
556
|
+
subp += p.subProperty.flatten.to_set
|
557
|
+
end
|
558
|
+
|
559
|
+
# uhh this whole "seen" business might not be necessary
|
560
|
+
predicates + predicate_set(subp - predicates - seen, seen: predicates)
|
561
|
+
end
|
562
|
+
|
563
|
+
# Returns subjects from the graph with entailment.
|
564
|
+
#
|
565
|
+
# @param repo
|
566
|
+
# @param predicate
|
567
|
+
# @param object
|
568
|
+
# @param entail
|
569
|
+
# @param only
|
570
|
+
#
|
571
|
+
# @return [RDF::Resource]
|
572
|
+
#
|
573
|
+
def self.subjects_for repo, predicate, object, entail: true, only: []
|
574
|
+
raise 'Object must be a Term' unless object.is_a? RDF::Term
|
575
|
+
predicate = predicate.respond_to?(:to_a) ? predicate.to_a : [predicate]
|
576
|
+
raise 'Predicate must be some kind of term' unless
|
577
|
+
predicate.all? { |p| p.is_a? RDF::URI }
|
578
|
+
|
579
|
+
only = coerce_node_spec only, rev: true
|
580
|
+
|
581
|
+
predicate = predicate.map { |x| RDF::Vocabulary.find_term x }.compact
|
582
|
+
predicate = predicate_set predicate if entail
|
583
|
+
|
584
|
+
out = {}
|
585
|
+
revp = Set.new
|
586
|
+
predicate.each do |p|
|
587
|
+
repo.query([nil, p, object]).subjects.each do |s|
|
588
|
+
next unless node_matches? s, only
|
589
|
+
|
590
|
+
entry = out[s] ||= [Set.new, Set.new]
|
591
|
+
entry[0] << p
|
592
|
+
end
|
593
|
+
|
594
|
+
# do this here while we're at it
|
595
|
+
unless object.literal?
|
596
|
+
revp += p.inverseOf.to_set
|
597
|
+
revp << p if p.type.include? RDF::OWL.SymmetricProperty
|
598
|
+
end
|
599
|
+
end
|
600
|
+
|
601
|
+
unless object.literal?
|
602
|
+
revp = predicate_set revp if entail
|
603
|
+
|
604
|
+
revp.each do |p|
|
605
|
+
repo.query([object, p, nil]).objects.each do |o|
|
606
|
+
next unless node_matches? o, only
|
607
|
+
|
608
|
+
entry = out[o] ||= [Set.new, Set.new]
|
609
|
+
entry[1] << p
|
610
|
+
end
|
611
|
+
end
|
612
|
+
end
|
613
|
+
|
614
|
+
# run this through a block to get access to the predicates
|
615
|
+
return out.map { |p, v| yield p, *v } if block_given?
|
616
|
+
|
617
|
+
out.keys
|
618
|
+
end
|
619
|
+
|
620
|
+
# Returns objects from the graph with entailment.
|
621
|
+
#
|
622
|
+
# @param repo
|
623
|
+
# @param subject
|
624
|
+
# @param predicate
|
625
|
+
# @param entail
|
626
|
+
# @param only
|
627
|
+
# @param datatype
|
628
|
+
#
|
629
|
+
# @return [RDF::Term]
|
630
|
+
#
|
631
|
+
def self.objects_for repo, subject, predicate,
|
632
|
+
entail: true, only: [], datatype: nil
|
633
|
+
raise "Subject must be a resource, not #{subject.inspect}" unless
|
634
|
+
subject.is_a? RDF::Resource
|
635
|
+
predicate = predicate.respond_to?(:to_a) ? predicate.to_a : [predicate]
|
636
|
+
raise "Predicate must be a term, not #{predicate.first.class}" unless
|
637
|
+
predicate.all? { |p| p.is_a? RDF::URI }
|
638
|
+
|
639
|
+
predicate = predicate.map { |x| RDF::Vocabulary.find_term x }.compact
|
640
|
+
|
641
|
+
only = coerce_node_spec only
|
642
|
+
|
643
|
+
datatype = (
|
644
|
+
datatype.respond_to?(:to_a) ? datatype.to_a : [datatype]).compact
|
645
|
+
raise 'Datatype must be some kind of term' unless
|
646
|
+
datatype.all? { |p| p.is_a? RDF::URI }
|
647
|
+
|
648
|
+
# fluff this out
|
649
|
+
predicate = predicate_set predicate if entail
|
650
|
+
|
651
|
+
out = {}
|
652
|
+
predicate.each do |p|
|
653
|
+
repo.query([subject, p, nil]).objects.each do |o|
|
654
|
+
|
655
|
+
# make sure it's in the spec
|
656
|
+
next unless node_matches? o, only
|
657
|
+
|
658
|
+
# constrain output
|
659
|
+
next if o.literal? and
|
660
|
+
!(datatype.empty? or datatype.include?(o.datatype))
|
661
|
+
|
662
|
+
entry = out[o] ||= [Set.new, Set.new]
|
663
|
+
entry.first << p
|
664
|
+
end
|
665
|
+
end
|
666
|
+
|
667
|
+
# now we do the reverse
|
668
|
+
unless only == [:literal]
|
669
|
+
# generate reverse predicates
|
670
|
+
revp = Set.new
|
671
|
+
predicate.each do |p|
|
672
|
+
revp += p.inverseOf.to_set
|
673
|
+
revp << p if p.type.include? RDF::OWL.SymmetricProperty
|
674
|
+
end
|
675
|
+
revp = predicate_set revp if entail
|
676
|
+
|
677
|
+
# now scan 'em
|
678
|
+
revp.each do |p|
|
679
|
+
repo.query([nil, p, subject]).subjects.each do |s|
|
680
|
+
next unless node_matches? s, only
|
681
|
+
# no need to check datatype; subject is never a literal
|
682
|
+
|
683
|
+
entry = out[s] ||= [Set.new, Set.new]
|
684
|
+
entry.last << p
|
685
|
+
end
|
686
|
+
end
|
687
|
+
end
|
688
|
+
|
689
|
+
# run this through a block to get access to the predicates
|
690
|
+
return out.map { |p, v| yield p, *v } if block_given?
|
691
|
+
|
692
|
+
out.keys
|
693
|
+
end
|
694
|
+
|
695
|
+
# Obtain the canonical UUID for the given URI
|
696
|
+
#
|
697
|
+
# @param repo [RDF::Queryable]
|
698
|
+
# @param uri [RDF::URI, URI, to_s] the subject of the inquiry
|
699
|
+
# @param unique [true, false] return a single resource/nil or an array
|
700
|
+
# @param published [true, false] whether to restrict to published docs
|
701
|
+
#
|
702
|
+
# @return [RDF::URI, Array]
|
703
|
+
#
|
704
|
+
def self.canonical_uuid repo, uri, unique: true, published: false,
|
705
|
+
scache: {}, ucache: {}, base: nil
|
706
|
+
# make sure this is actually a uri
|
707
|
+
orig = uri = coerce_resource uri, base
|
708
|
+
unless uri.is_a? RDF::Node
|
709
|
+
tu = URI(uri_pp(uri).to_s).normalize
|
710
|
+
|
711
|
+
if tu.path && !tu.fragment &&
|
712
|
+
UUID_RE.match?(uu = tu.path.delete_prefix(?/))
|
713
|
+
tu = URI('urn:uuid:' + uu.downcase)
|
714
|
+
end
|
715
|
+
|
716
|
+
# unconditionally overwrite uri
|
717
|
+
uri = RDF::URI(tu.to_s)
|
718
|
+
|
719
|
+
# now check if it's a uuid
|
720
|
+
if tu.respond_to? :uuid
|
721
|
+
# warn "lol uuid #{orig}"
|
722
|
+
# if it's a uuid, check that we have it as a subject
|
723
|
+
# if we have it as a subject, return it
|
724
|
+
return uri if scache[uri] ||= repo.has_subject?(uri)
|
725
|
+
# note i don't want to screw around right now dealing with the
|
726
|
+
# case that a UUID might not itself be canonical
|
727
|
+
end
|
728
|
+
end
|
729
|
+
|
730
|
+
# spit up the cache if present
|
731
|
+
if out = ucache[orig]
|
732
|
+
# warn "lol cached #{orig}"
|
733
|
+
return unique ? out.first : out
|
734
|
+
end
|
735
|
+
|
736
|
+
# otherwise we proceed:
|
737
|
+
|
738
|
+
# goal: return the most "appropriate" UUID for the given URI
|
739
|
+
|
740
|
+
# it is so lame i have to do this
|
741
|
+
bits = { nil => 0, false => 0, true => 1 }
|
742
|
+
|
743
|
+
# rank (0 is higher):
|
744
|
+
# * (00) exact & canonical == 0,
|
745
|
+
# * (01) exact == 1,
|
746
|
+
# * (10) inexact & canonical == 2,
|
747
|
+
# * (11) inexact == 3.
|
748
|
+
|
749
|
+
# warn "WTF URI #{uri}"
|
750
|
+
|
751
|
+
# handle path parameters by generating a bunch of candidates
|
752
|
+
uris = if uri.respond_to? :path and uri.path.start_with? ?/
|
753
|
+
# split any path parameters off
|
754
|
+
uu, *pp = split_pp uri
|
755
|
+
if pp.empty?
|
756
|
+
[uri] # no path parameters
|
757
|
+
else
|
758
|
+
uu = RDF::URI(uu.to_s)
|
759
|
+
bp = uu.path # base path
|
760
|
+
(0..pp.length).to_a.reverse.map do |i|
|
761
|
+
u = uu.dup
|
762
|
+
u.path = ([bp] + pp.take(i)).join(';')
|
763
|
+
u
|
764
|
+
end
|
765
|
+
end
|
766
|
+
else
|
767
|
+
[uri] # not a pathful URI
|
768
|
+
end
|
769
|
+
|
770
|
+
# collect the candidates by URI
|
771
|
+
sa = predicate_set [RDF::SAK::CI.canonical,
|
772
|
+
RDF::SAK::CI.alias, RDF::OWL.sameAs]
|
773
|
+
candidates = nil
|
774
|
+
uris.each do |u|
|
775
|
+
candidates = subjects_for(repo, sa, u, entail: false) do |s, f|
|
776
|
+
# there is no #to_i for booleans and also we xor this number
|
777
|
+
[s, { rank: bits[f.include?(RDF::SAK::CI.canonical)] ^ 1,
|
778
|
+
published: published?(repo, s),
|
779
|
+
mtime: dates_for(repo, s).last || DateTime.new }]
|
780
|
+
end.compact.to_h
|
781
|
+
break unless candidates.empty?
|
782
|
+
end
|
783
|
+
|
784
|
+
# now collect by slug
|
785
|
+
slug = terminal_slug uri, base: base
|
786
|
+
if slug and !slug.empty?
|
787
|
+
exact = uri == coerce_resource(slug, base) # slug represents exact match
|
788
|
+
sl = [RDF::SAK::CI['canonical-slug'], RDF::SAK::CI.slug]
|
789
|
+
[RDF::XSD.string, RDF::XSD.token].each do |t|
|
790
|
+
subjects_for(repo, sl, RDF::Literal(slug, datatype: t)) do |s, f|
|
791
|
+
# default to lowest rank if this candidate is new
|
792
|
+
entry = candidates[s] ||= {
|
793
|
+
published: published?(repo, s, base: base),
|
794
|
+
rank: 0b11, mtime: dates_for(repo, s).last || DateTime.new }
|
795
|
+
# true is 1 and false is zero so we xor this too
|
796
|
+
rank = (BITS[exact] << 1 | BITS[f.include?(sl[0])]) ^ 0b11
|
797
|
+
# now amend the rank if we have found a better one
|
798
|
+
entry[:rank] = rank if rank < entry[:rank]
|
799
|
+
end
|
800
|
+
end
|
801
|
+
end
|
802
|
+
|
803
|
+
candidates.delete_if { |s, _| !/^urn:uuid:/.match?(s.to_s) }
|
804
|
+
|
805
|
+
# scan all the candidates for replacements and remove any
|
806
|
+
# candidates that have been replaced
|
807
|
+
candidates.to_a.each do |k, v|
|
808
|
+
# note that
|
809
|
+
reps = replacements_for(repo, k, published: published) - [k]
|
810
|
+
unless reps.empty?
|
811
|
+
v[:replaced] = true
|
812
|
+
reps.each do |r|
|
813
|
+
c = candidates[r] ||= { rank: v[:rank],
|
814
|
+
published: published?(repo, r),
|
815
|
+
mtime: dates_for(repo, r).last || v[:mtime] || DateTime.new }
|
816
|
+
# we give the replacement the rank and mtime of the
|
817
|
+
# resource being replaced if it scores better
|
818
|
+
c[:rank] = v[:rank] if v[:rank] < c[:rank]
|
819
|
+
c[:mtime] = v[:mtime] if v[:mtime] > c[:mtime]
|
820
|
+
end
|
821
|
+
end
|
822
|
+
end
|
823
|
+
|
824
|
+
# now we can remove all unpublished candidates if the context is
|
825
|
+
# published
|
826
|
+
candidates.select! do |_, v|
|
827
|
+
!v[:replaced] && (published ? v[:published] : true)
|
828
|
+
end
|
829
|
+
|
830
|
+
# now we sort by rank and date; the highest-ranking newest
|
831
|
+
# candidate is the one
|
832
|
+
|
833
|
+
out = candidates.sort do |a, b|
|
834
|
+
_, va = a
|
835
|
+
_, vb = b
|
836
|
+
cb = published ? BITS[vb[:published]] <=> BITS[va[:published]] : 0
|
837
|
+
cr = va[:rank] <=> vb[:rank]
|
838
|
+
cb == 0 ? cr == 0 ? vb[:mtime] <=> va[:mtime] : cr : cb
|
839
|
+
end.map { |x| x.first }.compact
|
840
|
+
|
841
|
+
# set cache
|
842
|
+
ucache[orig] = out
|
843
|
+
|
844
|
+
#warn "lol not cached #{orig}"
|
845
|
+
|
846
|
+
unique ? out.first : out
|
847
|
+
|
848
|
+
# an exact match is better than an inexact one
|
849
|
+
|
850
|
+
# a canonical match is better than non-canonical
|
851
|
+
|
852
|
+
# note this is four bits: exact, canon(exact), inexact, canon(inexact)
|
853
|
+
# !canon(exact) should rank higher than canon(inexact)
|
854
|
+
|
855
|
+
# unreplaced is better than replaced
|
856
|
+
|
857
|
+
# newer is better than older (though no reason an older item
|
858
|
+
# can't replace a newer one)
|
859
|
+
|
860
|
+
# published is better than not, unless the context is
|
861
|
+
# unpublished and an unpublished document replaces a published one
|
862
|
+
end
|
863
|
+
|
864
|
+
SCHEME_RANK = { https: 0, http: 1 }
|
865
|
+
|
866
|
+
def cmp_resource a, b, www: nil
|
867
|
+
raise 'Comparands must be instances of RDF::Value' unless
|
868
|
+
[a, b].all? { |x| x.is_a? RDF::Value }
|
869
|
+
|
870
|
+
# URI beats non-URI
|
871
|
+
if a.uri?
|
872
|
+
if b.uri?
|
873
|
+
# https beats http beats other
|
874
|
+
as = a.scheme.downcase.to_sym
|
875
|
+
bs = b.scheme.downcase.to_sym
|
876
|
+
cmp = SCHEME_RANK.fetch(as, 2) <=> SCHEME_RANK.fetch(bs, 2)
|
877
|
+
|
878
|
+
# bail out early
|
879
|
+
return cmp unless cmp == 0
|
880
|
+
|
881
|
+
# this would have returned if the schemes were different, as
|
882
|
+
# such we only need to test one of them
|
883
|
+
if [:http, :https].any?(as) and not www.nil?
|
884
|
+
# if www is non-nil, prefer www or no-www depending on
|
885
|
+
# truthiness of `www` parameter
|
886
|
+
pref = [false, true].zip(www ? [1, 0] : [0, 1]).to_h
|
887
|
+
re = /^(?:(www)\.)?(.*?)$/
|
888
|
+
|
889
|
+
ah = re.match(a.host.to_s.downcase)[1,2]
|
890
|
+
bh = re.match(b.host.to_s.downcase)[1,2]
|
891
|
+
|
892
|
+
# compare hosts sans www
|
893
|
+
cmp = ah[1] <=> bh[1]
|
894
|
+
return cmp unless cmp == 0
|
895
|
+
|
896
|
+
# now compare presence of www
|
897
|
+
cmp = pref[ah[0] == 'www'] <=> pref[bh[0] == 'www']
|
898
|
+
return cmp unless cmp == 0
|
899
|
+
|
900
|
+
# if we're still here, compare the path/query/fragment
|
901
|
+
re = /^.*?\/\/.*?(\/.*)$/
|
902
|
+
al = re.match(a.to_s)[1].to_s
|
903
|
+
bl = re.match(b.to_s)[1].to_s
|
904
|
+
|
905
|
+
return al <=> bl
|
906
|
+
end
|
907
|
+
|
908
|
+
return a <=> b
|
909
|
+
else
|
910
|
+
return -1
|
911
|
+
end
|
912
|
+
elsif b.uri?
|
913
|
+
return 1
|
914
|
+
else
|
915
|
+
return a <=> b
|
916
|
+
end
|
917
|
+
end
|
918
|
+
|
919
|
+
def self.cmp_label repo, a, b, labels: nil, supplant: true, reverse: false
|
920
|
+
labels ||= {}
|
921
|
+
|
922
|
+
# try supplied label or fall back
|
923
|
+
pair = [a, b].map do |x|
|
924
|
+
if labels[x]
|
925
|
+
labels[x][1]
|
926
|
+
elsif supplant and y = label_for(repo, x)
|
927
|
+
labels[x] = y
|
928
|
+
y[1]
|
929
|
+
else
|
930
|
+
x
|
931
|
+
end
|
932
|
+
end
|
933
|
+
|
934
|
+
pair.reverse! if reverse
|
935
|
+
# warn "#{pair[0]} <=> #{pair[1]}"
|
936
|
+
pair[0].to_s <=> pair[1].to_s
|
937
|
+
end
|
938
|
+
|
939
|
+
# Obtain the "best" dereferenceable URI for the subject.
|
940
|
+
# Optionally returns all candidates.
|
941
|
+
#
|
942
|
+
# @param repo [RDF::Queryable]
|
943
|
+
# @param subject [RDF::Resource]
|
944
|
+
# @param unique [true, false] flag for unique return value
|
945
|
+
# @param rdf [true, false] flag to specify RDF::URI vs URI
|
946
|
+
# @param slugs [true, false] flag to include slugs
|
947
|
+
# @param fragment [true, false] flag to include fragment URIs
|
948
|
+
#
|
949
|
+
# @return [RDF::URI, URI, Array]
|
950
|
+
#
|
951
|
+
def self.canonical_uri repo, subject, base: nil,
|
952
|
+
unique: true, rdf: true, slugs: false, fragment: false
|
953
|
+
subject = coerce_resource subject, base
|
954
|
+
out = []
|
955
|
+
|
956
|
+
# try to find it first
|
957
|
+
out = objects_for(repo, subject, [RDF::SAK::CI.canonical, RDF::OWL.sameAs],
|
958
|
+
entail: false, only: :resource).select do |o|
|
959
|
+
# only consider the subjects
|
960
|
+
repo.has_subject? o
|
961
|
+
end.sort { |a, b| cmp_resource a, b }
|
962
|
+
|
963
|
+
# try to generate in lieu
|
964
|
+
if subject.uri? and (out.empty? or slugs)
|
965
|
+
|
966
|
+
out += objects_for(repo, subject,
|
967
|
+
[RDF::SAK::CI['canonical-slug'], RDF::SAK::CI.slug],
|
968
|
+
only: :literal).map do |o|
|
969
|
+
base + o.value
|
970
|
+
end if slugs
|
971
|
+
|
972
|
+
uri = URI(uri_pp(subject.to_s))
|
973
|
+
if base and uri.respond_to? :uuid
|
974
|
+
b = base.clone
|
975
|
+
b.query = b.fragment = nil
|
976
|
+
b.path = '/' + uri.uuid
|
977
|
+
out << RDF::URI.new(b.to_s)
|
978
|
+
else
|
979
|
+
out << subject
|
980
|
+
end
|
981
|
+
end
|
982
|
+
|
983
|
+
# remove all URIs with fragments unless specified
|
984
|
+
unless fragment
|
985
|
+
tmp = out.reject(&:fragment)
|
986
|
+
out = tmp unless tmp.empty?
|
987
|
+
end
|
988
|
+
|
989
|
+
# coerce to URI objects if specified
|
990
|
+
out.map! { |u| URI(uri_pp u.to_s) } unless rdf
|
991
|
+
|
992
|
+
unique ? out.first : out.uniq
|
993
|
+
end
|
994
|
+
|
995
|
+
# Determine whether the URI represents a published document.
|
996
|
+
#
|
997
|
+
# @param repo
|
998
|
+
# @param uri
|
999
|
+
#
|
1000
|
+
# @return [true, false]
|
1001
|
+
def self.published? repo, uri, circulated: false, base: nil
|
1002
|
+
uri = coerce_resource uri, base
|
1003
|
+
candidates = objects_for(
|
1004
|
+
repo, uri, RDF::Vocab::BIBO.status, only: :resource).to_set
|
1005
|
+
|
1006
|
+
test = Set[RDF::Vocab::BIBO['status/published']]
|
1007
|
+
test << RDF::SAK::CI.circulated if circulated
|
1008
|
+
|
1009
|
+
# warn candidates, test, candidates & test
|
1010
|
+
|
1011
|
+
!(candidates & test).empty?
|
1012
|
+
end
|
1013
|
+
|
1014
|
+
# Obtain a key-value structure for the given subject, optionally
|
1015
|
+
# constraining the result by node type (:resource, :uri/:iri,
|
1016
|
+
# :blank/:bnode, :literal)
|
1017
|
+
#
|
1018
|
+
# @param repo
|
1019
|
+
# @param subject of the inquiry
|
1020
|
+
# @param rev map in reverse
|
1021
|
+
# @param only one or more node types
|
1022
|
+
# @param uuids coerce resources to if possible
|
1023
|
+
#
|
1024
|
+
# @return [Hash]
|
1025
|
+
#
|
1026
|
+
def self.struct_for repo, subject, base: nil,
|
1027
|
+
rev: false, only: [], uuids: false, canon: false, ucache: {}, scache: {}
|
1028
|
+
only = coerce_node_spec only
|
1029
|
+
|
1030
|
+
# coerce the subject
|
1031
|
+
subject = canonical_uuid(repo, subject,
|
1032
|
+
base: base, scache: scache, ucache: ucache) || subject if uuids
|
1033
|
+
|
1034
|
+
rsrc = {}
|
1035
|
+
pattern = rev ? [nil, nil, subject] : [subject, nil, nil]
|
1036
|
+
repo.query(pattern) do |stmt|
|
1037
|
+
# this will skip over any term not matching the type
|
1038
|
+
node = rev ? stmt.subject : stmt.object
|
1039
|
+
next unless node_matches? node, only
|
1040
|
+
|
1041
|
+
# coerce the node to uuid if told to
|
1042
|
+
if node.resource?
|
1043
|
+
if uuids
|
1044
|
+
uu = canonical_uuid(repo, node, scache: scache, ucache: ucache) unless
|
1045
|
+
ucache.key? node
|
1046
|
+
node = uu || (canon ? canonical_uri(repo, node) : node)
|
1047
|
+
elsif canon
|
1048
|
+
node = canonical_uri(repo, node)
|
1049
|
+
end
|
1050
|
+
end
|
1051
|
+
|
1052
|
+
p = RDF::Vocabulary.find_term(stmt.predicate) || stmt.predicate
|
1053
|
+
o = rsrc[p] ||= []
|
1054
|
+
o.push node if node # may be nil
|
1055
|
+
end
|
1056
|
+
|
1057
|
+
# XXX in here we can do fun stuff like filter/sort by language/datatype
|
1058
|
+
rsrc.values.each { |v| v.sort!.uniq! }
|
1059
|
+
|
1060
|
+
rsrc
|
1061
|
+
end
|
1062
|
+
|
1063
|
+
# Obtain the most appropriate label(s) for the subject's type(s).
|
1064
|
+
# Returns one or more (depending on the `unique` flag)
|
1065
|
+
# predicate-object pairs in order of preference.
|
1066
|
+
#
|
1067
|
+
# @param repo [RDF::Queryable]
|
1068
|
+
# @param subject [RDF::Resource]
|
1069
|
+
# @param unique [true, false] only return the first pair
|
1070
|
+
# @param type [RDF::Term, Array] supply asserted types if already retrieved
|
1071
|
+
# @param lang [nil] not currently implemented (will be conneg)
|
1072
|
+
# @param desc [false, true] retrieve description instead of label
|
1073
|
+
# @param alt [false, true] retrieve alternate instead of main
|
1074
|
+
#
|
1075
|
+
# @return [Array] either a predicate-object pair or an array of pairs.
|
1076
|
+
#
|
1077
|
+
def self.label_for repo, subject, candidates: nil, unique: true, type: nil,
|
1078
|
+
lang: nil, desc: false, alt: false, base: nil
|
1079
|
+
raise ArgumentError, 'no repo!' unless repo.is_a? RDF::Queryable
|
1080
|
+
return unless subject.is_a? RDF::Value and subject.resource?
|
1081
|
+
|
1082
|
+
asserted = asserted_types repo, subject, type
|
1083
|
+
|
1084
|
+
# get all the inferred types by layer; add default class if needed
|
1085
|
+
strata = type_strata asserted
|
1086
|
+
strata.push [RDF::RDFS.Resource] if
|
1087
|
+
strata.empty? or not strata[-1].include?(RDF::RDFS.Resource)
|
1088
|
+
|
1089
|
+
# get the key-value pairs for the subject
|
1090
|
+
candidates ||= struct_for repo, subject, only: :literal
|
1091
|
+
|
1092
|
+
seen = {}
|
1093
|
+
accum = []
|
1094
|
+
strata.each do |lst|
|
1095
|
+
lst.each do |cls|
|
1096
|
+
next unless STRINGS[cls] and
|
1097
|
+
preds = STRINGS[cls][desc ? :desc : :label][alt ? 1 : 0]
|
1098
|
+
# warn cls
|
1099
|
+
preds.each do |p|
|
1100
|
+
# warn p.inspect
|
1101
|
+
next unless vals = candidates[p]
|
1102
|
+
vals.each do |v|
|
1103
|
+
pair = [p, v]
|
1104
|
+
accum.push(pair) unless seen[pair]
|
1105
|
+
seen[pair] = true
|
1106
|
+
end
|
1107
|
+
end
|
1108
|
+
end
|
1109
|
+
end
|
1110
|
+
|
1111
|
+
# try that for now
|
1112
|
+
unique ? accum[0] : accum.uniq
|
1113
|
+
|
1114
|
+
# what we want to do is match the predicates from the subject to
|
1115
|
+
# the predicates in the label designation
|
1116
|
+
|
1117
|
+
# get label predicate stack(s) for RDF type(s)
|
1118
|
+
|
1119
|
+
# get all predicates in order (use alt stack if doubly specified)
|
1120
|
+
|
1121
|
+
# filter out desired language(s)
|
1122
|
+
|
1123
|
+
# XXX note we will probably want to return the predicate as well
|
1124
|
+
end
|
1125
|
+
|
1126
|
+
# Assuming the subject is a thing that has authors, return the
|
1127
|
+
# list of authors. Try bibo:authorList first for an explicit
|
1128
|
+
# ordering, then continue to the various other predicates.
|
1129
|
+
#
|
1130
|
+
# @param repo [RDF::Queryable]
|
1131
|
+
# @param subject [RDF::Resource]
|
1132
|
+
# @param unique [false, true] only return the first author
|
1133
|
+
# @param contrib [false, true] return contributors instead of authors
|
1134
|
+
#
|
1135
|
+
# @return [RDF::Value, Array]
|
1136
|
+
#
|
1137
|
+
def authors_for repo, subject, unique: false, contrib: false, base: nil
|
1138
|
+
authors = []
|
1139
|
+
|
1140
|
+
# try the author list
|
1141
|
+
lp = [RDF::Vocab::BIBO[contrib ? :contributorList : :authorList]]
|
1142
|
+
lp += lp.first.entail(:equivalentProperty) # XXX cache this
|
1143
|
+
lp.each do |pred|
|
1144
|
+
o = repo.first_object([subject, pred, nil])
|
1145
|
+
next unless o
|
1146
|
+
# note this use of RDF::List is not particularly well-documented
|
1147
|
+
authors += RDF::List.from(repo, o).to_a
|
1148
|
+
end
|
1149
|
+
|
1150
|
+
# now try various permutations of the author/contributor predicate
|
1151
|
+
unsorted = []
|
1152
|
+
preds = contrib ? CONTRIB : AUTHOR
|
1153
|
+
preds.each do |pred|
|
1154
|
+
unsorted += repo.query([subject, pred, nil]).objects
|
1155
|
+
end
|
1156
|
+
|
1157
|
+
# prefetch the author names
|
1158
|
+
labels = authors.map { |a| [a, label_for(repo, a)] }.to_h
|
1159
|
+
|
1160
|
+
authors += unsorted.uniq.sort { |a, b| labels[a] <=> labels[b] }
|
1161
|
+
|
1162
|
+
unique ? authors.first : authors.uniq
|
1163
|
+
end
|
1164
|
+
|
1165
|
+
# Find the terminal replacements for the given subject, if any exist.
|
1166
|
+
#
|
1167
|
+
# @param repo
|
1168
|
+
# @param subject
|
1169
|
+
# @param published indicate the context is published
|
1170
|
+
#
|
1171
|
+
# @return [Set]
|
1172
|
+
#
|
1173
|
+
def self.replacements_for repo, subject, published: true, base: nil
|
1174
|
+
subject = coerce_resource subject, base
|
1175
|
+
|
1176
|
+
# `seen` is a hash mapping resources to publication status and
|
1177
|
+
# subsequent replacements. it collects all the resources in the
|
1178
|
+
# replacement chain in :fwd (replaces) and :rev (replaced-by)
|
1179
|
+
# members, along with a boolean :pub. `seen` also performs a
|
1180
|
+
# duty as cycle-breaking sentinel.
|
1181
|
+
|
1182
|
+
seen = {}
|
1183
|
+
queue = [subject]
|
1184
|
+
while (test = queue.shift)
|
1185
|
+
# fwd is "replaces", rev is "replaced by"
|
1186
|
+
entry = seen[test] ||= {
|
1187
|
+
pub: published?(repo, test), fwd: Set.new, rev: Set.new }
|
1188
|
+
queue += (
|
1189
|
+
subjects_for(repo, RDF::Vocab::DC.replaces, subject) +
|
1190
|
+
objects_for(repo, subject, RDF::Vocab::DC.isReplacedBy,
|
1191
|
+
only: :resource)
|
1192
|
+
).uniq.map do |r| # r = replacement
|
1193
|
+
next if seen.include? r
|
1194
|
+
seen[r] ||= { pub: published?(repo, r), fwd: Set.new, rev: Set.new }
|
1195
|
+
seen[r][:fwd] << test
|
1196
|
+
entry[:rev] << r
|
1197
|
+
r
|
1198
|
+
end.compact.uniq
|
1199
|
+
end
|
1200
|
+
|
1201
|
+
# if we're calling from a published context, we return the
|
1202
|
+
# (topologically) last published resource(s), even if they are
|
1203
|
+
# replaced ultimately by unpublished resources.
|
1204
|
+
|
1205
|
+
out = seen.map { |k, v| v[:rev].empty? ? k : nil }.compact - [subject]
|
1206
|
+
|
1207
|
+
# now we modify `out` based on the publication status of the context
|
1208
|
+
if published
|
1209
|
+
pubout = out.select { |o| seen[o][:pub] }
|
1210
|
+
# if there is anything left after this, return it
|
1211
|
+
return pubout unless pubout.empty?
|
1212
|
+
# now we want to find the penultimate elements of `seen` that
|
1213
|
+
# are farthest along the replacement chain but whose status is
|
1214
|
+
# published
|
1215
|
+
|
1216
|
+
# start with `out`, take the union of their :fwd members, then
|
1217
|
+
# take the subset of those which are published. if the result
|
1218
|
+
# is empty, repeat. (this is walking backwards through the
|
1219
|
+
# graph we just walked forwards through to construct `seen`)
|
1220
|
+
loop do
|
1221
|
+
# XXX THIS NEEDS A TEST CASE
|
1222
|
+
out = seen.values_at(*out).map { |v| v[:fwd] }.reduce(:+).to_a
|
1223
|
+
break if out.empty?
|
1224
|
+
pubout = out.select { |o| seen[o][:pub] }
|
1225
|
+
return pubout unless pubout.empty?
|
1226
|
+
end
|
1227
|
+
end
|
1228
|
+
|
1229
|
+
out
|
1230
|
+
end
|
1231
|
+
|
1232
|
+
# Obtain dates for the subject as instances of Date(Time). This is
|
1233
|
+
# just shorthand for a common application of `objects_for`.
|
1234
|
+
#
|
1235
|
+
# @param repo
|
1236
|
+
# @param subject
|
1237
|
+
# @param predicate
|
1238
|
+
# @param datatype
|
1239
|
+
#
|
1240
|
+
# @return [Array] of dates
|
1241
|
+
#
|
1242
|
+
def self.dates_for repo, subject, predicate: RDF::Vocab::DC.date,
|
1243
|
+
datatype: [RDF::XSD.date, RDF::XSD.dateTime]
|
1244
|
+
objects_for(
|
1245
|
+
repo, subject, predicate, only: :literal, datatype: datatype) do |o|
|
1246
|
+
o.object
|
1247
|
+
end.sort.uniq
|
1248
|
+
end
|
1249
|
+
|
1250
|
+
# Obtain any specified MIME types for the subject. Just shorthand
|
1251
|
+
# for a common application of `objects_for`.
|
1252
|
+
#
|
1253
|
+
# @param repo
|
1254
|
+
# @param subject
|
1255
|
+
# @param predicate
|
1256
|
+
# @param datatype
|
1257
|
+
#
|
1258
|
+
# @return [Array] of internet media types
|
1259
|
+
#
|
1260
|
+
def formats_for repo, subject, predicate: RDF::Vocab::DC.format,
|
1261
|
+
datatype: [RDF::XSD.token]
|
1262
|
+
objects_for(
|
1263
|
+
repo, subject, predicate, only: :literal, datatype: datatype) do |o|
|
1264
|
+
t = o.object
|
1265
|
+
t =~ /\// ? RDF::SAK::MimeMagic.new(t.to_s.downcase) : nil
|
1266
|
+
end.compact.sort.uniq
|
1267
|
+
end
|
1268
|
+
|
1269
|
+
def self.base_for xmlnode, base
|
1270
|
+
base = URI(base.to_s) unless base.is_a? URI
|
1271
|
+
out = base
|
1272
|
+
|
1273
|
+
if xmlnode.at_xpath('self::html:*|/html', XPATHNS)
|
1274
|
+
b = URI(xmlnode.at_xpath(XPATH[:htmlbase], XPATHNS).to_s.strip)
|
1275
|
+
|
1276
|
+
out = b if b.absolute?
|
1277
|
+
elsif b = xmlnode.root.at_xpath(XPATH[:xmlbase])
|
1278
|
+
b = URI(b.to_s.strip)
|
1279
|
+
out = b if b.absolute?
|
1280
|
+
end
|
1281
|
+
|
1282
|
+
out
|
1283
|
+
end
|
1284
|
+
|
1285
|
+
# Traverse links based on content type.
|
1286
|
+
def self.traverse_links node, type: 'application/xhtml+xml', &block
|
1287
|
+
enum_for :traverse_links, node, type: type unless block
|
1288
|
+
type = type.strip.downcase.gsub(/\s*;.*/, '')
|
1289
|
+
xpath = LINK_MAP.fetch type, XPATH[:xlinks]
|
1290
|
+
node.xpath(xpath, XPATHNS).each { |node| block.call node }
|
1291
|
+
end
|
1292
|
+
|
1293
|
+
|
1294
|
+
# XXX OTHER STUFF
|
1295
|
+
|
1296
|
+
# isolate an element into a new document
|
1297
|
+
def subtree doc, xpath = '/*', reindent: true, prefixes: {}
|
1298
|
+
# at this time we shouldn't try to do anything cute with the xpath
|
1299
|
+
# even though it is attractive to want to prune out prefixes
|
1300
|
+
|
1301
|
+
# how about we start with a noop
|
1302
|
+
return doc.root.dup if xpath == '/*'
|
1303
|
+
|
1304
|
+
begin
|
1305
|
+
nodes = doc.xpath xpath, prefixes
|
1306
|
+
return unless
|
1307
|
+
nodes and nodes.is_a?(Nokogiri::XML::NodeSet) and !nodes.empty?
|
1308
|
+
out = Nokogiri::XML::Document.new
|
1309
|
+
out << nodes.first.dup
|
1310
|
+
reindent out.root if reindent
|
1311
|
+
out
|
1312
|
+
rescue Nokogiri::SyntaxError
|
1313
|
+
return
|
1314
|
+
end
|
1315
|
+
end
|
1316
|
+
|
1317
|
+
# reindent text nodes
|
1318
|
+
def reindent node, depth = 0, indent = ' '
|
1319
|
+
kids = node.children
|
1320
|
+
if kids and child = kids.first
|
1321
|
+
loop do
|
1322
|
+
if child.element?
|
1323
|
+
# recurse into the element
|
1324
|
+
reindent child, depth + 1, indent
|
1325
|
+
elsif child.text?
|
1326
|
+
text = child.content || ''
|
1327
|
+
|
1328
|
+
# optional horizontal whitespace followed by at least
|
1329
|
+
# one newline (we don't care what kind), followed by
|
1330
|
+
# optional horizontal or vertical whitespace
|
1331
|
+
preamble = !!text.gsub!(/\A[ \t]*[\r\n]+\s*/, '')
|
1332
|
+
|
1333
|
+
# then we don't care what's in the middle, but hey let's get
|
1334
|
+
# rid of dos newlines because we can always put them back
|
1335
|
+
# later if we absolutely have to
|
1336
|
+
text.gsub!(/\r+/, '')
|
1337
|
+
|
1338
|
+
# then optionally any whitespace followed by at least
|
1339
|
+
# another newline again, followed by optional horizontal
|
1340
|
+
# whitespace and then the end of the string
|
1341
|
+
epilogue = !!text.gsub!(/\s*[\r\n]+[ \t]*\z/, '')
|
1342
|
+
|
1343
|
+
# if we prune these off we'll have a text node that is
|
1344
|
+
# either the empty string or it isn't (note we will only
|
1345
|
+
# register an epilogue if the text has some non-whitespace
|
1346
|
+
# in it, because otherwise the first regex would have
|
1347
|
+
# snagged everything, so it's probably redundant)
|
1348
|
+
|
1349
|
+
# if it's *not* empty then we *prepend* indented whitespace
|
1350
|
+
if preamble and !text.empty?
|
1351
|
+
d = depth + (child.previous ? 1 : 0)
|
1352
|
+
text = "\n" + (indent * d) + text
|
1353
|
+
end
|
1354
|
+
|
1355
|
+
# then we unconditionally *append*, (modulo there being a
|
1356
|
+
# newline in the original at all), but we have to check by
|
1357
|
+
# how much: if this is *not* the last node then depth + 1,
|
1358
|
+
# otherwise depth
|
1359
|
+
if preamble or epilogue
|
1360
|
+
d = depth + (child.next ? 1 : 0)
|
1361
|
+
text << "\n" + (indent * d)
|
1362
|
+
end
|
1363
|
+
|
1364
|
+
child.content = text
|
1365
|
+
end
|
1366
|
+
|
1367
|
+
break unless child = child.next
|
1368
|
+
end
|
1369
|
+
end
|
1370
|
+
|
1371
|
+
node
|
1372
|
+
end
|
1373
|
+
|
1374
|
+
XHTMLNS = 'http://www.w3.org/1999/xhtml'.freeze
|
1375
|
+
XHV = 'http://www.w3.org/1999/xhtml/vocab#'.freeze
|
1376
|
+
XPATHNS = {
|
1377
|
+
html: XHTMLNS,
|
1378
|
+
svg: 'http://www.w3.org/2000/svg',
|
1379
|
+
atom: 'http://www.w3.org/2005/Atom',
|
1380
|
+
xlink: 'http://www.w3.org/1999/xlink',
|
1381
|
+
}.freeze
|
1382
|
+
|
1383
|
+
######## URI STUFF ########
|
1384
|
+
|
1385
|
+
# Preprocess a URI string so that it can be handed to +URI.parse+
|
1386
|
+
# without crashing.
|
1387
|
+
#
|
1388
|
+
# @param uri [#to_s] The URI string in question
|
1389
|
+
# @param extra [#to_s] Character class of any extra characters to escape
|
1390
|
+
# @return [String] The sanitized (appropriately escaped) URI string
|
1391
|
+
|
1392
|
+
# really gotta stop carting this thing around
|
1393
|
+
def uri_pp uri, extra = ''
|
1394
|
+
# take care of malformed escapes
|
1395
|
+
uri = uri.to_s.b.gsub(/%(?![0-9A-Fa-f]{2})/n, '%25')
|
1396
|
+
uri.gsub!(/([#{Regexp.quote extra}])/) do |s|
|
1397
|
+
sprintf('%%%02X', s.ord)
|
1398
|
+
end unless extra.empty?
|
1399
|
+
# we want the minimal amount of escaping so we split out the separators
|
1400
|
+
out = ''
|
1401
|
+
parts = RFC3986.match(uri).captures
|
1402
|
+
parts.each_index do |i|
|
1403
|
+
next if parts[i].nil?
|
1404
|
+
out << SEPS[i].first
|
1405
|
+
out << parts[i].b.gsub(SF) { |s| sprintf('%%%02X', s.ord) }
|
1406
|
+
out << SEPS[i].last
|
1407
|
+
end
|
1408
|
+
|
1409
|
+
# make sure escaped hex is upper case like the rfc says
|
1410
|
+
out.gsub(/(%[0-9A-Fa-f]{2})/) { |x| x.upcase }
|
1411
|
+
end
|
1412
|
+
|
1413
|
+
# Given a URI as input, split any query parameters into an array of
|
1414
|
+
# key-value pairs. If +:only+ is true, this will just return the
|
1415
|
+
# pairs. Otherwise it will prepend the query-less URI to the array,
|
1416
|
+
# and can be captured with an idiom like +uri, *qp = split_qp uri+.
|
1417
|
+
#
|
1418
|
+
# @param uri [URI,#to_s] The URI to extract parameters from
|
1419
|
+
# @param only [false, true] whether to only return the parameters
|
1420
|
+
# @return [Array] (See description)
|
1421
|
+
#
|
1422
|
+
def split_qp uri, only: false
|
1423
|
+
uri = URI(uri_pp uri.to_s) unless uri.is_a? URI
|
1424
|
+
qp = URI::decode_www_form(uri.query)
|
1425
|
+
return qp if only
|
1426
|
+
uri.query = nil
|
1427
|
+
[uri] + qp
|
1428
|
+
end
|
1429
|
+
|
1430
|
+
# Given a URI as input, split any path parameters out of the last
|
1431
|
+
# path segment. Works the same way as #split_pp.
|
1432
|
+
#
|
1433
|
+
# @param uri [URI,#to_s] The URI to extract parameters from
|
1434
|
+
# @param only [false, true] whether to only return the parameters
|
1435
|
+
# @return [Array] (See description)
|
1436
|
+
#
|
1437
|
+
def split_pp uri, only: false
|
1438
|
+
begin
|
1439
|
+
u = (uri.is_a?(URI) ? uri : URI(uri_pp uri.to_s)).normalize
|
1440
|
+
|
1441
|
+
rescue URI::InvalidURIError => e
|
1442
|
+
# these stock error messages don't even tell you what the uri is
|
1443
|
+
raise URI::InvalidURIError, "#{e.message} (#{uri.to_s})"
|
1444
|
+
end
|
1445
|
+
|
1446
|
+
return only ? [] : [uri] unless u.path
|
1447
|
+
uri = u
|
1448
|
+
|
1449
|
+
ps = uri.path.split '/', -1
|
1450
|
+
pp = ps.pop.split ';', -1
|
1451
|
+
bp = (ps + [pp.shift]).join '/'
|
1452
|
+
uri = uri.dup
|
1453
|
+
|
1454
|
+
begin
|
1455
|
+
uri.path = bp
|
1456
|
+
rescue URI::InvalidURIError => e
|
1457
|
+
# these stock error messages don't even tell you what the uri is
|
1458
|
+
m = e.message
|
1459
|
+
raise URI::InvalidURIError, "#{m} (#{uri.to_s}, #{bp})"
|
1460
|
+
end
|
1461
|
+
|
1462
|
+
return pp if only
|
1463
|
+
[uri] + pp
|
1464
|
+
end
|
1465
|
+
|
1466
|
+
def split_pp2 path, only: false
|
1467
|
+
# ugh apparently we need a special case for ''.split
|
1468
|
+
return only ? [] : [''] if !path or path.empty?
|
1469
|
+
|
1470
|
+
ps = path.to_s.split ?/, -1 # path segments
|
1471
|
+
pp = ps.pop.to_s.split ?;, -1 # path parameters
|
1472
|
+
bp = (ps + [pp.shift]).join ?/ # base path
|
1473
|
+
|
1474
|
+
only ? pp : [bp] + pp
|
1475
|
+
end
|
1476
|
+
|
1477
|
+
# Coerce a stringlike argument into a URI. Raises an exception if
|
1478
|
+
# the string can't be turned into a valid URI. Optionally resolves
|
1479
|
+
# against a +base+, and the coercion can be tuned to either URI or
|
1480
|
+
# RDF::URI via +:as+.
|
1481
|
+
#
|
1482
|
+
# @param arg [URI, RDF::URI, #to_s] The input string
|
1483
|
+
# @param base [URI, RDF::URI, #to_s] The optional base URI
|
1484
|
+
# @param as [:rdf, :uri, nil] The optional coercion type
|
1485
|
+
# @return [URI, RDF::URI, String]
|
1486
|
+
#
|
1487
|
+
def coerce_resource arg, base = nil, as: :rdf
|
1488
|
+
as = assert_uri_coercion as
|
1489
|
+
return arg if as and arg.is_a?({ uri: URI, rdf: RDF::URI }[as])
|
1490
|
+
raise ArgumentError, 'arg must be stringable' unless arg.respond_to? :to_s
|
1491
|
+
|
1492
|
+
arg = arg.to_s.strip
|
1493
|
+
|
1494
|
+
if arg.start_with? '_:' and as
|
1495
|
+
# override the coercion if this is a blank node
|
1496
|
+
as = :rdf
|
1497
|
+
elsif base
|
1498
|
+
begin
|
1499
|
+
arg = (base.is_a?(URI) ? base : URI(uri_pp base.to_s.strip)).merge arg
|
1500
|
+
rescue URI::InvalidURIError => e
|
1501
|
+
warn "attempted to coerce #{arg} which turned out to be invalid: #{e}"
|
1502
|
+
return
|
1503
|
+
end
|
1504
|
+
end
|
1505
|
+
|
1506
|
+
URI_COERCIONS[as].call arg
|
1507
|
+
end
|
1508
|
+
|
1509
|
+
# Coerce a stringlike argument into a UUID URN. Will
|
1510
|
+
def coerce_uuid_urn arg, base = nil
|
1511
|
+
# if this is an ncname then change it
|
1512
|
+
if ([URI, RDF::URI] & arg.class.ancestors).empty? &&
|
1513
|
+
arg.respond_to?(:to_s)
|
1514
|
+
arg = arg.to_s
|
1515
|
+
|
1516
|
+
# coerce ncname to uuid
|
1517
|
+
arg = UUID::NCName::from_ncname(arg, version: 1) if arg =~
|
1518
|
+
/^[A-P](?:[0-9A-Z_-]{20}|[2-7A-Z]{24})[A-P]$/i
|
1519
|
+
|
1520
|
+
# now the string is either a UUID or it isn't
|
1521
|
+
arg = "urn:uuid:#{arg}" unless arg.start_with? 'urn:uuid:'
|
1522
|
+
else
|
1523
|
+
arg = arg.class.new arg.to_s.downcase unless arg == arg.to_s.downcase
|
1524
|
+
end
|
1525
|
+
|
1526
|
+
raise ArgumentError, 'not a UUID' unless
|
1527
|
+
arg.to_s =~ /^urn:uuid:[0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8}$/
|
1528
|
+
|
1529
|
+
arg = coerce_resource arg, base
|
1530
|
+
end
|
1531
|
+
|
1532
|
+
# Get the last non-empty path segment of the URI
|
1533
|
+
#
|
1534
|
+
# @param uri
|
1535
|
+
#
|
1536
|
+
# @return [String]
|
1537
|
+
def terminal_slug uri, base: nil
|
1538
|
+
uri = coerce_resource uri, base
|
1539
|
+
return unless uri.respond_to? :path
|
1540
|
+
if p = uri.path
|
1541
|
+
if p = /^\/+(.*?)\/*$/.match(p)
|
1542
|
+
if p = p[1].split(/\/+/).last
|
1543
|
+
# we need to escape colons or it will think it's absolute
|
1544
|
+
return uri_pp(p.split(/;+/).first || '', ':')
|
1545
|
+
end
|
1546
|
+
end
|
1547
|
+
end
|
1548
|
+
''
|
1549
|
+
end
|
1550
|
+
|
1551
|
+
# Resolve a string or array or attribute node containing one or more
|
1552
|
+
# terms/CURIEs against a set of prefixes. The CURIE can be a string,
|
1553
|
+
# Nokogiri::XML::Attr, or an array thereof. Strings are stripped and
|
1554
|
+
# split on whitespace. +:prefixes+ and +:base+ can be supplied or
|
1555
|
+
# gleaned from +:refnode+, which itself can be gleaned if +curie+ is
|
1556
|
+
# a Nokogiri::XML::Attr. Returns an array of (attempted) resolved
|
1557
|
+
# terms unless +:scalar+ is true, in which case only the first URI
|
1558
|
+
# is returned. When +:noop+ is true, this method will always return
|
1559
|
+
# a value. Can coerce results to either RDF::URI or URI objects.
|
1560
|
+
#
|
1561
|
+
# @note +:vocab+ overrides, and is the same as supplying
|
1562
|
+
# +prefix[nil]+. It is only meaningful when +:term+ (i.e., when we
|
1563
|
+
# expect the input to be an RDFa term) is true.
|
1564
|
+
#
|
1565
|
+
# @param curie [#to_s, Nokogiri::XML::Attr,Array] One or more CURIEs
|
1566
|
+
# @param prefixes [#to_h] The hash of prefixes (nil key is equivalent
|
1567
|
+
# to vocab)
|
1568
|
+
# @param vocab [nil,#to_s] An optional base URI
|
1569
|
+
# @param refnode [nil, Nokogiri::XML::Element] A reference node for resolution
|
1570
|
+
# @param term [false, true] Whether to treat the input as an RDFa _term_
|
1571
|
+
# @param noop [true, false] Whether to skip if the CURIE can't be resolved
|
1572
|
+
# @param scalar [false, true] Whether to return a scalar value
|
1573
|
+
# @param coerce [nil, :rdf, :uri] Desired type coercion for the output
|
1574
|
+
#
|
1575
|
+
# @return [nil,URI,RDF::URI,Array<nil,URI,RDF::URI>]
|
1576
|
+
#
|
1577
|
+
def resolve_curie curie, prefixes: {}, vocab: nil, base: nil,
|
1578
|
+
refnode: nil, term: false, noop: true, scalar: false, coerce: nil
|
1579
|
+
prefixes = sanitize_prefixes prefixes
|
1580
|
+
|
1581
|
+
raise 'coerce must be either :uri or :rdf' if coerce and
|
1582
|
+
not %i[uri rdf].include?(coerce)
|
1583
|
+
|
1584
|
+
# coerce curie to its value and set refnode if not present
|
1585
|
+
if curie.is_a? Nokogiri::XML::Attr
|
1586
|
+
refnode ||= curie.parent
|
1587
|
+
curie = curie.value.strip.split
|
1588
|
+
elsif curie.respond_to? :to_a
|
1589
|
+
curie = curie.to_a
|
1590
|
+
raise ArgumentError,
|
1591
|
+
'if curie is an array, it has to be all strings' unless
|
1592
|
+
curie.all? { |x| x.respond_to? :to_s }
|
1593
|
+
curie = curie.map { |x| x.to_s.strip.split }.flatten
|
1594
|
+
else
|
1595
|
+
raise ArgumentError, 'curie must be stringable' unless
|
1596
|
+
curie.respond_to? :to_s
|
1597
|
+
curie = curie.to_s.strip.split
|
1598
|
+
end
|
1599
|
+
|
1600
|
+
if refnode
|
1601
|
+
raise ArgumentError, 'refnode must be an element' unless
|
1602
|
+
refnode.is_a? Nokogiri::XML::Element
|
1603
|
+
prefixes = get_prefixes refnode if prefixes.empty?
|
1604
|
+
end
|
1605
|
+
|
1606
|
+
# now we overwrite the vocab
|
1607
|
+
if vocab
|
1608
|
+
raise ArgumentError, 'vocab must be stringable' unless
|
1609
|
+
vocab.respond_to? :to_s
|
1610
|
+
prefixes[nil] = vocab.to_s.strip
|
1611
|
+
end
|
1612
|
+
|
1613
|
+
out = curie.map do |c|
|
1614
|
+
prefix, slug = /^\[?(?:([^:]+):)?(.*?)\]?$/.match(c).captures
|
1615
|
+
prefix = prefix.to_sym if prefix
|
1616
|
+
tmp = if prefixes[prefix]
|
1617
|
+
prefixes[prefix] + slug
|
1618
|
+
else
|
1619
|
+
noop ? c : nil
|
1620
|
+
end
|
1621
|
+
tmp && coerce ? URI_COERCIONS[coerce].call(tmp) : tmp
|
1622
|
+
end
|
1623
|
+
|
1624
|
+
scalar ? out.first : out
|
1625
|
+
end
|
1626
|
+
|
1627
|
+
# Abbreviate one or more URIs into one or more CURIEs if we
|
1628
|
+
# can. Will through if +noop:+ is true, or if false, return nil for
|
1629
|
+
# any URI that can't be abbreviated this way. Takes a hash of
|
1630
|
+
# prefix-URI mappings where the keys are assumed to be symbols or
|
1631
|
+
# +nil+ to express the current vocabulary, which can be overridden
|
1632
|
+
# via +vocab:+.
|
1633
|
+
#
|
1634
|
+
# @note Only +noop: true+ can be guaranteed to return a value.
|
1635
|
+
#
|
1636
|
+
# @param term [Array<#to_s>, #to_s] the term(s)
|
1637
|
+
# @param prefixes [Hash<Symbol,nil>, #to_h] the prefix mappings
|
1638
|
+
# @param vocab [#to_s] current vocabulary, overrides +prefixes[nil]+
|
1639
|
+
# @param noop [true, false] whether or not to pass terms through
|
1640
|
+
# @param sort [true, false] whether or not to sort (only if +noop:+)
|
1641
|
+
# @return [String, nil, Array<String,nil>] the (maybe) abbreviated term(s)
|
1642
|
+
#
|
1643
|
+
def abbreviate term, prefixes: {}, vocab: nil, noop: true, sort: true
|
1644
|
+
# this returns a duplicate that we can mess with
|
1645
|
+
prefixes = sanitize_prefixes prefixes
|
1646
|
+
|
1647
|
+
# sanitize vocab
|
1648
|
+
raise ArgumentError, 'vocab must be nil or stringable' unless
|
1649
|
+
vocab.nil? or vocab.respond_to? :to_s
|
1650
|
+
prefixes[nil] = vocab.to_s if vocab
|
1651
|
+
scalar = true
|
1652
|
+
|
1653
|
+
term = if term.respond_to? :to_a
|
1654
|
+
scalar = false
|
1655
|
+
term.to_a
|
1656
|
+
else [term]; end
|
1657
|
+
|
1658
|
+
rev = prefixes.invert
|
1659
|
+
|
1660
|
+
term.map! do |t|
|
1661
|
+
t = t.to_s
|
1662
|
+
slug = nil # we want this value to be nil if no match and !noop
|
1663
|
+
|
1664
|
+
# try matching each prefix URI from longest to shortest
|
1665
|
+
rev.sort { |a, b| b.first.length <=> a.first.length }.each do |uri, pfx|
|
1666
|
+
slug = t.delete_prefix uri
|
1667
|
+
# this is saying the URI either doesn't match or abbreviates to ""
|
1668
|
+
if slug == t or pfx.nil? && slug.empty?
|
1669
|
+
slug = nil
|
1670
|
+
else
|
1671
|
+
# it's already a slug so we add a prefix if there is one
|
1672
|
+
slug = '%s:%s' % [pfx, slug] unless pfx.nil?
|
1673
|
+
break # we have our match
|
1674
|
+
end
|
1675
|
+
end
|
1676
|
+
|
1677
|
+
# at this point slug is either an abbreviated term or nil, so:
|
1678
|
+
slug ||= t if noop
|
1679
|
+
slug
|
1680
|
+
end
|
1681
|
+
|
1682
|
+
# only sort if noop is set
|
1683
|
+
term.sort! if noop && sort
|
1684
|
+
|
1685
|
+
scalar ? term.first : term
|
1686
|
+
end
|
1687
|
+
|
1688
|
+
######## RDFA/XML STUFF ########
|
1689
|
+
|
1690
|
+
# Returns the base URI from the perspective of the given element.
|
1691
|
+
# Can optionally be coerced into either a URI or RDF::URI. Also
|
1692
|
+
# takes a default value.
|
1693
|
+
#
|
1694
|
+
# @param elem [Nokogiri::XML::Node] the context element
|
1695
|
+
# @param default [nil, #to_s] the default URI
|
1696
|
+
# @param coerce [nil, :uri, :rdf] the coercion scheme, if any
|
1697
|
+
# @return [nil, String, URI, RDF::URI] the context's base URI
|
1698
|
+
def get_base elem, default: nil, coerce: nil
|
1699
|
+
assert_uri_coercion coerce
|
1700
|
+
|
1701
|
+
if elem.document?
|
1702
|
+
elem = elem.root
|
1703
|
+
return unless elem
|
1704
|
+
end
|
1705
|
+
|
1706
|
+
# get the xpath
|
1707
|
+
xpath = (elem.namespace && elem.namespace.href == XHTMLNS or
|
1708
|
+
elem.at_xpath('/html')) ? :htmlbase : :xmlbase
|
1709
|
+
|
1710
|
+
# now we go looking for the attribute
|
1711
|
+
if base = elem.at_xpath(XPATH[xpath], XPATHNS)
|
1712
|
+
base = base.value.strip
|
1713
|
+
else
|
1714
|
+
base = default.to_s.strip if default
|
1715
|
+
end
|
1716
|
+
|
1717
|
+
# clear it out if it's the empty string
|
1718
|
+
base = nil if base and base.empty?
|
1719
|
+
|
1720
|
+
# eh that's about all the input sanitation we're gonna get
|
1721
|
+
base && coerce ? URI_COERCIONS[coerce].call(base) : base
|
1722
|
+
end
|
1723
|
+
|
1724
|
+
# Given an X(HT)ML element, returns a hash of prefixes of the form
|
1725
|
+
# +{ prefix: "vocab" }+, where the current +@vocab+ is represented
|
1726
|
+
# by the +nil+ key. An optional +:traverse+ parameter can be set to
|
1727
|
+
# +false+ to prevent ascending the node tree. Any XML namespace
|
1728
|
+
# declarations are superseded by the +@prefix+ attribute. Returns
|
1729
|
+
# any +@vocab+ declaration found as the +nil+ key.
|
1730
|
+
#
|
1731
|
+
# @note The +descend: true+ parameter assumes we are trying to
|
1732
|
+
# collect all the namespaces in use in the entire subtree, rather
|
1733
|
+
# than resolve any particular CURIE. As such, the _first_ prefix
|
1734
|
+
# mapping in document order is preserved over subsequent/descendant
|
1735
|
+
# ones.
|
1736
|
+
#
|
1737
|
+
# @param elem [Nokogiri::XML::Node] The context element
|
1738
|
+
# @param traverse [true, false] whether or not to traverse the tree
|
1739
|
+
# @param coerce [nil, :rdf, :uri] a type coercion for the URIs, if any
|
1740
|
+
# @param descend [false, true] go _down_ the tree instead of up
|
1741
|
+
# @return [Hash] Depending on +:traverse+, either all prefixes
|
1742
|
+
# merged, or just the ones asserted in the element.
|
1743
|
+
def get_prefixes elem, traverse: true, coerce: nil, descend: false
|
1744
|
+
coerce = assert_uri_coercion coerce
|
1745
|
+
|
1746
|
+
# deal with a common phenomenon
|
1747
|
+
elem = elem.root if elem.is_a? Nokogiri::XML::Document
|
1748
|
+
|
1749
|
+
# get namespace definitions first
|
1750
|
+
prefix = elem.namespaces.reject do |k, _| k == 'xmlns'
|
1751
|
+
end.transform_keys { |k| k.split(?:)[1].to_sym }
|
1752
|
+
|
1753
|
+
# now do the prefix attribute
|
1754
|
+
if elem.key? 'prefix'
|
1755
|
+
# XXX note this assumes largely that the input is clean
|
1756
|
+
elem['prefix'].strip.split.each_slice(2) do |k, v|
|
1757
|
+
pfx = k.split(?:)[0] or next # otherwise error
|
1758
|
+
prefix[pfx.to_sym] = v
|
1759
|
+
end
|
1760
|
+
end
|
1761
|
+
|
1762
|
+
# encode the vocab as the null prefix
|
1763
|
+
if vocab = elem['vocab']
|
1764
|
+
vocab.strip!
|
1765
|
+
# note that a specified but empty @vocab means kill any existing vocab
|
1766
|
+
prefix[nil] = vocab.empty? ? nil : vocab
|
1767
|
+
end
|
1768
|
+
|
1769
|
+
# don't forget we can coerce
|
1770
|
+
prefix.transform_values! { |v| COERCIONS[coerce].call v } if coerce
|
1771
|
+
|
1772
|
+
# don't proceed if `traverse` is false
|
1773
|
+
return prefix unless traverse
|
1774
|
+
|
1775
|
+
# save us having to recurse in ruby by using xpath implemented in c
|
1776
|
+
xpath = '%s::*[namespace::*|@prefix|@vocab]' %
|
1777
|
+
(descend ? :descendant : :ancestor)
|
1778
|
+
elem.xpath(xpath).each do |e|
|
1779
|
+
# this will always merge our prefix on top irrespective of direction
|
1780
|
+
prefix = get_prefix(e, traverse: false, coerce: coerce).merge prefix
|
1781
|
+
end
|
1782
|
+
|
1783
|
+
prefix
|
1784
|
+
end
|
1785
|
+
|
1786
|
+
# Given an X(HT)ML element, return the nearest RDFa _subject_.
|
1787
|
+
# Optionally takes +:prefix+ and +:base+ parameters which override
|
1788
|
+
# anything found in the document tree.
|
1789
|
+
#
|
1790
|
+
# @param node [Nokogiri::XML::Element] the node
|
1791
|
+
# @param prefixes [Hash] Prefix mapping. Overrides derived values.
|
1792
|
+
# @param base [#to_s,URI,RDF::URI] Base URI, overrides as well.
|
1793
|
+
# @param coerce [nil, :rdf, :uri] the coercion regime
|
1794
|
+
#
|
1795
|
+
# @return [URI,RDF::URI,String] the subject
|
1796
|
+
#
|
1797
|
+
def subject_for node, prefixes: nil, base: nil, coerce: :rdf
|
1798
|
+
assert_xml_node node
|
1799
|
+
coerce = assert_uri_coercion coerce
|
1800
|
+
|
1801
|
+
if n = node.at_xpath(XPATH[:literal])
|
1802
|
+
return internal_subject_for n,
|
1803
|
+
prefixes: prefixes, base: base, coerce: coerce
|
1804
|
+
end
|
1805
|
+
|
1806
|
+
internal_subject_for node, prefixes: prefixes, base: base, coerce: coerce
|
1807
|
+
end
|
1808
|
+
|
1809
|
+
def modernize doc
|
1810
|
+
doc.xpath(XPATH[:modernize], XPATHNS).each do |e|
|
1811
|
+
# gotta instance_exec because `markup` is otherwise unbound
|
1812
|
+
instance_exec e, &MODERNIZE[e.name.to_sym]
|
1813
|
+
end
|
1814
|
+
end
|
1815
|
+
|
1816
|
+
# Strip all the links surrounding and RDFa attributes off
|
1817
|
+
# +dfn+/+abbr+/+span+ tags. Assuming a construct like +<a
|
1818
|
+
# rel="some:relation" href="#..." typeof="skos:Concept"><dfn
|
1819
|
+
# property="some:property">Term</dfn></a>+ is a link to a glossary
|
1820
|
+
# entry, this method returns the term back to an undecorated state
|
1821
|
+
# (+<dfn>Term</dfn>+).
|
1822
|
+
|
1823
|
+
def dehydrate doc
|
1824
|
+
doc.xpath(XPATH[:dehydrate], XPATHNS).each do |e|
|
1825
|
+
e = e.replace e.elements.first.dup
|
1826
|
+
%w[about resource typeof rel rev property datatype].each do |a|
|
1827
|
+
e.delete a if e.key? a
|
1828
|
+
end
|
1829
|
+
end
|
1830
|
+
end
|
1831
|
+
|
1832
|
+
# Scan all the +dfn+/+abbr+/+span+ tags in the document that are not
|
1833
|
+
# already wrapped in a link. This method scans the text (or
|
1834
|
+
# +@content+) of each element and compares it to the contents of the
|
1835
|
+
# graph. If the process locates a subject, it will use that subject
|
1836
|
+
# as the basis of a link. if there are zero subjects, or more than
|
1837
|
+
# one, then the method executes a block which can be used to pick
|
1838
|
+
# (e.g., via user interface) a definite subject or otherwise add one.
|
1839
|
+
|
1840
|
+
# (maybe add +code+/+kbd+/+samp+/+var+/+time+ one day too)
|
1841
|
+
|
1842
|
+
def rehydrate doc, graph, &block
|
1843
|
+
doc.xpath(XPATH[:rehydrate], XPATHNS).each do |e|
|
1844
|
+
lang = e.xpath(XPATH[:lang]).to_s.strip
|
1845
|
+
# dt = e['datatype'] # XXX no datatype rn
|
1846
|
+
text = (e['content'] || e.xpath('.//text()').to_a.join).strip
|
1847
|
+
|
1848
|
+
# now we have the literal
|
1849
|
+
lit = [RDF::Literal(text)]
|
1850
|
+
lit.unshift RDF::Literal(text, language: lang) unless lang.empty?
|
1851
|
+
|
1852
|
+
# candidates
|
1853
|
+
cand = {}
|
1854
|
+
lit.map do |t|
|
1855
|
+
graph.query(object: t).to_a
|
1856
|
+
end.flatten.each do |x|
|
1857
|
+
y = cand[x.subject] ||= {}
|
1858
|
+
(y[:stmts] ||= []) << x
|
1859
|
+
y[:types] ||= graph.query([x.subject, RDF.type, nil]).objects.sort
|
1860
|
+
end
|
1861
|
+
|
1862
|
+
# if there's only one candidate, this is basically a noop
|
1863
|
+
chosen = cand.keys.first if cand.size == 1
|
1864
|
+
|
1865
|
+
# call the block to reconcile any gaps or conflicts
|
1866
|
+
if block_given? and cand.size != 1
|
1867
|
+
# the block is expected to return one of the candidates or
|
1868
|
+
# nil. we call the block with the graph so that the block can
|
1869
|
+
# manipulate its contents.
|
1870
|
+
chosen = block.call cand, graph
|
1871
|
+
raise ArgumentError, 'block must return nil or a term' unless
|
1872
|
+
chosen.nil? or chosen.is_a? RDF::Term
|
1873
|
+
end
|
1874
|
+
|
1875
|
+
if chosen
|
1876
|
+
# we assume this has been retrieved from the graph
|
1877
|
+
cc = cand[chosen]
|
1878
|
+
unless cc
|
1879
|
+
cc = cand[chosen] = {}
|
1880
|
+
cc[:stmts] = graph.query([chosen, nil, lit[0]]).to_a.sort
|
1881
|
+
cc[:types] = graph.query([chosen, RDF.type, nil]).objects.sort
|
1882
|
+
# if either of these are empty then the graph was not
|
1883
|
+
# appropriately populated
|
1884
|
+
raise 'Missing a statement relating #{chosen} to #{text}' if
|
1885
|
+
cc[:stmts].empty?
|
1886
|
+
end
|
1887
|
+
|
1888
|
+
# we should actually probably move any prefix/vocab/xmlns
|
1889
|
+
# declarations from the inner node to the outer one (although
|
1890
|
+
# in practice this will be an unlikely configuration)
|
1891
|
+
pfx = get_prefixes e
|
1892
|
+
|
1893
|
+
# here we have pretty much everything except for the prefixes
|
1894
|
+
# and wherever we want to actually link to.
|
1895
|
+
|
1896
|
+
inner = e.dup
|
1897
|
+
spec = { [inner] => :a, href: '' }
|
1898
|
+
# we should have types
|
1899
|
+
spec[:typeof] = abbreviate cc[:types], prefixes: pfx unless
|
1900
|
+
cc[:types].empty?
|
1901
|
+
|
1902
|
+
markup replace: e, spec: spec
|
1903
|
+
end
|
1904
|
+
end
|
1905
|
+
# return maybe the elements that did/didn't get changed?
|
1906
|
+
end
|
1907
|
+
|
1908
|
+
######## RENDERING STUFF ########
|
1909
|
+
|
1910
|
+
# Given a structure of the form +{ predicate => [objects] }+,
|
1911
|
+
# rearrange the structure into one more amenable to rendering
|
1912
|
+
# RDFa. Returns a hash of the form +{ resources: { r1 => Set[p1, pn]
|
1913
|
+
# }, literals: { l1 => Set[p2, pm] }, types: Set[t1, tn], datatypes:
|
1914
|
+
# Set[d1, dn] }+. This inverted structure can then be conveniently
|
1915
|
+
# traversed to generate the RDFa. An optional block lets us examine
|
1916
|
+
# the predicate-object pairs as they go by.
|
1917
|
+
#
|
1918
|
+
# @param struct [Hash] The struct of the designated form
|
1919
|
+
# @yield [p, o] An optional block is given the predicate-object pair
|
1920
|
+
# @return [Hash] The inverted structure, as described.
|
1921
|
+
#
|
1922
|
+
def prepare_collation struct, &block
|
1923
|
+
resources = {}
|
1924
|
+
literals = {}
|
1925
|
+
datatypes = Set.new
|
1926
|
+
types = Set.new
|
1927
|
+
|
1928
|
+
struct.each do |p, v|
|
1929
|
+
v.each do |o|
|
1930
|
+
block.call p, o if block
|
1931
|
+
|
1932
|
+
if o.literal?
|
1933
|
+
literals[o] ||= Set.new
|
1934
|
+
literals[o].add p
|
1935
|
+
# collect the datatype
|
1936
|
+
datatypes.add o.datatype if o.has_datatype?
|
1937
|
+
else
|
1938
|
+
if p == RDF::RDFV.type
|
1939
|
+
# separate the type
|
1940
|
+
types.add o
|
1941
|
+
else
|
1942
|
+
# collect the resource
|
1943
|
+
resources[o] ||= Set.new
|
1944
|
+
resources[o].add p
|
1945
|
+
end
|
1946
|
+
end
|
1947
|
+
end
|
1948
|
+
end
|
1949
|
+
|
1950
|
+
{ resources: resources, literals: literals,
|
1951
|
+
datatypes: datatypes, types: types }
|
1952
|
+
end
|
1953
|
+
|
1954
|
+
# Given a hash of prefixes and an array of nodes, obtain the the
|
1955
|
+
# subset of prefixes that abbreviate the nodes. Scans RDF URIs as
|
1956
|
+
# well as RDF::Literal datatypes.
|
1957
|
+
#
|
1958
|
+
# @param prefixes [#to_h] The prefixes, of the form +{ k: "v" }+
|
1959
|
+
# @param nodes [Array<RDF::Term>] The nodes to supply
|
1960
|
+
# @return [Hash] The prefix subset
|
1961
|
+
def prefix_subset prefixes, nodes
|
1962
|
+
prefixes = sanitize_prefixes prefixes, true
|
1963
|
+
|
1964
|
+
raise 'nodes must be arrayable' unless nodes.respond_to? :to_a
|
1965
|
+
|
1966
|
+
# sniff out all the URIs and datatypes
|
1967
|
+
resources = Set.new
|
1968
|
+
nodes.each do |n|
|
1969
|
+
next unless n.is_a? RDF::Term
|
1970
|
+
if n.literal? && n.datatype?
|
1971
|
+
resources << n.datatype
|
1972
|
+
elsif n.uri?
|
1973
|
+
resources << n
|
1974
|
+
end
|
1975
|
+
end
|
1976
|
+
|
1977
|
+
# now we abbreviate all the resources
|
1978
|
+
pfx = abbreviate(resources.to_a,
|
1979
|
+
prefixes: prefixes, noop: false, sort: false).uniq.compact.map do |p|
|
1980
|
+
p.split(?:).first.to_sym
|
1981
|
+
end.uniq.to_set
|
1982
|
+
|
1983
|
+
# now we return the subset
|
1984
|
+
prefixes.select { |k, _| pfx.include? k.to_sym }
|
1985
|
+
end
|
1986
|
+
|
1987
|
+
# turns any data structure into a set of nodes
|
1988
|
+
def smush_struct struct
|
1989
|
+
out = Set.new
|
1990
|
+
|
1991
|
+
if struct.is_a? RDF::Term
|
1992
|
+
out << struct
|
1993
|
+
elsif struct.respond_to? :to_a
|
1994
|
+
out |= struct.to_a.map { |s| smush_struct(s).to_a }.flatten.to_set
|
1995
|
+
end
|
1996
|
+
|
1997
|
+
out
|
1998
|
+
end
|
1999
|
+
|
2000
|
+
def invert_struct struct
|
2001
|
+
nodes = {}
|
2002
|
+
|
2003
|
+
struct.each do |p, v|
|
2004
|
+
v.each do |o|
|
2005
|
+
nodes[o] ||= Set.new
|
2006
|
+
nodes[o] << p
|
2007
|
+
end
|
2008
|
+
end
|
2009
|
+
|
2010
|
+
nodes
|
2011
|
+
end
|
2012
|
+
|
2013
|
+
def title_tag predicates, content,
|
2014
|
+
prefixes: {}, vocab: nil, lang: nil, xhtml: true
|
2015
|
+
|
2016
|
+
# begin with the tag
|
2017
|
+
tag = { '#title' => content.to_s,
|
2018
|
+
property: abbreviate(predicates, prefixes: prefixes, vocab: vocab) }
|
2019
|
+
|
2020
|
+
# we set the language if it exists and is different from the
|
2021
|
+
# body OR if it is xsd:string we set it to the empty string
|
2022
|
+
lang = (content.language? && content.language != lang ?
|
2023
|
+
content.language : nil) || (content.datatype == RDF::XSD.string &&
|
2024
|
+
lang ? '' : nil)
|
2025
|
+
if lang
|
2026
|
+
tag['xml:lang'] = lang if xhtml
|
2027
|
+
tag[:lang] = lang
|
2028
|
+
end
|
2029
|
+
if content.datatype? && content.datatype != RDF::XSD.string
|
2030
|
+
tag[:datatype] = abbreviate(content.datatype,
|
2031
|
+
prefixes: prefixes, vocab: vocab)
|
2032
|
+
end
|
2033
|
+
|
2034
|
+
tag
|
2035
|
+
end
|
2036
|
+
|
2037
|
+
######## MISC STUFF ########
|
2038
|
+
|
2039
|
+
# Obtain everything that is an owl:equivalentClass or
|
2040
|
+
# rdfs:subClassOf the given type.
|
2041
|
+
#
|
2042
|
+
# @param rdftype [RDF::Term]
|
2043
|
+
#
|
2044
|
+
# @return [Array]
|
2045
|
+
|
2046
|
+
def all_related rdftype
|
2047
|
+
t = RDF::Vocabulary.find_term(rdftype) or raise "No type #{rdftype.to_s}"
|
2048
|
+
q = [t] # queue
|
2049
|
+
c = {} # cache
|
2050
|
+
|
2051
|
+
while term = q.shift
|
2052
|
+
# add term to cache
|
2053
|
+
c[term] = term
|
2054
|
+
|
2055
|
+
# keep this from tripping up
|
2056
|
+
next unless term.uri? and term.respond_to? :class?
|
2057
|
+
|
2058
|
+
# entail equivalent classes
|
2059
|
+
term.entail(:equivalentClass).each do |ec|
|
2060
|
+
# add equivalent classes to queue (if not already cached)
|
2061
|
+
q.push ec unless c[ec]
|
2062
|
+
c[ec] = ec unless ec == term
|
2063
|
+
end
|
2064
|
+
|
2065
|
+
# entail subclasses
|
2066
|
+
term.subClass.each do |sc|
|
2067
|
+
# add subclasses to queue (if not already cached)
|
2068
|
+
q.push sc unless c[sc]
|
2069
|
+
c[sc] = sc unless sc == term
|
2070
|
+
end
|
2071
|
+
end
|
2072
|
+
|
2073
|
+
# smush the result
|
2074
|
+
c.keys
|
2075
|
+
end
|
2076
|
+
|
2077
|
+
|
2078
|
+
|
2079
|
+
# duplicate instance methods as module methods
|
2080
|
+
extend self
|
2081
|
+
end
|