rdf-sak 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +202 -0
- data/README.md +268 -0
- data/Rakefile +13 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/example/cleanup.xsl +14 -0
- data/example/matches.xhtml +11 -0
- data/example/transforms.ttl +58 -0
- data/lib/rdf-sak.rb +1 -0
- data/lib/rdf/sak.rb +2506 -0
- data/lib/rdf/sak/ci.rb +827 -0
- data/lib/rdf/sak/cli.rb +35 -0
- data/lib/rdf/sak/docstats.rb +188 -0
- data/lib/rdf/sak/document.rb +772 -0
- data/lib/rdf/sak/ibis.rb +248 -0
- data/lib/rdf/sak/mimemagic.rb +73 -0
- data/lib/rdf/sak/pav.rb +479 -0
- data/lib/rdf/sak/qb.rb +280 -0
- data/lib/rdf/sak/scovo.rb +51 -0
- data/lib/rdf/sak/tfo.rb +301 -0
- data/lib/rdf/sak/transform.rb +1172 -0
- data/lib/rdf/sak/urlrunner.rb +602 -0
- data/lib/rdf/sak/util.rb +2081 -0
- data/lib/rdf/sak/version.rb +5 -0
- data/rdf-sak.gemspec +60 -0
- metadata +366 -0
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rspec/core/rake_task"
|
3
|
+
|
4
|
+
RSpec::Core::RakeTask.new(:spec)
|
5
|
+
|
6
|
+
task :default => :spec
|
7
|
+
|
8
|
+
#desc 'Generate Vocabularies'
|
9
|
+
#task :gen_vocabs => %w(ci).map { |v| "lib/rdf/sak/#{v}.rb" }
|
10
|
+
|
11
|
+
# XXX turn this into a rake task at some point :P
|
12
|
+
|
13
|
+
# rdf serialize --uri 'https://privatealpha.com/ontology/content-inventory/1#' --output-format vocabulary --module-name RDF::SAK --class-name CI -o lib/rdf/sak/ci.rb --strict 'https://privatealpha.com/ontology/content-inventory/1#'
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "rdf/sak"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/example/cleanup.xsl
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<xsl:stylesheet version="1.0"
|
3
|
+
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
4
|
+
xmlns:html="http://www.w3.org/1999/xhtml"
|
5
|
+
xmlns="http://www.w3.org/1999/xhtml"
|
6
|
+
exclude-result-prefixes="html">
|
7
|
+
|
8
|
+
<xsl:key name="main" match="html:main" use="''"/>
|
9
|
+
|
10
|
+
<xsl:template match="/html:*">
|
11
|
+
<xsl:copy-of select="key('main', '')[1]"/>
|
12
|
+
</xsl:template>
|
13
|
+
|
14
|
+
</xsl:stylesheet>
|
@@ -0,0 +1,58 @@
|
|
1
|
+
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
|
2
|
+
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
|
3
|
+
@prefix owl: <http://www.w3.org/2002/07/owl#> .
|
4
|
+
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
|
5
|
+
@prefix dct: <http://purl.org/dc/terms/> .
|
6
|
+
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
|
7
|
+
@prefix ci: <https://privatealpha.com/ontology/content-inventory/1#> .
|
8
|
+
@prefix tfo: <https://privatealpha.com/ontology/transformation/1#> .
|
9
|
+
@prefix xf: <tag:makethingsmakesense.com,2020:transform/> .
|
10
|
+
|
11
|
+
xf:prefix a tfo:Parameter ;
|
12
|
+
skos:prefLabel "Prefix"@en ;
|
13
|
+
rdfs:comment "A compact prefix declaration of the form prefix:url"@en ;
|
14
|
+
dct:identifier "prefix"^^xsd:token ;
|
15
|
+
rdfs:range xsd:token .
|
16
|
+
|
17
|
+
xf:xpath a tfo:Parameter ;
|
18
|
+
skos:prefLabel "XPath"@en ;
|
19
|
+
rdfs:comment "An XPath expression"@en ;
|
20
|
+
dct:identifier "xpath"^^xsd:token ;
|
21
|
+
owl:cardinality 1 ;
|
22
|
+
rdfs:range xsd:string .
|
23
|
+
|
24
|
+
xf:reindent a tfo:Parameter ;
|
25
|
+
skos:prefLabel "Reindent"@en ;
|
26
|
+
rdfs:comment "Reindent the XML tree"@en ;
|
27
|
+
dct:identifier "reindent"^^xsd:token ;
|
28
|
+
tfo:default true ;
|
29
|
+
owl:cardinality 1 ;
|
30
|
+
rdfs:range xsd:boolean .
|
31
|
+
|
32
|
+
xf:subtree a tfo:Transform ;
|
33
|
+
skos:prefLabel "Subtree"@en ;
|
34
|
+
rdfs:comment "Isolate an X(HT)ML node using XPath."@en ;
|
35
|
+
tfo:implementation <urn:x-ruby:RDF::SAK::Transform::XPath> ;
|
36
|
+
tfo:accepts "application/xml"^^tfo:content-type ;
|
37
|
+
tfo:returns "application/xml"^^tfo:content-type ;
|
38
|
+
tfo:parameter xf:xpath, xf:prefix, xf:reindent ;
|
39
|
+
tfo:parameter-list ( xf:xpath xf:prefix xf:reindent ) .
|
40
|
+
|
41
|
+
xf:cleanup a tfo:Transform ;
|
42
|
+
skos:prefLabel "Cleanup"@en ;
|
43
|
+
rdfs:comment "Apply cleanup.xsl to the input."@en ;
|
44
|
+
tfo:implementation <file:example/cleanup.xsl> ;
|
45
|
+
tfo:accepts "application/xml"^^tfo:content-type ;
|
46
|
+
tfo:returns "application/xml"^^tfo:content-type .
|
47
|
+
|
48
|
+
<urn:uuid:78e6d8ce-a88a-4be0-8bfa-079136945816> a tfo:Partial ;
|
49
|
+
tfo:transform xf:subtree ;
|
50
|
+
xf:xpath "//html:main[1]"^^xsd:string ;
|
51
|
+
xf:prefix "html:http://www.w3.org/1999/xhtml"^^xsd:token .
|
52
|
+
|
53
|
+
<urn:uuid:4498eef5-1ca6-4034-937a-d50033dd6693> a tfo:Application ;
|
54
|
+
tfo:input <ni:///sha-256;0GHHmDtxh9CRZttXdr-cX78u72auS2P-O6tDXxvz2kU> ;
|
55
|
+
tfo:output <ni:///sha-256;_BbLbNSZl0TcQcjz-v3qF5fa5VL11rdha7c24K44pTc> ;
|
56
|
+
tfo:transform xf:subtree ;
|
57
|
+
xf:xpath "//html:main[1]"^^xsd:string ;
|
58
|
+
xf:prefix "html:http://www.w3.org/1999/xhtml"^^xsd:token .
|
data/lib/rdf-sak.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'rdf/sak'
|
data/lib/rdf/sak.rb
ADDED
@@ -0,0 +1,2506 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'rdf/sak/version'
|
3
|
+
|
4
|
+
# basic stuff
|
5
|
+
require 'stringio'
|
6
|
+
require 'pathname'
|
7
|
+
require 'tempfile'
|
8
|
+
|
9
|
+
# rdf stuff
|
10
|
+
require 'uri'
|
11
|
+
require 'uri/urn'
|
12
|
+
require 'rdf'
|
13
|
+
require 'rdf/reasoner'
|
14
|
+
require 'linkeddata'
|
15
|
+
|
16
|
+
# my stuff
|
17
|
+
require 'xml-mixup'
|
18
|
+
require 'md-noko'
|
19
|
+
require 'uuid-ncname'
|
20
|
+
require 'rdf/sak/mimemagic'
|
21
|
+
require 'rdf/sak/util'
|
22
|
+
|
23
|
+
# ontologies, mine in particular
|
24
|
+
require 'rdf/sak/ci'
|
25
|
+
require 'rdf/sak/ibis'
|
26
|
+
# others not included in rdf.rb
|
27
|
+
require 'rdf/sak/pav'
|
28
|
+
require 'rdf/sak/qb'
|
29
|
+
|
30
|
+
module RDF::SAK
|
31
|
+
|
32
|
+
class Context
|
33
|
+
include XML::Mixup
|
34
|
+
include Util
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# RDF::Reasoner.apply(:rdfs, :owl)
|
39
|
+
|
40
|
+
G_OK = [RDF::Repository, RDF::Dataset, RDF::Graph].freeze
|
41
|
+
C_OK = [Pathname, IO, String].freeze
|
42
|
+
|
43
|
+
def coerce_to_path_or_io obj
|
44
|
+
return obj if obj.is_a? IO
|
45
|
+
return obj.expand_path if obj.is_a? Pathname
|
46
|
+
raise "#{obj.inspect} is not stringable" unless obj.respond_to? :to_s
|
47
|
+
Pathname(obj.to_s).expand_path
|
48
|
+
end
|
49
|
+
|
50
|
+
def coerce_graph graph = nil, type: nil
|
51
|
+
# begin with empty graph
|
52
|
+
out = RDF::Repository.new
|
53
|
+
|
54
|
+
return out unless graph
|
55
|
+
return graph if G_OK.any? { |c| graph.is_a? c }
|
56
|
+
|
57
|
+
# now turn into an array
|
58
|
+
graph = [graph] unless graph.is_a? Array
|
59
|
+
|
60
|
+
graph.each do |g|
|
61
|
+
raise 'Graph must be some kind of RDF::Graph or RDF data file' unless
|
62
|
+
C_OK.any? { |c| g.is_a? c } || g.respond_to?(:to_s)
|
63
|
+
|
64
|
+
opts = {}
|
65
|
+
opts[:content_type] = type if type
|
66
|
+
|
67
|
+
if g.is_a? Pathname
|
68
|
+
opts[:filename] = g.expand_path.to_s
|
69
|
+
g = g.open
|
70
|
+
elsif g.is_a? File
|
71
|
+
opts[:filename] = g.path
|
72
|
+
end
|
73
|
+
|
74
|
+
g = StringIO.new(g.to_s) unless g.is_a? IO
|
75
|
+
reader = RDF::Reader.for(opts) do
|
76
|
+
g.rewind
|
77
|
+
sample = g.read 1000
|
78
|
+
g.rewind
|
79
|
+
sample
|
80
|
+
end or raise "Could not find an RDF::Reader for #{opts[:content_type]}"
|
81
|
+
|
82
|
+
reader = reader.new g, **opts
|
83
|
+
reader.each_statement do |stmt|
|
84
|
+
out << stmt
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
out
|
89
|
+
end
|
90
|
+
|
91
|
+
def normalize_hash h
|
92
|
+
return h unless h.is_a? Hash
|
93
|
+
out = {}
|
94
|
+
h.each do |k, v|
|
95
|
+
out[k.to_s.to_sym] = v.is_a?(Hash) ? normalize_hash(v) :
|
96
|
+
v.respond_to?(:to_a) ? v.to_a.map { |x| normalize_hash x } : v
|
97
|
+
end
|
98
|
+
out
|
99
|
+
end
|
100
|
+
|
101
|
+
def coerce_config config
|
102
|
+
# config must either be a hash or a file name/pathname/io object
|
103
|
+
unless config.respond_to? :to_h
|
104
|
+
# when in rome
|
105
|
+
require 'yaml'
|
106
|
+
config = if config.is_a? IO
|
107
|
+
YAML.load config
|
108
|
+
else
|
109
|
+
YAML.load_file Pathname.new(config).expand_path
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
config = normalize_hash config
|
114
|
+
|
115
|
+
# config MUST have source and target dirs
|
116
|
+
raise 'Config must have :source, :target, and :private directories' unless
|
117
|
+
([:source, :target, :private] - config.keys).empty?
|
118
|
+
[:source, :target].each do |path|
|
119
|
+
dir = config[path] = Pathname.new(config[path]).expand_path
|
120
|
+
raise "#{dir} is not a readable directory" unless
|
121
|
+
dir.directory? && dir.readable?
|
122
|
+
end
|
123
|
+
raise "Target directory #{config[:target]} is not writable" unless
|
124
|
+
config[:target].writable?
|
125
|
+
raise "Source and target directories are the same: #{config[:source]}" if
|
126
|
+
config[:source] == config[:target]
|
127
|
+
|
128
|
+
# we try to create the private directory
|
129
|
+
config[:private] = config[:target] + config[:private]
|
130
|
+
if config[:private].exist?
|
131
|
+
raise "#{config[:private]} is not a readable/writable directory" unless
|
132
|
+
[:directory?, :readable?, :writable?].all? do |m|
|
133
|
+
config[:private].send m
|
134
|
+
end
|
135
|
+
else
|
136
|
+
config[:private].mkpath
|
137
|
+
end
|
138
|
+
|
139
|
+
# config MAY have graph location(s) but we can test this other
|
140
|
+
# ways, same goes for base URI
|
141
|
+
if config[:graph]
|
142
|
+
g = config[:graph]
|
143
|
+
g = [g] unless g.is_a? Array
|
144
|
+
config[:graph] = g.map { |x| Pathname.new(x).expand_path }
|
145
|
+
end
|
146
|
+
|
147
|
+
# deal with prefix map
|
148
|
+
if config[:prefixes]
|
149
|
+
config[:prefixes] = config[:prefixes].transform_values do |p|
|
150
|
+
# we have to wrap this in case it fails
|
151
|
+
begin
|
152
|
+
RDF::Vocabulary.find_term(p) || RDF::URI(p)
|
153
|
+
rescue
|
154
|
+
RDF::URI(p)
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
if dups = config[:duplicate]
|
160
|
+
pfx = config[:prefixes] || {}
|
161
|
+
base = URI(uri_pp config[:base])
|
162
|
+
if dups.is_a? Hash
|
163
|
+
config[:duplicate] = dups.map do |ruri, preds|
|
164
|
+
preds = [preds] unless preds.is_a? Array
|
165
|
+
preds.map! do |p|
|
166
|
+
resolve_curie p, prefixes: pfx, scalar: true, coerce: :rdf
|
167
|
+
end
|
168
|
+
[RDF::URI((base + ruri.to_s).to_s), Set.new(preds)]
|
169
|
+
end.to_h
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
# rewrite maps
|
174
|
+
config[:maps] = {} unless config[:maps].is_a? Hash
|
175
|
+
%w(rewrite redirect gone).each do |type|
|
176
|
+
config[:maps][type.to_sym] ||= ".#{type}.map"
|
177
|
+
end
|
178
|
+
|
179
|
+
config
|
180
|
+
end
|
181
|
+
|
182
|
+
def cmp_label a, b, labels: nil, supplant: true, reverse: false
|
183
|
+
labels ||= {}
|
184
|
+
|
185
|
+
# try supplied label or fall back
|
186
|
+
pair = [a, b].map do |x|
|
187
|
+
if labels[x]
|
188
|
+
labels[x][1]
|
189
|
+
elsif supplant and y = label_for(x)
|
190
|
+
labels[x] = y
|
191
|
+
y[1]
|
192
|
+
else
|
193
|
+
x
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
pair.reverse! if reverse
|
198
|
+
# warn "#{pair[0]} <=> #{pair[1]}"
|
199
|
+
pair[0].to_s <=> pair[1].to_s
|
200
|
+
end
|
201
|
+
|
202
|
+
def term_list terms
|
203
|
+
return [] if terms.nil?
|
204
|
+
terms = terms.respond_to?(:to_a) ? terms.to_a : [terms]
|
205
|
+
terms.uniq.map { |t| RDF::Vocabulary.find_term t }.compact
|
206
|
+
end
|
207
|
+
|
208
|
+
def coerce_resource arg
|
209
|
+
super arg, @base
|
210
|
+
end
|
211
|
+
|
212
|
+
def coerce_uuid_urn arg
|
213
|
+
super arg, @base
|
214
|
+
end
|
215
|
+
|
216
|
+
public
|
217
|
+
|
218
|
+
attr_reader :config, :graph, :base
|
219
|
+
|
220
|
+
# Initialize a context.
|
221
|
+
#
|
222
|
+
# @param graph
|
223
|
+
# @param base
|
224
|
+
# @param config
|
225
|
+
# @param type
|
226
|
+
#
|
227
|
+
# @return [RDF::SAK::Context] the new context object.
|
228
|
+
|
229
|
+
def initialize graph: nil, base: nil, config: nil, type: nil
|
230
|
+
# RDF::Reasoner.apply(:rdfs, :owl)
|
231
|
+
|
232
|
+
@config = coerce_config config
|
233
|
+
|
234
|
+
graph ||= @config[:graph] if @config[:graph]
|
235
|
+
base ||= @config[:base] if @config[:base]
|
236
|
+
|
237
|
+
@graph = coerce_graph graph, type: type
|
238
|
+
@base = RDF::URI.new base.to_s if base
|
239
|
+
@ucache = RDF::Util::Cache.new(-1)
|
240
|
+
@scache = {} # wtf rdf util cache doesn't like booleans
|
241
|
+
end
|
242
|
+
|
243
|
+
# Get the prefix mappings from the configuration.
|
244
|
+
#
|
245
|
+
# @return [Hash]
|
246
|
+
|
247
|
+
def prefixes
|
248
|
+
@config[:prefixes] || {}
|
249
|
+
end
|
250
|
+
|
251
|
+
# Abbreviate a set of terms against the registered namespace
|
252
|
+
# prefixes and optional default vocabulary, or otherwise return a
|
253
|
+
# string representation of the original URI.
|
254
|
+
|
255
|
+
# @param term [RDF::Term]
|
256
|
+
# @param prefixes [Hash]
|
257
|
+
#
|
258
|
+
# @return [String]
|
259
|
+
#
|
260
|
+
def abbreviate term, prefixes: @config[:prefixes],
|
261
|
+
vocab: nil, noop: true, sort: true
|
262
|
+
super term, prefixes: prefixes || {}, vocab: vocab, noop: noop, sort: sort
|
263
|
+
end
|
264
|
+
|
265
|
+
# Obtain a key-value structure for the given subject, optionally
|
266
|
+
# constraining the result by node type (:resource, :uri/:iri,
|
267
|
+
# :blank/:bnode, :literal)
|
268
|
+
#
|
269
|
+
# @param subject of the inquiry
|
270
|
+
# @param rev map in reverse
|
271
|
+
# @param only one or more node types
|
272
|
+
# @param uuids coerce resources to if possible
|
273
|
+
#
|
274
|
+
# @return [Hash]
|
275
|
+
#
|
276
|
+
def struct_for subject, rev: false, only: [], uuids: false, canon: false
|
277
|
+
Util.struct_for @graph, subject,
|
278
|
+
rev: rev, only: only, uuids: uuids, canon: canon
|
279
|
+
end
|
280
|
+
|
281
|
+
# Obtain everything in the graph that is an `rdf:type` of something.
|
282
|
+
#
|
283
|
+
# @return [Array]
|
284
|
+
#
|
285
|
+
def all_types
|
286
|
+
@graph.query([nil, RDF.type, nil]).objects.uniq
|
287
|
+
end
|
288
|
+
|
289
|
+
# Obtain every subject that is rdf:type the given type or its subtypes.
|
290
|
+
#
|
291
|
+
# @param rdftype [RDF::Term]
|
292
|
+
#
|
293
|
+
# @return [Array]
|
294
|
+
#
|
295
|
+
def all_of_type rdftype, exclude: []
|
296
|
+
exclude = term_list exclude
|
297
|
+
t = RDF::Vocabulary.find_term(rdftype) or raise "No type #{rdftype.to_s}"
|
298
|
+
out = []
|
299
|
+
(all_types & all_related(t) - exclude).each do |type|
|
300
|
+
out += @graph.query([nil, RDF.type, type]).subjects
|
301
|
+
end
|
302
|
+
|
303
|
+
out.uniq
|
304
|
+
end
|
305
|
+
|
306
|
+
# Obtain all and only the rdf:types directly asserted on the subject.
|
307
|
+
#
|
308
|
+
# @param subject [RDF::Resource]
|
309
|
+
# @param type [RDF::Term, :to_a]
|
310
|
+
#
|
311
|
+
# @return [Array]
|
312
|
+
#
|
313
|
+
def asserted_types subject, type = nil
|
314
|
+
Util.asserted_types @graph, subject, type
|
315
|
+
end
|
316
|
+
|
317
|
+
# Obtain the canonical UUID for the given URI
|
318
|
+
#
|
319
|
+
# @param uri [RDF::URI, URI, to_s] the subject of the inquiry
|
320
|
+
# @param unique [true, false] return a single resource/nil or an array
|
321
|
+
# @param published [true, false] whether to restrict to published docs
|
322
|
+
#
|
323
|
+
# @return [RDF::URI, Array]
|
324
|
+
#
|
325
|
+
def canonical_uuid uri, unique: true, published: false
|
326
|
+
Util.canonical_uuid @graph, uri, unique: unique,
|
327
|
+
published: published, scache: @scache, ucache: @ucache, base: @base
|
328
|
+
end
|
329
|
+
|
330
|
+
# Obtain the "best" dereferenceable URI for the subject.
|
331
|
+
# Optionally returns all candidates.
|
332
|
+
#
|
333
|
+
# @param subject [RDF::Resource]
|
334
|
+
# @param unique [true, false] flag for unique return value
|
335
|
+
# @param rdf [true, false] flag to specify RDF::URI vs URI
|
336
|
+
# @param slugs [true, false] flag to include slugs
|
337
|
+
# @param fragment [true, false] flag to include fragment URIs
|
338
|
+
#
|
339
|
+
# @return [RDF::URI, URI, Array]
|
340
|
+
#
|
341
|
+
def canonical_uri subject,
|
342
|
+
unique: true, rdf: true, slugs: false, fragment: false
|
343
|
+
Util.canonical_uri @graph, subject, base: @base,
|
344
|
+
unique: unique, rdf: rdf, slugs: slugs, fragment: fragment
|
345
|
+
end
|
346
|
+
|
347
|
+
# Returns subjects from the graph with entailment.
|
348
|
+
#
|
349
|
+
# @param predicate
|
350
|
+
# @param object
|
351
|
+
# @param entail
|
352
|
+
# @param only
|
353
|
+
#
|
354
|
+
# @return [RDF::Resource]
|
355
|
+
#
|
356
|
+
def subjects_for predicate, object, entail: true, only: []
|
357
|
+
Util.subjects_for @graph, predicate, object, entail: entail, only: only
|
358
|
+
end
|
359
|
+
|
360
|
+
# Returns objects from the graph with entailment.
|
361
|
+
#
|
362
|
+
# @param subject
|
363
|
+
# @param predicate
|
364
|
+
# @param entail
|
365
|
+
# @param only
|
366
|
+
# @param datatype
|
367
|
+
#
|
368
|
+
# @return [RDF::Term]
|
369
|
+
#
|
370
|
+
def objects_for subject, predicate, entail: true, only: [], datatype: nil
|
371
|
+
Util.objects_for @graph, subject, predicate,
|
372
|
+
entail: entail, only: only, datatype: datatype
|
373
|
+
end
|
374
|
+
|
375
|
+
# Find the terminal replacements for the given subject, if any exist.
|
376
|
+
#
|
377
|
+
# @param subject
|
378
|
+
# @param published indicate the context is published
|
379
|
+
#
|
380
|
+
# @return [Set]
|
381
|
+
#
|
382
|
+
def replacements_for subject, published: true
|
383
|
+
Util.replacements_for @graph, subject, published: published
|
384
|
+
end
|
385
|
+
|
386
|
+
# Obtain dates for the subject as instances of Date(Time). This is
|
387
|
+
# just shorthand for a common application of `objects_for`.
|
388
|
+
#
|
389
|
+
# @param subject
|
390
|
+
# @param predicate
|
391
|
+
# @param datatype
|
392
|
+
#
|
393
|
+
# @return [Array] of dates
|
394
|
+
def dates_for subject, predicate: RDF::Vocab::DC.date,
|
395
|
+
datatype: [RDF::XSD.date, RDF::XSD.dateTime]
|
396
|
+
Util.dates_for @graph, subject, predicate: predicate, datatype: datatype
|
397
|
+
end
|
398
|
+
|
399
|
+
# Obtain any specified MIME types for the subject. Just shorthand
|
400
|
+
# for a common application of `objects_for`.
|
401
|
+
#
|
402
|
+
# @param subject
|
403
|
+
# @param predicate
|
404
|
+
# @param datatype
|
405
|
+
#
|
406
|
+
# @return [Array] of internet media types
|
407
|
+
#
|
408
|
+
def formats_for subject, predicate: RDF::Vocab::DC.format,
|
409
|
+
datatype: [RDF::XSD.token]
|
410
|
+
Util.objects_for @graph, subject, predicate: predicate, datatype: datatype
|
411
|
+
end
|
412
|
+
|
413
|
+
# Assuming the subject is a thing that has authors, return the
|
414
|
+
# list of authors. Try bibo:authorList first for an explicit
|
415
|
+
# ordering, then continue to the various other predicates.
|
416
|
+
#
|
417
|
+
# @param subject [RDF::Resource]
|
418
|
+
# @param unique [false, true] only return the first author
|
419
|
+
# @param contrib [false, true] return contributors instead of authors
|
420
|
+
#
|
421
|
+
# @return [RDF::Value, Array]
|
422
|
+
#
|
423
|
+
def authors_for subject, unique: false, contrib: false
|
424
|
+
Util.authors_for @graph, subject, unique: unique, contrib: contrib
|
425
|
+
end
|
426
|
+
|
427
|
+
# Obtain the most appropriate label(s) for the subject's type(s).
|
428
|
+
# Returns one or more (depending on the `unique` flag)
|
429
|
+
# predicate-object pairs in order of preference.
|
430
|
+
#
|
431
|
+
# @param subject [RDF::Resource]
|
432
|
+
# @param unique [true, false] only return the first pair
|
433
|
+
# @param type [RDF::Term, Array] supply asserted types if already retrieved
|
434
|
+
# @param lang [nil] not currently implemented (will be conneg)
|
435
|
+
# @param desc [false, true] retrieve description instead of label
|
436
|
+
# @param alt [false, true] retrieve alternate instead of main
|
437
|
+
#
|
438
|
+
# @return [Array] either a predicate-object pair or an array of pairs.
|
439
|
+
#
|
440
|
+
def label_for subject, candidates: nil, unique: true, type: nil,
|
441
|
+
lang: nil, desc: false, alt: false
|
442
|
+
Util.label_for @graph, subject, candidates: candidates,
|
443
|
+
unique: unique, type: type, lang: lang, desc: desc, alt: alt
|
444
|
+
end
|
445
|
+
|
446
|
+
SKOS_HIER = [
|
447
|
+
{
|
448
|
+
element: :subject,
|
449
|
+
pattern: -> c, p { [nil, p, c] },
|
450
|
+
preds: [RDF::Vocab::SKOS.broader, RDF::Vocab::SKOS.broaderTransitive],
|
451
|
+
},
|
452
|
+
{
|
453
|
+
element: :object,
|
454
|
+
pattern: -> c, p { [c, p, nil] },
|
455
|
+
preds: [RDF::Vocab::SKOS.narrower, RDF::Vocab::SKOS.narrowerTransitive],
|
456
|
+
}
|
457
|
+
]
|
458
|
+
SKOS_HIER.each do |struct|
|
459
|
+
# lol how many times are we gonna cart this thing around
|
460
|
+
preds = struct[:preds]
|
461
|
+
i = 0
|
462
|
+
loop do
|
463
|
+
equiv = preds[i].entail(:equivalentProperty) - preds
|
464
|
+
preds.insert(i + 1, *equiv) unless equiv.empty?
|
465
|
+
i += equiv.length + 1;
|
466
|
+
break if i >= preds.length
|
467
|
+
end
|
468
|
+
end
|
469
|
+
|
470
|
+
def sub_concepts concept, extra: []
|
471
|
+
raise 'Concept must be exactly one concept' unless
|
472
|
+
concept.is_a? RDF::Resource
|
473
|
+
extra = term_list extra
|
474
|
+
|
475
|
+
# we need an array for a queue, and a set to accumulate the
|
476
|
+
# output as well as a separate 'seen' set
|
477
|
+
queue = [concept]
|
478
|
+
seen = Set.new queue.dup
|
479
|
+
out = seen.dup
|
480
|
+
|
481
|
+
# it turns out that the main SKOS hierarchy terms, while not
|
482
|
+
# being transitive themselves, are subproperties of transitive
|
483
|
+
# relations which means they are as good as being transitive.
|
484
|
+
|
485
|
+
while c = queue.shift
|
486
|
+
SKOS_HIER.each do |struct|
|
487
|
+
elem, pat, preds = struct.values_at(:element, :pattern, :preds)
|
488
|
+
preds.each do |p|
|
489
|
+
@graph.query(pat.call c, p).each do |stmt|
|
490
|
+
# obtain hierarchical element
|
491
|
+
hierc = stmt.send elem
|
492
|
+
|
493
|
+
# skip any further processing if we have seen this concept
|
494
|
+
next if seen.include? hierc
|
495
|
+
seen << hierc
|
496
|
+
|
497
|
+
next if !extra.empty? and !extra.any? do |t|
|
498
|
+
@graph.has_statement? RDF::Statement.new(hierc, RDF.type, t)
|
499
|
+
end
|
500
|
+
|
501
|
+
queue << hierc
|
502
|
+
out << hierc
|
503
|
+
end
|
504
|
+
end
|
505
|
+
end
|
506
|
+
end
|
507
|
+
|
508
|
+
out.to_a.sort
|
509
|
+
end
|
510
|
+
|
511
|
+
def audiences_for uuid, proximate: false, invert: false
|
512
|
+
p = invert ? CI['non-audience'] : RDF::Vocab::DC.audience
|
513
|
+
return @graph.query([uuid, p, nil]).objects if proximate
|
514
|
+
|
515
|
+
out = []
|
516
|
+
@graph.query([uuid, p, nil]).objects.each do |o|
|
517
|
+
out += sub_concepts o
|
518
|
+
end
|
519
|
+
|
520
|
+
out
|
521
|
+
end
|
522
|
+
|
523
|
+
# Get all "reachable" UUID-identified entities (subjects which are
|
524
|
+
# also objects)
|
525
|
+
def reachable published: false
|
526
|
+
p = published ? -> x { published?(x) } : -> x { true }
|
527
|
+
# now get the subjects which are also objects
|
528
|
+
@graph.subjects.select do |s|
|
529
|
+
s.uri? && s =~ /^urn:uuid:/ && @graph.has_object?(s) && p.call(s)
|
530
|
+
end
|
531
|
+
end
|
532
|
+
|
533
|
+
# holy cow this is actually a lot of stuff:
|
534
|
+
|
535
|
+
# turn markdown into xhtml (via md-noko)
|
536
|
+
|
537
|
+
# turn html into xhtml (trivial)
|
538
|
+
|
539
|
+
# generate triples from ordinary (x)html structure
|
540
|
+
|
541
|
+
# map vanilla (x)html metadata to existing graph (ie to get resource URIs)
|
542
|
+
|
543
|
+
# pull triples from rdfa
|
544
|
+
|
545
|
+
# stuff rdfa into rdfa-less xhtml
|
546
|
+
|
547
|
+
# basic nlp detection of terms + text-level markup (dfn, abbr...)
|
548
|
+
|
549
|
+
# markdown round-tripping (may as well store source in md if possible)
|
550
|
+
|
551
|
+
# add title attribute to all links
|
552
|
+
|
553
|
+
# add alt attribute to all images
|
554
|
+
|
555
|
+
# segmentation of composite documents into multiple files
|
556
|
+
|
557
|
+
# aggregation of simple documents into composites
|
558
|
+
|
559
|
+
# generate backlinks
|
560
|
+
|
561
|
+
# - resource (ie file) generation -
|
562
|
+
|
563
|
+
# generate indexes of people, groups, and organizations
|
564
|
+
|
565
|
+
# generate indexes of books, not-books, and other external links
|
566
|
+
|
567
|
+
def head_links subject, struct: nil, nodes: nil, prefixes: {},
|
568
|
+
ignore: [], uris: {}, labels: {}, vocab: nil
|
569
|
+
|
570
|
+
raise 'ignore must be Array or Set' unless
|
571
|
+
[Array, Set].any? { |c| ignore.is_a? c }
|
572
|
+
|
573
|
+
struct ||= struct_for subject
|
574
|
+
nodes ||= invert_struct struct
|
575
|
+
|
576
|
+
# make sure these are actually URI objects not RDF::URI
|
577
|
+
uris = uris.transform_values { |v| URI(uri_pp v.to_s) }
|
578
|
+
uri = uris[subject] || canonical_uri(subject, rdf: false)
|
579
|
+
|
580
|
+
ignore = ignore.to_set
|
581
|
+
|
582
|
+
# output
|
583
|
+
links = []
|
584
|
+
|
585
|
+
nodes.reject { |n, _| ignore.include?(n) || !n.uri? }.each do |k, v|
|
586
|
+
# first nuke rdf:type, that's never in there
|
587
|
+
v = v.dup.delete RDF::RDFV.type
|
588
|
+
next if v.empty?
|
589
|
+
|
590
|
+
unless uris[k]
|
591
|
+
cu = canonical_uri k
|
592
|
+
uris[k] = cu || uri_pp(k.to_s)
|
593
|
+
end
|
594
|
+
|
595
|
+
# munge the url and make the tag
|
596
|
+
rel = abbreviate v.to_a, vocab: vocab
|
597
|
+
ru = uri.route_to(uris[k])
|
598
|
+
ln = { nil => :link, rel: rel, href: ru.to_s }
|
599
|
+
|
600
|
+
# add the title
|
601
|
+
if lab = labels[k]
|
602
|
+
ln[:title] = lab[1].to_s
|
603
|
+
end
|
604
|
+
|
605
|
+
# add type attribute
|
606
|
+
unless (mts = formats_for k).empty?
|
607
|
+
ln[:type] = mts.first.to_s
|
608
|
+
|
609
|
+
if ln[:type] =~ /(java|ecma)script/i ||
|
610
|
+
!(v.to_set & Set[RDF::Vocab::DC.requires]).empty?
|
611
|
+
ln[:src] = ln.delete :href
|
612
|
+
# make sure we pass in an empty string so there is a closing tag
|
613
|
+
ln.delete nil
|
614
|
+
ln[['']] = :script
|
615
|
+
end
|
616
|
+
end
|
617
|
+
|
618
|
+
# finally add the link
|
619
|
+
links.push ln
|
620
|
+
end
|
621
|
+
|
622
|
+
links.sort! do |a, b|
|
623
|
+
# sort by rel, then by href
|
624
|
+
# warn a.inspect, b.inspect
|
625
|
+
s = 0
|
626
|
+
[nil, :rel, :rev, :href, :title].each do |k|
|
627
|
+
s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
|
628
|
+
break if s != 0
|
629
|
+
end
|
630
|
+
s
|
631
|
+
end
|
632
|
+
|
633
|
+
links
|
634
|
+
end
|
635
|
+
|
636
|
+
def head_meta subject, struct: nil, nodes: nil, prefixes: {},
|
637
|
+
ignore: [], meta_names: {}, vocab: nil, lang: nil, xhtml: true
|
638
|
+
|
639
|
+
raise 'ignore must be Array or Set' unless
|
640
|
+
[Array, Set].any? { |c| ignore.is_a? c }
|
641
|
+
|
642
|
+
struct ||= struct_for subject
|
643
|
+
nodes ||= invert_struct struct
|
644
|
+
|
645
|
+
ignore = ignore.to_set
|
646
|
+
|
647
|
+
meta = []
|
648
|
+
nodes.select { |n| n.literal? && !ignore.include?(n) }.each do |k, v|
|
649
|
+
rel = abbreviate v.to_a, vocab: vocab
|
650
|
+
tag = { nil => :meta, property: rel, content: k.to_s }
|
651
|
+
|
652
|
+
lang = (k.language? && k.language != lang ? k.language : nil) ||
|
653
|
+
(k.datatype == RDF::XSD.string && lang ? '' : nil)
|
654
|
+
if lang
|
655
|
+
tag['xml:lang'] = lang if xhtml
|
656
|
+
tag[:lang] = lang
|
657
|
+
end
|
658
|
+
|
659
|
+
tag[:datatype] = abbreviate k.datatype, vocab: XHV if k.datatype?
|
660
|
+
tag[:name] = meta_names[k] if meta_names[k]
|
661
|
+
|
662
|
+
meta << tag
|
663
|
+
end
|
664
|
+
|
665
|
+
meta.sort! do |a, b|
|
666
|
+
s = 0
|
667
|
+
[:about, :property, :datatype, :content, :name].each do |k|
|
668
|
+
# warn a.inspect, b.inspect
|
669
|
+
s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
|
670
|
+
break if s != 0
|
671
|
+
end
|
672
|
+
s
|
673
|
+
end
|
674
|
+
|
675
|
+
meta
|
676
|
+
end
|
677
|
+
|
678
|
+
def generate_backlinks subject, published: true, ignore: nil
|
679
|
+
uri = canonical_uri(subject, rdf: false) || URI(uri_pp subject)
|
680
|
+
ignore ||= Set.new
|
681
|
+
raise 'ignore must be amenable to a set' unless ignore.respond_to? :to_set
|
682
|
+
ignore = ignore.to_set
|
683
|
+
nodes = {}
|
684
|
+
labels = {}
|
685
|
+
types = {}
|
686
|
+
@graph.query([nil, nil, subject]).each do |stmt|
|
687
|
+
next if ignore.include?(sj = stmt.subject)
|
688
|
+
preds = nodes[sj] ||= Set.new
|
689
|
+
preds << (pr = stmt.predicate)
|
690
|
+
types[sj] ||= asserted_types sj
|
691
|
+
labels[sj] ||= label_for sj
|
692
|
+
labels[pr] ||= label_for pr
|
693
|
+
end
|
694
|
+
|
695
|
+
# prune out
|
696
|
+
nodes.select! { |k, _| published? k } if published
|
697
|
+
|
698
|
+
return if nodes.empty?
|
699
|
+
|
700
|
+
li = nodes.sort do |a, b|
|
701
|
+
cmp_label a[0], b[0], labels: labels
|
702
|
+
end.map do |rsrc, preds|
|
703
|
+
cu = canonical_uri(rsrc, rdf: false) or next
|
704
|
+
lab = labels[rsrc] || [nil, rsrc]
|
705
|
+
lp = abbreviate(lab[0]) if lab[0]
|
706
|
+
ty = abbreviate(types[rsrc]) if types[rsrc]
|
707
|
+
|
708
|
+
{ [{ [{ [lab[1].to_s] => :span, property: lp }] => :a,
|
709
|
+
href: uri.route_to(cu), typeof: ty, rev: abbreviate(preds) }] => :li }
|
710
|
+
end.compact
|
711
|
+
|
712
|
+
{ [{ li => :ul }] => :nav }
|
713
|
+
end
|
714
|
+
|
715
|
+
def generate_twitter_meta subject
|
716
|
+
# get author
|
717
|
+
author = authors_for(subject, unique: true) or return
|
718
|
+
|
719
|
+
# get author's twitter account
|
720
|
+
twitter = objects_for(author, RDF::Vocab::FOAF.account,
|
721
|
+
only: :resource).select { |t| t.to_s =~ /twitter\.com/
|
722
|
+
}.sort.first or return
|
723
|
+
twitter = URI(twitter.to_s).path.split(/\/+/)[1]
|
724
|
+
twitter = ?@ + twitter unless twitter.start_with? ?@
|
725
|
+
|
726
|
+
# get title
|
727
|
+
title = label_for(subject) or return
|
728
|
+
|
729
|
+
out = [
|
730
|
+
{ nil => :meta, name: 'twitter:card', content: :summary },
|
731
|
+
{ nil => :meta, name: 'twitter:site', content: twitter },
|
732
|
+
{ nil => :meta, name: 'twitter:title', content: title[1].to_s }
|
733
|
+
]
|
734
|
+
|
735
|
+
# get abstract
|
736
|
+
if desc = label_for(subject, desc: true)
|
737
|
+
out.push({ nil => :meta, name: 'twitter:description',
|
738
|
+
content: desc[1].to_s })
|
739
|
+
end
|
740
|
+
|
741
|
+
# get image (foaf:depiction)
|
742
|
+
img = objects_for(subject, RDF::Vocab::FOAF.depiction, only: :resource)
|
743
|
+
unless img.empty?
|
744
|
+
img = img[0].to_s
|
745
|
+
out.push({ nil => :meta, name: 'twitter:image', content: img })
|
746
|
+
out[0][:content] = :summary_large_image
|
747
|
+
end
|
748
|
+
|
749
|
+
# return the appropriate xml-mixup structure
|
750
|
+
out
|
751
|
+
end
|
752
|
+
|
753
|
+
AUTHOR_SPEC = [
|
754
|
+
['By:', [RDF::Vocab::BIBO.authorList, RDF::Vocab::DC.creator]],
|
755
|
+
['With:', [RDF::Vocab::BIBO.contributorList, RDF::Vocab::DC.contributor]],
|
756
|
+
['Edited by:', [RDF::Vocab::BIBO.editorList, RDF::Vocab::BIBO.editor]],
|
757
|
+
['Translated by:', [RDF::Vocab::BIBO.translator]],
|
758
|
+
].freeze
|
759
|
+
|
760
|
+
def generate_bibliography id, published: true
|
761
|
+
id = canonical_uuid id
|
762
|
+
uri = canonical_uri id
|
763
|
+
struct = struct_for id
|
764
|
+
nodes = Set[id] + smush_struct(struct)
|
765
|
+
bodynodes = Set.new
|
766
|
+
parts = {}
|
767
|
+
referents = {}
|
768
|
+
labels = { id => label_for(id, candidates: struct) }
|
769
|
+
canon = {}
|
770
|
+
|
771
|
+
# uggh put these somewhere
|
772
|
+
preds = {
|
773
|
+
hp: predicate_set(RDF::Vocab::DC.hasPart),
|
774
|
+
sa: predicate_set(RDF::RDFS.seeAlso),
|
775
|
+
canon: predicate_set([RDF::OWL.sameAs, CI.canonical]),
|
776
|
+
ref: predicate_set(RDF::Vocab::DC.references),
|
777
|
+
al: predicate_set(RDF::Vocab::BIBO.contributorList),
|
778
|
+
cont: predicate_set(RDF::Vocab::DC.contributor),
|
779
|
+
}
|
780
|
+
|
781
|
+
# collect up all the parts (as in dct:hasPart)
|
782
|
+
objects_for(id, preds[:hp], entail: false, only: :resource).each do |part|
|
783
|
+
bodynodes << part
|
784
|
+
|
785
|
+
# gather up all the possible alias urls this thing can have
|
786
|
+
sa = ([part] + objects_for(part,
|
787
|
+
preds[:sa], only: :uri, entail: false)).map do |x|
|
788
|
+
[x] + subjects_for(preds[:canon], x, only: :uri, entail: false)
|
789
|
+
end.flatten.uniq
|
790
|
+
|
791
|
+
# collect all the referents
|
792
|
+
reftmp = {}
|
793
|
+
sa.each do |u|
|
794
|
+
subjects_for preds[:ref], u, only: :uri, entail: false do |s, *p|
|
795
|
+
reftmp[s] ||= Set.new
|
796
|
+
reftmp[s] += p[0].to_set
|
797
|
+
end
|
798
|
+
end
|
799
|
+
|
800
|
+
# if we are producing a list of references identified by only
|
801
|
+
# published resources, prune out all the unpublished referents
|
802
|
+
reftmp.select! { |x, _| published? x } if published
|
803
|
+
|
804
|
+
# unconditionally skip this item if nothing references it
|
805
|
+
next if reftmp.empty?
|
806
|
+
|
807
|
+
referents[part] = reftmp
|
808
|
+
|
809
|
+
reftmp.each do |r, _|
|
810
|
+
labels[r] ||= label_for r
|
811
|
+
canon[r] ||= canonical_uri r
|
812
|
+
end
|
813
|
+
|
814
|
+
# collect all the authors and author lists
|
815
|
+
|
816
|
+
objects_for(part, preds[:al], only: :resource, entail: false) do |o|
|
817
|
+
RDF::List.new(subject: o, graph: @graph).each do |a|
|
818
|
+
labels[a] ||= label_for a
|
819
|
+
end
|
820
|
+
end
|
821
|
+
|
822
|
+
objects_for(part, preds[:cont], only: :uri, entail: false) do |a|
|
823
|
+
labels[a] ||= label_for a
|
824
|
+
end
|
825
|
+
|
826
|
+
ps = struct_for part
|
827
|
+
labels[part] = label_for part, candidates: ps
|
828
|
+
nodes |= smush_struct ps
|
829
|
+
|
830
|
+
parts[part] = ps
|
831
|
+
end
|
832
|
+
|
833
|
+
bmap = prepare_collation struct
|
834
|
+
pf = -> x { abbreviate bmap[x.literal? ? :literals : :resources][x] }
|
835
|
+
|
836
|
+
body = []
|
837
|
+
parts.sort { |a, b| cmp_label a[0], b[0], labels: labels }.each do |k, v|
|
838
|
+
mapping = prepare_collation v
|
839
|
+
p = -> x {
|
840
|
+
abbreviate mapping[x.literal? ? :literals : :resources][x] }
|
841
|
+
t = abbreviate mapping[:types]
|
842
|
+
|
843
|
+
lp = label_for k, candidates: v
|
844
|
+
h2c = [lp[1].to_s]
|
845
|
+
h2 = { h2c => :h2 }
|
846
|
+
cu = canonical_uri k
|
847
|
+
rel = nil
|
848
|
+
unless cu.scheme.downcase.start_with? 'http'
|
849
|
+
if sa = v[RDF::RDFS.seeAlso]
|
850
|
+
rel = p.call sa[0]
|
851
|
+
cu = canonical_uri sa[0]
|
852
|
+
else
|
853
|
+
cu = nil
|
854
|
+
end
|
855
|
+
end
|
856
|
+
|
857
|
+
if cu
|
858
|
+
h2c[0] = { [lp[1].to_s] => :a, rel: rel,
|
859
|
+
property: p.call(lp[1]), href: cu.to_s }
|
860
|
+
else
|
861
|
+
h2[:property] = p.call(lp[1])
|
862
|
+
end
|
863
|
+
|
864
|
+
# authors &c
|
865
|
+
# authors contributors editors translators
|
866
|
+
al = []
|
867
|
+
AUTHOR_SPEC.each do |label, pl|
|
868
|
+
dd = []
|
869
|
+
seen = Set.new
|
870
|
+
pl.each do |pred|
|
871
|
+
# first check if the struct has the predicate
|
872
|
+
next unless v[pred]
|
873
|
+
li = []
|
874
|
+
ul = { li => :ul, rel: abbreviate(pred) }
|
875
|
+
v[pred].sort { |a, b| cmp_label a, b, labels: labels }.each do |o|
|
876
|
+
# check if this is a list
|
877
|
+
tl = RDF::List.new subject: o, graph: @graph
|
878
|
+
if tl.empty? and !seen.include? o
|
879
|
+
seen << o
|
880
|
+
lab = labels[o] ? { [labels[o][1]] => :span,
|
881
|
+
property: abbreviate(labels[o][0]) } : o
|
882
|
+
li << { [lab] => :li, resource: o }
|
883
|
+
else
|
884
|
+
# XXX this will actually not be right if there are
|
885
|
+
# multiple lists but FINE FOR NOW
|
886
|
+
ul[:inlist] ||= ''
|
887
|
+
tl.each do |a|
|
888
|
+
seen << a
|
889
|
+
lab = labels[a] ? { [labels[a][1]] => :span,
|
890
|
+
property: abbreviate(labels[a][0]) } : a
|
891
|
+
li << { [lab] => :li, resource: a }
|
892
|
+
end
|
893
|
+
end
|
894
|
+
end
|
895
|
+
dd << ul unless li.empty?
|
896
|
+
end
|
897
|
+
al += [{ [label] => :dt }, { dd => :dd }] unless dd.empty?
|
898
|
+
end
|
899
|
+
|
900
|
+
# ref list
|
901
|
+
rl = referents[k].sort do |a, b|
|
902
|
+
cmp_label a[0], b[0], labels: labels
|
903
|
+
end.map do |ref, pset|
|
904
|
+
lab = labels[ref] ? { [labels[ref][1]] => :span,
|
905
|
+
property: abbreviate(labels[ref][0]) } : ref
|
906
|
+
|
907
|
+
{ [{ [lab] => :a, rev: abbreviate(pset), href: canon[ref] }] => :li }
|
908
|
+
end
|
909
|
+
|
910
|
+
contents = [h2, {
|
911
|
+
al + [{ ['Referenced in:'] => :dt },
|
912
|
+
{ [{ rl => :ul }] => :dd }] => :dl }]
|
913
|
+
|
914
|
+
body << { contents => :section,
|
915
|
+
rel: pf.call(k), resource: k.to_s, typeof: t }
|
916
|
+
end
|
917
|
+
|
918
|
+
# prepend abstract to body if it exists
|
919
|
+
abs = label_for id, candidates: struct, desc: true
|
920
|
+
if abs
|
921
|
+
tag = { '#p' => abs[1], property: abbreviate(abs[0]) }
|
922
|
+
body.unshift tag
|
923
|
+
end
|
924
|
+
|
925
|
+
# add labels to nodes
|
926
|
+
nodes += smush_struct labels
|
927
|
+
|
928
|
+
# get prefixes
|
929
|
+
pfx = prefix_subset prefixes, nodes
|
930
|
+
|
931
|
+
# get title tag
|
932
|
+
title = title_tag labels[id][0], labels[id][1],
|
933
|
+
prefixes: prefixes, lang: 'en'
|
934
|
+
|
935
|
+
# get links
|
936
|
+
link = head_links id,
|
937
|
+
struct: struct, ignore: bodynodes, labels: labels, vocab: XHV
|
938
|
+
|
939
|
+
# get metas
|
940
|
+
mn = {}
|
941
|
+
mn[abs[1]] = :description if abs
|
942
|
+
mi = Set.new
|
943
|
+
mi << labels[id][1] if labels[id]
|
944
|
+
meta = head_meta id,
|
945
|
+
struct: struct, lang: 'en', ignore: mi, meta_names: mn, vocab: XHV
|
946
|
+
|
947
|
+
meta += generate_twitter_meta(id) || []
|
948
|
+
|
949
|
+
xhtml_stub(base: uri, prefix: pfx, lang: 'en', title: title, vocab: XHV,
|
950
|
+
link: link, meta: meta, transform: @config[:transform],
|
951
|
+
body: { body => :body, about: '',
|
952
|
+
typeof: abbreviate(struct[RDF::RDFV.type] || []) }).document
|
953
|
+
end
|
954
|
+
|
955
|
+
# generate skos concept schemes
|
956
|
+
|
957
|
+
CONCEPTS = Util.all_related(RDF::Vocab::SKOS.Concept).to_set
|
958
|
+
|
959
|
+
def generate_audience_csv file = nil, published: true
|
960
|
+
require 'csv'
|
961
|
+
file = coerce_to_path_or_io file if file
|
962
|
+
lab = {}
|
963
|
+
|
964
|
+
out = all_internal_docs(published: published,
|
965
|
+
exclude: RDF::Vocab::FOAF.Image).map do |s|
|
966
|
+
u = canonical_uri s
|
967
|
+
x = struct_for s
|
968
|
+
c = x[RDF::Vocab::DC.created] ? x[RDF::Vocab::DC.created][0] : nil
|
969
|
+
_, t = label_for s, candidates: x
|
970
|
+
_, d = label_for s, candidates: x, desc: true
|
971
|
+
|
972
|
+
# # audience(s)
|
973
|
+
# a = objects_for(s, RDF::Vocab::DC.audience).map do |au|
|
974
|
+
# next lab[au] if lab[au]
|
975
|
+
# _, al = label_for au
|
976
|
+
# lab[au] = al
|
977
|
+
# end.map(&:to_s).sort.join '; '
|
978
|
+
|
979
|
+
# # explicit non-audience(s)
|
980
|
+
# n = objects_for(s, RDF::SAK::CI['non-audience']).map do |au|
|
981
|
+
# next lab[au] if lab[au]
|
982
|
+
# _, al = label_for au
|
983
|
+
# lab[au] = al
|
984
|
+
# end.map(&:to_s).sort.join '; '
|
985
|
+
|
986
|
+
# audience and non-audience
|
987
|
+
a, n = [RDF::Vocab::DC.audience, CI['non-audience']].map do |ap|
|
988
|
+
objects_for(s, ap).map do |au|
|
989
|
+
next lab[au] if lab[au]
|
990
|
+
_, al = label_for au
|
991
|
+
lab[au] = al
|
992
|
+
end.map(&:to_s).sort.join '; '
|
993
|
+
end
|
994
|
+
|
995
|
+
# concepts???
|
996
|
+
concepts = [RDF::Vocab::DC.subject, CI.introduces,
|
997
|
+
CI.assumes, CI.mentions].map do |pred|
|
998
|
+
objects_for(s, pred, only: :resource).map do |o|
|
999
|
+
con = self.objects_for(o, RDF.type).to_set & CONCEPTS
|
1000
|
+
next if con.empty?
|
1001
|
+
next lab[o] if lab[o]
|
1002
|
+
_, ol = label_for o
|
1003
|
+
lab[o] = ol
|
1004
|
+
end.compact.map(&:to_s).sort.join '; '
|
1005
|
+
end
|
1006
|
+
|
1007
|
+
[s, u, c, t, d, a, n].map(&:to_s) + concepts
|
1008
|
+
end.sort { |a, b| a[2] <=> b[2] }
|
1009
|
+
|
1010
|
+
out.unshift ['ID', 'URL', 'Created', 'Title', 'Description', 'Audience',
|
1011
|
+
'Non-Audience', 'Subject', 'Introduces', 'Assumes', 'Mentions']
|
1012
|
+
|
1013
|
+
if file
|
1014
|
+
# don't open until now
|
1015
|
+
file = file.expand_path.open('wb') unless file.is_a? IO
|
1016
|
+
|
1017
|
+
csv = CSV.new file
|
1018
|
+
out.each { |x| csv << x }
|
1019
|
+
file.flush
|
1020
|
+
end
|
1021
|
+
|
1022
|
+
out
|
1023
|
+
end
|
1024
|
+
|
1025
|
+
CSV_PRED = {
|
1026
|
+
audience: RDF::Vocab::DC.audience,
|
1027
|
+
nonaudience: CI['non-audience'],
|
1028
|
+
subject: RDF::Vocab::DC.subject,
|
1029
|
+
assumes: CI.assumes,
|
1030
|
+
introduces: CI.introduces,
|
1031
|
+
mentions: CI.mentions,
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
def ingest_csv file
|
1035
|
+
file = coerce_to_path_or_io file
|
1036
|
+
|
1037
|
+
require 'csv'
|
1038
|
+
|
1039
|
+
# key mapper
|
1040
|
+
km = { uuid: :id, url: :uri }
|
1041
|
+
kt = -> (k) { km[k] || k }
|
1042
|
+
|
1043
|
+
# grab all the concepts and audiences
|
1044
|
+
|
1045
|
+
audiences = {}
|
1046
|
+
all_of_type(CI.Audience).map do |c|
|
1047
|
+
s = struct_for c
|
1048
|
+
|
1049
|
+
# homogenize the labels
|
1050
|
+
lab = [false, true].map do |b|
|
1051
|
+
label_for(c, candidates: s, unique: false, alt: b).map { |x| x[1] }
|
1052
|
+
end.flatten.map { |x| x.to_s.strip.downcase }
|
1053
|
+
|
1054
|
+
# we want all the keys to share the same set
|
1055
|
+
set = nil
|
1056
|
+
lab.each { |t| set = audiences[t] ||= set || Set.new }
|
1057
|
+
set << c
|
1058
|
+
end
|
1059
|
+
|
1060
|
+
concepts = {}
|
1061
|
+
all_of_type(RDF::Vocab::SKOS.Concept).map do |c|
|
1062
|
+
s = struct_for c
|
1063
|
+
|
1064
|
+
# homogenize the labels
|
1065
|
+
lab = [false, true].map do |b|
|
1066
|
+
label_for(c, candidates: s, unique: false, alt: b).map { |x| x[1] }
|
1067
|
+
end.flatten.map { |x| x.to_s.strip.downcase }
|
1068
|
+
|
1069
|
+
# we want all the keys to share the same set
|
1070
|
+
set = nil
|
1071
|
+
lab.each { |t| set = concepts[t] ||= set || Set.new }
|
1072
|
+
set << c
|
1073
|
+
end
|
1074
|
+
|
1075
|
+
data = CSV.read(file, headers: true,
|
1076
|
+
header_converters: :symbol).map do |o|
|
1077
|
+
o = o.to_h.transform_keys(&kt)
|
1078
|
+
s = canonical_uuid(o.delete :id) or next
|
1079
|
+
|
1080
|
+
# LOLOL wtf
|
1081
|
+
|
1082
|
+
# handle audience
|
1083
|
+
[:audience, :nonaudience].each do |a|
|
1084
|
+
if o[a]
|
1085
|
+
o[a] = o[a].strip.split(/\s*[;,]+\s*/, -1).map do |t|
|
1086
|
+
if t =~ /^[a-z+-]+:[^[:space:]]+$/
|
1087
|
+
u = RDF::URI(t)
|
1088
|
+
canonical_uuid(u) || u
|
1089
|
+
elsif audiences[t.downcase]
|
1090
|
+
audiences[t.downcase].to_a
|
1091
|
+
end
|
1092
|
+
end.flatten.compact.uniq
|
1093
|
+
else
|
1094
|
+
o[a] = []
|
1095
|
+
end
|
1096
|
+
end
|
1097
|
+
|
1098
|
+
# handle concepts
|
1099
|
+
[:subject, :introduces, :assumes, :mentions].each do |a|
|
1100
|
+
if o[a]
|
1101
|
+
o[a] = o[a].strip.split(/\s*[;,]+\s*/, -1).map do |t|
|
1102
|
+
if t =~ /^[a-z+-]+:[^[:space:]]+$/
|
1103
|
+
u = RDF::URI(t)
|
1104
|
+
canonical_uuid(u) || u
|
1105
|
+
elsif concepts[t.downcase]
|
1106
|
+
concepts[t.downcase].to_a
|
1107
|
+
end
|
1108
|
+
end.flatten.compact.uniq
|
1109
|
+
else
|
1110
|
+
o[a] = []
|
1111
|
+
end
|
1112
|
+
|
1113
|
+
end
|
1114
|
+
|
1115
|
+
CSV_PRED.each do |sym, pred|
|
1116
|
+
o[sym].each do |obj|
|
1117
|
+
@graph << [s, pred, obj]
|
1118
|
+
end
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
[s, o]
|
1122
|
+
end.compact.to_h
|
1123
|
+
data
|
1124
|
+
end
|
1125
|
+
|
1126
|
+
def generate_sitemap published: true
|
1127
|
+
urls = {}
|
1128
|
+
|
1129
|
+
# do feeds separately
|
1130
|
+
feeds = all_of_type RDF::Vocab::DCAT.Distribution
|
1131
|
+
#feeds.select! { |f| published? f } if published
|
1132
|
+
feeds.each do |f|
|
1133
|
+
uri = canonical_uri(f)
|
1134
|
+
f = generate_atom_feed f, published: published, related: feeds
|
1135
|
+
mt = f.at_xpath('/atom:feed/atom:updated[1]/text()',
|
1136
|
+
{ atom: 'http://www.w3.org/2005/Atom' })
|
1137
|
+
urls[uri] = { [{ [uri.to_s] => :loc }, { [mt] => :lastmod }] => :url }
|
1138
|
+
end
|
1139
|
+
|
1140
|
+
# build up hash of urls
|
1141
|
+
all_internal_docs(published: published).each do |doc|
|
1142
|
+
next if asserted_types(doc).include? RDF::Vocab::FOAF.Image
|
1143
|
+
uri = canonical_uri(doc)
|
1144
|
+
next unless uri.authority && @base && uri.authority == base.authority
|
1145
|
+
mods = objects_for(doc, [RDF::Vocab::DC.created,
|
1146
|
+
RDF::Vocab::DC.modified, RDF::Vocab::DC.issued],
|
1147
|
+
datatype: RDF::XSD.dateTime).sort
|
1148
|
+
nodes = [{ [uri.to_s] => :loc }]
|
1149
|
+
nodes << { [mods[-1].to_s] => :lastmod } unless mods.empty?
|
1150
|
+
urls[uri] = { nodes => :url }
|
1151
|
+
end
|
1152
|
+
|
1153
|
+
urls = urls.sort.map { |_, v| v }
|
1154
|
+
|
1155
|
+
markup(spec: { urls => :urlset,
|
1156
|
+
xmlns: 'http://www.sitemaps.org/schemas/sitemap/0.9' }).document
|
1157
|
+
end
|
1158
|
+
|
1159
|
+
def write_sitemap published: true
|
1160
|
+
sitemap = generate_sitemap published: published
|
1161
|
+
file = @config[:sitemap] || '.well-known/sitemap.xml'
|
1162
|
+
target = @config[published ? :target : :private]
|
1163
|
+
target.mkpath unless target.directory?
|
1164
|
+
|
1165
|
+
fh = (target + file).open(?w)
|
1166
|
+
sitemap.write_to fh
|
1167
|
+
fh.close
|
1168
|
+
end
|
1169
|
+
|
1170
|
+
# generate atom feed
|
1171
|
+
|
1172
|
+
#
|
1173
|
+
def all_internal_docs published: true, exclude: []
|
1174
|
+
# find all UUIDs that are documents
|
1175
|
+
docs = all_of_type(RDF::Vocab::FOAF.Document,
|
1176
|
+
exclude: exclude).select { |x| x =~ /^urn:uuid:/ }
|
1177
|
+
|
1178
|
+
# prune out all but the published documents if specified
|
1179
|
+
if published
|
1180
|
+
p = RDF::Vocab::BIBO.status
|
1181
|
+
o = RDF::Vocabulary.find_term(
|
1182
|
+
'http://purl.org/ontology/bibo/status/published')
|
1183
|
+
docs = docs.select do |s|
|
1184
|
+
@graph.has_statement? RDF::Statement(s, p, o)
|
1185
|
+
end
|
1186
|
+
end
|
1187
|
+
|
1188
|
+
docs
|
1189
|
+
end
|
1190
|
+
|
1191
|
+
def generate_atom_feed id, published: true, related: []
|
1192
|
+
raise 'ID must be a resource' unless id.is_a? RDF::Resource
|
1193
|
+
|
1194
|
+
# prepare relateds
|
1195
|
+
raise 'related must be an array' unless related.is_a? Array
|
1196
|
+
related -= [id]
|
1197
|
+
|
1198
|
+
# feed = struct_for id
|
1199
|
+
|
1200
|
+
faudy = audiences_for id
|
1201
|
+
faudn = audiences_for id, invert: true
|
1202
|
+
faudy -= faudn
|
1203
|
+
|
1204
|
+
docs = all_internal_docs published: published
|
1205
|
+
|
1206
|
+
# now we create a hash keyed by uuid containing the metadata
|
1207
|
+
authors = {}
|
1208
|
+
titles = {}
|
1209
|
+
dates = {}
|
1210
|
+
entries = {}
|
1211
|
+
latest = nil
|
1212
|
+
docs.each do |uu|
|
1213
|
+
# basically make a jsonld-like structure
|
1214
|
+
#rsrc = struct_for uu
|
1215
|
+
|
1216
|
+
indexed = objects_for uu, RDF::SAK::CI.indexed, only: :literal
|
1217
|
+
next if !indexed.empty? and indexed.any? { |f| f == false }
|
1218
|
+
|
1219
|
+
# get id (got it already duh)
|
1220
|
+
|
1221
|
+
# get audiences
|
1222
|
+
audy = audiences_for uu, proximate: true
|
1223
|
+
audn = audiences_for uu, proximate: true, invert: true
|
1224
|
+
|
1225
|
+
#warn "#{faudy.to_s} & #{faud"
|
1226
|
+
|
1227
|
+
skip = false
|
1228
|
+
if audy.empty?
|
1229
|
+
# an unspecified audience implies "everybody", but if the
|
1230
|
+
# feed's audience *is* specified, then it's not for everybody
|
1231
|
+
skip = true unless faudy.empty?
|
1232
|
+
else
|
1233
|
+
# if document audience matches feed non-audience, disqualify
|
1234
|
+
skip = true unless (faudn & audy).empty?
|
1235
|
+
|
1236
|
+
# absence of an explicit feed audience implies "everybody"
|
1237
|
+
if faudy.empty?
|
1238
|
+
# if document audience minus feed non-audience has
|
1239
|
+
# members, re-qualify
|
1240
|
+
skip = false unless (audy - faudn).empty?
|
1241
|
+
else
|
1242
|
+
# if document audience matches feed audience, re-qualify
|
1243
|
+
skip = false unless (faudy & audy).empty?
|
1244
|
+
end
|
1245
|
+
end
|
1246
|
+
|
1247
|
+
# if document non-audience matches feed audience, re-disqualify
|
1248
|
+
skip = true if !(audn.empty? || faudy.empty?) && !(faudy & audn).empty?
|
1249
|
+
|
1250
|
+
next if skip
|
1251
|
+
|
1252
|
+
canon = URI.parse(canonical_uri(uu).to_s)
|
1253
|
+
|
1254
|
+
xml = { '#entry' => [
|
1255
|
+
{ '#link' => nil, rel: :alternate, href: canon, type: 'text/html' },
|
1256
|
+
{ '#id' => uu.to_s }
|
1257
|
+
] }
|
1258
|
+
|
1259
|
+
# get published date first
|
1260
|
+
published = (objects_for uu,
|
1261
|
+
[RDF::Vocab::DC.issued, RDF::Vocab::DC.created],
|
1262
|
+
datatype: RDF::XSD.dateTime)[0]
|
1263
|
+
|
1264
|
+
# get latest updated date
|
1265
|
+
updated = (objects_for uu, RDF::Vocab::DC.modified,
|
1266
|
+
datatype: RDF::XSD.dateTime).sort[-1]
|
1267
|
+
updated ||= published || RDF::Literal::DateTime.new(DateTime.now)
|
1268
|
+
updated = Time.parse(updated.to_s).utc
|
1269
|
+
latest = updated if !latest or latest < updated
|
1270
|
+
|
1271
|
+
xml['#entry'].push({ '#updated' => updated.iso8601 })
|
1272
|
+
|
1273
|
+
if published
|
1274
|
+
published = Time.parse(published.to_s).utc
|
1275
|
+
xml['#entry'].push({ '#published' => published.iso8601 })
|
1276
|
+
dates[uu] = [published, updated]
|
1277
|
+
else
|
1278
|
+
dates[uu] = [updated, updated]
|
1279
|
+
end
|
1280
|
+
|
1281
|
+
# get author(s)
|
1282
|
+
al = []
|
1283
|
+
authors_for(uu).each do |a|
|
1284
|
+
unless authors[a]
|
1285
|
+
n = label_for a
|
1286
|
+
x = authors[a] = { '#author' => [{ '#name' => n[1].to_s }] }
|
1287
|
+
|
1288
|
+
hp = @graph.first_object [a, RDF::Vocab::FOAF.homepage, nil]
|
1289
|
+
hp ||= canonical_uri a
|
1290
|
+
|
1291
|
+
x['#author'].push({ '#uri' => hp.to_s }) if hp
|
1292
|
+
end
|
1293
|
+
|
1294
|
+
al.push authors[a]
|
1295
|
+
end
|
1296
|
+
|
1297
|
+
xml['#entry'] += al unless al.empty?
|
1298
|
+
|
1299
|
+
# get title (note unshift)
|
1300
|
+
if (t = label_for uu)
|
1301
|
+
titles[uu] = t[1].to_s
|
1302
|
+
xml['#entry'].unshift({ '#title' => t[1].to_s })
|
1303
|
+
else
|
1304
|
+
titles[uu] = uu.to_s
|
1305
|
+
end
|
1306
|
+
|
1307
|
+
# get abstract
|
1308
|
+
if (d = label_for uu, desc: true)
|
1309
|
+
xml['#entry'].push({ '#summary' => d[1].to_s })
|
1310
|
+
end
|
1311
|
+
|
1312
|
+
entries[uu] = xml
|
1313
|
+
end
|
1314
|
+
|
1315
|
+
# note we overwrite the entries hash here with a sorted array
|
1316
|
+
entrycmp = -> a, b {
|
1317
|
+
# first we sort by published date
|
1318
|
+
p = dates[a][0] <=> dates[b][0]
|
1319
|
+
# if the published dates are the same, sort by updated date
|
1320
|
+
u = dates[a][1] <=> dates[b][1]
|
1321
|
+
# to break any ties, finally sort by title
|
1322
|
+
p == 0 ? u == 0 ? titles[a] <=> titles[b] : u : p }
|
1323
|
+
entries = entries.values_at(
|
1324
|
+
*entries.keys.sort { |a, b| entrycmp.call(a, b) })
|
1325
|
+
# ugggh god forgot the asterisk and lost an hour
|
1326
|
+
|
1327
|
+
# now we punt out the doc
|
1328
|
+
|
1329
|
+
preamble = [
|
1330
|
+
{ '#id' => id.to_s },
|
1331
|
+
{ '#updated' => latest.iso8601 },
|
1332
|
+
{ '#generator' => 'RDF::SAK', version: RDF::SAK::VERSION,
|
1333
|
+
uri: "https://github.com/doriantaylor/rb-rdf-sak" },
|
1334
|
+
{ nil => :link, rel: :self, type: 'application/atom+xml',
|
1335
|
+
href: canonical_uri(id) },
|
1336
|
+
{ nil => :link, rel: :alternate, type: 'text/html',
|
1337
|
+
href: @base },
|
1338
|
+
] + related.map do |r|
|
1339
|
+
{ nil => :link, rel: :related, type: 'application/atom+xml',
|
1340
|
+
href: canonical_uri(r) }
|
1341
|
+
end
|
1342
|
+
|
1343
|
+
if (t = label_for id)
|
1344
|
+
preamble.unshift({ '#title' => t[1].to_s })
|
1345
|
+
end
|
1346
|
+
|
1347
|
+
if (r = @graph.first_literal [id, RDF::Vocab::DC.rights, nil])
|
1348
|
+
rh = { '#rights' => r.to_s, type: :text }
|
1349
|
+
rh['xml:lang'] = r.language if r.has_language?
|
1350
|
+
preamble.push rh
|
1351
|
+
end
|
1352
|
+
|
1353
|
+
markup(spec: { '#feed' => preamble + entries,
|
1354
|
+
xmlns: 'http://www.w3.org/2005/Atom' }).document
|
1355
|
+
end
|
1356
|
+
|
1357
|
+
def write_feeds type: RDF::Vocab::DCAT.Distribution, published: true
|
1358
|
+
feeds = all_of_type type
|
1359
|
+
target = @config[published ? :target : :private]
|
1360
|
+
feeds.each do |feed|
|
1361
|
+
tu = URI(feed.to_s)
|
1362
|
+
doc = generate_atom_feed feed, published: published, related: feeds
|
1363
|
+
fh = (target + "#{tu.uuid}.xml").open('w')
|
1364
|
+
doc.write_to fh
|
1365
|
+
fh.close
|
1366
|
+
end
|
1367
|
+
end
|
1368
|
+
|
1369
|
+
# generate sass palettes
|
1370
|
+
|
1371
|
+
# generate rewrite map(s)
|
1372
|
+
def generate_rewrite_map published: false, docs: nil
|
1373
|
+
docs ||= reachable published: published
|
1374
|
+
base = URI(@base.to_s)
|
1375
|
+
rwm = {}
|
1376
|
+
docs.each do |doc|
|
1377
|
+
tu = URI(doc.to_s)
|
1378
|
+
cu = canonical_uri doc, rdf: false
|
1379
|
+
next unless tu.respond_to?(:uuid) and cu.respond_to?(:request_uri)
|
1380
|
+
|
1381
|
+
# skip external links obvs
|
1382
|
+
next unless base.route_to(cu).relative?
|
1383
|
+
|
1384
|
+
# skip /uuid form
|
1385
|
+
cp = cu.request_uri.delete_prefix '/'
|
1386
|
+
next if cu.host == base.host and tu.uuid == cp
|
1387
|
+
|
1388
|
+
rwm[cp] = tu.uuid
|
1389
|
+
end
|
1390
|
+
|
1391
|
+
rwm
|
1392
|
+
end
|
1393
|
+
|
1394
|
+
# give me all UUIDs of all documents, filter for published if
|
1395
|
+
# applicable
|
1396
|
+
#
|
1397
|
+
# find the "best" (relative) URL for the UUID and map the pair
|
1398
|
+
# together
|
1399
|
+
def generate_uuid_redirect_map published: false, docs: nil
|
1400
|
+
docs ||= reachable published: published
|
1401
|
+
|
1402
|
+
base = URI(@base.to_s)
|
1403
|
+
|
1404
|
+
# keys are /uuid, values are
|
1405
|
+
out = {}
|
1406
|
+
docs.each do |doc|
|
1407
|
+
tu = URI(doc.to_s)
|
1408
|
+
cu = canonical_uri doc, rdf: false
|
1409
|
+
next unless tu.respond_to?(:uuid) and cu.respond_to?(:request_uri)
|
1410
|
+
|
1411
|
+
# skip /uuid form
|
1412
|
+
cp = cu.request_uri.delete_prefix '/'
|
1413
|
+
next if cu.host == base.host && tu.uuid == cp
|
1414
|
+
|
1415
|
+
# all redirect links are absolute
|
1416
|
+
out[tu.uuid] = cu.to_s
|
1417
|
+
end
|
1418
|
+
out
|
1419
|
+
end
|
1420
|
+
|
1421
|
+
# find all URIs/slugs that are *not* canonical, map them to slugs
|
1422
|
+
# that *are* canonical
|
1423
|
+
def generate_slug_redirect_map published: false, docs: nil
|
1424
|
+
docs ||= reachable published: published
|
1425
|
+
base = URI(@base.to_s)
|
1426
|
+
|
1427
|
+
# for redirects we collect all the docs, plus all their URIs,
|
1428
|
+
# separate canonical from the rest
|
1429
|
+
|
1430
|
+
# actually an easy way to do this is just harvest all the
|
1431
|
+
# multi-addressed docs, remove the first one, then ask for the
|
1432
|
+
# canonical uuid back,
|
1433
|
+
|
1434
|
+
fwd = {}
|
1435
|
+
rev = {}
|
1436
|
+
out = {}
|
1437
|
+
|
1438
|
+
docs.each do |doc|
|
1439
|
+
uris = canonical_uri doc, unique: false, rdf: false
|
1440
|
+
canon = uris.shift
|
1441
|
+
next unless canon.respond_to? :request_uri
|
1442
|
+
|
1443
|
+
# cache the forward direction
|
1444
|
+
fwd[doc] = canon
|
1445
|
+
|
1446
|
+
unless uris.empty?
|
1447
|
+
uris.each do |uri|
|
1448
|
+
next unless uri.respond_to? :request_uri
|
1449
|
+
next if canon == uri
|
1450
|
+
next unless base.route_to(uri).relative?
|
1451
|
+
|
1452
|
+
# warn "#{canon} <=> #{uri}"
|
1453
|
+
|
1454
|
+
requri = uri.request_uri.delete_prefix '/'
|
1455
|
+
next if requri == '' ||
|
1456
|
+
requri =~ /^[0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8}$/
|
1457
|
+
|
1458
|
+
# cache the reverse direction
|
1459
|
+
rev[uri] = requri
|
1460
|
+
end
|
1461
|
+
end
|
1462
|
+
end
|
1463
|
+
|
1464
|
+
rev.each do |uri, requri|
|
1465
|
+
if (doc = canonical_uuid(uri, published: published)) and
|
1466
|
+
fwd[doc] and fwd[doc] != uri
|
1467
|
+
out[requri] = fwd[doc].to_s
|
1468
|
+
end
|
1469
|
+
end
|
1470
|
+
|
1471
|
+
out
|
1472
|
+
end
|
1473
|
+
|
1474
|
+
# you know what, it's entirely possible that these ought never be
|
1475
|
+
# called individually and the work to get one would duplicate the
|
1476
|
+
# work of getting the other, so maybe just do 'em both at once
|
1477
|
+
|
1478
|
+
def generate_redirect_map published: false, docs: nil
|
1479
|
+
generate_uuid_redirect_map(published: published, docs: docs).merge(
|
1480
|
+
generate_slug_redirect_map(published: published, docs: docs))
|
1481
|
+
end
|
1482
|
+
|
1483
|
+
def generate_gone_map published: false, docs: nil
|
1484
|
+
# published is a no-op for this one because these docs are by
|
1485
|
+
# definition not published
|
1486
|
+
docs ||= reachable published: false
|
1487
|
+
p = RDF::Vocab::BIBO.status
|
1488
|
+
base = URI(@base.to_s)
|
1489
|
+
out = {}
|
1490
|
+
docs.select { |s|
|
1491
|
+
@graph.has_statement? RDF::Statement(s, p, CI.retired) }.each do |doc|
|
1492
|
+
canon = canonical_uri doc, rdf: false
|
1493
|
+
next unless base.route_to(canon).relative?
|
1494
|
+
canon = canon.request_uri.delete_prefix '/'
|
1495
|
+
# value of the gone map doesn't matter
|
1496
|
+
out[canon] = canon
|
1497
|
+
end
|
1498
|
+
|
1499
|
+
out
|
1500
|
+
end
|
1501
|
+
|
1502
|
+
# private?
|
1503
|
+
|
1504
|
+
def map_location type
|
1505
|
+
# find file name in config
|
1506
|
+
fn = @config[:maps][type] or return
|
1507
|
+
|
1508
|
+
# concatenate to target directory
|
1509
|
+
@config[:target] + fn
|
1510
|
+
end
|
1511
|
+
|
1512
|
+
# private?
|
1513
|
+
|
1514
|
+
def write_map_file location, data
|
1515
|
+
# open file
|
1516
|
+
fh = File.new location, 'w'
|
1517
|
+
data.sort.each { |k, v| fh.write "#{k}\t#{v}\n" }
|
1518
|
+
fh.close # return value is return value from close
|
1519
|
+
end
|
1520
|
+
|
1521
|
+
# public again
|
1522
|
+
|
1523
|
+
def write_rewrite_map published: false, docs: nil
|
1524
|
+
data = generate_rewrite_map published: published, docs: docs
|
1525
|
+
loc = map_location :rewrite
|
1526
|
+
write_map_file loc, data
|
1527
|
+
end
|
1528
|
+
|
1529
|
+
def write_redirect_map published: false, docs: nil
|
1530
|
+
data = generate_redirect_map published: published, docs: docs
|
1531
|
+
loc = map_location :redirect
|
1532
|
+
write_map_file loc, data
|
1533
|
+
end
|
1534
|
+
|
1535
|
+
def write_gone_map published: false, docs: nil
|
1536
|
+
data = generate_gone_map published: published, docs: docs
|
1537
|
+
loc = map_location :gone
|
1538
|
+
write_map_file loc, data
|
1539
|
+
end
|
1540
|
+
|
1541
|
+
def write_maps published: true, docs: nil
|
1542
|
+
docs ||= reachable published: false
|
1543
|
+
# slug to uuid (internal)
|
1544
|
+
write_rewrite_map docs: docs
|
1545
|
+
# uuid/slug to canonical slug (308)
|
1546
|
+
write_redirect_map docs: docs
|
1547
|
+
# retired slugs/uuids (410)
|
1548
|
+
write_gone_map docs: docs
|
1549
|
+
true
|
1550
|
+
end
|
1551
|
+
|
1552
|
+
# whoops lol we forgot the book list
|
1553
|
+
|
1554
|
+
def reading_lists published: true
|
1555
|
+
out = all_of_type RDF::Vocab::SiocTypes.ReadingList
|
1556
|
+
return out unless published
|
1557
|
+
out.select { |r| published? r }
|
1558
|
+
end
|
1559
|
+
|
1560
|
+
def generate_reading_list subject, published: true
|
1561
|
+
# struct = struct_for subject
|
1562
|
+
|
1563
|
+
# find all the books, sort them by title
|
1564
|
+
|
1565
|
+
# for each book, give title, authors, inbound references
|
1566
|
+
|
1567
|
+
# punt out xhtml
|
1568
|
+
end
|
1569
|
+
|
1570
|
+
def write_reading_lists published: true
|
1571
|
+
reading_lists(published: published).each do |rl|
|
1572
|
+
tu = URI(rl.to_s)
|
1573
|
+
doc = generate_reading_list rl, published: published
|
1574
|
+
fh = (target + "#{tu.uuid}.xml").open('w')
|
1575
|
+
doc.write_to fh
|
1576
|
+
fh.close
|
1577
|
+
end
|
1578
|
+
end
|
1579
|
+
|
1580
|
+
DSD_SEQ = %i[characters words blocks sections
|
1581
|
+
min low-quartile median high-quartile max mean sd].freeze
|
1582
|
+
TH_SEQ = %w[Document Abstract Created Modified Characters Words Blocks
|
1583
|
+
Sections Min Q1 Median Q3 Max Mean SD].map { |t| { [t] => :th } }
|
1584
|
+
|
1585
|
+
def generate_stats published: true
|
1586
|
+
out = {}
|
1587
|
+
all_of_type(QB.DataSet).map do |s|
|
1588
|
+
base = canonical_uri s, rdf: false
|
1589
|
+
types = abbreviate asserted_types(s)
|
1590
|
+
title = if t = label_for(s)
|
1591
|
+
[t[1].to_s, abbreviate(t[0])]
|
1592
|
+
end
|
1593
|
+
cache = {}
|
1594
|
+
subjects_for(QB.dataSet, s, only: :resource).each do |o|
|
1595
|
+
if d = objects_for(o, CI.document, only: :resource).first
|
1596
|
+
if !published or published?(d)
|
1597
|
+
# include a "sort" time that defaults to epoch zero
|
1598
|
+
c = cache[o] ||= {
|
1599
|
+
doc: d, stime: Time.at(0).getgm, struct: struct_for(o) }
|
1600
|
+
|
1601
|
+
if t = label_for(d)
|
1602
|
+
c[:title] = t
|
1603
|
+
end
|
1604
|
+
if a = label_for(d, desc: true)
|
1605
|
+
c[:abstract] = a
|
1606
|
+
end
|
1607
|
+
if ct = objects_for(d,
|
1608
|
+
RDF::Vocab::DC.created, datatype: RDF::XSD.dateTime).first
|
1609
|
+
c[:stime] = c[:ctime] = ct.object.to_time.getgm
|
1610
|
+
end
|
1611
|
+
if mt = objects_for(d,
|
1612
|
+
RDF::Vocab::DC.modified, datatype:RDF::XSD.dateTime)
|
1613
|
+
c[:mtime] = mt.map { |m| m.object.to_time.getgm }.sort
|
1614
|
+
c[:stime] = c[:mtime].last unless mt.empty?
|
1615
|
+
end
|
1616
|
+
end
|
1617
|
+
end
|
1618
|
+
end
|
1619
|
+
|
1620
|
+
# sort lambda closure
|
1621
|
+
sl = -> a, b do
|
1622
|
+
x = cache[b][:stime] <=> cache[a][:stime]
|
1623
|
+
return x unless x == 0
|
1624
|
+
x = cache[b][:ctime] <=> cache[a][:ctime]
|
1625
|
+
return x unless x == 0
|
1626
|
+
ta = cache[a][:title] || Array.new(2, cache[a][:uri])
|
1627
|
+
tb = cache[b][:title] || Array.new(2, cache[b][:uri])
|
1628
|
+
ta[1].to_s <=> tb[1].to_s
|
1629
|
+
end
|
1630
|
+
|
1631
|
+
rows = []
|
1632
|
+
cache.keys.sort(&sl).each do |k|
|
1633
|
+
c = cache[k]
|
1634
|
+
href = base.route_to canonical_uri(c[:doc], rdf: false)
|
1635
|
+
dt = abbreviate asserted_types(c[:doc])
|
1636
|
+
uu = URI(k.to_s).uuid
|
1637
|
+
nc = UUID::NCName.to_ncname uu, version: 1
|
1638
|
+
tp, tt = c[:title] || []
|
1639
|
+
ab = if c[:abstract]
|
1640
|
+
{ [c[:abstract][1].to_s] => :th, about: href,
|
1641
|
+
property: abbreviate(c[:abstract].first) }
|
1642
|
+
else
|
1643
|
+
{ [] => :th }
|
1644
|
+
end
|
1645
|
+
|
1646
|
+
td = [{ { { [tt.to_s] => :span, property: abbreviate(tp) } => :a,
|
1647
|
+
rel: 'ci:document', href: href } => :th },
|
1648
|
+
ab,
|
1649
|
+
{ [c[:ctime].iso8601] => :th, property: 'dct:created',
|
1650
|
+
datatype: 'xsd:dateTime', about: href, typeof: dt },
|
1651
|
+
{ c[:mtime].reverse.map { |m| { [m.iso8601] => :span,
|
1652
|
+
property: 'dct:modified', datatype: 'xsd:dateTime' } } => :th,
|
1653
|
+
about: href
|
1654
|
+
},
|
1655
|
+
] + DSD_SEQ.map do |f|
|
1656
|
+
h = []
|
1657
|
+
x = { h => :td }
|
1658
|
+
p = CI[f]
|
1659
|
+
if y = c[:struct][p] and !y.empty?
|
1660
|
+
h << y = y.first
|
1661
|
+
x[:property] = abbreviate p
|
1662
|
+
x[:datatype] = abbreviate y.datatype if y.datatype?
|
1663
|
+
end
|
1664
|
+
x
|
1665
|
+
end
|
1666
|
+
rows << { td => :tr, id: nc, about: "##{nc}",
|
1667
|
+
typeof: 'qb:Observation' }
|
1668
|
+
end
|
1669
|
+
|
1670
|
+
out[s] = xhtml_stub(base: base, title: title,
|
1671
|
+
transform: config[:transform], attr: { about: '', typeof: types },
|
1672
|
+
prefix: prefixes, content: {
|
1673
|
+
[{ [{ [{ ['About'] => :th, colspan: 4 },
|
1674
|
+
{ ['Counts'] => :th, colspan: 4 },
|
1675
|
+
{ ['Words per Block'] => :th, colspan: 7 }] => :tr },
|
1676
|
+
{ TH_SEQ => :tr } ] => :thead },
|
1677
|
+
{ rows => :tbody, rev: 'qb:dataSet' }] => :table }).document
|
1678
|
+
end
|
1679
|
+
|
1680
|
+
out
|
1681
|
+
end
|
1682
|
+
|
1683
|
+
def write_stats published: true
|
1684
|
+
target = @config[published ? :target : :private]
|
1685
|
+
target.mkpath unless target.directory?
|
1686
|
+
generate_stats(published: published).each do |uu, doc|
|
1687
|
+
bn = URI(uu.to_s).uuid + '.xml'
|
1688
|
+
fh = (target + bn).open ?w
|
1689
|
+
doc.write_to fh
|
1690
|
+
fh.flush
|
1691
|
+
fh.close
|
1692
|
+
end
|
1693
|
+
end
|
1694
|
+
|
1695
|
+
# - io stuff -
|
1696
|
+
|
1697
|
+
# Locate the file in the source directory associated with the given URI.
|
1698
|
+
#
|
1699
|
+
# @param [RDF::URI, URI, :to_s] the URI requested
|
1700
|
+
#
|
1701
|
+
# @return [Pathname] of the corresponding file or nil if no file was found.
|
1702
|
+
|
1703
|
+
def locate uri
|
1704
|
+
uri = coerce_resource uri
|
1705
|
+
|
1706
|
+
base = URI(@base.to_s)
|
1707
|
+
|
1708
|
+
tu = URI(uri) # copy of uri for testing content
|
1709
|
+
unless tu.scheme == 'urn' and tu.nid == 'uuid'
|
1710
|
+
raise "could not find UUID for #{uri}" unless uuid = canonical_uuid(uri)
|
1711
|
+
tu = URI(uri = uuid)
|
1712
|
+
end
|
1713
|
+
|
1714
|
+
# xxx bail if the uri isn't a subject in the graph
|
1715
|
+
|
1716
|
+
candidates = [@config[:source] + tu.uuid]
|
1717
|
+
|
1718
|
+
# try all canonical URIs
|
1719
|
+
(canonical_uri uri, unique: false, slugs: true).each do |u|
|
1720
|
+
u = URI(u.to_s)
|
1721
|
+
next unless u.hostname == base.hostname
|
1722
|
+
p = URI.unescape u.path[/^\/*(.*?)$/, 1]
|
1723
|
+
candidates.push(@config[:source] + p)
|
1724
|
+
end
|
1725
|
+
|
1726
|
+
# warn candidates
|
1727
|
+
|
1728
|
+
files = candidates.uniq.map do |c|
|
1729
|
+
Pathname.glob(c.to_s + '{,.*,/index{,.*}}')
|
1730
|
+
end.reduce(:+).reject do |x|
|
1731
|
+
x.directory? or RDF::SAK::MimeMagic.by_path(x).to_s !~
|
1732
|
+
/.*(?:markdown|(?:x?ht|x)ml).*/i
|
1733
|
+
end.uniq
|
1734
|
+
|
1735
|
+
#warn files
|
1736
|
+
|
1737
|
+
# XXX implement negotiation algorithm
|
1738
|
+
return files[0]
|
1739
|
+
|
1740
|
+
# return the filename from the source
|
1741
|
+
# nil
|
1742
|
+
end
|
1743
|
+
|
1744
|
+
# Visit (open) the document at the given URI.
|
1745
|
+
#
|
1746
|
+
# @param uri [RDF::URI, URI, :to_s]
|
1747
|
+
#
|
1748
|
+
# @return [RDF::SAK::Context::Document] or nil
|
1749
|
+
|
1750
|
+
def visit uri
|
1751
|
+
uri = canonical_uuid uri
|
1752
|
+
path = locate uri
|
1753
|
+
return unless path
|
1754
|
+
Document.new self, uri, uri: canonical_uri(uri), doc: path
|
1755
|
+
end
|
1756
|
+
|
1757
|
+
# resolve documents from source
|
1758
|
+
def resolve_documents
|
1759
|
+
src = @config[:source]
|
1760
|
+
out = []
|
1761
|
+
src.find do |f|
|
1762
|
+
Find.prune if f.basename.to_s[0] == ?.
|
1763
|
+
next if f.directory?
|
1764
|
+
out << f
|
1765
|
+
end
|
1766
|
+
|
1767
|
+
out
|
1768
|
+
end
|
1769
|
+
|
1770
|
+
def resolve_file path
|
1771
|
+
return unless path.file?
|
1772
|
+
path = Pathname('/') + path.relative_path_from(@config[:source])
|
1773
|
+
base = URI(@base.to_s)
|
1774
|
+
uri = base + path.to_s
|
1775
|
+
|
1776
|
+
#warn "trying #{uri}"
|
1777
|
+
|
1778
|
+
until (out = canonical_uuid uri)
|
1779
|
+
# iteratively strip off
|
1780
|
+
break if uri.path.end_with? '/'
|
1781
|
+
|
1782
|
+
dn = path.dirname
|
1783
|
+
bn = path.basename '.*'
|
1784
|
+
|
1785
|
+
# try index first
|
1786
|
+
if bn.to_s == 'index'
|
1787
|
+
p = dn.to_s
|
1788
|
+
p << '/' unless p.end_with? '/'
|
1789
|
+
uri = base + p
|
1790
|
+
elsif bn == path.basename
|
1791
|
+
break
|
1792
|
+
else
|
1793
|
+
path = dn + bn
|
1794
|
+
uri = base + path.to_s
|
1795
|
+
end
|
1796
|
+
|
1797
|
+
# warn "trying #{uri}"
|
1798
|
+
end
|
1799
|
+
|
1800
|
+
out
|
1801
|
+
end
|
1802
|
+
|
1803
|
+
# Determine whether the URI represents a published document.
|
1804
|
+
#
|
1805
|
+
# @param uri
|
1806
|
+
#
|
1807
|
+
# @return [true, false]
|
1808
|
+
def published? uri, circulated: false
|
1809
|
+
RDF::SAK::Util.published? @graph, uri,
|
1810
|
+
circulated: circulated, base: @base
|
1811
|
+
end
|
1812
|
+
|
1813
|
+
# Find a destination pathname for the document
|
1814
|
+
#
|
1815
|
+
# @param uri
|
1816
|
+
# @param published
|
1817
|
+
#
|
1818
|
+
# @return [Pathname]
|
1819
|
+
def target_for uri, published: false
|
1820
|
+
uri = coerce_resource uri
|
1821
|
+
uri = canonical_uuid uri
|
1822
|
+
target = @config[published?(uri) && published ? :target : :private]
|
1823
|
+
|
1824
|
+
# target is a pathname so this makes a pathname
|
1825
|
+
target + "#{URI(uri.to_s).uuid}.xml"
|
1826
|
+
end
|
1827
|
+
|
1828
|
+
# read from source
|
1829
|
+
|
1830
|
+
# write (manipulated (x|x?ht)ml) back to source
|
1831
|
+
|
1832
|
+
# write public and private variants to target
|
1833
|
+
|
1834
|
+
def write_xhtml published: true
|
1835
|
+
end
|
1836
|
+
|
1837
|
+
# write modified rdf
|
1838
|
+
|
1839
|
+
# - internet stuff -
|
1840
|
+
|
1841
|
+
# verify external links for upness
|
1842
|
+
|
1843
|
+
# collect triples for external links
|
1844
|
+
|
1845
|
+
# fetch references for people/companies/concepts/etc from dbpedia/wikidata
|
1846
|
+
|
1847
|
+
# - document context class -
|
1848
|
+
|
1849
|
+
class Document
|
1850
|
+
include XML::Mixup
|
1851
|
+
include Util
|
1852
|
+
|
1853
|
+
private
|
1854
|
+
|
1855
|
+
C_OK = [Nokogiri::XML::Node, IO, Pathname].freeze
|
1856
|
+
|
1857
|
+
public
|
1858
|
+
|
1859
|
+
attr_reader :doc, :uuid, :uri
|
1860
|
+
|
1861
|
+
def initialize context, uuid, doc: nil, uri: nil, mtime: nil
|
1862
|
+
raise 'context must be a RDF::SAK::Context' unless
|
1863
|
+
context.is_a? RDF::SAK::Context
|
1864
|
+
raise 'uuid must be an RDF::URI' unless
|
1865
|
+
uuid.is_a? RDF::URI and uuid.to_s.start_with? 'urn:uuid:'
|
1866
|
+
|
1867
|
+
doc ||= context.locate uuid
|
1868
|
+
raise 'doc must be Pathname, IO, or Nokogiri node' unless
|
1869
|
+
C_OK.any? { |c| doc.is_a? c } || doc.respond_to?(:to_s)
|
1870
|
+
|
1871
|
+
# set some instance variables
|
1872
|
+
@context = context
|
1873
|
+
@uuid = uuid
|
1874
|
+
@mtime = mtime || doc.respond_to?(:mtime) ? doc.mtime : Time.now
|
1875
|
+
@target = context.target_for uuid
|
1876
|
+
|
1877
|
+
# now process the document
|
1878
|
+
|
1879
|
+
# turn the document into an XML::Document
|
1880
|
+
if doc.is_a? Nokogiri::XML::Node
|
1881
|
+
# a node that is not a document should be wrapped with one
|
1882
|
+
unless doc.is_a? Nokogiri::XML::Document
|
1883
|
+
d = doc.dup 1
|
1884
|
+
doc = Nokogiri::XML::Document.new
|
1885
|
+
doc << d
|
1886
|
+
end
|
1887
|
+
else
|
1888
|
+
type = nil
|
1889
|
+
|
1890
|
+
# pathnames turned into IO objects
|
1891
|
+
if doc.is_a? Pathname
|
1892
|
+
type = RDF::SAK::MimeMagic.by_path doc
|
1893
|
+
doc = doc.open # this may raise if the file isn't there
|
1894
|
+
end
|
1895
|
+
|
1896
|
+
# squash everything else to a string
|
1897
|
+
doc = doc.to_s unless doc.is_a? IO
|
1898
|
+
|
1899
|
+
# check type by content
|
1900
|
+
type ||= RDF::SAK::MimeMagic.by_magic(doc)
|
1901
|
+
|
1902
|
+
# can you believe there is a special bookmarks mime type good grief
|
1903
|
+
type = 'text/html' if type == 'application/x-mozilla-bookmarks'
|
1904
|
+
|
1905
|
+
# now we try to parse the blob
|
1906
|
+
if type.to_s =~ /xml/i
|
1907
|
+
doc = Nokogiri.XML doc
|
1908
|
+
elsif type == 'text/html'
|
1909
|
+
# if the detected type is html, try it as strict xml first
|
1910
|
+
attempt = nil
|
1911
|
+
begin
|
1912
|
+
attempt = Nokogiri.XML doc, nil, nil, (1 << 11) # NONET
|
1913
|
+
rescue Nokogiri::XML::SyntaxError
|
1914
|
+
# do not wrap this a second time; let it fail if it's gonna
|
1915
|
+
tmp = Nokogiri.HTML doc
|
1916
|
+
attempt = Nokogiri::XML::Document.new
|
1917
|
+
attempt << tmp.root.dup(1)
|
1918
|
+
end
|
1919
|
+
doc = attempt
|
1920
|
+
elsif type.to_s =~ /^text\/(?:plain|(?:x-)?markdown)/i
|
1921
|
+
# just assume plain text is markdown
|
1922
|
+
doc = ::MD::Noko.new.ingest doc
|
1923
|
+
else
|
1924
|
+
raise "Don't know what to do with #{uuid} (#{type})"
|
1925
|
+
end
|
1926
|
+
end
|
1927
|
+
|
1928
|
+
# now fix the namespaces for mangled html documents
|
1929
|
+
root = doc.root
|
1930
|
+
if root.name == 'html'
|
1931
|
+
unless root.namespace
|
1932
|
+
# clear this off or it will be duplicated in the output
|
1933
|
+
root.remove_attribute('xmlns')
|
1934
|
+
# now generate a new ns object
|
1935
|
+
ns = root.add_namespace(nil, XHTMLNS)
|
1936
|
+
# *now* scan the document and add the namespace declaration
|
1937
|
+
root.traverse do |node|
|
1938
|
+
if node.element? && node.namespace.nil?
|
1939
|
+
# downcasing the name may be cargo culting; need to check
|
1940
|
+
# node.name = node.name.downcase # yup it is
|
1941
|
+
node.namespace = ns
|
1942
|
+
end
|
1943
|
+
end
|
1944
|
+
end
|
1945
|
+
|
1946
|
+
# also add the magic blank doctype declaration if it's missing
|
1947
|
+
unless doc.internal_subset
|
1948
|
+
doc.create_internal_subset('html', nil, nil)
|
1949
|
+
end
|
1950
|
+
end
|
1951
|
+
|
1952
|
+
# aaand set some more instance variables
|
1953
|
+
|
1954
|
+
@uri = URI(uri || @context.canonical_uri(uuid))
|
1955
|
+
|
1956
|
+
# voilà
|
1957
|
+
@doc = doc
|
1958
|
+
end
|
1959
|
+
|
1960
|
+
# proxy for context published
|
1961
|
+
def published?
|
1962
|
+
@context.published? @uuid
|
1963
|
+
end
|
1964
|
+
|
1965
|
+
def base_for node = nil
|
1966
|
+
node ||= @doc
|
1967
|
+
doc = node.document
|
1968
|
+
base = @uri.to_s
|
1969
|
+
if doc.root.name.to_sym == :html
|
1970
|
+
b = doc.at_xpath(
|
1971
|
+
'(/html:html/html:head/html:base[@href])[1]/@href',
|
1972
|
+
{ html: XHTMLNS }).to_s.strip
|
1973
|
+
base = b if URI(b).absolute?
|
1974
|
+
elsif b = doc.at_xpath('ancestor-or-self::*[@xml:base][1]/@xml:base')
|
1975
|
+
b = b.to_s.strip
|
1976
|
+
base = b if URI(b).absolute?
|
1977
|
+
end
|
1978
|
+
|
1979
|
+
URI(base)
|
1980
|
+
end
|
1981
|
+
|
1982
|
+
# notice these are only RDFa attributes that take URIs
|
1983
|
+
RDFA_ATTR = [:about, :resource, :typeof].freeze
|
1984
|
+
LINK_ATTR = [:href, :src, :data, :action, :longdesc].freeze
|
1985
|
+
LINK_XPATH = ('.//html:*[not(self::html:base)][%s]' %
|
1986
|
+
(LINK_ATTR + RDFA_ATTR).map { |a| "@#{a.to_s}" }.join('|')).freeze
|
1987
|
+
|
1988
|
+
def rewrite_links node = @doc, uuids: {}, uris: {}, &block
|
1989
|
+
base = base_for node
|
1990
|
+
count = 0
|
1991
|
+
cache = {}
|
1992
|
+
node.xpath(LINK_XPATH, { html: XHTMLNS }).each do |elem|
|
1993
|
+
LINK_ATTR.each do |attr|
|
1994
|
+
attr = attr.to_s
|
1995
|
+
next unless elem.has_attribute? attr
|
1996
|
+
|
1997
|
+
abs = base.merge uri_pp(elem[attr].strip)
|
1998
|
+
|
1999
|
+
# fix e.g. http->https
|
2000
|
+
if abs.host == @uri.host and abs.scheme != @uri.scheme
|
2001
|
+
tmp = @uri.dup
|
2002
|
+
tmp.path = abs.path
|
2003
|
+
tmp.query = abs.query
|
2004
|
+
tmp.fragment = abs.fragment
|
2005
|
+
abs = tmp
|
2006
|
+
end
|
2007
|
+
|
2008
|
+
# harvest query string
|
2009
|
+
pp = split_pp abs, only: true
|
2010
|
+
|
2011
|
+
abs = RDF::URI(abs.to_s)
|
2012
|
+
|
2013
|
+
# round-trip to uuid and back if we can
|
2014
|
+
if uuid = uuids[abs] ||= @context.canonical_uuid(abs)
|
2015
|
+
abs = cache[abs] ||= @context.canonical_uri(uuid)
|
2016
|
+
else
|
2017
|
+
abs = cache[abs] ||= @context.canonical_uri(abs)
|
2018
|
+
end
|
2019
|
+
|
2020
|
+
# reinstate the path parameters
|
2021
|
+
if !pp.empty? && split_pp(abs, only: true).empty?
|
2022
|
+
abs = abs.dup
|
2023
|
+
abs.path = ([abs.path] + pp).join(';')
|
2024
|
+
end
|
2025
|
+
|
2026
|
+
|
2027
|
+
elem[attr] = @uri.route_to(abs.to_s).to_s
|
2028
|
+
count += 1
|
2029
|
+
end
|
2030
|
+
|
2031
|
+
block.call elem if block
|
2032
|
+
end
|
2033
|
+
|
2034
|
+
count
|
2035
|
+
end
|
2036
|
+
|
2037
|
+
# sponge the document for rdfa
|
2038
|
+
def triples_for
|
2039
|
+
end
|
2040
|
+
|
2041
|
+
OBJS = [:href, :src].freeze
|
2042
|
+
|
2043
|
+
# ancestor node always with (@property and not @content) and
|
2044
|
+
# not @resource|@href|@src unless @rel|@rev
|
2045
|
+
LITXP = ['(ancestor::*[@property][not(@content)]',
|
2046
|
+
'[not(@resource|@href|@src) or @rel|@rev])[1]' ].join('').freeze
|
2047
|
+
# note parentheses cause the index to be counted from the root
|
2048
|
+
|
2049
|
+
def vocab_for node
|
2050
|
+
if node[:vocab]
|
2051
|
+
vocab = node[:vocab].strip
|
2052
|
+
return nil if vocab == ''
|
2053
|
+
return vocab
|
2054
|
+
end
|
2055
|
+
parent = node.parent
|
2056
|
+
vocab_for parent if parent and parent.element?
|
2057
|
+
end
|
2058
|
+
|
2059
|
+
def prefixes_for node, prefixes = {}
|
2060
|
+
# start with namespaces
|
2061
|
+
pfx = node.namespaces.select do |k, _|
|
2062
|
+
k.start_with? 'xmlns:'
|
2063
|
+
end.transform_keys do |k|
|
2064
|
+
k.delete_prefix 'xmlns:'
|
2065
|
+
end
|
2066
|
+
|
2067
|
+
# then add @prefix overtop of the namespaces
|
2068
|
+
if node[:prefix]
|
2069
|
+
x = node[:prefix].strip.split(/\s+/)
|
2070
|
+
a = []
|
2071
|
+
b = []
|
2072
|
+
x.each_index { |i| (i % 2 == 0 ? a : b).push x[i] }
|
2073
|
+
# if the size is uneven the values will be nil, so w drop em
|
2074
|
+
pfx.merge! a.zip(b).to_h.reject { |_, v| v.nil? }
|
2075
|
+
end
|
2076
|
+
|
2077
|
+
# since we're ascending the tree, input takes precedence
|
2078
|
+
prefixes = pfx.merge prefixes
|
2079
|
+
|
2080
|
+
if node.parent and node.parent.element?
|
2081
|
+
prefixes_for(node.parent, prefixes)
|
2082
|
+
else
|
2083
|
+
prefixes
|
2084
|
+
end
|
2085
|
+
end
|
2086
|
+
|
2087
|
+
# give us the rdf subject of the node itself
|
2088
|
+
def subject_for node = nil, rdf: false, is_ancestor: false
|
2089
|
+
node ||= @doc.root
|
2090
|
+
raise 'Node must be an element' unless
|
2091
|
+
node.is_a? Nokogiri::XML::Element
|
2092
|
+
|
2093
|
+
# first we check for an ancestor element with @property and no
|
2094
|
+
# @content; if we find one then we reevaluate with that
|
2095
|
+
# element as the starting point
|
2096
|
+
if n = node.at_xpath(LITXP)
|
2097
|
+
return subject_for n
|
2098
|
+
end
|
2099
|
+
|
2100
|
+
# answer a bunch of helpful questions about this element
|
2101
|
+
subject = nil
|
2102
|
+
base = base_for node
|
2103
|
+
parent = node.parent
|
2104
|
+
ns_href = node.namespace.href if node.namespace
|
2105
|
+
up_ok = %i{rel rev}.none? { |a| node[a] }
|
2106
|
+
is_root = !parent or parent.document?
|
2107
|
+
special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
|
2108
|
+
(ns_href == 'http://www.w3.org/1999/xhtml' or
|
2109
|
+
/^(?:[^:]+:)?html$/xi === parent.name)
|
2110
|
+
|
2111
|
+
# if the node is being inspected as an ancestor to the
|
2112
|
+
# original node, we have to check it backwards.
|
2113
|
+
if is_ancestor
|
2114
|
+
# ah right @resource gets special treatment
|
2115
|
+
if subject = node[:resource]
|
2116
|
+
subject.strip!
|
2117
|
+
if m = /^\[(.*?)\]$/.match(subject)
|
2118
|
+
end
|
2119
|
+
else
|
2120
|
+
OBJS.each do |attr|
|
2121
|
+
if node[attr]
|
2122
|
+
# merge with the root and return it
|
2123
|
+
subject = base + node[attr]
|
2124
|
+
break
|
2125
|
+
end
|
2126
|
+
end
|
2127
|
+
end
|
2128
|
+
|
2129
|
+
return rdf ? RDF::URI(subject.to_s) : subject
|
2130
|
+
|
2131
|
+
# note if we are being called with is_ancestor, that means
|
2132
|
+
# the original node (or indeed any of the nodes previously
|
2133
|
+
# tested) have anything resembling a resource in them. this
|
2134
|
+
# means @rel/@rev should be ignored, and we should keep
|
2135
|
+
# looking for a subject.
|
2136
|
+
end
|
2137
|
+
|
2138
|
+
if node[:about]
|
2139
|
+
|
2140
|
+
if m = /^_:(.*)$/.match(node[:about])
|
2141
|
+
return RDF::Node(m[1])
|
2142
|
+
end
|
2143
|
+
|
2144
|
+
# XXX resolve @about against potential curie
|
2145
|
+
subject = base + node[:about]
|
2146
|
+
|
2147
|
+
elsif is_root
|
2148
|
+
subject = base
|
2149
|
+
elsif special
|
2150
|
+
subject = subject_for parent
|
2151
|
+
elsif node[:resource]
|
2152
|
+
# XXX resolve @about against potential curie
|
2153
|
+
subject = base + node[:resource]
|
2154
|
+
elsif node[:href]
|
2155
|
+
subject = base + node[:href]
|
2156
|
+
elsif node[:src]
|
2157
|
+
subject = base + node[:src]
|
2158
|
+
elsif node[:typeof]
|
2159
|
+
# bnode the typeof attr
|
2160
|
+
|
2161
|
+
# note we return bnodes irrespective of the rdf flag
|
2162
|
+
return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
|
2163
|
+
elsif node[:inlist]
|
2164
|
+
# bnode the inlist attr
|
2165
|
+
return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
|
2166
|
+
elsif (parent[:inlist] && OBJS.none? { |a| parent[a] }) ||
|
2167
|
+
(is_ancestor && !up_ok)
|
2168
|
+
# bnode the element
|
2169
|
+
return RDF::Node('id-%016x' % node.pointer_id)
|
2170
|
+
# elsif node[:id]
|
2171
|
+
else
|
2172
|
+
subject = subject_for parent, is_ancestor: true
|
2173
|
+
end
|
2174
|
+
|
2175
|
+
rdf ? RDF::URI(subject.to_s) : URI(subject.to_s)
|
2176
|
+
|
2177
|
+
end
|
2178
|
+
|
2179
|
+
# backlink structure
|
2180
|
+
def generate_backlinks published: true, ignore: nil
|
2181
|
+
@context.generate_backlinks @uuid, published: published, ignore: ignore
|
2182
|
+
end
|
2183
|
+
|
2184
|
+
# goofy twitter-specific metadata
|
2185
|
+
def generate_twitter_meta
|
2186
|
+
@context.generate_twitter_meta @uuid
|
2187
|
+
end
|
2188
|
+
|
2189
|
+
def transform_xhtml published: true
|
2190
|
+
# before we do any more work make sure this is html
|
2191
|
+
doc = @doc.dup 1
|
2192
|
+
body = doc.at_xpath('//html:body[1]', { html: XHTMLNS }) or return
|
2193
|
+
|
2194
|
+
# eliminate comments
|
2195
|
+
doc.xpath('//comment()[not(ancestor::html:script)]',
|
2196
|
+
{ html: XHTMLNS }).each { |c| c.unlink }
|
2197
|
+
|
2198
|
+
# initial stuff
|
2199
|
+
struct = @context.struct_for @uuid, uuids: true, canon: true
|
2200
|
+
# rstruct = @context.struct_for @uuid, uuids: true, rev: true
|
2201
|
+
resources = {}
|
2202
|
+
literals = {}
|
2203
|
+
ufwd = {} # uuid -> uri
|
2204
|
+
urev = {} # uri -> uuid
|
2205
|
+
datatypes = Set.new
|
2206
|
+
types = Set.new
|
2207
|
+
authors = @context.authors_for(@uuid)
|
2208
|
+
title = @context.label_for @uuid, candidates: struct
|
2209
|
+
desc = @context.label_for @uuid, candidates: struct, desc: true
|
2210
|
+
|
2211
|
+
# rewrite content
|
2212
|
+
title = title[1] if title
|
2213
|
+
desc = desc[1] if desc
|
2214
|
+
|
2215
|
+
# `struct` and `rstruct` will contain all the links and
|
2216
|
+
# metadata for forward and backward neighbours, respectively,
|
2217
|
+
# which we need to mine (predicates, classes, datatypes) for
|
2218
|
+
# prefixes among other things.
|
2219
|
+
|
2220
|
+
struct.each do |p, v|
|
2221
|
+
v.each do |o|
|
2222
|
+
if o.literal?
|
2223
|
+
literals[o] ||= Set.new
|
2224
|
+
literals[o].add p
|
2225
|
+
|
2226
|
+
# collect the datatype
|
2227
|
+
datatypes.add o.datatype if o.has_datatype?
|
2228
|
+
else
|
2229
|
+
# normalize URIs
|
2230
|
+
if o.to_s.start_with? 'urn:uuid:'
|
2231
|
+
ufwd[o] ||= @context.canonical_uri o
|
2232
|
+
elsif cu = @context.canonical_uuid(o)
|
2233
|
+
o = urev[o] ||= cu
|
2234
|
+
end
|
2235
|
+
|
2236
|
+
|
2237
|
+
# collect the resource
|
2238
|
+
resources[o] ||= Set.new
|
2239
|
+
resources[o].add p
|
2240
|
+
|
2241
|
+
# add to type
|
2242
|
+
types.add o if p == RDF::RDFV.type
|
2243
|
+
end
|
2244
|
+
end
|
2245
|
+
end
|
2246
|
+
urev.merge! ufwd.invert
|
2247
|
+
|
2248
|
+
labels = resources.keys.map do |k|
|
2249
|
+
# turn this into a pair which subsequently gets turned into a hash
|
2250
|
+
[k, @context.label_for(k) ]
|
2251
|
+
end.to_h
|
2252
|
+
|
2253
|
+
#warn labels
|
2254
|
+
|
2255
|
+
# handle the title
|
2256
|
+
title ||= RDF::Literal('')
|
2257
|
+
tm = { '#title' => title,
|
2258
|
+
property: @context.abbreviate(literals[title].to_a, vocab: XHV) }
|
2259
|
+
if tl = title.language
|
2260
|
+
tm['xml:lang'] = tl # if xmlns
|
2261
|
+
tm['lang'] = tl
|
2262
|
+
elsif tdt = title.datatype and tdt != RDF::XSD.string
|
2263
|
+
tm[:datatype] = @context.abbreviate(tdt)
|
2264
|
+
end
|
2265
|
+
|
2266
|
+
# we accumulate a record of the links in the body so we know
|
2267
|
+
# which ones to skip in the head
|
2268
|
+
bodylinks = {}
|
2269
|
+
rewrite_links body, uuids: ufwd, uris: urev do |elem|
|
2270
|
+
vocab = elem.at_xpath('ancestor-or-self::*[@vocab][1]/@vocab')
|
2271
|
+
vocab = uri_pp(vocab.to_s) if vocab
|
2272
|
+
|
2273
|
+
if elem.key?('href') or elem.key?('src')
|
2274
|
+
vu = uri_pp(elem['href'] || elem['src'])
|
2275
|
+
ru = RDF::URI(@uri.merge(vu))
|
2276
|
+
bodylinks[urev[ru] || ru] = true
|
2277
|
+
|
2278
|
+
if rel = resources[urev[ru] || ru]
|
2279
|
+
elem['rel'] = (@context.abbreviate rel, vocab: vocab).join ' '
|
2280
|
+
end
|
2281
|
+
|
2282
|
+
label = labels[urev[ru] || ru]
|
2283
|
+
if label and (!elem.key?('title') or elem['title'].strip == '')
|
2284
|
+
elem['title'] = label[1].to_s
|
2285
|
+
end
|
2286
|
+
end
|
2287
|
+
end
|
2288
|
+
|
2289
|
+
# and now we do the head
|
2290
|
+
links = []
|
2291
|
+
resources.reject { |k, _| bodylinks[k] }.each do |k, v|
|
2292
|
+
v = v.dup.delete RDF::RDFV.type
|
2293
|
+
next if v.empty?
|
2294
|
+
mts = @context.formats_for k
|
2295
|
+
|
2296
|
+
# warn k, v.inspect
|
2297
|
+
|
2298
|
+
# warn k, mts.inspect
|
2299
|
+
|
2300
|
+
rel = @context.abbreviate v.to_a, vocab: XHV
|
2301
|
+
ru = @uri.route_to(uri_pp (ufwd[k] || k).to_s)
|
2302
|
+
ln = { nil => :link, rel: rel, href: ru.to_s }
|
2303
|
+
if (label = labels[urev[k] || k])
|
2304
|
+
ln[:title] = label[1].to_s
|
2305
|
+
end
|
2306
|
+
|
2307
|
+
# add type=lol/wut
|
2308
|
+
ln[:type] = mts.first.to_s unless mts.empty?
|
2309
|
+
|
2310
|
+
if !ln[:type] and v.include?(RDF::Vocab::XHV.stylesheet)
|
2311
|
+
ln[:type] = 'text/css'
|
2312
|
+
elsif ln[:type] =~ /(java|ecma)script/i or
|
2313
|
+
v.include?(RDF::Vocab::DC.requires)
|
2314
|
+
ln[nil] = :script
|
2315
|
+
ln[:src] = ln.delete :href
|
2316
|
+
ln[:type] ||= 'text/javascript'
|
2317
|
+
end
|
2318
|
+
links.push ln
|
2319
|
+
end
|
2320
|
+
|
2321
|
+
links.sort! do |a, b|
|
2322
|
+
# sort by rel, then by href
|
2323
|
+
# warn a.inspect, b.inspect
|
2324
|
+
s = 0
|
2325
|
+
[nil, :rel, :rev, :href, :title].each do |k|
|
2326
|
+
s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
|
2327
|
+
break if s != 0
|
2328
|
+
end
|
2329
|
+
s
|
2330
|
+
end
|
2331
|
+
|
2332
|
+
# we want to duplicate links from particular subjects (eg the root)
|
2333
|
+
(@context.config[:duplicate] || {}).sort do |a, b|
|
2334
|
+
a.first <=> b.first
|
2335
|
+
end.each do |s, preds|
|
2336
|
+
|
2337
|
+
o = {}
|
2338
|
+
u = ufwd[s] ||= @context.canonical_uuid s
|
2339
|
+
s = urev[u] ||= @context.canonical_uri u if u
|
2340
|
+
f = {}
|
2341
|
+
|
2342
|
+
# do not include this subject as these links are already included!
|
2343
|
+
next if u == @uuid
|
2344
|
+
|
2345
|
+
# gather up the objects, then gather up the predicates
|
2346
|
+
|
2347
|
+
@context.objects_for u || s, preds, only: :resource do |obj, rel|
|
2348
|
+
# XXX do not know why += |= etc does not work
|
2349
|
+
x = @context.canonical_uuid(obj) || obj
|
2350
|
+
urev[x] ||= @context.canonical_uri x
|
2351
|
+
y = o[x] ||= Set.new
|
2352
|
+
o[x] = y | rel
|
2353
|
+
f[x] = @context.formats_for x
|
2354
|
+
end
|
2355
|
+
|
2356
|
+
srel = @uri.route_to((u ? urev[u] || s : s).to_s)
|
2357
|
+
|
2358
|
+
# now collect all the other predicates
|
2359
|
+
o.keys.each do |obj|
|
2360
|
+
hrel = @uri.route_to((urev[obj] || obj).to_s)
|
2361
|
+
o[obj] |= @context.graph.query([u || s, nil, obj]).predicates.to_set
|
2362
|
+
rels = @context.abbreviate o[obj].to_a, vocab: XHV
|
2363
|
+
ln = { nil => :link, about: srel, rel: rels, href: hrel }
|
2364
|
+
ln[:type] = f[obj].first if f[obj]
|
2365
|
+
|
2366
|
+
# add to links
|
2367
|
+
links << ln
|
2368
|
+
end
|
2369
|
+
end
|
2370
|
+
|
2371
|
+
meta = []
|
2372
|
+
|
2373
|
+
# include author names as old school meta tags
|
2374
|
+
authors.each do |a|
|
2375
|
+
name = labels[urev[a] || a] or next
|
2376
|
+
datatypes.add name[0] # a convenient place to chuck this
|
2377
|
+
prop = @context.abbreviate(name[0])
|
2378
|
+
name = name[1]
|
2379
|
+
about = @uri.route_to((ufwd[a] || a).to_s)
|
2380
|
+
tag = { nil => :meta, about: about.to_s, name: :author,
|
2381
|
+
property: prop, content: name.to_s }
|
2382
|
+
|
2383
|
+
if name.has_datatype? and name.datatype != RDF::XSD.string
|
2384
|
+
tag[:datatype] = @context.abbreviate(name.datatype)
|
2385
|
+
elsif name.has_language?
|
2386
|
+
tag['xml:lang'] = tag[:lang] = name.language
|
2387
|
+
end
|
2388
|
+
meta.push tag
|
2389
|
+
end
|
2390
|
+
|
2391
|
+
literals.each do |k, v|
|
2392
|
+
next if k == title
|
2393
|
+
rel = @context.abbreviate v.to_a, vocab: XHV
|
2394
|
+
elem = { nil => :meta, property: rel, content: k.to_s }
|
2395
|
+
elem[:name] = :description if k == desc
|
2396
|
+
|
2397
|
+
if k.has_datatype?
|
2398
|
+
datatypes.add k.datatype # so we get the prefix
|
2399
|
+
elem[:datatype] = @context.abbreviate k.datatype, vocab: XHV
|
2400
|
+
end
|
2401
|
+
|
2402
|
+
meta.push(elem)
|
2403
|
+
end
|
2404
|
+
|
2405
|
+
meta.sort! do |a, b|
|
2406
|
+
s = 0
|
2407
|
+
[:about, :property, :datatype, :content, :name].each do |k|
|
2408
|
+
# warn a.inspect, b.inspect
|
2409
|
+
s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
|
2410
|
+
break if s != 0
|
2411
|
+
end
|
2412
|
+
s
|
2413
|
+
end
|
2414
|
+
|
2415
|
+
# don't forget style tag
|
2416
|
+
style = doc.xpath('/html:html/html:head/html:style', { html: XHTMLNS })
|
2417
|
+
|
2418
|
+
body = body.dup 1
|
2419
|
+
body = { '#body' => body.children.to_a, about: '' }
|
2420
|
+
body[:typeof] = @context.abbreviate(types.to_a, vocab: XHV) unless
|
2421
|
+
types.empty?
|
2422
|
+
|
2423
|
+
# prepare only the prefixes we need to resolve the data we need
|
2424
|
+
rsc = @context.abbreviate(
|
2425
|
+
(struct.keys + resources.keys + datatypes.to_a + types.to_a).uniq,
|
2426
|
+
noop: false).map do |x|
|
2427
|
+
next if x.nil?
|
2428
|
+
x.split(?:)[0].to_sym
|
2429
|
+
end.select { |x| not x.nil? }.to_set
|
2430
|
+
|
2431
|
+
pfx = @context.prefixes.select do |k, _|
|
2432
|
+
rsc.include? k
|
2433
|
+
end.transform_values { |v| v.to_s }
|
2434
|
+
|
2435
|
+
# XXX deal with the qb:Observation separately (just nuke it for now)
|
2436
|
+
extra = generate_twitter_meta || []
|
2437
|
+
if bl = generate_backlinks(published: published,
|
2438
|
+
ignore: @context.graph.query(
|
2439
|
+
[nil, CI.document, @uuid]).subjects.to_set)
|
2440
|
+
extra << { [bl] => :object }
|
2441
|
+
end
|
2442
|
+
|
2443
|
+
# and now for the document
|
2444
|
+
xf = @context.config[:transform]
|
2445
|
+
doc = xhtml_stub(
|
2446
|
+
base: @uri, prefix: pfx, vocab: XHV, lang: 'en', title: tm,
|
2447
|
+
link: links, meta: meta, style: style, transform: xf,
|
2448
|
+
extra: extra, body: body).document
|
2449
|
+
|
2450
|
+
# goddamn script tags and text/html
|
2451
|
+
doc.xpath('//html:script[@src][not(node())]',
|
2452
|
+
{ html: XHTMLNS }).each do |script|
|
2453
|
+
script << doc.create_text_node('')
|
2454
|
+
end
|
2455
|
+
|
2456
|
+
doc
|
2457
|
+
end
|
2458
|
+
|
2459
|
+
# Actually write the transformed document to the target
|
2460
|
+
#
|
2461
|
+
# @param published [true, false]
|
2462
|
+
#
|
2463
|
+
# @return [Array] pathname(s) written
|
2464
|
+
def write_to_target published: true
|
2465
|
+
|
2466
|
+
# in all cases we write to private target
|
2467
|
+
states = [false]
|
2468
|
+
# document has to be publishable
|
2469
|
+
states.push true if published && @context.published?(@uuid)
|
2470
|
+
|
2471
|
+
ok = []
|
2472
|
+
states.each do |state|
|
2473
|
+
target = @context.config[state ? :target : :private]
|
2474
|
+
|
2475
|
+
# XXX this is dumb; it should do something more robust if it
|
2476
|
+
# fails
|
2477
|
+
doc = transform_xhtml(published: state) or next
|
2478
|
+
|
2479
|
+
begin
|
2480
|
+
fh = Tempfile.create('xml-', target)
|
2481
|
+
path = Pathname(fh.path)
|
2482
|
+
|
2483
|
+
# write the doc to the target
|
2484
|
+
doc.write_to fh
|
2485
|
+
fh.close
|
2486
|
+
|
2487
|
+
uuid = URI(@uuid.to_s)
|
2488
|
+
newpath = path.dirname + "#{uuid.uuid}.xml"
|
2489
|
+
ok.push newpath
|
2490
|
+
|
2491
|
+
File.chmod(0644, path)
|
2492
|
+
File.rename(path, newpath)
|
2493
|
+
File.utime(@mtime, @mtime, newpath)
|
2494
|
+
rescue Exception => e
|
2495
|
+
# XXX this should only rescue a specific class of errors
|
2496
|
+
warn e.class, e
|
2497
|
+
File.unlink path if path.exist?
|
2498
|
+
end
|
2499
|
+
end
|
2500
|
+
|
2501
|
+
ok
|
2502
|
+
end
|
2503
|
+
|
2504
|
+
end
|
2505
|
+
end
|
2506
|
+
end
|