rdf-sak 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,13 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
8
+ #desc 'Generate Vocabularies'
9
+ #task :gen_vocabs => %w(ci).map { |v| "lib/rdf/sak/#{v}.rb" }
10
+
11
+ # XXX turn this into a rake task at some point :P
12
+
13
+ # rdf serialize --uri 'https://privatealpha.com/ontology/content-inventory/1#' --output-format vocabulary --module-name RDF::SAK --class-name CI -o lib/rdf/sak/ci.rb --strict 'https://privatealpha.com/ontology/content-inventory/1#'
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "rdf/sak"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,14 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <xsl:stylesheet version="1.0"
3
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
4
+ xmlns:html="http://www.w3.org/1999/xhtml"
5
+ xmlns="http://www.w3.org/1999/xhtml"
6
+ exclude-result-prefixes="html">
7
+
8
+ <xsl:key name="main" match="html:main" use="''"/>
9
+
10
+ <xsl:template match="/html:*">
11
+ <xsl:copy-of select="key('main', '')[1]"/>
12
+ </xsl:template>
13
+
14
+ </xsl:stylesheet>
@@ -0,0 +1,11 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <html xmlns="http://www.w3.org/1999/xhtml">
3
+ <head>
4
+ <title>i match lol</title>
5
+ </head>
6
+ <body>
7
+ <main>
8
+ <p>hooray you found the main element</p>
9
+ </main>
10
+ </body>
11
+ </html>
@@ -0,0 +1,58 @@
1
+ @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
2
+ @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
3
+ @prefix owl: <http://www.w3.org/2002/07/owl#> .
4
+ @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
5
+ @prefix dct: <http://purl.org/dc/terms/> .
6
+ @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
7
+ @prefix ci: <https://privatealpha.com/ontology/content-inventory/1#> .
8
+ @prefix tfo: <https://privatealpha.com/ontology/transformation/1#> .
9
+ @prefix xf: <tag:makethingsmakesense.com,2020:transform/> .
10
+
11
+ xf:prefix a tfo:Parameter ;
12
+ skos:prefLabel "Prefix"@en ;
13
+ rdfs:comment "A compact prefix declaration of the form prefix:url"@en ;
14
+ dct:identifier "prefix"^^xsd:token ;
15
+ rdfs:range xsd:token .
16
+
17
+ xf:xpath a tfo:Parameter ;
18
+ skos:prefLabel "XPath"@en ;
19
+ rdfs:comment "An XPath expression"@en ;
20
+ dct:identifier "xpath"^^xsd:token ;
21
+ owl:cardinality 1 ;
22
+ rdfs:range xsd:string .
23
+
24
+ xf:reindent a tfo:Parameter ;
25
+ skos:prefLabel "Reindent"@en ;
26
+ rdfs:comment "Reindent the XML tree"@en ;
27
+ dct:identifier "reindent"^^xsd:token ;
28
+ tfo:default true ;
29
+ owl:cardinality 1 ;
30
+ rdfs:range xsd:boolean .
31
+
32
+ xf:subtree a tfo:Transform ;
33
+ skos:prefLabel "Subtree"@en ;
34
+ rdfs:comment "Isolate an X(HT)ML node using XPath."@en ;
35
+ tfo:implementation <urn:x-ruby:RDF::SAK::Transform::XPath> ;
36
+ tfo:accepts "application/xml"^^tfo:content-type ;
37
+ tfo:returns "application/xml"^^tfo:content-type ;
38
+ tfo:parameter xf:xpath, xf:prefix, xf:reindent ;
39
+ tfo:parameter-list ( xf:xpath xf:prefix xf:reindent ) .
40
+
41
+ xf:cleanup a tfo:Transform ;
42
+ skos:prefLabel "Cleanup"@en ;
43
+ rdfs:comment "Apply cleanup.xsl to the input."@en ;
44
+ tfo:implementation <file:example/cleanup.xsl> ;
45
+ tfo:accepts "application/xml"^^tfo:content-type ;
46
+ tfo:returns "application/xml"^^tfo:content-type .
47
+
48
+ <urn:uuid:78e6d8ce-a88a-4be0-8bfa-079136945816> a tfo:Partial ;
49
+ tfo:transform xf:subtree ;
50
+ xf:xpath "//html:main[1]"^^xsd:string ;
51
+ xf:prefix "html:http://www.w3.org/1999/xhtml"^^xsd:token .
52
+
53
+ <urn:uuid:4498eef5-1ca6-4034-937a-d50033dd6693> a tfo:Application ;
54
+ tfo:input <ni:///sha-256;0GHHmDtxh9CRZttXdr-cX78u72auS2P-O6tDXxvz2kU> ;
55
+ tfo:output <ni:///sha-256;_BbLbNSZl0TcQcjz-v3qF5fa5VL11rdha7c24K44pTc> ;
56
+ tfo:transform xf:subtree ;
57
+ xf:xpath "//html:main[1]"^^xsd:string ;
58
+ xf:prefix "html:http://www.w3.org/1999/xhtml"^^xsd:token .
@@ -0,0 +1 @@
1
+ require 'rdf/sak'
@@ -0,0 +1,2506 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rdf/sak/version'
3
+
4
+ # basic stuff
5
+ require 'stringio'
6
+ require 'pathname'
7
+ require 'tempfile'
8
+
9
+ # rdf stuff
10
+ require 'uri'
11
+ require 'uri/urn'
12
+ require 'rdf'
13
+ require 'rdf/reasoner'
14
+ require 'linkeddata'
15
+
16
+ # my stuff
17
+ require 'xml-mixup'
18
+ require 'md-noko'
19
+ require 'uuid-ncname'
20
+ require 'rdf/sak/mimemagic'
21
+ require 'rdf/sak/util'
22
+
23
+ # ontologies, mine in particular
24
+ require 'rdf/sak/ci'
25
+ require 'rdf/sak/ibis'
26
+ # others not included in rdf.rb
27
+ require 'rdf/sak/pav'
28
+ require 'rdf/sak/qb'
29
+
30
+ module RDF::SAK
31
+
32
+ class Context
33
+ include XML::Mixup
34
+ include Util
35
+
36
+ private
37
+
38
+ # RDF::Reasoner.apply(:rdfs, :owl)
39
+
40
+ G_OK = [RDF::Repository, RDF::Dataset, RDF::Graph].freeze
41
+ C_OK = [Pathname, IO, String].freeze
42
+
43
+ def coerce_to_path_or_io obj
44
+ return obj if obj.is_a? IO
45
+ return obj.expand_path if obj.is_a? Pathname
46
+ raise "#{obj.inspect} is not stringable" unless obj.respond_to? :to_s
47
+ Pathname(obj.to_s).expand_path
48
+ end
49
+
50
+ def coerce_graph graph = nil, type: nil
51
+ # begin with empty graph
52
+ out = RDF::Repository.new
53
+
54
+ return out unless graph
55
+ return graph if G_OK.any? { |c| graph.is_a? c }
56
+
57
+ # now turn into an array
58
+ graph = [graph] unless graph.is_a? Array
59
+
60
+ graph.each do |g|
61
+ raise 'Graph must be some kind of RDF::Graph or RDF data file' unless
62
+ C_OK.any? { |c| g.is_a? c } || g.respond_to?(:to_s)
63
+
64
+ opts = {}
65
+ opts[:content_type] = type if type
66
+
67
+ if g.is_a? Pathname
68
+ opts[:filename] = g.expand_path.to_s
69
+ g = g.open
70
+ elsif g.is_a? File
71
+ opts[:filename] = g.path
72
+ end
73
+
74
+ g = StringIO.new(g.to_s) unless g.is_a? IO
75
+ reader = RDF::Reader.for(opts) do
76
+ g.rewind
77
+ sample = g.read 1000
78
+ g.rewind
79
+ sample
80
+ end or raise "Could not find an RDF::Reader for #{opts[:content_type]}"
81
+
82
+ reader = reader.new g, **opts
83
+ reader.each_statement do |stmt|
84
+ out << stmt
85
+ end
86
+ end
87
+
88
+ out
89
+ end
90
+
91
+ def normalize_hash h
92
+ return h unless h.is_a? Hash
93
+ out = {}
94
+ h.each do |k, v|
95
+ out[k.to_s.to_sym] = v.is_a?(Hash) ? normalize_hash(v) :
96
+ v.respond_to?(:to_a) ? v.to_a.map { |x| normalize_hash x } : v
97
+ end
98
+ out
99
+ end
100
+
101
+ def coerce_config config
102
+ # config must either be a hash or a file name/pathname/io object
103
+ unless config.respond_to? :to_h
104
+ # when in rome
105
+ require 'yaml'
106
+ config = if config.is_a? IO
107
+ YAML.load config
108
+ else
109
+ YAML.load_file Pathname.new(config).expand_path
110
+ end
111
+ end
112
+
113
+ config = normalize_hash config
114
+
115
+ # config MUST have source and target dirs
116
+ raise 'Config must have :source, :target, and :private directories' unless
117
+ ([:source, :target, :private] - config.keys).empty?
118
+ [:source, :target].each do |path|
119
+ dir = config[path] = Pathname.new(config[path]).expand_path
120
+ raise "#{dir} is not a readable directory" unless
121
+ dir.directory? && dir.readable?
122
+ end
123
+ raise "Target directory #{config[:target]} is not writable" unless
124
+ config[:target].writable?
125
+ raise "Source and target directories are the same: #{config[:source]}" if
126
+ config[:source] == config[:target]
127
+
128
+ # we try to create the private directory
129
+ config[:private] = config[:target] + config[:private]
130
+ if config[:private].exist?
131
+ raise "#{config[:private]} is not a readable/writable directory" unless
132
+ [:directory?, :readable?, :writable?].all? do |m|
133
+ config[:private].send m
134
+ end
135
+ else
136
+ config[:private].mkpath
137
+ end
138
+
139
+ # config MAY have graph location(s) but we can test this other
140
+ # ways, same goes for base URI
141
+ if config[:graph]
142
+ g = config[:graph]
143
+ g = [g] unless g.is_a? Array
144
+ config[:graph] = g.map { |x| Pathname.new(x).expand_path }
145
+ end
146
+
147
+ # deal with prefix map
148
+ if config[:prefixes]
149
+ config[:prefixes] = config[:prefixes].transform_values do |p|
150
+ # we have to wrap this in case it fails
151
+ begin
152
+ RDF::Vocabulary.find_term(p) || RDF::URI(p)
153
+ rescue
154
+ RDF::URI(p)
155
+ end
156
+ end
157
+ end
158
+
159
+ if dups = config[:duplicate]
160
+ pfx = config[:prefixes] || {}
161
+ base = URI(uri_pp config[:base])
162
+ if dups.is_a? Hash
163
+ config[:duplicate] = dups.map do |ruri, preds|
164
+ preds = [preds] unless preds.is_a? Array
165
+ preds.map! do |p|
166
+ resolve_curie p, prefixes: pfx, scalar: true, coerce: :rdf
167
+ end
168
+ [RDF::URI((base + ruri.to_s).to_s), Set.new(preds)]
169
+ end.to_h
170
+ end
171
+ end
172
+
173
+ # rewrite maps
174
+ config[:maps] = {} unless config[:maps].is_a? Hash
175
+ %w(rewrite redirect gone).each do |type|
176
+ config[:maps][type.to_sym] ||= ".#{type}.map"
177
+ end
178
+
179
+ config
180
+ end
181
+
182
+ def cmp_label a, b, labels: nil, supplant: true, reverse: false
183
+ labels ||= {}
184
+
185
+ # try supplied label or fall back
186
+ pair = [a, b].map do |x|
187
+ if labels[x]
188
+ labels[x][1]
189
+ elsif supplant and y = label_for(x)
190
+ labels[x] = y
191
+ y[1]
192
+ else
193
+ x
194
+ end
195
+ end
196
+
197
+ pair.reverse! if reverse
198
+ # warn "#{pair[0]} <=> #{pair[1]}"
199
+ pair[0].to_s <=> pair[1].to_s
200
+ end
201
+
202
+ def term_list terms
203
+ return [] if terms.nil?
204
+ terms = terms.respond_to?(:to_a) ? terms.to_a : [terms]
205
+ terms.uniq.map { |t| RDF::Vocabulary.find_term t }.compact
206
+ end
207
+
208
+ def coerce_resource arg
209
+ super arg, @base
210
+ end
211
+
212
+ def coerce_uuid_urn arg
213
+ super arg, @base
214
+ end
215
+
216
+ public
217
+
218
+ attr_reader :config, :graph, :base
219
+
220
+ # Initialize a context.
221
+ #
222
+ # @param graph
223
+ # @param base
224
+ # @param config
225
+ # @param type
226
+ #
227
+ # @return [RDF::SAK::Context] the new context object.
228
+
229
+ def initialize graph: nil, base: nil, config: nil, type: nil
230
+ # RDF::Reasoner.apply(:rdfs, :owl)
231
+
232
+ @config = coerce_config config
233
+
234
+ graph ||= @config[:graph] if @config[:graph]
235
+ base ||= @config[:base] if @config[:base]
236
+
237
+ @graph = coerce_graph graph, type: type
238
+ @base = RDF::URI.new base.to_s if base
239
+ @ucache = RDF::Util::Cache.new(-1)
240
+ @scache = {} # wtf rdf util cache doesn't like booleans
241
+ end
242
+
243
+ # Get the prefix mappings from the configuration.
244
+ #
245
+ # @return [Hash]
246
+
247
+ def prefixes
248
+ @config[:prefixes] || {}
249
+ end
250
+
251
+ # Abbreviate a set of terms against the registered namespace
252
+ # prefixes and optional default vocabulary, or otherwise return a
253
+ # string representation of the original URI.
254
+
255
+ # @param term [RDF::Term]
256
+ # @param prefixes [Hash]
257
+ #
258
+ # @return [String]
259
+ #
260
+ def abbreviate term, prefixes: @config[:prefixes],
261
+ vocab: nil, noop: true, sort: true
262
+ super term, prefixes: prefixes || {}, vocab: vocab, noop: noop, sort: sort
263
+ end
264
+
265
+ # Obtain a key-value structure for the given subject, optionally
266
+ # constraining the result by node type (:resource, :uri/:iri,
267
+ # :blank/:bnode, :literal)
268
+ #
269
+ # @param subject of the inquiry
270
+ # @param rev map in reverse
271
+ # @param only one or more node types
272
+ # @param uuids coerce resources to if possible
273
+ #
274
+ # @return [Hash]
275
+ #
276
+ def struct_for subject, rev: false, only: [], uuids: false, canon: false
277
+ Util.struct_for @graph, subject,
278
+ rev: rev, only: only, uuids: uuids, canon: canon
279
+ end
280
+
281
+ # Obtain everything in the graph that is an `rdf:type` of something.
282
+ #
283
+ # @return [Array]
284
+ #
285
+ def all_types
286
+ @graph.query([nil, RDF.type, nil]).objects.uniq
287
+ end
288
+
289
+ # Obtain every subject that is rdf:type the given type or its subtypes.
290
+ #
291
+ # @param rdftype [RDF::Term]
292
+ #
293
+ # @return [Array]
294
+ #
295
+ def all_of_type rdftype, exclude: []
296
+ exclude = term_list exclude
297
+ t = RDF::Vocabulary.find_term(rdftype) or raise "No type #{rdftype.to_s}"
298
+ out = []
299
+ (all_types & all_related(t) - exclude).each do |type|
300
+ out += @graph.query([nil, RDF.type, type]).subjects
301
+ end
302
+
303
+ out.uniq
304
+ end
305
+
306
+ # Obtain all and only the rdf:types directly asserted on the subject.
307
+ #
308
+ # @param subject [RDF::Resource]
309
+ # @param type [RDF::Term, :to_a]
310
+ #
311
+ # @return [Array]
312
+ #
313
+ def asserted_types subject, type = nil
314
+ Util.asserted_types @graph, subject, type
315
+ end
316
+
317
+ # Obtain the canonical UUID for the given URI
318
+ #
319
+ # @param uri [RDF::URI, URI, to_s] the subject of the inquiry
320
+ # @param unique [true, false] return a single resource/nil or an array
321
+ # @param published [true, false] whether to restrict to published docs
322
+ #
323
+ # @return [RDF::URI, Array]
324
+ #
325
+ def canonical_uuid uri, unique: true, published: false
326
+ Util.canonical_uuid @graph, uri, unique: unique,
327
+ published: published, scache: @scache, ucache: @ucache, base: @base
328
+ end
329
+
330
+ # Obtain the "best" dereferenceable URI for the subject.
331
+ # Optionally returns all candidates.
332
+ #
333
+ # @param subject [RDF::Resource]
334
+ # @param unique [true, false] flag for unique return value
335
+ # @param rdf [true, false] flag to specify RDF::URI vs URI
336
+ # @param slugs [true, false] flag to include slugs
337
+ # @param fragment [true, false] flag to include fragment URIs
338
+ #
339
+ # @return [RDF::URI, URI, Array]
340
+ #
341
+ def canonical_uri subject,
342
+ unique: true, rdf: true, slugs: false, fragment: false
343
+ Util.canonical_uri @graph, subject, base: @base,
344
+ unique: unique, rdf: rdf, slugs: slugs, fragment: fragment
345
+ end
346
+
347
+ # Returns subjects from the graph with entailment.
348
+ #
349
+ # @param predicate
350
+ # @param object
351
+ # @param entail
352
+ # @param only
353
+ #
354
+ # @return [RDF::Resource]
355
+ #
356
+ def subjects_for predicate, object, entail: true, only: []
357
+ Util.subjects_for @graph, predicate, object, entail: entail, only: only
358
+ end
359
+
360
+ # Returns objects from the graph with entailment.
361
+ #
362
+ # @param subject
363
+ # @param predicate
364
+ # @param entail
365
+ # @param only
366
+ # @param datatype
367
+ #
368
+ # @return [RDF::Term]
369
+ #
370
+ def objects_for subject, predicate, entail: true, only: [], datatype: nil
371
+ Util.objects_for @graph, subject, predicate,
372
+ entail: entail, only: only, datatype: datatype
373
+ end
374
+
375
+ # Find the terminal replacements for the given subject, if any exist.
376
+ #
377
+ # @param subject
378
+ # @param published indicate the context is published
379
+ #
380
+ # @return [Set]
381
+ #
382
+ def replacements_for subject, published: true
383
+ Util.replacements_for @graph, subject, published: published
384
+ end
385
+
386
+ # Obtain dates for the subject as instances of Date(Time). This is
387
+ # just shorthand for a common application of `objects_for`.
388
+ #
389
+ # @param subject
390
+ # @param predicate
391
+ # @param datatype
392
+ #
393
+ # @return [Array] of dates
394
+ def dates_for subject, predicate: RDF::Vocab::DC.date,
395
+ datatype: [RDF::XSD.date, RDF::XSD.dateTime]
396
+ Util.dates_for @graph, subject, predicate: predicate, datatype: datatype
397
+ end
398
+
399
+ # Obtain any specified MIME types for the subject. Just shorthand
400
+ # for a common application of `objects_for`.
401
+ #
402
+ # @param subject
403
+ # @param predicate
404
+ # @param datatype
405
+ #
406
+ # @return [Array] of internet media types
407
+ #
408
+ def formats_for subject, predicate: RDF::Vocab::DC.format,
409
+ datatype: [RDF::XSD.token]
410
+ Util.objects_for @graph, subject, predicate: predicate, datatype: datatype
411
+ end
412
+
413
+ # Assuming the subject is a thing that has authors, return the
414
+ # list of authors. Try bibo:authorList first for an explicit
415
+ # ordering, then continue to the various other predicates.
416
+ #
417
+ # @param subject [RDF::Resource]
418
+ # @param unique [false, true] only return the first author
419
+ # @param contrib [false, true] return contributors instead of authors
420
+ #
421
+ # @return [RDF::Value, Array]
422
+ #
423
+ def authors_for subject, unique: false, contrib: false
424
+ Util.authors_for @graph, subject, unique: unique, contrib: contrib
425
+ end
426
+
427
+ # Obtain the most appropriate label(s) for the subject's type(s).
428
+ # Returns one or more (depending on the `unique` flag)
429
+ # predicate-object pairs in order of preference.
430
+ #
431
+ # @param subject [RDF::Resource]
432
+ # @param unique [true, false] only return the first pair
433
+ # @param type [RDF::Term, Array] supply asserted types if already retrieved
434
+ # @param lang [nil] not currently implemented (will be conneg)
435
+ # @param desc [false, true] retrieve description instead of label
436
+ # @param alt [false, true] retrieve alternate instead of main
437
+ #
438
+ # @return [Array] either a predicate-object pair or an array of pairs.
439
+ #
440
+ def label_for subject, candidates: nil, unique: true, type: nil,
441
+ lang: nil, desc: false, alt: false
442
+ Util.label_for @graph, subject, candidates: candidates,
443
+ unique: unique, type: type, lang: lang, desc: desc, alt: alt
444
+ end
445
+
446
+ SKOS_HIER = [
447
+ {
448
+ element: :subject,
449
+ pattern: -> c, p { [nil, p, c] },
450
+ preds: [RDF::Vocab::SKOS.broader, RDF::Vocab::SKOS.broaderTransitive],
451
+ },
452
+ {
453
+ element: :object,
454
+ pattern: -> c, p { [c, p, nil] },
455
+ preds: [RDF::Vocab::SKOS.narrower, RDF::Vocab::SKOS.narrowerTransitive],
456
+ }
457
+ ]
458
+ SKOS_HIER.each do |struct|
459
+ # lol how many times are we gonna cart this thing around
460
+ preds = struct[:preds]
461
+ i = 0
462
+ loop do
463
+ equiv = preds[i].entail(:equivalentProperty) - preds
464
+ preds.insert(i + 1, *equiv) unless equiv.empty?
465
+ i += equiv.length + 1;
466
+ break if i >= preds.length
467
+ end
468
+ end
469
+
470
+ def sub_concepts concept, extra: []
471
+ raise 'Concept must be exactly one concept' unless
472
+ concept.is_a? RDF::Resource
473
+ extra = term_list extra
474
+
475
+ # we need an array for a queue, and a set to accumulate the
476
+ # output as well as a separate 'seen' set
477
+ queue = [concept]
478
+ seen = Set.new queue.dup
479
+ out = seen.dup
480
+
481
+ # it turns out that the main SKOS hierarchy terms, while not
482
+ # being transitive themselves, are subproperties of transitive
483
+ # relations which means they are as good as being transitive.
484
+
485
+ while c = queue.shift
486
+ SKOS_HIER.each do |struct|
487
+ elem, pat, preds = struct.values_at(:element, :pattern, :preds)
488
+ preds.each do |p|
489
+ @graph.query(pat.call c, p).each do |stmt|
490
+ # obtain hierarchical element
491
+ hierc = stmt.send elem
492
+
493
+ # skip any further processing if we have seen this concept
494
+ next if seen.include? hierc
495
+ seen << hierc
496
+
497
+ next if !extra.empty? and !extra.any? do |t|
498
+ @graph.has_statement? RDF::Statement.new(hierc, RDF.type, t)
499
+ end
500
+
501
+ queue << hierc
502
+ out << hierc
503
+ end
504
+ end
505
+ end
506
+ end
507
+
508
+ out.to_a.sort
509
+ end
510
+
511
+ def audiences_for uuid, proximate: false, invert: false
512
+ p = invert ? CI['non-audience'] : RDF::Vocab::DC.audience
513
+ return @graph.query([uuid, p, nil]).objects if proximate
514
+
515
+ out = []
516
+ @graph.query([uuid, p, nil]).objects.each do |o|
517
+ out += sub_concepts o
518
+ end
519
+
520
+ out
521
+ end
522
+
523
+ # Get all "reachable" UUID-identified entities (subjects which are
524
+ # also objects)
525
+ def reachable published: false
526
+ p = published ? -> x { published?(x) } : -> x { true }
527
+ # now get the subjects which are also objects
528
+ @graph.subjects.select do |s|
529
+ s.uri? && s =~ /^urn:uuid:/ && @graph.has_object?(s) && p.call(s)
530
+ end
531
+ end
532
+
533
+ # holy cow this is actually a lot of stuff:
534
+
535
+ # turn markdown into xhtml (via md-noko)
536
+
537
+ # turn html into xhtml (trivial)
538
+
539
+ # generate triples from ordinary (x)html structure
540
+
541
+ # map vanilla (x)html metadata to existing graph (ie to get resource URIs)
542
+
543
+ # pull triples from rdfa
544
+
545
+ # stuff rdfa into rdfa-less xhtml
546
+
547
+ # basic nlp detection of terms + text-level markup (dfn, abbr...)
548
+
549
+ # markdown round-tripping (may as well store source in md if possible)
550
+
551
+ # add title attribute to all links
552
+
553
+ # add alt attribute to all images
554
+
555
+ # segmentation of composite documents into multiple files
556
+
557
+ # aggregation of simple documents into composites
558
+
559
+ # generate backlinks
560
+
561
+ # - resource (ie file) generation -
562
+
563
+ # generate indexes of people, groups, and organizations
564
+
565
+ # generate indexes of books, not-books, and other external links
566
+
567
+ def head_links subject, struct: nil, nodes: nil, prefixes: {},
568
+ ignore: [], uris: {}, labels: {}, vocab: nil
569
+
570
+ raise 'ignore must be Array or Set' unless
571
+ [Array, Set].any? { |c| ignore.is_a? c }
572
+
573
+ struct ||= struct_for subject
574
+ nodes ||= invert_struct struct
575
+
576
+ # make sure these are actually URI objects not RDF::URI
577
+ uris = uris.transform_values { |v| URI(uri_pp v.to_s) }
578
+ uri = uris[subject] || canonical_uri(subject, rdf: false)
579
+
580
+ ignore = ignore.to_set
581
+
582
+ # output
583
+ links = []
584
+
585
+ nodes.reject { |n, _| ignore.include?(n) || !n.uri? }.each do |k, v|
586
+ # first nuke rdf:type, that's never in there
587
+ v = v.dup.delete RDF::RDFV.type
588
+ next if v.empty?
589
+
590
+ unless uris[k]
591
+ cu = canonical_uri k
592
+ uris[k] = cu || uri_pp(k.to_s)
593
+ end
594
+
595
+ # munge the url and make the tag
596
+ rel = abbreviate v.to_a, vocab: vocab
597
+ ru = uri.route_to(uris[k])
598
+ ln = { nil => :link, rel: rel, href: ru.to_s }
599
+
600
+ # add the title
601
+ if lab = labels[k]
602
+ ln[:title] = lab[1].to_s
603
+ end
604
+
605
+ # add type attribute
606
+ unless (mts = formats_for k).empty?
607
+ ln[:type] = mts.first.to_s
608
+
609
+ if ln[:type] =~ /(java|ecma)script/i ||
610
+ !(v.to_set & Set[RDF::Vocab::DC.requires]).empty?
611
+ ln[:src] = ln.delete :href
612
+ # make sure we pass in an empty string so there is a closing tag
613
+ ln.delete nil
614
+ ln[['']] = :script
615
+ end
616
+ end
617
+
618
+ # finally add the link
619
+ links.push ln
620
+ end
621
+
622
+ links.sort! do |a, b|
623
+ # sort by rel, then by href
624
+ # warn a.inspect, b.inspect
625
+ s = 0
626
+ [nil, :rel, :rev, :href, :title].each do |k|
627
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
628
+ break if s != 0
629
+ end
630
+ s
631
+ end
632
+
633
+ links
634
+ end
635
+
636
+ def head_meta subject, struct: nil, nodes: nil, prefixes: {},
637
+ ignore: [], meta_names: {}, vocab: nil, lang: nil, xhtml: true
638
+
639
+ raise 'ignore must be Array or Set' unless
640
+ [Array, Set].any? { |c| ignore.is_a? c }
641
+
642
+ struct ||= struct_for subject
643
+ nodes ||= invert_struct struct
644
+
645
+ ignore = ignore.to_set
646
+
647
+ meta = []
648
+ nodes.select { |n| n.literal? && !ignore.include?(n) }.each do |k, v|
649
+ rel = abbreviate v.to_a, vocab: vocab
650
+ tag = { nil => :meta, property: rel, content: k.to_s }
651
+
652
+ lang = (k.language? && k.language != lang ? k.language : nil) ||
653
+ (k.datatype == RDF::XSD.string && lang ? '' : nil)
654
+ if lang
655
+ tag['xml:lang'] = lang if xhtml
656
+ tag[:lang] = lang
657
+ end
658
+
659
+ tag[:datatype] = abbreviate k.datatype, vocab: XHV if k.datatype?
660
+ tag[:name] = meta_names[k] if meta_names[k]
661
+
662
+ meta << tag
663
+ end
664
+
665
+ meta.sort! do |a, b|
666
+ s = 0
667
+ [:about, :property, :datatype, :content, :name].each do |k|
668
+ # warn a.inspect, b.inspect
669
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
670
+ break if s != 0
671
+ end
672
+ s
673
+ end
674
+
675
+ meta
676
+ end
677
+
678
+ def generate_backlinks subject, published: true, ignore: nil
679
+ uri = canonical_uri(subject, rdf: false) || URI(uri_pp subject)
680
+ ignore ||= Set.new
681
+ raise 'ignore must be amenable to a set' unless ignore.respond_to? :to_set
682
+ ignore = ignore.to_set
683
+ nodes = {}
684
+ labels = {}
685
+ types = {}
686
+ @graph.query([nil, nil, subject]).each do |stmt|
687
+ next if ignore.include?(sj = stmt.subject)
688
+ preds = nodes[sj] ||= Set.new
689
+ preds << (pr = stmt.predicate)
690
+ types[sj] ||= asserted_types sj
691
+ labels[sj] ||= label_for sj
692
+ labels[pr] ||= label_for pr
693
+ end
694
+
695
+ # prune out
696
+ nodes.select! { |k, _| published? k } if published
697
+
698
+ return if nodes.empty?
699
+
700
+ li = nodes.sort do |a, b|
701
+ cmp_label a[0], b[0], labels: labels
702
+ end.map do |rsrc, preds|
703
+ cu = canonical_uri(rsrc, rdf: false) or next
704
+ lab = labels[rsrc] || [nil, rsrc]
705
+ lp = abbreviate(lab[0]) if lab[0]
706
+ ty = abbreviate(types[rsrc]) if types[rsrc]
707
+
708
+ { [{ [{ [lab[1].to_s] => :span, property: lp }] => :a,
709
+ href: uri.route_to(cu), typeof: ty, rev: abbreviate(preds) }] => :li }
710
+ end.compact
711
+
712
+ { [{ li => :ul }] => :nav }
713
+ end
714
+
715
+ def generate_twitter_meta subject
716
+ # get author
717
+ author = authors_for(subject, unique: true) or return
718
+
719
+ # get author's twitter account
720
+ twitter = objects_for(author, RDF::Vocab::FOAF.account,
721
+ only: :resource).select { |t| t.to_s =~ /twitter\.com/
722
+ }.sort.first or return
723
+ twitter = URI(twitter.to_s).path.split(/\/+/)[1]
724
+ twitter = ?@ + twitter unless twitter.start_with? ?@
725
+
726
+ # get title
727
+ title = label_for(subject) or return
728
+
729
+ out = [
730
+ { nil => :meta, name: 'twitter:card', content: :summary },
731
+ { nil => :meta, name: 'twitter:site', content: twitter },
732
+ { nil => :meta, name: 'twitter:title', content: title[1].to_s }
733
+ ]
734
+
735
+ # get abstract
736
+ if desc = label_for(subject, desc: true)
737
+ out.push({ nil => :meta, name: 'twitter:description',
738
+ content: desc[1].to_s })
739
+ end
740
+
741
+ # get image (foaf:depiction)
742
+ img = objects_for(subject, RDF::Vocab::FOAF.depiction, only: :resource)
743
+ unless img.empty?
744
+ img = img[0].to_s
745
+ out.push({ nil => :meta, name: 'twitter:image', content: img })
746
+ out[0][:content] = :summary_large_image
747
+ end
748
+
749
+ # return the appropriate xml-mixup structure
750
+ out
751
+ end
752
+
753
+ AUTHOR_SPEC = [
754
+ ['By:', [RDF::Vocab::BIBO.authorList, RDF::Vocab::DC.creator]],
755
+ ['With:', [RDF::Vocab::BIBO.contributorList, RDF::Vocab::DC.contributor]],
756
+ ['Edited by:', [RDF::Vocab::BIBO.editorList, RDF::Vocab::BIBO.editor]],
757
+ ['Translated by:', [RDF::Vocab::BIBO.translator]],
758
+ ].freeze
759
+
760
+ def generate_bibliography id, published: true
761
+ id = canonical_uuid id
762
+ uri = canonical_uri id
763
+ struct = struct_for id
764
+ nodes = Set[id] + smush_struct(struct)
765
+ bodynodes = Set.new
766
+ parts = {}
767
+ referents = {}
768
+ labels = { id => label_for(id, candidates: struct) }
769
+ canon = {}
770
+
771
+ # uggh put these somewhere
772
+ preds = {
773
+ hp: predicate_set(RDF::Vocab::DC.hasPart),
774
+ sa: predicate_set(RDF::RDFS.seeAlso),
775
+ canon: predicate_set([RDF::OWL.sameAs, CI.canonical]),
776
+ ref: predicate_set(RDF::Vocab::DC.references),
777
+ al: predicate_set(RDF::Vocab::BIBO.contributorList),
778
+ cont: predicate_set(RDF::Vocab::DC.contributor),
779
+ }
780
+
781
+ # collect up all the parts (as in dct:hasPart)
782
+ objects_for(id, preds[:hp], entail: false, only: :resource).each do |part|
783
+ bodynodes << part
784
+
785
+ # gather up all the possible alias urls this thing can have
786
+ sa = ([part] + objects_for(part,
787
+ preds[:sa], only: :uri, entail: false)).map do |x|
788
+ [x] + subjects_for(preds[:canon], x, only: :uri, entail: false)
789
+ end.flatten.uniq
790
+
791
+ # collect all the referents
792
+ reftmp = {}
793
+ sa.each do |u|
794
+ subjects_for preds[:ref], u, only: :uri, entail: false do |s, *p|
795
+ reftmp[s] ||= Set.new
796
+ reftmp[s] += p[0].to_set
797
+ end
798
+ end
799
+
800
+ # if we are producing a list of references identified by only
801
+ # published resources, prune out all the unpublished referents
802
+ reftmp.select! { |x, _| published? x } if published
803
+
804
+ # unconditionally skip this item if nothing references it
805
+ next if reftmp.empty?
806
+
807
+ referents[part] = reftmp
808
+
809
+ reftmp.each do |r, _|
810
+ labels[r] ||= label_for r
811
+ canon[r] ||= canonical_uri r
812
+ end
813
+
814
+ # collect all the authors and author lists
815
+
816
+ objects_for(part, preds[:al], only: :resource, entail: false) do |o|
817
+ RDF::List.new(subject: o, graph: @graph).each do |a|
818
+ labels[a] ||= label_for a
819
+ end
820
+ end
821
+
822
+ objects_for(part, preds[:cont], only: :uri, entail: false) do |a|
823
+ labels[a] ||= label_for a
824
+ end
825
+
826
+ ps = struct_for part
827
+ labels[part] = label_for part, candidates: ps
828
+ nodes |= smush_struct ps
829
+
830
+ parts[part] = ps
831
+ end
832
+
833
+ bmap = prepare_collation struct
834
+ pf = -> x { abbreviate bmap[x.literal? ? :literals : :resources][x] }
835
+
836
+ body = []
837
+ parts.sort { |a, b| cmp_label a[0], b[0], labels: labels }.each do |k, v|
838
+ mapping = prepare_collation v
839
+ p = -> x {
840
+ abbreviate mapping[x.literal? ? :literals : :resources][x] }
841
+ t = abbreviate mapping[:types]
842
+
843
+ lp = label_for k, candidates: v
844
+ h2c = [lp[1].to_s]
845
+ h2 = { h2c => :h2 }
846
+ cu = canonical_uri k
847
+ rel = nil
848
+ unless cu.scheme.downcase.start_with? 'http'
849
+ if sa = v[RDF::RDFS.seeAlso]
850
+ rel = p.call sa[0]
851
+ cu = canonical_uri sa[0]
852
+ else
853
+ cu = nil
854
+ end
855
+ end
856
+
857
+ if cu
858
+ h2c[0] = { [lp[1].to_s] => :a, rel: rel,
859
+ property: p.call(lp[1]), href: cu.to_s }
860
+ else
861
+ h2[:property] = p.call(lp[1])
862
+ end
863
+
864
+ # authors &c
865
+ # authors contributors editors translators
866
+ al = []
867
+ AUTHOR_SPEC.each do |label, pl|
868
+ dd = []
869
+ seen = Set.new
870
+ pl.each do |pred|
871
+ # first check if the struct has the predicate
872
+ next unless v[pred]
873
+ li = []
874
+ ul = { li => :ul, rel: abbreviate(pred) }
875
+ v[pred].sort { |a, b| cmp_label a, b, labels: labels }.each do |o|
876
+ # check if this is a list
877
+ tl = RDF::List.new subject: o, graph: @graph
878
+ if tl.empty? and !seen.include? o
879
+ seen << o
880
+ lab = labels[o] ? { [labels[o][1]] => :span,
881
+ property: abbreviate(labels[o][0]) } : o
882
+ li << { [lab] => :li, resource: o }
883
+ else
884
+ # XXX this will actually not be right if there are
885
+ # multiple lists but FINE FOR NOW
886
+ ul[:inlist] ||= ''
887
+ tl.each do |a|
888
+ seen << a
889
+ lab = labels[a] ? { [labels[a][1]] => :span,
890
+ property: abbreviate(labels[a][0]) } : a
891
+ li << { [lab] => :li, resource: a }
892
+ end
893
+ end
894
+ end
895
+ dd << ul unless li.empty?
896
+ end
897
+ al += [{ [label] => :dt }, { dd => :dd }] unless dd.empty?
898
+ end
899
+
900
+ # ref list
901
+ rl = referents[k].sort do |a, b|
902
+ cmp_label a[0], b[0], labels: labels
903
+ end.map do |ref, pset|
904
+ lab = labels[ref] ? { [labels[ref][1]] => :span,
905
+ property: abbreviate(labels[ref][0]) } : ref
906
+
907
+ { [{ [lab] => :a, rev: abbreviate(pset), href: canon[ref] }] => :li }
908
+ end
909
+
910
+ contents = [h2, {
911
+ al + [{ ['Referenced in:'] => :dt },
912
+ { [{ rl => :ul }] => :dd }] => :dl }]
913
+
914
+ body << { contents => :section,
915
+ rel: pf.call(k), resource: k.to_s, typeof: t }
916
+ end
917
+
918
+ # prepend abstract to body if it exists
919
+ abs = label_for id, candidates: struct, desc: true
920
+ if abs
921
+ tag = { '#p' => abs[1], property: abbreviate(abs[0]) }
922
+ body.unshift tag
923
+ end
924
+
925
+ # add labels to nodes
926
+ nodes += smush_struct labels
927
+
928
+ # get prefixes
929
+ pfx = prefix_subset prefixes, nodes
930
+
931
+ # get title tag
932
+ title = title_tag labels[id][0], labels[id][1],
933
+ prefixes: prefixes, lang: 'en'
934
+
935
+ # get links
936
+ link = head_links id,
937
+ struct: struct, ignore: bodynodes, labels: labels, vocab: XHV
938
+
939
+ # get metas
940
+ mn = {}
941
+ mn[abs[1]] = :description if abs
942
+ mi = Set.new
943
+ mi << labels[id][1] if labels[id]
944
+ meta = head_meta id,
945
+ struct: struct, lang: 'en', ignore: mi, meta_names: mn, vocab: XHV
946
+
947
+ meta += generate_twitter_meta(id) || []
948
+
949
+ xhtml_stub(base: uri, prefix: pfx, lang: 'en', title: title, vocab: XHV,
950
+ link: link, meta: meta, transform: @config[:transform],
951
+ body: { body => :body, about: '',
952
+ typeof: abbreviate(struct[RDF::RDFV.type] || []) }).document
953
+ end
954
+
955
+ # generate skos concept schemes
956
+
957
+ CONCEPTS = Util.all_related(RDF::Vocab::SKOS.Concept).to_set
958
+
959
+ def generate_audience_csv file = nil, published: true
960
+ require 'csv'
961
+ file = coerce_to_path_or_io file if file
962
+ lab = {}
963
+
964
+ out = all_internal_docs(published: published,
965
+ exclude: RDF::Vocab::FOAF.Image).map do |s|
966
+ u = canonical_uri s
967
+ x = struct_for s
968
+ c = x[RDF::Vocab::DC.created] ? x[RDF::Vocab::DC.created][0] : nil
969
+ _, t = label_for s, candidates: x
970
+ _, d = label_for s, candidates: x, desc: true
971
+
972
+ # # audience(s)
973
+ # a = objects_for(s, RDF::Vocab::DC.audience).map do |au|
974
+ # next lab[au] if lab[au]
975
+ # _, al = label_for au
976
+ # lab[au] = al
977
+ # end.map(&:to_s).sort.join '; '
978
+
979
+ # # explicit non-audience(s)
980
+ # n = objects_for(s, RDF::SAK::CI['non-audience']).map do |au|
981
+ # next lab[au] if lab[au]
982
+ # _, al = label_for au
983
+ # lab[au] = al
984
+ # end.map(&:to_s).sort.join '; '
985
+
986
+ # audience and non-audience
987
+ a, n = [RDF::Vocab::DC.audience, CI['non-audience']].map do |ap|
988
+ objects_for(s, ap).map do |au|
989
+ next lab[au] if lab[au]
990
+ _, al = label_for au
991
+ lab[au] = al
992
+ end.map(&:to_s).sort.join '; '
993
+ end
994
+
995
+ # concepts???
996
+ concepts = [RDF::Vocab::DC.subject, CI.introduces,
997
+ CI.assumes, CI.mentions].map do |pred|
998
+ objects_for(s, pred, only: :resource).map do |o|
999
+ con = self.objects_for(o, RDF.type).to_set & CONCEPTS
1000
+ next if con.empty?
1001
+ next lab[o] if lab[o]
1002
+ _, ol = label_for o
1003
+ lab[o] = ol
1004
+ end.compact.map(&:to_s).sort.join '; '
1005
+ end
1006
+
1007
+ [s, u, c, t, d, a, n].map(&:to_s) + concepts
1008
+ end.sort { |a, b| a[2] <=> b[2] }
1009
+
1010
+ out.unshift ['ID', 'URL', 'Created', 'Title', 'Description', 'Audience',
1011
+ 'Non-Audience', 'Subject', 'Introduces', 'Assumes', 'Mentions']
1012
+
1013
+ if file
1014
+ # don't open until now
1015
+ file = file.expand_path.open('wb') unless file.is_a? IO
1016
+
1017
+ csv = CSV.new file
1018
+ out.each { |x| csv << x }
1019
+ file.flush
1020
+ end
1021
+
1022
+ out
1023
+ end
1024
+
1025
+ CSV_PRED = {
1026
+ audience: RDF::Vocab::DC.audience,
1027
+ nonaudience: CI['non-audience'],
1028
+ subject: RDF::Vocab::DC.subject,
1029
+ assumes: CI.assumes,
1030
+ introduces: CI.introduces,
1031
+ mentions: CI.mentions,
1032
+ }
1033
+
1034
+ def ingest_csv file
1035
+ file = coerce_to_path_or_io file
1036
+
1037
+ require 'csv'
1038
+
1039
+ # key mapper
1040
+ km = { uuid: :id, url: :uri }
1041
+ kt = -> (k) { km[k] || k }
1042
+
1043
+ # grab all the concepts and audiences
1044
+
1045
+ audiences = {}
1046
+ all_of_type(CI.Audience).map do |c|
1047
+ s = struct_for c
1048
+
1049
+ # homogenize the labels
1050
+ lab = [false, true].map do |b|
1051
+ label_for(c, candidates: s, unique: false, alt: b).map { |x| x[1] }
1052
+ end.flatten.map { |x| x.to_s.strip.downcase }
1053
+
1054
+ # we want all the keys to share the same set
1055
+ set = nil
1056
+ lab.each { |t| set = audiences[t] ||= set || Set.new }
1057
+ set << c
1058
+ end
1059
+
1060
+ concepts = {}
1061
+ all_of_type(RDF::Vocab::SKOS.Concept).map do |c|
1062
+ s = struct_for c
1063
+
1064
+ # homogenize the labels
1065
+ lab = [false, true].map do |b|
1066
+ label_for(c, candidates: s, unique: false, alt: b).map { |x| x[1] }
1067
+ end.flatten.map { |x| x.to_s.strip.downcase }
1068
+
1069
+ # we want all the keys to share the same set
1070
+ set = nil
1071
+ lab.each { |t| set = concepts[t] ||= set || Set.new }
1072
+ set << c
1073
+ end
1074
+
1075
+ data = CSV.read(file, headers: true,
1076
+ header_converters: :symbol).map do |o|
1077
+ o = o.to_h.transform_keys(&kt)
1078
+ s = canonical_uuid(o.delete :id) or next
1079
+
1080
+ # LOLOL wtf
1081
+
1082
+ # handle audience
1083
+ [:audience, :nonaudience].each do |a|
1084
+ if o[a]
1085
+ o[a] = o[a].strip.split(/\s*[;,]+\s*/, -1).map do |t|
1086
+ if t =~ /^[a-z+-]+:[^[:space:]]+$/
1087
+ u = RDF::URI(t)
1088
+ canonical_uuid(u) || u
1089
+ elsif audiences[t.downcase]
1090
+ audiences[t.downcase].to_a
1091
+ end
1092
+ end.flatten.compact.uniq
1093
+ else
1094
+ o[a] = []
1095
+ end
1096
+ end
1097
+
1098
+ # handle concepts
1099
+ [:subject, :introduces, :assumes, :mentions].each do |a|
1100
+ if o[a]
1101
+ o[a] = o[a].strip.split(/\s*[;,]+\s*/, -1).map do |t|
1102
+ if t =~ /^[a-z+-]+:[^[:space:]]+$/
1103
+ u = RDF::URI(t)
1104
+ canonical_uuid(u) || u
1105
+ elsif concepts[t.downcase]
1106
+ concepts[t.downcase].to_a
1107
+ end
1108
+ end.flatten.compact.uniq
1109
+ else
1110
+ o[a] = []
1111
+ end
1112
+
1113
+ end
1114
+
1115
+ CSV_PRED.each do |sym, pred|
1116
+ o[sym].each do |obj|
1117
+ @graph << [s, pred, obj]
1118
+ end
1119
+ end
1120
+
1121
+ [s, o]
1122
+ end.compact.to_h
1123
+ data
1124
+ end
1125
+
1126
+ def generate_sitemap published: true
1127
+ urls = {}
1128
+
1129
+ # do feeds separately
1130
+ feeds = all_of_type RDF::Vocab::DCAT.Distribution
1131
+ #feeds.select! { |f| published? f } if published
1132
+ feeds.each do |f|
1133
+ uri = canonical_uri(f)
1134
+ f = generate_atom_feed f, published: published, related: feeds
1135
+ mt = f.at_xpath('/atom:feed/atom:updated[1]/text()',
1136
+ { atom: 'http://www.w3.org/2005/Atom' })
1137
+ urls[uri] = { [{ [uri.to_s] => :loc }, { [mt] => :lastmod }] => :url }
1138
+ end
1139
+
1140
+ # build up hash of urls
1141
+ all_internal_docs(published: published).each do |doc|
1142
+ next if asserted_types(doc).include? RDF::Vocab::FOAF.Image
1143
+ uri = canonical_uri(doc)
1144
+ next unless uri.authority && @base && uri.authority == base.authority
1145
+ mods = objects_for(doc, [RDF::Vocab::DC.created,
1146
+ RDF::Vocab::DC.modified, RDF::Vocab::DC.issued],
1147
+ datatype: RDF::XSD.dateTime).sort
1148
+ nodes = [{ [uri.to_s] => :loc }]
1149
+ nodes << { [mods[-1].to_s] => :lastmod } unless mods.empty?
1150
+ urls[uri] = { nodes => :url }
1151
+ end
1152
+
1153
+ urls = urls.sort.map { |_, v| v }
1154
+
1155
+ markup(spec: { urls => :urlset,
1156
+ xmlns: 'http://www.sitemaps.org/schemas/sitemap/0.9' }).document
1157
+ end
1158
+
1159
+ def write_sitemap published: true
1160
+ sitemap = generate_sitemap published: published
1161
+ file = @config[:sitemap] || '.well-known/sitemap.xml'
1162
+ target = @config[published ? :target : :private]
1163
+ target.mkpath unless target.directory?
1164
+
1165
+ fh = (target + file).open(?w)
1166
+ sitemap.write_to fh
1167
+ fh.close
1168
+ end
1169
+
1170
+ # generate atom feed
1171
+
1172
+ #
1173
+ def all_internal_docs published: true, exclude: []
1174
+ # find all UUIDs that are documents
1175
+ docs = all_of_type(RDF::Vocab::FOAF.Document,
1176
+ exclude: exclude).select { |x| x =~ /^urn:uuid:/ }
1177
+
1178
+ # prune out all but the published documents if specified
1179
+ if published
1180
+ p = RDF::Vocab::BIBO.status
1181
+ o = RDF::Vocabulary.find_term(
1182
+ 'http://purl.org/ontology/bibo/status/published')
1183
+ docs = docs.select do |s|
1184
+ @graph.has_statement? RDF::Statement(s, p, o)
1185
+ end
1186
+ end
1187
+
1188
+ docs
1189
+ end
1190
+
1191
+ def generate_atom_feed id, published: true, related: []
1192
+ raise 'ID must be a resource' unless id.is_a? RDF::Resource
1193
+
1194
+ # prepare relateds
1195
+ raise 'related must be an array' unless related.is_a? Array
1196
+ related -= [id]
1197
+
1198
+ # feed = struct_for id
1199
+
1200
+ faudy = audiences_for id
1201
+ faudn = audiences_for id, invert: true
1202
+ faudy -= faudn
1203
+
1204
+ docs = all_internal_docs published: published
1205
+
1206
+ # now we create a hash keyed by uuid containing the metadata
1207
+ authors = {}
1208
+ titles = {}
1209
+ dates = {}
1210
+ entries = {}
1211
+ latest = nil
1212
+ docs.each do |uu|
1213
+ # basically make a jsonld-like structure
1214
+ #rsrc = struct_for uu
1215
+
1216
+ indexed = objects_for uu, RDF::SAK::CI.indexed, only: :literal
1217
+ next if !indexed.empty? and indexed.any? { |f| f == false }
1218
+
1219
+ # get id (got it already duh)
1220
+
1221
+ # get audiences
1222
+ audy = audiences_for uu, proximate: true
1223
+ audn = audiences_for uu, proximate: true, invert: true
1224
+
1225
+ #warn "#{faudy.to_s} & #{faud"
1226
+
1227
+ skip = false
1228
+ if audy.empty?
1229
+ # an unspecified audience implies "everybody", but if the
1230
+ # feed's audience *is* specified, then it's not for everybody
1231
+ skip = true unless faudy.empty?
1232
+ else
1233
+ # if document audience matches feed non-audience, disqualify
1234
+ skip = true unless (faudn & audy).empty?
1235
+
1236
+ # absence of an explicit feed audience implies "everybody"
1237
+ if faudy.empty?
1238
+ # if document audience minus feed non-audience has
1239
+ # members, re-qualify
1240
+ skip = false unless (audy - faudn).empty?
1241
+ else
1242
+ # if document audience matches feed audience, re-qualify
1243
+ skip = false unless (faudy & audy).empty?
1244
+ end
1245
+ end
1246
+
1247
+ # if document non-audience matches feed audience, re-disqualify
1248
+ skip = true if !(audn.empty? || faudy.empty?) && !(faudy & audn).empty?
1249
+
1250
+ next if skip
1251
+
1252
+ canon = URI.parse(canonical_uri(uu).to_s)
1253
+
1254
+ xml = { '#entry' => [
1255
+ { '#link' => nil, rel: :alternate, href: canon, type: 'text/html' },
1256
+ { '#id' => uu.to_s }
1257
+ ] }
1258
+
1259
+ # get published date first
1260
+ published = (objects_for uu,
1261
+ [RDF::Vocab::DC.issued, RDF::Vocab::DC.created],
1262
+ datatype: RDF::XSD.dateTime)[0]
1263
+
1264
+ # get latest updated date
1265
+ updated = (objects_for uu, RDF::Vocab::DC.modified,
1266
+ datatype: RDF::XSD.dateTime).sort[-1]
1267
+ updated ||= published || RDF::Literal::DateTime.new(DateTime.now)
1268
+ updated = Time.parse(updated.to_s).utc
1269
+ latest = updated if !latest or latest < updated
1270
+
1271
+ xml['#entry'].push({ '#updated' => updated.iso8601 })
1272
+
1273
+ if published
1274
+ published = Time.parse(published.to_s).utc
1275
+ xml['#entry'].push({ '#published' => published.iso8601 })
1276
+ dates[uu] = [published, updated]
1277
+ else
1278
+ dates[uu] = [updated, updated]
1279
+ end
1280
+
1281
+ # get author(s)
1282
+ al = []
1283
+ authors_for(uu).each do |a|
1284
+ unless authors[a]
1285
+ n = label_for a
1286
+ x = authors[a] = { '#author' => [{ '#name' => n[1].to_s }] }
1287
+
1288
+ hp = @graph.first_object [a, RDF::Vocab::FOAF.homepage, nil]
1289
+ hp ||= canonical_uri a
1290
+
1291
+ x['#author'].push({ '#uri' => hp.to_s }) if hp
1292
+ end
1293
+
1294
+ al.push authors[a]
1295
+ end
1296
+
1297
+ xml['#entry'] += al unless al.empty?
1298
+
1299
+ # get title (note unshift)
1300
+ if (t = label_for uu)
1301
+ titles[uu] = t[1].to_s
1302
+ xml['#entry'].unshift({ '#title' => t[1].to_s })
1303
+ else
1304
+ titles[uu] = uu.to_s
1305
+ end
1306
+
1307
+ # get abstract
1308
+ if (d = label_for uu, desc: true)
1309
+ xml['#entry'].push({ '#summary' => d[1].to_s })
1310
+ end
1311
+
1312
+ entries[uu] = xml
1313
+ end
1314
+
1315
+ # note we overwrite the entries hash here with a sorted array
1316
+ entrycmp = -> a, b {
1317
+ # first we sort by published date
1318
+ p = dates[a][0] <=> dates[b][0]
1319
+ # if the published dates are the same, sort by updated date
1320
+ u = dates[a][1] <=> dates[b][1]
1321
+ # to break any ties, finally sort by title
1322
+ p == 0 ? u == 0 ? titles[a] <=> titles[b] : u : p }
1323
+ entries = entries.values_at(
1324
+ *entries.keys.sort { |a, b| entrycmp.call(a, b) })
1325
+ # ugggh god forgot the asterisk and lost an hour
1326
+
1327
+ # now we punt out the doc
1328
+
1329
+ preamble = [
1330
+ { '#id' => id.to_s },
1331
+ { '#updated' => latest.iso8601 },
1332
+ { '#generator' => 'RDF::SAK', version: RDF::SAK::VERSION,
1333
+ uri: "https://github.com/doriantaylor/rb-rdf-sak" },
1334
+ { nil => :link, rel: :self, type: 'application/atom+xml',
1335
+ href: canonical_uri(id) },
1336
+ { nil => :link, rel: :alternate, type: 'text/html',
1337
+ href: @base },
1338
+ ] + related.map do |r|
1339
+ { nil => :link, rel: :related, type: 'application/atom+xml',
1340
+ href: canonical_uri(r) }
1341
+ end
1342
+
1343
+ if (t = label_for id)
1344
+ preamble.unshift({ '#title' => t[1].to_s })
1345
+ end
1346
+
1347
+ if (r = @graph.first_literal [id, RDF::Vocab::DC.rights, nil])
1348
+ rh = { '#rights' => r.to_s, type: :text }
1349
+ rh['xml:lang'] = r.language if r.has_language?
1350
+ preamble.push rh
1351
+ end
1352
+
1353
+ markup(spec: { '#feed' => preamble + entries,
1354
+ xmlns: 'http://www.w3.org/2005/Atom' }).document
1355
+ end
1356
+
1357
+ def write_feeds type: RDF::Vocab::DCAT.Distribution, published: true
1358
+ feeds = all_of_type type
1359
+ target = @config[published ? :target : :private]
1360
+ feeds.each do |feed|
1361
+ tu = URI(feed.to_s)
1362
+ doc = generate_atom_feed feed, published: published, related: feeds
1363
+ fh = (target + "#{tu.uuid}.xml").open('w')
1364
+ doc.write_to fh
1365
+ fh.close
1366
+ end
1367
+ end
1368
+
1369
+ # generate sass palettes
1370
+
1371
+ # generate rewrite map(s)
1372
+ def generate_rewrite_map published: false, docs: nil
1373
+ docs ||= reachable published: published
1374
+ base = URI(@base.to_s)
1375
+ rwm = {}
1376
+ docs.each do |doc|
1377
+ tu = URI(doc.to_s)
1378
+ cu = canonical_uri doc, rdf: false
1379
+ next unless tu.respond_to?(:uuid) and cu.respond_to?(:request_uri)
1380
+
1381
+ # skip external links obvs
1382
+ next unless base.route_to(cu).relative?
1383
+
1384
+ # skip /uuid form
1385
+ cp = cu.request_uri.delete_prefix '/'
1386
+ next if cu.host == base.host and tu.uuid == cp
1387
+
1388
+ rwm[cp] = tu.uuid
1389
+ end
1390
+
1391
+ rwm
1392
+ end
1393
+
1394
+ # give me all UUIDs of all documents, filter for published if
1395
+ # applicable
1396
+ #
1397
+ # find the "best" (relative) URL for the UUID and map the pair
1398
+ # together
1399
+ def generate_uuid_redirect_map published: false, docs: nil
1400
+ docs ||= reachable published: published
1401
+
1402
+ base = URI(@base.to_s)
1403
+
1404
+ # keys are /uuid, values are
1405
+ out = {}
1406
+ docs.each do |doc|
1407
+ tu = URI(doc.to_s)
1408
+ cu = canonical_uri doc, rdf: false
1409
+ next unless tu.respond_to?(:uuid) and cu.respond_to?(:request_uri)
1410
+
1411
+ # skip /uuid form
1412
+ cp = cu.request_uri.delete_prefix '/'
1413
+ next if cu.host == base.host && tu.uuid == cp
1414
+
1415
+ # all redirect links are absolute
1416
+ out[tu.uuid] = cu.to_s
1417
+ end
1418
+ out
1419
+ end
1420
+
1421
+ # find all URIs/slugs that are *not* canonical, map them to slugs
1422
+ # that *are* canonical
1423
+ def generate_slug_redirect_map published: false, docs: nil
1424
+ docs ||= reachable published: published
1425
+ base = URI(@base.to_s)
1426
+
1427
+ # for redirects we collect all the docs, plus all their URIs,
1428
+ # separate canonical from the rest
1429
+
1430
+ # actually an easy way to do this is just harvest all the
1431
+ # multi-addressed docs, remove the first one, then ask for the
1432
+ # canonical uuid back,
1433
+
1434
+ fwd = {}
1435
+ rev = {}
1436
+ out = {}
1437
+
1438
+ docs.each do |doc|
1439
+ uris = canonical_uri doc, unique: false, rdf: false
1440
+ canon = uris.shift
1441
+ next unless canon.respond_to? :request_uri
1442
+
1443
+ # cache the forward direction
1444
+ fwd[doc] = canon
1445
+
1446
+ unless uris.empty?
1447
+ uris.each do |uri|
1448
+ next unless uri.respond_to? :request_uri
1449
+ next if canon == uri
1450
+ next unless base.route_to(uri).relative?
1451
+
1452
+ # warn "#{canon} <=> #{uri}"
1453
+
1454
+ requri = uri.request_uri.delete_prefix '/'
1455
+ next if requri == '' ||
1456
+ requri =~ /^[0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8}$/
1457
+
1458
+ # cache the reverse direction
1459
+ rev[uri] = requri
1460
+ end
1461
+ end
1462
+ end
1463
+
1464
+ rev.each do |uri, requri|
1465
+ if (doc = canonical_uuid(uri, published: published)) and
1466
+ fwd[doc] and fwd[doc] != uri
1467
+ out[requri] = fwd[doc].to_s
1468
+ end
1469
+ end
1470
+
1471
+ out
1472
+ end
1473
+
1474
+ # you know what, it's entirely possible that these ought never be
1475
+ # called individually and the work to get one would duplicate the
1476
+ # work of getting the other, so maybe just do 'em both at once
1477
+
1478
+ def generate_redirect_map published: false, docs: nil
1479
+ generate_uuid_redirect_map(published: published, docs: docs).merge(
1480
+ generate_slug_redirect_map(published: published, docs: docs))
1481
+ end
1482
+
1483
+ def generate_gone_map published: false, docs: nil
1484
+ # published is a no-op for this one because these docs are by
1485
+ # definition not published
1486
+ docs ||= reachable published: false
1487
+ p = RDF::Vocab::BIBO.status
1488
+ base = URI(@base.to_s)
1489
+ out = {}
1490
+ docs.select { |s|
1491
+ @graph.has_statement? RDF::Statement(s, p, CI.retired) }.each do |doc|
1492
+ canon = canonical_uri doc, rdf: false
1493
+ next unless base.route_to(canon).relative?
1494
+ canon = canon.request_uri.delete_prefix '/'
1495
+ # value of the gone map doesn't matter
1496
+ out[canon] = canon
1497
+ end
1498
+
1499
+ out
1500
+ end
1501
+
1502
+ # private?
1503
+
1504
+ def map_location type
1505
+ # find file name in config
1506
+ fn = @config[:maps][type] or return
1507
+
1508
+ # concatenate to target directory
1509
+ @config[:target] + fn
1510
+ end
1511
+
1512
+ # private?
1513
+
1514
+ def write_map_file location, data
1515
+ # open file
1516
+ fh = File.new location, 'w'
1517
+ data.sort.each { |k, v| fh.write "#{k}\t#{v}\n" }
1518
+ fh.close # return value is return value from close
1519
+ end
1520
+
1521
+ # public again
1522
+
1523
+ def write_rewrite_map published: false, docs: nil
1524
+ data = generate_rewrite_map published: published, docs: docs
1525
+ loc = map_location :rewrite
1526
+ write_map_file loc, data
1527
+ end
1528
+
1529
+ def write_redirect_map published: false, docs: nil
1530
+ data = generate_redirect_map published: published, docs: docs
1531
+ loc = map_location :redirect
1532
+ write_map_file loc, data
1533
+ end
1534
+
1535
+ def write_gone_map published: false, docs: nil
1536
+ data = generate_gone_map published: published, docs: docs
1537
+ loc = map_location :gone
1538
+ write_map_file loc, data
1539
+ end
1540
+
1541
+ def write_maps published: true, docs: nil
1542
+ docs ||= reachable published: false
1543
+ # slug to uuid (internal)
1544
+ write_rewrite_map docs: docs
1545
+ # uuid/slug to canonical slug (308)
1546
+ write_redirect_map docs: docs
1547
+ # retired slugs/uuids (410)
1548
+ write_gone_map docs: docs
1549
+ true
1550
+ end
1551
+
1552
+ # whoops lol we forgot the book list
1553
+
1554
+ def reading_lists published: true
1555
+ out = all_of_type RDF::Vocab::SiocTypes.ReadingList
1556
+ return out unless published
1557
+ out.select { |r| published? r }
1558
+ end
1559
+
1560
+ def generate_reading_list subject, published: true
1561
+ # struct = struct_for subject
1562
+
1563
+ # find all the books, sort them by title
1564
+
1565
+ # for each book, give title, authors, inbound references
1566
+
1567
+ # punt out xhtml
1568
+ end
1569
+
1570
+ def write_reading_lists published: true
1571
+ reading_lists(published: published).each do |rl|
1572
+ tu = URI(rl.to_s)
1573
+ doc = generate_reading_list rl, published: published
1574
+ fh = (target + "#{tu.uuid}.xml").open('w')
1575
+ doc.write_to fh
1576
+ fh.close
1577
+ end
1578
+ end
1579
+
1580
+ DSD_SEQ = %i[characters words blocks sections
1581
+ min low-quartile median high-quartile max mean sd].freeze
1582
+ TH_SEQ = %w[Document Abstract Created Modified Characters Words Blocks
1583
+ Sections Min Q1 Median Q3 Max Mean SD].map { |t| { [t] => :th } }
1584
+
1585
+ def generate_stats published: true
1586
+ out = {}
1587
+ all_of_type(QB.DataSet).map do |s|
1588
+ base = canonical_uri s, rdf: false
1589
+ types = abbreviate asserted_types(s)
1590
+ title = if t = label_for(s)
1591
+ [t[1].to_s, abbreviate(t[0])]
1592
+ end
1593
+ cache = {}
1594
+ subjects_for(QB.dataSet, s, only: :resource).each do |o|
1595
+ if d = objects_for(o, CI.document, only: :resource).first
1596
+ if !published or published?(d)
1597
+ # include a "sort" time that defaults to epoch zero
1598
+ c = cache[o] ||= {
1599
+ doc: d, stime: Time.at(0).getgm, struct: struct_for(o) }
1600
+
1601
+ if t = label_for(d)
1602
+ c[:title] = t
1603
+ end
1604
+ if a = label_for(d, desc: true)
1605
+ c[:abstract] = a
1606
+ end
1607
+ if ct = objects_for(d,
1608
+ RDF::Vocab::DC.created, datatype: RDF::XSD.dateTime).first
1609
+ c[:stime] = c[:ctime] = ct.object.to_time.getgm
1610
+ end
1611
+ if mt = objects_for(d,
1612
+ RDF::Vocab::DC.modified, datatype:RDF::XSD.dateTime)
1613
+ c[:mtime] = mt.map { |m| m.object.to_time.getgm }.sort
1614
+ c[:stime] = c[:mtime].last unless mt.empty?
1615
+ end
1616
+ end
1617
+ end
1618
+ end
1619
+
1620
+ # sort lambda closure
1621
+ sl = -> a, b do
1622
+ x = cache[b][:stime] <=> cache[a][:stime]
1623
+ return x unless x == 0
1624
+ x = cache[b][:ctime] <=> cache[a][:ctime]
1625
+ return x unless x == 0
1626
+ ta = cache[a][:title] || Array.new(2, cache[a][:uri])
1627
+ tb = cache[b][:title] || Array.new(2, cache[b][:uri])
1628
+ ta[1].to_s <=> tb[1].to_s
1629
+ end
1630
+
1631
+ rows = []
1632
+ cache.keys.sort(&sl).each do |k|
1633
+ c = cache[k]
1634
+ href = base.route_to canonical_uri(c[:doc], rdf: false)
1635
+ dt = abbreviate asserted_types(c[:doc])
1636
+ uu = URI(k.to_s).uuid
1637
+ nc = UUID::NCName.to_ncname uu, version: 1
1638
+ tp, tt = c[:title] || []
1639
+ ab = if c[:abstract]
1640
+ { [c[:abstract][1].to_s] => :th, about: href,
1641
+ property: abbreviate(c[:abstract].first) }
1642
+ else
1643
+ { [] => :th }
1644
+ end
1645
+
1646
+ td = [{ { { [tt.to_s] => :span, property: abbreviate(tp) } => :a,
1647
+ rel: 'ci:document', href: href } => :th },
1648
+ ab,
1649
+ { [c[:ctime].iso8601] => :th, property: 'dct:created',
1650
+ datatype: 'xsd:dateTime', about: href, typeof: dt },
1651
+ { c[:mtime].reverse.map { |m| { [m.iso8601] => :span,
1652
+ property: 'dct:modified', datatype: 'xsd:dateTime' } } => :th,
1653
+ about: href
1654
+ },
1655
+ ] + DSD_SEQ.map do |f|
1656
+ h = []
1657
+ x = { h => :td }
1658
+ p = CI[f]
1659
+ if y = c[:struct][p] and !y.empty?
1660
+ h << y = y.first
1661
+ x[:property] = abbreviate p
1662
+ x[:datatype] = abbreviate y.datatype if y.datatype?
1663
+ end
1664
+ x
1665
+ end
1666
+ rows << { td => :tr, id: nc, about: "##{nc}",
1667
+ typeof: 'qb:Observation' }
1668
+ end
1669
+
1670
+ out[s] = xhtml_stub(base: base, title: title,
1671
+ transform: config[:transform], attr: { about: '', typeof: types },
1672
+ prefix: prefixes, content: {
1673
+ [{ [{ [{ ['About'] => :th, colspan: 4 },
1674
+ { ['Counts'] => :th, colspan: 4 },
1675
+ { ['Words per Block'] => :th, colspan: 7 }] => :tr },
1676
+ { TH_SEQ => :tr } ] => :thead },
1677
+ { rows => :tbody, rev: 'qb:dataSet' }] => :table }).document
1678
+ end
1679
+
1680
+ out
1681
+ end
1682
+
1683
+ def write_stats published: true
1684
+ target = @config[published ? :target : :private]
1685
+ target.mkpath unless target.directory?
1686
+ generate_stats(published: published).each do |uu, doc|
1687
+ bn = URI(uu.to_s).uuid + '.xml'
1688
+ fh = (target + bn).open ?w
1689
+ doc.write_to fh
1690
+ fh.flush
1691
+ fh.close
1692
+ end
1693
+ end
1694
+
1695
+ # - io stuff -
1696
+
1697
+ # Locate the file in the source directory associated with the given URI.
1698
+ #
1699
+ # @param [RDF::URI, URI, :to_s] the URI requested
1700
+ #
1701
+ # @return [Pathname] of the corresponding file or nil if no file was found.
1702
+
1703
+ def locate uri
1704
+ uri = coerce_resource uri
1705
+
1706
+ base = URI(@base.to_s)
1707
+
1708
+ tu = URI(uri) # copy of uri for testing content
1709
+ unless tu.scheme == 'urn' and tu.nid == 'uuid'
1710
+ raise "could not find UUID for #{uri}" unless uuid = canonical_uuid(uri)
1711
+ tu = URI(uri = uuid)
1712
+ end
1713
+
1714
+ # xxx bail if the uri isn't a subject in the graph
1715
+
1716
+ candidates = [@config[:source] + tu.uuid]
1717
+
1718
+ # try all canonical URIs
1719
+ (canonical_uri uri, unique: false, slugs: true).each do |u|
1720
+ u = URI(u.to_s)
1721
+ next unless u.hostname == base.hostname
1722
+ p = URI.unescape u.path[/^\/*(.*?)$/, 1]
1723
+ candidates.push(@config[:source] + p)
1724
+ end
1725
+
1726
+ # warn candidates
1727
+
1728
+ files = candidates.uniq.map do |c|
1729
+ Pathname.glob(c.to_s + '{,.*,/index{,.*}}')
1730
+ end.reduce(:+).reject do |x|
1731
+ x.directory? or RDF::SAK::MimeMagic.by_path(x).to_s !~
1732
+ /.*(?:markdown|(?:x?ht|x)ml).*/i
1733
+ end.uniq
1734
+
1735
+ #warn files
1736
+
1737
+ # XXX implement negotiation algorithm
1738
+ return files[0]
1739
+
1740
+ # return the filename from the source
1741
+ # nil
1742
+ end
1743
+
1744
+ # Visit (open) the document at the given URI.
1745
+ #
1746
+ # @param uri [RDF::URI, URI, :to_s]
1747
+ #
1748
+ # @return [RDF::SAK::Context::Document] or nil
1749
+
1750
+ def visit uri
1751
+ uri = canonical_uuid uri
1752
+ path = locate uri
1753
+ return unless path
1754
+ Document.new self, uri, uri: canonical_uri(uri), doc: path
1755
+ end
1756
+
1757
+ # resolve documents from source
1758
+ def resolve_documents
1759
+ src = @config[:source]
1760
+ out = []
1761
+ src.find do |f|
1762
+ Find.prune if f.basename.to_s[0] == ?.
1763
+ next if f.directory?
1764
+ out << f
1765
+ end
1766
+
1767
+ out
1768
+ end
1769
+
1770
+ def resolve_file path
1771
+ return unless path.file?
1772
+ path = Pathname('/') + path.relative_path_from(@config[:source])
1773
+ base = URI(@base.to_s)
1774
+ uri = base + path.to_s
1775
+
1776
+ #warn "trying #{uri}"
1777
+
1778
+ until (out = canonical_uuid uri)
1779
+ # iteratively strip off
1780
+ break if uri.path.end_with? '/'
1781
+
1782
+ dn = path.dirname
1783
+ bn = path.basename '.*'
1784
+
1785
+ # try index first
1786
+ if bn.to_s == 'index'
1787
+ p = dn.to_s
1788
+ p << '/' unless p.end_with? '/'
1789
+ uri = base + p
1790
+ elsif bn == path.basename
1791
+ break
1792
+ else
1793
+ path = dn + bn
1794
+ uri = base + path.to_s
1795
+ end
1796
+
1797
+ # warn "trying #{uri}"
1798
+ end
1799
+
1800
+ out
1801
+ end
1802
+
1803
+ # Determine whether the URI represents a published document.
1804
+ #
1805
+ # @param uri
1806
+ #
1807
+ # @return [true, false]
1808
+ def published? uri, circulated: false
1809
+ RDF::SAK::Util.published? @graph, uri,
1810
+ circulated: circulated, base: @base
1811
+ end
1812
+
1813
+ # Find a destination pathname for the document
1814
+ #
1815
+ # @param uri
1816
+ # @param published
1817
+ #
1818
+ # @return [Pathname]
1819
+ def target_for uri, published: false
1820
+ uri = coerce_resource uri
1821
+ uri = canonical_uuid uri
1822
+ target = @config[published?(uri) && published ? :target : :private]
1823
+
1824
+ # target is a pathname so this makes a pathname
1825
+ target + "#{URI(uri.to_s).uuid}.xml"
1826
+ end
1827
+
1828
+ # read from source
1829
+
1830
+ # write (manipulated (x|x?ht)ml) back to source
1831
+
1832
+ # write public and private variants to target
1833
+
1834
+ def write_xhtml published: true
1835
+ end
1836
+
1837
+ # write modified rdf
1838
+
1839
+ # - internet stuff -
1840
+
1841
+ # verify external links for upness
1842
+
1843
+ # collect triples for external links
1844
+
1845
+ # fetch references for people/companies/concepts/etc from dbpedia/wikidata
1846
+
1847
+ # - document context class -
1848
+
1849
+ class Document
1850
+ include XML::Mixup
1851
+ include Util
1852
+
1853
+ private
1854
+
1855
+ C_OK = [Nokogiri::XML::Node, IO, Pathname].freeze
1856
+
1857
+ public
1858
+
1859
+ attr_reader :doc, :uuid, :uri
1860
+
1861
+ def initialize context, uuid, doc: nil, uri: nil, mtime: nil
1862
+ raise 'context must be a RDF::SAK::Context' unless
1863
+ context.is_a? RDF::SAK::Context
1864
+ raise 'uuid must be an RDF::URI' unless
1865
+ uuid.is_a? RDF::URI and uuid.to_s.start_with? 'urn:uuid:'
1866
+
1867
+ doc ||= context.locate uuid
1868
+ raise 'doc must be Pathname, IO, or Nokogiri node' unless
1869
+ C_OK.any? { |c| doc.is_a? c } || doc.respond_to?(:to_s)
1870
+
1871
+ # set some instance variables
1872
+ @context = context
1873
+ @uuid = uuid
1874
+ @mtime = mtime || doc.respond_to?(:mtime) ? doc.mtime : Time.now
1875
+ @target = context.target_for uuid
1876
+
1877
+ # now process the document
1878
+
1879
+ # turn the document into an XML::Document
1880
+ if doc.is_a? Nokogiri::XML::Node
1881
+ # a node that is not a document should be wrapped with one
1882
+ unless doc.is_a? Nokogiri::XML::Document
1883
+ d = doc.dup 1
1884
+ doc = Nokogiri::XML::Document.new
1885
+ doc << d
1886
+ end
1887
+ else
1888
+ type = nil
1889
+
1890
+ # pathnames turned into IO objects
1891
+ if doc.is_a? Pathname
1892
+ type = RDF::SAK::MimeMagic.by_path doc
1893
+ doc = doc.open # this may raise if the file isn't there
1894
+ end
1895
+
1896
+ # squash everything else to a string
1897
+ doc = doc.to_s unless doc.is_a? IO
1898
+
1899
+ # check type by content
1900
+ type ||= RDF::SAK::MimeMagic.by_magic(doc)
1901
+
1902
+ # can you believe there is a special bookmarks mime type good grief
1903
+ type = 'text/html' if type == 'application/x-mozilla-bookmarks'
1904
+
1905
+ # now we try to parse the blob
1906
+ if type.to_s =~ /xml/i
1907
+ doc = Nokogiri.XML doc
1908
+ elsif type == 'text/html'
1909
+ # if the detected type is html, try it as strict xml first
1910
+ attempt = nil
1911
+ begin
1912
+ attempt = Nokogiri.XML doc, nil, nil, (1 << 11) # NONET
1913
+ rescue Nokogiri::XML::SyntaxError
1914
+ # do not wrap this a second time; let it fail if it's gonna
1915
+ tmp = Nokogiri.HTML doc
1916
+ attempt = Nokogiri::XML::Document.new
1917
+ attempt << tmp.root.dup(1)
1918
+ end
1919
+ doc = attempt
1920
+ elsif type.to_s =~ /^text\/(?:plain|(?:x-)?markdown)/i
1921
+ # just assume plain text is markdown
1922
+ doc = ::MD::Noko.new.ingest doc
1923
+ else
1924
+ raise "Don't know what to do with #{uuid} (#{type})"
1925
+ end
1926
+ end
1927
+
1928
+ # now fix the namespaces for mangled html documents
1929
+ root = doc.root
1930
+ if root.name == 'html'
1931
+ unless root.namespace
1932
+ # clear this off or it will be duplicated in the output
1933
+ root.remove_attribute('xmlns')
1934
+ # now generate a new ns object
1935
+ ns = root.add_namespace(nil, XHTMLNS)
1936
+ # *now* scan the document and add the namespace declaration
1937
+ root.traverse do |node|
1938
+ if node.element? && node.namespace.nil?
1939
+ # downcasing the name may be cargo culting; need to check
1940
+ # node.name = node.name.downcase # yup it is
1941
+ node.namespace = ns
1942
+ end
1943
+ end
1944
+ end
1945
+
1946
+ # also add the magic blank doctype declaration if it's missing
1947
+ unless doc.internal_subset
1948
+ doc.create_internal_subset('html', nil, nil)
1949
+ end
1950
+ end
1951
+
1952
+ # aaand set some more instance variables
1953
+
1954
+ @uri = URI(uri || @context.canonical_uri(uuid))
1955
+
1956
+ # voilà
1957
+ @doc = doc
1958
+ end
1959
+
1960
+ # proxy for context published
1961
+ def published?
1962
+ @context.published? @uuid
1963
+ end
1964
+
1965
+ def base_for node = nil
1966
+ node ||= @doc
1967
+ doc = node.document
1968
+ base = @uri.to_s
1969
+ if doc.root.name.to_sym == :html
1970
+ b = doc.at_xpath(
1971
+ '(/html:html/html:head/html:base[@href])[1]/@href',
1972
+ { html: XHTMLNS }).to_s.strip
1973
+ base = b if URI(b).absolute?
1974
+ elsif b = doc.at_xpath('ancestor-or-self::*[@xml:base][1]/@xml:base')
1975
+ b = b.to_s.strip
1976
+ base = b if URI(b).absolute?
1977
+ end
1978
+
1979
+ URI(base)
1980
+ end
1981
+
1982
+ # notice these are only RDFa attributes that take URIs
1983
+ RDFA_ATTR = [:about, :resource, :typeof].freeze
1984
+ LINK_ATTR = [:href, :src, :data, :action, :longdesc].freeze
1985
+ LINK_XPATH = ('.//html:*[not(self::html:base)][%s]' %
1986
+ (LINK_ATTR + RDFA_ATTR).map { |a| "@#{a.to_s}" }.join('|')).freeze
1987
+
1988
+ def rewrite_links node = @doc, uuids: {}, uris: {}, &block
1989
+ base = base_for node
1990
+ count = 0
1991
+ cache = {}
1992
+ node.xpath(LINK_XPATH, { html: XHTMLNS }).each do |elem|
1993
+ LINK_ATTR.each do |attr|
1994
+ attr = attr.to_s
1995
+ next unless elem.has_attribute? attr
1996
+
1997
+ abs = base.merge uri_pp(elem[attr].strip)
1998
+
1999
+ # fix e.g. http->https
2000
+ if abs.host == @uri.host and abs.scheme != @uri.scheme
2001
+ tmp = @uri.dup
2002
+ tmp.path = abs.path
2003
+ tmp.query = abs.query
2004
+ tmp.fragment = abs.fragment
2005
+ abs = tmp
2006
+ end
2007
+
2008
+ # harvest query string
2009
+ pp = split_pp abs, only: true
2010
+
2011
+ abs = RDF::URI(abs.to_s)
2012
+
2013
+ # round-trip to uuid and back if we can
2014
+ if uuid = uuids[abs] ||= @context.canonical_uuid(abs)
2015
+ abs = cache[abs] ||= @context.canonical_uri(uuid)
2016
+ else
2017
+ abs = cache[abs] ||= @context.canonical_uri(abs)
2018
+ end
2019
+
2020
+ # reinstate the path parameters
2021
+ if !pp.empty? && split_pp(abs, only: true).empty?
2022
+ abs = abs.dup
2023
+ abs.path = ([abs.path] + pp).join(';')
2024
+ end
2025
+
2026
+
2027
+ elem[attr] = @uri.route_to(abs.to_s).to_s
2028
+ count += 1
2029
+ end
2030
+
2031
+ block.call elem if block
2032
+ end
2033
+
2034
+ count
2035
+ end
2036
+
2037
+ # sponge the document for rdfa
2038
+ def triples_for
2039
+ end
2040
+
2041
+ OBJS = [:href, :src].freeze
2042
+
2043
+ # ancestor node always with (@property and not @content) and
2044
+ # not @resource|@href|@src unless @rel|@rev
2045
+ LITXP = ['(ancestor::*[@property][not(@content)]',
2046
+ '[not(@resource|@href|@src) or @rel|@rev])[1]' ].join('').freeze
2047
+ # note parentheses cause the index to be counted from the root
2048
+
2049
+ def vocab_for node
2050
+ if node[:vocab]
2051
+ vocab = node[:vocab].strip
2052
+ return nil if vocab == ''
2053
+ return vocab
2054
+ end
2055
+ parent = node.parent
2056
+ vocab_for parent if parent and parent.element?
2057
+ end
2058
+
2059
+ def prefixes_for node, prefixes = {}
2060
+ # start with namespaces
2061
+ pfx = node.namespaces.select do |k, _|
2062
+ k.start_with? 'xmlns:'
2063
+ end.transform_keys do |k|
2064
+ k.delete_prefix 'xmlns:'
2065
+ end
2066
+
2067
+ # then add @prefix overtop of the namespaces
2068
+ if node[:prefix]
2069
+ x = node[:prefix].strip.split(/\s+/)
2070
+ a = []
2071
+ b = []
2072
+ x.each_index { |i| (i % 2 == 0 ? a : b).push x[i] }
2073
+ # if the size is uneven the values will be nil, so w drop em
2074
+ pfx.merge! a.zip(b).to_h.reject { |_, v| v.nil? }
2075
+ end
2076
+
2077
+ # since we're ascending the tree, input takes precedence
2078
+ prefixes = pfx.merge prefixes
2079
+
2080
+ if node.parent and node.parent.element?
2081
+ prefixes_for(node.parent, prefixes)
2082
+ else
2083
+ prefixes
2084
+ end
2085
+ end
2086
+
2087
+ # give us the rdf subject of the node itself
2088
+ def subject_for node = nil, rdf: false, is_ancestor: false
2089
+ node ||= @doc.root
2090
+ raise 'Node must be an element' unless
2091
+ node.is_a? Nokogiri::XML::Element
2092
+
2093
+ # first we check for an ancestor element with @property and no
2094
+ # @content; if we find one then we reevaluate with that
2095
+ # element as the starting point
2096
+ if n = node.at_xpath(LITXP)
2097
+ return subject_for n
2098
+ end
2099
+
2100
+ # answer a bunch of helpful questions about this element
2101
+ subject = nil
2102
+ base = base_for node
2103
+ parent = node.parent
2104
+ ns_href = node.namespace.href if node.namespace
2105
+ up_ok = %i{rel rev}.none? { |a| node[a] }
2106
+ is_root = !parent or parent.document?
2107
+ special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
2108
+ (ns_href == 'http://www.w3.org/1999/xhtml' or
2109
+ /^(?:[^:]+:)?html$/xi === parent.name)
2110
+
2111
+ # if the node is being inspected as an ancestor to the
2112
+ # original node, we have to check it backwards.
2113
+ if is_ancestor
2114
+ # ah right @resource gets special treatment
2115
+ if subject = node[:resource]
2116
+ subject.strip!
2117
+ if m = /^\[(.*?)\]$/.match(subject)
2118
+ end
2119
+ else
2120
+ OBJS.each do |attr|
2121
+ if node[attr]
2122
+ # merge with the root and return it
2123
+ subject = base + node[attr]
2124
+ break
2125
+ end
2126
+ end
2127
+ end
2128
+
2129
+ return rdf ? RDF::URI(subject.to_s) : subject
2130
+
2131
+ # note if we are being called with is_ancestor, that means
2132
+ # the original node (or indeed any of the nodes previously
2133
+ # tested) have anything resembling a resource in them. this
2134
+ # means @rel/@rev should be ignored, and we should keep
2135
+ # looking for a subject.
2136
+ end
2137
+
2138
+ if node[:about]
2139
+
2140
+ if m = /^_:(.*)$/.match(node[:about])
2141
+ return RDF::Node(m[1])
2142
+ end
2143
+
2144
+ # XXX resolve @about against potential curie
2145
+ subject = base + node[:about]
2146
+
2147
+ elsif is_root
2148
+ subject = base
2149
+ elsif special
2150
+ subject = subject_for parent
2151
+ elsif node[:resource]
2152
+ # XXX resolve @about against potential curie
2153
+ subject = base + node[:resource]
2154
+ elsif node[:href]
2155
+ subject = base + node[:href]
2156
+ elsif node[:src]
2157
+ subject = base + node[:src]
2158
+ elsif node[:typeof]
2159
+ # bnode the typeof attr
2160
+
2161
+ # note we return bnodes irrespective of the rdf flag
2162
+ return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
2163
+ elsif node[:inlist]
2164
+ # bnode the inlist attr
2165
+ return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
2166
+ elsif (parent[:inlist] && OBJS.none? { |a| parent[a] }) ||
2167
+ (is_ancestor && !up_ok)
2168
+ # bnode the element
2169
+ return RDF::Node('id-%016x' % node.pointer_id)
2170
+ # elsif node[:id]
2171
+ else
2172
+ subject = subject_for parent, is_ancestor: true
2173
+ end
2174
+
2175
+ rdf ? RDF::URI(subject.to_s) : URI(subject.to_s)
2176
+
2177
+ end
2178
+
2179
+ # backlink structure
2180
+ def generate_backlinks published: true, ignore: nil
2181
+ @context.generate_backlinks @uuid, published: published, ignore: ignore
2182
+ end
2183
+
2184
+ # goofy twitter-specific metadata
2185
+ def generate_twitter_meta
2186
+ @context.generate_twitter_meta @uuid
2187
+ end
2188
+
2189
+ def transform_xhtml published: true
2190
+ # before we do any more work make sure this is html
2191
+ doc = @doc.dup 1
2192
+ body = doc.at_xpath('//html:body[1]', { html: XHTMLNS }) or return
2193
+
2194
+ # eliminate comments
2195
+ doc.xpath('//comment()[not(ancestor::html:script)]',
2196
+ { html: XHTMLNS }).each { |c| c.unlink }
2197
+
2198
+ # initial stuff
2199
+ struct = @context.struct_for @uuid, uuids: true, canon: true
2200
+ # rstruct = @context.struct_for @uuid, uuids: true, rev: true
2201
+ resources = {}
2202
+ literals = {}
2203
+ ufwd = {} # uuid -> uri
2204
+ urev = {} # uri -> uuid
2205
+ datatypes = Set.new
2206
+ types = Set.new
2207
+ authors = @context.authors_for(@uuid)
2208
+ title = @context.label_for @uuid, candidates: struct
2209
+ desc = @context.label_for @uuid, candidates: struct, desc: true
2210
+
2211
+ # rewrite content
2212
+ title = title[1] if title
2213
+ desc = desc[1] if desc
2214
+
2215
+ # `struct` and `rstruct` will contain all the links and
2216
+ # metadata for forward and backward neighbours, respectively,
2217
+ # which we need to mine (predicates, classes, datatypes) for
2218
+ # prefixes among other things.
2219
+
2220
+ struct.each do |p, v|
2221
+ v.each do |o|
2222
+ if o.literal?
2223
+ literals[o] ||= Set.new
2224
+ literals[o].add p
2225
+
2226
+ # collect the datatype
2227
+ datatypes.add o.datatype if o.has_datatype?
2228
+ else
2229
+ # normalize URIs
2230
+ if o.to_s.start_with? 'urn:uuid:'
2231
+ ufwd[o] ||= @context.canonical_uri o
2232
+ elsif cu = @context.canonical_uuid(o)
2233
+ o = urev[o] ||= cu
2234
+ end
2235
+
2236
+
2237
+ # collect the resource
2238
+ resources[o] ||= Set.new
2239
+ resources[o].add p
2240
+
2241
+ # add to type
2242
+ types.add o if p == RDF::RDFV.type
2243
+ end
2244
+ end
2245
+ end
2246
+ urev.merge! ufwd.invert
2247
+
2248
+ labels = resources.keys.map do |k|
2249
+ # turn this into a pair which subsequently gets turned into a hash
2250
+ [k, @context.label_for(k) ]
2251
+ end.to_h
2252
+
2253
+ #warn labels
2254
+
2255
+ # handle the title
2256
+ title ||= RDF::Literal('')
2257
+ tm = { '#title' => title,
2258
+ property: @context.abbreviate(literals[title].to_a, vocab: XHV) }
2259
+ if tl = title.language
2260
+ tm['xml:lang'] = tl # if xmlns
2261
+ tm['lang'] = tl
2262
+ elsif tdt = title.datatype and tdt != RDF::XSD.string
2263
+ tm[:datatype] = @context.abbreviate(tdt)
2264
+ end
2265
+
2266
+ # we accumulate a record of the links in the body so we know
2267
+ # which ones to skip in the head
2268
+ bodylinks = {}
2269
+ rewrite_links body, uuids: ufwd, uris: urev do |elem|
2270
+ vocab = elem.at_xpath('ancestor-or-self::*[@vocab][1]/@vocab')
2271
+ vocab = uri_pp(vocab.to_s) if vocab
2272
+
2273
+ if elem.key?('href') or elem.key?('src')
2274
+ vu = uri_pp(elem['href'] || elem['src'])
2275
+ ru = RDF::URI(@uri.merge(vu))
2276
+ bodylinks[urev[ru] || ru] = true
2277
+
2278
+ if rel = resources[urev[ru] || ru]
2279
+ elem['rel'] = (@context.abbreviate rel, vocab: vocab).join ' '
2280
+ end
2281
+
2282
+ label = labels[urev[ru] || ru]
2283
+ if label and (!elem.key?('title') or elem['title'].strip == '')
2284
+ elem['title'] = label[1].to_s
2285
+ end
2286
+ end
2287
+ end
2288
+
2289
+ # and now we do the head
2290
+ links = []
2291
+ resources.reject { |k, _| bodylinks[k] }.each do |k, v|
2292
+ v = v.dup.delete RDF::RDFV.type
2293
+ next if v.empty?
2294
+ mts = @context.formats_for k
2295
+
2296
+ # warn k, v.inspect
2297
+
2298
+ # warn k, mts.inspect
2299
+
2300
+ rel = @context.abbreviate v.to_a, vocab: XHV
2301
+ ru = @uri.route_to(uri_pp (ufwd[k] || k).to_s)
2302
+ ln = { nil => :link, rel: rel, href: ru.to_s }
2303
+ if (label = labels[urev[k] || k])
2304
+ ln[:title] = label[1].to_s
2305
+ end
2306
+
2307
+ # add type=lol/wut
2308
+ ln[:type] = mts.first.to_s unless mts.empty?
2309
+
2310
+ if !ln[:type] and v.include?(RDF::Vocab::XHV.stylesheet)
2311
+ ln[:type] = 'text/css'
2312
+ elsif ln[:type] =~ /(java|ecma)script/i or
2313
+ v.include?(RDF::Vocab::DC.requires)
2314
+ ln[nil] = :script
2315
+ ln[:src] = ln.delete :href
2316
+ ln[:type] ||= 'text/javascript'
2317
+ end
2318
+ links.push ln
2319
+ end
2320
+
2321
+ links.sort! do |a, b|
2322
+ # sort by rel, then by href
2323
+ # warn a.inspect, b.inspect
2324
+ s = 0
2325
+ [nil, :rel, :rev, :href, :title].each do |k|
2326
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
2327
+ break if s != 0
2328
+ end
2329
+ s
2330
+ end
2331
+
2332
+ # we want to duplicate links from particular subjects (eg the root)
2333
+ (@context.config[:duplicate] || {}).sort do |a, b|
2334
+ a.first <=> b.first
2335
+ end.each do |s, preds|
2336
+
2337
+ o = {}
2338
+ u = ufwd[s] ||= @context.canonical_uuid s
2339
+ s = urev[u] ||= @context.canonical_uri u if u
2340
+ f = {}
2341
+
2342
+ # do not include this subject as these links are already included!
2343
+ next if u == @uuid
2344
+
2345
+ # gather up the objects, then gather up the predicates
2346
+
2347
+ @context.objects_for u || s, preds, only: :resource do |obj, rel|
2348
+ # XXX do not know why += |= etc does not work
2349
+ x = @context.canonical_uuid(obj) || obj
2350
+ urev[x] ||= @context.canonical_uri x
2351
+ y = o[x] ||= Set.new
2352
+ o[x] = y | rel
2353
+ f[x] = @context.formats_for x
2354
+ end
2355
+
2356
+ srel = @uri.route_to((u ? urev[u] || s : s).to_s)
2357
+
2358
+ # now collect all the other predicates
2359
+ o.keys.each do |obj|
2360
+ hrel = @uri.route_to((urev[obj] || obj).to_s)
2361
+ o[obj] |= @context.graph.query([u || s, nil, obj]).predicates.to_set
2362
+ rels = @context.abbreviate o[obj].to_a, vocab: XHV
2363
+ ln = { nil => :link, about: srel, rel: rels, href: hrel }
2364
+ ln[:type] = f[obj].first if f[obj]
2365
+
2366
+ # add to links
2367
+ links << ln
2368
+ end
2369
+ end
2370
+
2371
+ meta = []
2372
+
2373
+ # include author names as old school meta tags
2374
+ authors.each do |a|
2375
+ name = labels[urev[a] || a] or next
2376
+ datatypes.add name[0] # a convenient place to chuck this
2377
+ prop = @context.abbreviate(name[0])
2378
+ name = name[1]
2379
+ about = @uri.route_to((ufwd[a] || a).to_s)
2380
+ tag = { nil => :meta, about: about.to_s, name: :author,
2381
+ property: prop, content: name.to_s }
2382
+
2383
+ if name.has_datatype? and name.datatype != RDF::XSD.string
2384
+ tag[:datatype] = @context.abbreviate(name.datatype)
2385
+ elsif name.has_language?
2386
+ tag['xml:lang'] = tag[:lang] = name.language
2387
+ end
2388
+ meta.push tag
2389
+ end
2390
+
2391
+ literals.each do |k, v|
2392
+ next if k == title
2393
+ rel = @context.abbreviate v.to_a, vocab: XHV
2394
+ elem = { nil => :meta, property: rel, content: k.to_s }
2395
+ elem[:name] = :description if k == desc
2396
+
2397
+ if k.has_datatype?
2398
+ datatypes.add k.datatype # so we get the prefix
2399
+ elem[:datatype] = @context.abbreviate k.datatype, vocab: XHV
2400
+ end
2401
+
2402
+ meta.push(elem)
2403
+ end
2404
+
2405
+ meta.sort! do |a, b|
2406
+ s = 0
2407
+ [:about, :property, :datatype, :content, :name].each do |k|
2408
+ # warn a.inspect, b.inspect
2409
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
2410
+ break if s != 0
2411
+ end
2412
+ s
2413
+ end
2414
+
2415
+ # don't forget style tag
2416
+ style = doc.xpath('/html:html/html:head/html:style', { html: XHTMLNS })
2417
+
2418
+ body = body.dup 1
2419
+ body = { '#body' => body.children.to_a, about: '' }
2420
+ body[:typeof] = @context.abbreviate(types.to_a, vocab: XHV) unless
2421
+ types.empty?
2422
+
2423
+ # prepare only the prefixes we need to resolve the data we need
2424
+ rsc = @context.abbreviate(
2425
+ (struct.keys + resources.keys + datatypes.to_a + types.to_a).uniq,
2426
+ noop: false).map do |x|
2427
+ next if x.nil?
2428
+ x.split(?:)[0].to_sym
2429
+ end.select { |x| not x.nil? }.to_set
2430
+
2431
+ pfx = @context.prefixes.select do |k, _|
2432
+ rsc.include? k
2433
+ end.transform_values { |v| v.to_s }
2434
+
2435
+ # XXX deal with the qb:Observation separately (just nuke it for now)
2436
+ extra = generate_twitter_meta || []
2437
+ if bl = generate_backlinks(published: published,
2438
+ ignore: @context.graph.query(
2439
+ [nil, CI.document, @uuid]).subjects.to_set)
2440
+ extra << { [bl] => :object }
2441
+ end
2442
+
2443
+ # and now for the document
2444
+ xf = @context.config[:transform]
2445
+ doc = xhtml_stub(
2446
+ base: @uri, prefix: pfx, vocab: XHV, lang: 'en', title: tm,
2447
+ link: links, meta: meta, style: style, transform: xf,
2448
+ extra: extra, body: body).document
2449
+
2450
+ # goddamn script tags and text/html
2451
+ doc.xpath('//html:script[@src][not(node())]',
2452
+ { html: XHTMLNS }).each do |script|
2453
+ script << doc.create_text_node('')
2454
+ end
2455
+
2456
+ doc
2457
+ end
2458
+
2459
+ # Actually write the transformed document to the target
2460
+ #
2461
+ # @param published [true, false]
2462
+ #
2463
+ # @return [Array] pathname(s) written
2464
+ def write_to_target published: true
2465
+
2466
+ # in all cases we write to private target
2467
+ states = [false]
2468
+ # document has to be publishable
2469
+ states.push true if published && @context.published?(@uuid)
2470
+
2471
+ ok = []
2472
+ states.each do |state|
2473
+ target = @context.config[state ? :target : :private]
2474
+
2475
+ # XXX this is dumb; it should do something more robust if it
2476
+ # fails
2477
+ doc = transform_xhtml(published: state) or next
2478
+
2479
+ begin
2480
+ fh = Tempfile.create('xml-', target)
2481
+ path = Pathname(fh.path)
2482
+
2483
+ # write the doc to the target
2484
+ doc.write_to fh
2485
+ fh.close
2486
+
2487
+ uuid = URI(@uuid.to_s)
2488
+ newpath = path.dirname + "#{uuid.uuid}.xml"
2489
+ ok.push newpath
2490
+
2491
+ File.chmod(0644, path)
2492
+ File.rename(path, newpath)
2493
+ File.utime(@mtime, @mtime, newpath)
2494
+ rescue Exception => e
2495
+ # XXX this should only rescue a specific class of errors
2496
+ warn e.class, e
2497
+ File.unlink path if path.exist?
2498
+ end
2499
+ end
2500
+
2501
+ ok
2502
+ end
2503
+
2504
+ end
2505
+ end
2506
+ end