rdf-sak 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
7
+
8
+ #desc 'Generate Vocabularies'
9
+ #task :gen_vocabs => %w(ci).map { |v| "lib/rdf/sak/#{v}.rb" }
10
+
11
+ # XXX turn this into a rake task at some point :P
12
+
13
+ # rdf serialize --uri 'https://privatealpha.com/ontology/content-inventory/1#' --output-format vocabulary --module-name RDF::SAK --class-name CI -o lib/rdf/sak/ci.rb --strict 'https://privatealpha.com/ontology/content-inventory/1#'
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "rdf/sak"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,14 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <xsl:stylesheet version="1.0"
3
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
4
+ xmlns:html="http://www.w3.org/1999/xhtml"
5
+ xmlns="http://www.w3.org/1999/xhtml"
6
+ exclude-result-prefixes="html">
7
+
8
+ <xsl:key name="main" match="html:main" use="''"/>
9
+
10
+ <xsl:template match="/html:*">
11
+ <xsl:copy-of select="key('main', '')[1]"/>
12
+ </xsl:template>
13
+
14
+ </xsl:stylesheet>
@@ -0,0 +1,11 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <html xmlns="http://www.w3.org/1999/xhtml">
3
+ <head>
4
+ <title>i match lol</title>
5
+ </head>
6
+ <body>
7
+ <main>
8
+ <p>hooray you found the main element</p>
9
+ </main>
10
+ </body>
11
+ </html>
@@ -0,0 +1,58 @@
1
+ @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
2
+ @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
3
+ @prefix owl: <http://www.w3.org/2002/07/owl#> .
4
+ @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
5
+ @prefix dct: <http://purl.org/dc/terms/> .
6
+ @prefix skos: <http://www.w3.org/2004/02/skos/core#> .
7
+ @prefix ci: <https://privatealpha.com/ontology/content-inventory/1#> .
8
+ @prefix tfo: <https://privatealpha.com/ontology/transformation/1#> .
9
+ @prefix xf: <tag:makethingsmakesense.com,2020:transform/> .
10
+
11
+ xf:prefix a tfo:Parameter ;
12
+ skos:prefLabel "Prefix"@en ;
13
+ rdfs:comment "A compact prefix declaration of the form prefix:url"@en ;
14
+ dct:identifier "prefix"^^xsd:token ;
15
+ rdfs:range xsd:token .
16
+
17
+ xf:xpath a tfo:Parameter ;
18
+ skos:prefLabel "XPath"@en ;
19
+ rdfs:comment "An XPath expression"@en ;
20
+ dct:identifier "xpath"^^xsd:token ;
21
+ owl:cardinality 1 ;
22
+ rdfs:range xsd:string .
23
+
24
+ xf:reindent a tfo:Parameter ;
25
+ skos:prefLabel "Reindent"@en ;
26
+ rdfs:comment "Reindent the XML tree"@en ;
27
+ dct:identifier "reindent"^^xsd:token ;
28
+ tfo:default true ;
29
+ owl:cardinality 1 ;
30
+ rdfs:range xsd:boolean .
31
+
32
+ xf:subtree a tfo:Transform ;
33
+ skos:prefLabel "Subtree"@en ;
34
+ rdfs:comment "Isolate an X(HT)ML node using XPath."@en ;
35
+ tfo:implementation <urn:x-ruby:RDF::SAK::Transform::XPath> ;
36
+ tfo:accepts "application/xml"^^tfo:content-type ;
37
+ tfo:returns "application/xml"^^tfo:content-type ;
38
+ tfo:parameter xf:xpath, xf:prefix, xf:reindent ;
39
+ tfo:parameter-list ( xf:xpath xf:prefix xf:reindent ) .
40
+
41
+ xf:cleanup a tfo:Transform ;
42
+ skos:prefLabel "Cleanup"@en ;
43
+ rdfs:comment "Apply cleanup.xsl to the input."@en ;
44
+ tfo:implementation <file:example/cleanup.xsl> ;
45
+ tfo:accepts "application/xml"^^tfo:content-type ;
46
+ tfo:returns "application/xml"^^tfo:content-type .
47
+
48
+ <urn:uuid:78e6d8ce-a88a-4be0-8bfa-079136945816> a tfo:Partial ;
49
+ tfo:transform xf:subtree ;
50
+ xf:xpath "//html:main[1]"^^xsd:string ;
51
+ xf:prefix "html:http://www.w3.org/1999/xhtml"^^xsd:token .
52
+
53
+ <urn:uuid:4498eef5-1ca6-4034-937a-d50033dd6693> a tfo:Application ;
54
+ tfo:input <ni:///sha-256;0GHHmDtxh9CRZttXdr-cX78u72auS2P-O6tDXxvz2kU> ;
55
+ tfo:output <ni:///sha-256;_BbLbNSZl0TcQcjz-v3qF5fa5VL11rdha7c24K44pTc> ;
56
+ tfo:transform xf:subtree ;
57
+ xf:xpath "//html:main[1]"^^xsd:string ;
58
+ xf:prefix "html:http://www.w3.org/1999/xhtml"^^xsd:token .
@@ -0,0 +1 @@
1
+ require 'rdf/sak'
@@ -0,0 +1,2506 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'rdf/sak/version'
3
+
4
+ # basic stuff
5
+ require 'stringio'
6
+ require 'pathname'
7
+ require 'tempfile'
8
+
9
+ # rdf stuff
10
+ require 'uri'
11
+ require 'uri/urn'
12
+ require 'rdf'
13
+ require 'rdf/reasoner'
14
+ require 'linkeddata'
15
+
16
+ # my stuff
17
+ require 'xml-mixup'
18
+ require 'md-noko'
19
+ require 'uuid-ncname'
20
+ require 'rdf/sak/mimemagic'
21
+ require 'rdf/sak/util'
22
+
23
+ # ontologies, mine in particular
24
+ require 'rdf/sak/ci'
25
+ require 'rdf/sak/ibis'
26
+ # others not included in rdf.rb
27
+ require 'rdf/sak/pav'
28
+ require 'rdf/sak/qb'
29
+
30
+ module RDF::SAK
31
+
32
+ class Context
33
+ include XML::Mixup
34
+ include Util
35
+
36
+ private
37
+
38
+ # RDF::Reasoner.apply(:rdfs, :owl)
39
+
40
+ G_OK = [RDF::Repository, RDF::Dataset, RDF::Graph].freeze
41
+ C_OK = [Pathname, IO, String].freeze
42
+
43
+ def coerce_to_path_or_io obj
44
+ return obj if obj.is_a? IO
45
+ return obj.expand_path if obj.is_a? Pathname
46
+ raise "#{obj.inspect} is not stringable" unless obj.respond_to? :to_s
47
+ Pathname(obj.to_s).expand_path
48
+ end
49
+
50
+ def coerce_graph graph = nil, type: nil
51
+ # begin with empty graph
52
+ out = RDF::Repository.new
53
+
54
+ return out unless graph
55
+ return graph if G_OK.any? { |c| graph.is_a? c }
56
+
57
+ # now turn into an array
58
+ graph = [graph] unless graph.is_a? Array
59
+
60
+ graph.each do |g|
61
+ raise 'Graph must be some kind of RDF::Graph or RDF data file' unless
62
+ C_OK.any? { |c| g.is_a? c } || g.respond_to?(:to_s)
63
+
64
+ opts = {}
65
+ opts[:content_type] = type if type
66
+
67
+ if g.is_a? Pathname
68
+ opts[:filename] = g.expand_path.to_s
69
+ g = g.open
70
+ elsif g.is_a? File
71
+ opts[:filename] = g.path
72
+ end
73
+
74
+ g = StringIO.new(g.to_s) unless g.is_a? IO
75
+ reader = RDF::Reader.for(opts) do
76
+ g.rewind
77
+ sample = g.read 1000
78
+ g.rewind
79
+ sample
80
+ end or raise "Could not find an RDF::Reader for #{opts[:content_type]}"
81
+
82
+ reader = reader.new g, **opts
83
+ reader.each_statement do |stmt|
84
+ out << stmt
85
+ end
86
+ end
87
+
88
+ out
89
+ end
90
+
91
+ def normalize_hash h
92
+ return h unless h.is_a? Hash
93
+ out = {}
94
+ h.each do |k, v|
95
+ out[k.to_s.to_sym] = v.is_a?(Hash) ? normalize_hash(v) :
96
+ v.respond_to?(:to_a) ? v.to_a.map { |x| normalize_hash x } : v
97
+ end
98
+ out
99
+ end
100
+
101
+ def coerce_config config
102
+ # config must either be a hash or a file name/pathname/io object
103
+ unless config.respond_to? :to_h
104
+ # when in rome
105
+ require 'yaml'
106
+ config = if config.is_a? IO
107
+ YAML.load config
108
+ else
109
+ YAML.load_file Pathname.new(config).expand_path
110
+ end
111
+ end
112
+
113
+ config = normalize_hash config
114
+
115
+ # config MUST have source and target dirs
116
+ raise 'Config must have :source, :target, and :private directories' unless
117
+ ([:source, :target, :private] - config.keys).empty?
118
+ [:source, :target].each do |path|
119
+ dir = config[path] = Pathname.new(config[path]).expand_path
120
+ raise "#{dir} is not a readable directory" unless
121
+ dir.directory? && dir.readable?
122
+ end
123
+ raise "Target directory #{config[:target]} is not writable" unless
124
+ config[:target].writable?
125
+ raise "Source and target directories are the same: #{config[:source]}" if
126
+ config[:source] == config[:target]
127
+
128
+ # we try to create the private directory
129
+ config[:private] = config[:target] + config[:private]
130
+ if config[:private].exist?
131
+ raise "#{config[:private]} is not a readable/writable directory" unless
132
+ [:directory?, :readable?, :writable?].all? do |m|
133
+ config[:private].send m
134
+ end
135
+ else
136
+ config[:private].mkpath
137
+ end
138
+
139
+ # config MAY have graph location(s) but we can test this other
140
+ # ways, same goes for base URI
141
+ if config[:graph]
142
+ g = config[:graph]
143
+ g = [g] unless g.is_a? Array
144
+ config[:graph] = g.map { |x| Pathname.new(x).expand_path }
145
+ end
146
+
147
+ # deal with prefix map
148
+ if config[:prefixes]
149
+ config[:prefixes] = config[:prefixes].transform_values do |p|
150
+ # we have to wrap this in case it fails
151
+ begin
152
+ RDF::Vocabulary.find_term(p) || RDF::URI(p)
153
+ rescue
154
+ RDF::URI(p)
155
+ end
156
+ end
157
+ end
158
+
159
+ if dups = config[:duplicate]
160
+ pfx = config[:prefixes] || {}
161
+ base = URI(uri_pp config[:base])
162
+ if dups.is_a? Hash
163
+ config[:duplicate] = dups.map do |ruri, preds|
164
+ preds = [preds] unless preds.is_a? Array
165
+ preds.map! do |p|
166
+ resolve_curie p, prefixes: pfx, scalar: true, coerce: :rdf
167
+ end
168
+ [RDF::URI((base + ruri.to_s).to_s), Set.new(preds)]
169
+ end.to_h
170
+ end
171
+ end
172
+
173
+ # rewrite maps
174
+ config[:maps] = {} unless config[:maps].is_a? Hash
175
+ %w(rewrite redirect gone).each do |type|
176
+ config[:maps][type.to_sym] ||= ".#{type}.map"
177
+ end
178
+
179
+ config
180
+ end
181
+
182
+ def cmp_label a, b, labels: nil, supplant: true, reverse: false
183
+ labels ||= {}
184
+
185
+ # try supplied label or fall back
186
+ pair = [a, b].map do |x|
187
+ if labels[x]
188
+ labels[x][1]
189
+ elsif supplant and y = label_for(x)
190
+ labels[x] = y
191
+ y[1]
192
+ else
193
+ x
194
+ end
195
+ end
196
+
197
+ pair.reverse! if reverse
198
+ # warn "#{pair[0]} <=> #{pair[1]}"
199
+ pair[0].to_s <=> pair[1].to_s
200
+ end
201
+
202
+ def term_list terms
203
+ return [] if terms.nil?
204
+ terms = terms.respond_to?(:to_a) ? terms.to_a : [terms]
205
+ terms.uniq.map { |t| RDF::Vocabulary.find_term t }.compact
206
+ end
207
+
208
+ def coerce_resource arg
209
+ super arg, @base
210
+ end
211
+
212
+ def coerce_uuid_urn arg
213
+ super arg, @base
214
+ end
215
+
216
+ public
217
+
218
+ attr_reader :config, :graph, :base
219
+
220
+ # Initialize a context.
221
+ #
222
+ # @param graph
223
+ # @param base
224
+ # @param config
225
+ # @param type
226
+ #
227
+ # @return [RDF::SAK::Context] the new context object.
228
+
229
+ def initialize graph: nil, base: nil, config: nil, type: nil
230
+ # RDF::Reasoner.apply(:rdfs, :owl)
231
+
232
+ @config = coerce_config config
233
+
234
+ graph ||= @config[:graph] if @config[:graph]
235
+ base ||= @config[:base] if @config[:base]
236
+
237
+ @graph = coerce_graph graph, type: type
238
+ @base = RDF::URI.new base.to_s if base
239
+ @ucache = RDF::Util::Cache.new(-1)
240
+ @scache = {} # wtf rdf util cache doesn't like booleans
241
+ end
242
+
243
+ # Get the prefix mappings from the configuration.
244
+ #
245
+ # @return [Hash]
246
+
247
+ def prefixes
248
+ @config[:prefixes] || {}
249
+ end
250
+
251
+ # Abbreviate a set of terms against the registered namespace
252
+ # prefixes and optional default vocabulary, or otherwise return a
253
+ # string representation of the original URI.
254
+
255
+ # @param term [RDF::Term]
256
+ # @param prefixes [Hash]
257
+ #
258
+ # @return [String]
259
+ #
260
+ def abbreviate term, prefixes: @config[:prefixes],
261
+ vocab: nil, noop: true, sort: true
262
+ super term, prefixes: prefixes || {}, vocab: vocab, noop: noop, sort: sort
263
+ end
264
+
265
+ # Obtain a key-value structure for the given subject, optionally
266
+ # constraining the result by node type (:resource, :uri/:iri,
267
+ # :blank/:bnode, :literal)
268
+ #
269
+ # @param subject of the inquiry
270
+ # @param rev map in reverse
271
+ # @param only one or more node types
272
+ # @param uuids coerce resources to if possible
273
+ #
274
+ # @return [Hash]
275
+ #
276
+ def struct_for subject, rev: false, only: [], uuids: false, canon: false
277
+ Util.struct_for @graph, subject,
278
+ rev: rev, only: only, uuids: uuids, canon: canon
279
+ end
280
+
281
+ # Obtain everything in the graph that is an `rdf:type` of something.
282
+ #
283
+ # @return [Array]
284
+ #
285
+ def all_types
286
+ @graph.query([nil, RDF.type, nil]).objects.uniq
287
+ end
288
+
289
+ # Obtain every subject that is rdf:type the given type or its subtypes.
290
+ #
291
+ # @param rdftype [RDF::Term]
292
+ #
293
+ # @return [Array]
294
+ #
295
+ def all_of_type rdftype, exclude: []
296
+ exclude = term_list exclude
297
+ t = RDF::Vocabulary.find_term(rdftype) or raise "No type #{rdftype.to_s}"
298
+ out = []
299
+ (all_types & all_related(t) - exclude).each do |type|
300
+ out += @graph.query([nil, RDF.type, type]).subjects
301
+ end
302
+
303
+ out.uniq
304
+ end
305
+
306
+ # Obtain all and only the rdf:types directly asserted on the subject.
307
+ #
308
+ # @param subject [RDF::Resource]
309
+ # @param type [RDF::Term, :to_a]
310
+ #
311
+ # @return [Array]
312
+ #
313
+ def asserted_types subject, type = nil
314
+ Util.asserted_types @graph, subject, type
315
+ end
316
+
317
+ # Obtain the canonical UUID for the given URI
318
+ #
319
+ # @param uri [RDF::URI, URI, to_s] the subject of the inquiry
320
+ # @param unique [true, false] return a single resource/nil or an array
321
+ # @param published [true, false] whether to restrict to published docs
322
+ #
323
+ # @return [RDF::URI, Array]
324
+ #
325
+ def canonical_uuid uri, unique: true, published: false
326
+ Util.canonical_uuid @graph, uri, unique: unique,
327
+ published: published, scache: @scache, ucache: @ucache, base: @base
328
+ end
329
+
330
+ # Obtain the "best" dereferenceable URI for the subject.
331
+ # Optionally returns all candidates.
332
+ #
333
+ # @param subject [RDF::Resource]
334
+ # @param unique [true, false] flag for unique return value
335
+ # @param rdf [true, false] flag to specify RDF::URI vs URI
336
+ # @param slugs [true, false] flag to include slugs
337
+ # @param fragment [true, false] flag to include fragment URIs
338
+ #
339
+ # @return [RDF::URI, URI, Array]
340
+ #
341
+ def canonical_uri subject,
342
+ unique: true, rdf: true, slugs: false, fragment: false
343
+ Util.canonical_uri @graph, subject, base: @base,
344
+ unique: unique, rdf: rdf, slugs: slugs, fragment: fragment
345
+ end
346
+
347
+ # Returns subjects from the graph with entailment.
348
+ #
349
+ # @param predicate
350
+ # @param object
351
+ # @param entail
352
+ # @param only
353
+ #
354
+ # @return [RDF::Resource]
355
+ #
356
+ def subjects_for predicate, object, entail: true, only: []
357
+ Util.subjects_for @graph, predicate, object, entail: entail, only: only
358
+ end
359
+
360
+ # Returns objects from the graph with entailment.
361
+ #
362
+ # @param subject
363
+ # @param predicate
364
+ # @param entail
365
+ # @param only
366
+ # @param datatype
367
+ #
368
+ # @return [RDF::Term]
369
+ #
370
+ def objects_for subject, predicate, entail: true, only: [], datatype: nil
371
+ Util.objects_for @graph, subject, predicate,
372
+ entail: entail, only: only, datatype: datatype
373
+ end
374
+
375
+ # Find the terminal replacements for the given subject, if any exist.
376
+ #
377
+ # @param subject
378
+ # @param published indicate the context is published
379
+ #
380
+ # @return [Set]
381
+ #
382
+ def replacements_for subject, published: true
383
+ Util.replacements_for @graph, subject, published: published
384
+ end
385
+
386
+ # Obtain dates for the subject as instances of Date(Time). This is
387
+ # just shorthand for a common application of `objects_for`.
388
+ #
389
+ # @param subject
390
+ # @param predicate
391
+ # @param datatype
392
+ #
393
+ # @return [Array] of dates
394
+ def dates_for subject, predicate: RDF::Vocab::DC.date,
395
+ datatype: [RDF::XSD.date, RDF::XSD.dateTime]
396
+ Util.dates_for @graph, subject, predicate: predicate, datatype: datatype
397
+ end
398
+
399
+ # Obtain any specified MIME types for the subject. Just shorthand
400
+ # for a common application of `objects_for`.
401
+ #
402
+ # @param subject
403
+ # @param predicate
404
+ # @param datatype
405
+ #
406
+ # @return [Array] of internet media types
407
+ #
408
+ def formats_for subject, predicate: RDF::Vocab::DC.format,
409
+ datatype: [RDF::XSD.token]
410
+ Util.objects_for @graph, subject, predicate: predicate, datatype: datatype
411
+ end
412
+
413
+ # Assuming the subject is a thing that has authors, return the
414
+ # list of authors. Try bibo:authorList first for an explicit
415
+ # ordering, then continue to the various other predicates.
416
+ #
417
+ # @param subject [RDF::Resource]
418
+ # @param unique [false, true] only return the first author
419
+ # @param contrib [false, true] return contributors instead of authors
420
+ #
421
+ # @return [RDF::Value, Array]
422
+ #
423
+ def authors_for subject, unique: false, contrib: false
424
+ Util.authors_for @graph, subject, unique: unique, contrib: contrib
425
+ end
426
+
427
+ # Obtain the most appropriate label(s) for the subject's type(s).
428
+ # Returns one or more (depending on the `unique` flag)
429
+ # predicate-object pairs in order of preference.
430
+ #
431
+ # @param subject [RDF::Resource]
432
+ # @param unique [true, false] only return the first pair
433
+ # @param type [RDF::Term, Array] supply asserted types if already retrieved
434
+ # @param lang [nil] not currently implemented (will be conneg)
435
+ # @param desc [false, true] retrieve description instead of label
436
+ # @param alt [false, true] retrieve alternate instead of main
437
+ #
438
+ # @return [Array] either a predicate-object pair or an array of pairs.
439
+ #
440
+ def label_for subject, candidates: nil, unique: true, type: nil,
441
+ lang: nil, desc: false, alt: false
442
+ Util.label_for @graph, subject, candidates: candidates,
443
+ unique: unique, type: type, lang: lang, desc: desc, alt: alt
444
+ end
445
+
446
+ SKOS_HIER = [
447
+ {
448
+ element: :subject,
449
+ pattern: -> c, p { [nil, p, c] },
450
+ preds: [RDF::Vocab::SKOS.broader, RDF::Vocab::SKOS.broaderTransitive],
451
+ },
452
+ {
453
+ element: :object,
454
+ pattern: -> c, p { [c, p, nil] },
455
+ preds: [RDF::Vocab::SKOS.narrower, RDF::Vocab::SKOS.narrowerTransitive],
456
+ }
457
+ ]
458
+ SKOS_HIER.each do |struct|
459
+ # lol how many times are we gonna cart this thing around
460
+ preds = struct[:preds]
461
+ i = 0
462
+ loop do
463
+ equiv = preds[i].entail(:equivalentProperty) - preds
464
+ preds.insert(i + 1, *equiv) unless equiv.empty?
465
+ i += equiv.length + 1;
466
+ break if i >= preds.length
467
+ end
468
+ end
469
+
470
+ def sub_concepts concept, extra: []
471
+ raise 'Concept must be exactly one concept' unless
472
+ concept.is_a? RDF::Resource
473
+ extra = term_list extra
474
+
475
+ # we need an array for a queue, and a set to accumulate the
476
+ # output as well as a separate 'seen' set
477
+ queue = [concept]
478
+ seen = Set.new queue.dup
479
+ out = seen.dup
480
+
481
+ # it turns out that the main SKOS hierarchy terms, while not
482
+ # being transitive themselves, are subproperties of transitive
483
+ # relations which means they are as good as being transitive.
484
+
485
+ while c = queue.shift
486
+ SKOS_HIER.each do |struct|
487
+ elem, pat, preds = struct.values_at(:element, :pattern, :preds)
488
+ preds.each do |p|
489
+ @graph.query(pat.call c, p).each do |stmt|
490
+ # obtain hierarchical element
491
+ hierc = stmt.send elem
492
+
493
+ # skip any further processing if we have seen this concept
494
+ next if seen.include? hierc
495
+ seen << hierc
496
+
497
+ next if !extra.empty? and !extra.any? do |t|
498
+ @graph.has_statement? RDF::Statement.new(hierc, RDF.type, t)
499
+ end
500
+
501
+ queue << hierc
502
+ out << hierc
503
+ end
504
+ end
505
+ end
506
+ end
507
+
508
+ out.to_a.sort
509
+ end
510
+
511
+ def audiences_for uuid, proximate: false, invert: false
512
+ p = invert ? CI['non-audience'] : RDF::Vocab::DC.audience
513
+ return @graph.query([uuid, p, nil]).objects if proximate
514
+
515
+ out = []
516
+ @graph.query([uuid, p, nil]).objects.each do |o|
517
+ out += sub_concepts o
518
+ end
519
+
520
+ out
521
+ end
522
+
523
+ # Get all "reachable" UUID-identified entities (subjects which are
524
+ # also objects)
525
+ def reachable published: false
526
+ p = published ? -> x { published?(x) } : -> x { true }
527
+ # now get the subjects which are also objects
528
+ @graph.subjects.select do |s|
529
+ s.uri? && s =~ /^urn:uuid:/ && @graph.has_object?(s) && p.call(s)
530
+ end
531
+ end
532
+
533
+ # holy cow this is actually a lot of stuff:
534
+
535
+ # turn markdown into xhtml (via md-noko)
536
+
537
+ # turn html into xhtml (trivial)
538
+
539
+ # generate triples from ordinary (x)html structure
540
+
541
+ # map vanilla (x)html metadata to existing graph (ie to get resource URIs)
542
+
543
+ # pull triples from rdfa
544
+
545
+ # stuff rdfa into rdfa-less xhtml
546
+
547
+ # basic nlp detection of terms + text-level markup (dfn, abbr...)
548
+
549
+ # markdown round-tripping (may as well store source in md if possible)
550
+
551
+ # add title attribute to all links
552
+
553
+ # add alt attribute to all images
554
+
555
+ # segmentation of composite documents into multiple files
556
+
557
+ # aggregation of simple documents into composites
558
+
559
+ # generate backlinks
560
+
561
+ # - resource (ie file) generation -
562
+
563
+ # generate indexes of people, groups, and organizations
564
+
565
+ # generate indexes of books, not-books, and other external links
566
+
567
+ def head_links subject, struct: nil, nodes: nil, prefixes: {},
568
+ ignore: [], uris: {}, labels: {}, vocab: nil
569
+
570
+ raise 'ignore must be Array or Set' unless
571
+ [Array, Set].any? { |c| ignore.is_a? c }
572
+
573
+ struct ||= struct_for subject
574
+ nodes ||= invert_struct struct
575
+
576
+ # make sure these are actually URI objects not RDF::URI
577
+ uris = uris.transform_values { |v| URI(uri_pp v.to_s) }
578
+ uri = uris[subject] || canonical_uri(subject, rdf: false)
579
+
580
+ ignore = ignore.to_set
581
+
582
+ # output
583
+ links = []
584
+
585
+ nodes.reject { |n, _| ignore.include?(n) || !n.uri? }.each do |k, v|
586
+ # first nuke rdf:type, that's never in there
587
+ v = v.dup.delete RDF::RDFV.type
588
+ next if v.empty?
589
+
590
+ unless uris[k]
591
+ cu = canonical_uri k
592
+ uris[k] = cu || uri_pp(k.to_s)
593
+ end
594
+
595
+ # munge the url and make the tag
596
+ rel = abbreviate v.to_a, vocab: vocab
597
+ ru = uri.route_to(uris[k])
598
+ ln = { nil => :link, rel: rel, href: ru.to_s }
599
+
600
+ # add the title
601
+ if lab = labels[k]
602
+ ln[:title] = lab[1].to_s
603
+ end
604
+
605
+ # add type attribute
606
+ unless (mts = formats_for k).empty?
607
+ ln[:type] = mts.first.to_s
608
+
609
+ if ln[:type] =~ /(java|ecma)script/i ||
610
+ !(v.to_set & Set[RDF::Vocab::DC.requires]).empty?
611
+ ln[:src] = ln.delete :href
612
+ # make sure we pass in an empty string so there is a closing tag
613
+ ln.delete nil
614
+ ln[['']] = :script
615
+ end
616
+ end
617
+
618
+ # finally add the link
619
+ links.push ln
620
+ end
621
+
622
+ links.sort! do |a, b|
623
+ # sort by rel, then by href
624
+ # warn a.inspect, b.inspect
625
+ s = 0
626
+ [nil, :rel, :rev, :href, :title].each do |k|
627
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
628
+ break if s != 0
629
+ end
630
+ s
631
+ end
632
+
633
+ links
634
+ end
635
+
636
+ def head_meta subject, struct: nil, nodes: nil, prefixes: {},
637
+ ignore: [], meta_names: {}, vocab: nil, lang: nil, xhtml: true
638
+
639
+ raise 'ignore must be Array or Set' unless
640
+ [Array, Set].any? { |c| ignore.is_a? c }
641
+
642
+ struct ||= struct_for subject
643
+ nodes ||= invert_struct struct
644
+
645
+ ignore = ignore.to_set
646
+
647
+ meta = []
648
+ nodes.select { |n| n.literal? && !ignore.include?(n) }.each do |k, v|
649
+ rel = abbreviate v.to_a, vocab: vocab
650
+ tag = { nil => :meta, property: rel, content: k.to_s }
651
+
652
+ lang = (k.language? && k.language != lang ? k.language : nil) ||
653
+ (k.datatype == RDF::XSD.string && lang ? '' : nil)
654
+ if lang
655
+ tag['xml:lang'] = lang if xhtml
656
+ tag[:lang] = lang
657
+ end
658
+
659
+ tag[:datatype] = abbreviate k.datatype, vocab: XHV if k.datatype?
660
+ tag[:name] = meta_names[k] if meta_names[k]
661
+
662
+ meta << tag
663
+ end
664
+
665
+ meta.sort! do |a, b|
666
+ s = 0
667
+ [:about, :property, :datatype, :content, :name].each do |k|
668
+ # warn a.inspect, b.inspect
669
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
670
+ break if s != 0
671
+ end
672
+ s
673
+ end
674
+
675
+ meta
676
+ end
677
+
678
+ def generate_backlinks subject, published: true, ignore: nil
679
+ uri = canonical_uri(subject, rdf: false) || URI(uri_pp subject)
680
+ ignore ||= Set.new
681
+ raise 'ignore must be amenable to a set' unless ignore.respond_to? :to_set
682
+ ignore = ignore.to_set
683
+ nodes = {}
684
+ labels = {}
685
+ types = {}
686
+ @graph.query([nil, nil, subject]).each do |stmt|
687
+ next if ignore.include?(sj = stmt.subject)
688
+ preds = nodes[sj] ||= Set.new
689
+ preds << (pr = stmt.predicate)
690
+ types[sj] ||= asserted_types sj
691
+ labels[sj] ||= label_for sj
692
+ labels[pr] ||= label_for pr
693
+ end
694
+
695
+ # prune out
696
+ nodes.select! { |k, _| published? k } if published
697
+
698
+ return if nodes.empty?
699
+
700
+ li = nodes.sort do |a, b|
701
+ cmp_label a[0], b[0], labels: labels
702
+ end.map do |rsrc, preds|
703
+ cu = canonical_uri(rsrc, rdf: false) or next
704
+ lab = labels[rsrc] || [nil, rsrc]
705
+ lp = abbreviate(lab[0]) if lab[0]
706
+ ty = abbreviate(types[rsrc]) if types[rsrc]
707
+
708
+ { [{ [{ [lab[1].to_s] => :span, property: lp }] => :a,
709
+ href: uri.route_to(cu), typeof: ty, rev: abbreviate(preds) }] => :li }
710
+ end.compact
711
+
712
+ { [{ li => :ul }] => :nav }
713
+ end
714
+
715
+ def generate_twitter_meta subject
716
+ # get author
717
+ author = authors_for(subject, unique: true) or return
718
+
719
+ # get author's twitter account
720
+ twitter = objects_for(author, RDF::Vocab::FOAF.account,
721
+ only: :resource).select { |t| t.to_s =~ /twitter\.com/
722
+ }.sort.first or return
723
+ twitter = URI(twitter.to_s).path.split(/\/+/)[1]
724
+ twitter = ?@ + twitter unless twitter.start_with? ?@
725
+
726
+ # get title
727
+ title = label_for(subject) or return
728
+
729
+ out = [
730
+ { nil => :meta, name: 'twitter:card', content: :summary },
731
+ { nil => :meta, name: 'twitter:site', content: twitter },
732
+ { nil => :meta, name: 'twitter:title', content: title[1].to_s }
733
+ ]
734
+
735
+ # get abstract
736
+ if desc = label_for(subject, desc: true)
737
+ out.push({ nil => :meta, name: 'twitter:description',
738
+ content: desc[1].to_s })
739
+ end
740
+
741
+ # get image (foaf:depiction)
742
+ img = objects_for(subject, RDF::Vocab::FOAF.depiction, only: :resource)
743
+ unless img.empty?
744
+ img = img[0].to_s
745
+ out.push({ nil => :meta, name: 'twitter:image', content: img })
746
+ out[0][:content] = :summary_large_image
747
+ end
748
+
749
+ # return the appropriate xml-mixup structure
750
+ out
751
+ end
752
+
753
+ AUTHOR_SPEC = [
754
+ ['By:', [RDF::Vocab::BIBO.authorList, RDF::Vocab::DC.creator]],
755
+ ['With:', [RDF::Vocab::BIBO.contributorList, RDF::Vocab::DC.contributor]],
756
+ ['Edited by:', [RDF::Vocab::BIBO.editorList, RDF::Vocab::BIBO.editor]],
757
+ ['Translated by:', [RDF::Vocab::BIBO.translator]],
758
+ ].freeze
759
+
760
+ def generate_bibliography id, published: true
761
+ id = canonical_uuid id
762
+ uri = canonical_uri id
763
+ struct = struct_for id
764
+ nodes = Set[id] + smush_struct(struct)
765
+ bodynodes = Set.new
766
+ parts = {}
767
+ referents = {}
768
+ labels = { id => label_for(id, candidates: struct) }
769
+ canon = {}
770
+
771
+ # uggh put these somewhere
772
+ preds = {
773
+ hp: predicate_set(RDF::Vocab::DC.hasPart),
774
+ sa: predicate_set(RDF::RDFS.seeAlso),
775
+ canon: predicate_set([RDF::OWL.sameAs, CI.canonical]),
776
+ ref: predicate_set(RDF::Vocab::DC.references),
777
+ al: predicate_set(RDF::Vocab::BIBO.contributorList),
778
+ cont: predicate_set(RDF::Vocab::DC.contributor),
779
+ }
780
+
781
+ # collect up all the parts (as in dct:hasPart)
782
+ objects_for(id, preds[:hp], entail: false, only: :resource).each do |part|
783
+ bodynodes << part
784
+
785
+ # gather up all the possible alias urls this thing can have
786
+ sa = ([part] + objects_for(part,
787
+ preds[:sa], only: :uri, entail: false)).map do |x|
788
+ [x] + subjects_for(preds[:canon], x, only: :uri, entail: false)
789
+ end.flatten.uniq
790
+
791
+ # collect all the referents
792
+ reftmp = {}
793
+ sa.each do |u|
794
+ subjects_for preds[:ref], u, only: :uri, entail: false do |s, *p|
795
+ reftmp[s] ||= Set.new
796
+ reftmp[s] += p[0].to_set
797
+ end
798
+ end
799
+
800
+ # if we are producing a list of references identified by only
801
+ # published resources, prune out all the unpublished referents
802
+ reftmp.select! { |x, _| published? x } if published
803
+
804
+ # unconditionally skip this item if nothing references it
805
+ next if reftmp.empty?
806
+
807
+ referents[part] = reftmp
808
+
809
+ reftmp.each do |r, _|
810
+ labels[r] ||= label_for r
811
+ canon[r] ||= canonical_uri r
812
+ end
813
+
814
+ # collect all the authors and author lists
815
+
816
+ objects_for(part, preds[:al], only: :resource, entail: false) do |o|
817
+ RDF::List.new(subject: o, graph: @graph).each do |a|
818
+ labels[a] ||= label_for a
819
+ end
820
+ end
821
+
822
+ objects_for(part, preds[:cont], only: :uri, entail: false) do |a|
823
+ labels[a] ||= label_for a
824
+ end
825
+
826
+ ps = struct_for part
827
+ labels[part] = label_for part, candidates: ps
828
+ nodes |= smush_struct ps
829
+
830
+ parts[part] = ps
831
+ end
832
+
833
+ bmap = prepare_collation struct
834
+ pf = -> x { abbreviate bmap[x.literal? ? :literals : :resources][x] }
835
+
836
+ body = []
837
+ parts.sort { |a, b| cmp_label a[0], b[0], labels: labels }.each do |k, v|
838
+ mapping = prepare_collation v
839
+ p = -> x {
840
+ abbreviate mapping[x.literal? ? :literals : :resources][x] }
841
+ t = abbreviate mapping[:types]
842
+
843
+ lp = label_for k, candidates: v
844
+ h2c = [lp[1].to_s]
845
+ h2 = { h2c => :h2 }
846
+ cu = canonical_uri k
847
+ rel = nil
848
+ unless cu.scheme.downcase.start_with? 'http'
849
+ if sa = v[RDF::RDFS.seeAlso]
850
+ rel = p.call sa[0]
851
+ cu = canonical_uri sa[0]
852
+ else
853
+ cu = nil
854
+ end
855
+ end
856
+
857
+ if cu
858
+ h2c[0] = { [lp[1].to_s] => :a, rel: rel,
859
+ property: p.call(lp[1]), href: cu.to_s }
860
+ else
861
+ h2[:property] = p.call(lp[1])
862
+ end
863
+
864
+ # authors &c
865
+ # authors contributors editors translators
866
+ al = []
867
+ AUTHOR_SPEC.each do |label, pl|
868
+ dd = []
869
+ seen = Set.new
870
+ pl.each do |pred|
871
+ # first check if the struct has the predicate
872
+ next unless v[pred]
873
+ li = []
874
+ ul = { li => :ul, rel: abbreviate(pred) }
875
+ v[pred].sort { |a, b| cmp_label a, b, labels: labels }.each do |o|
876
+ # check if this is a list
877
+ tl = RDF::List.new subject: o, graph: @graph
878
+ if tl.empty? and !seen.include? o
879
+ seen << o
880
+ lab = labels[o] ? { [labels[o][1]] => :span,
881
+ property: abbreviate(labels[o][0]) } : o
882
+ li << { [lab] => :li, resource: o }
883
+ else
884
+ # XXX this will actually not be right if there are
885
+ # multiple lists but FINE FOR NOW
886
+ ul[:inlist] ||= ''
887
+ tl.each do |a|
888
+ seen << a
889
+ lab = labels[a] ? { [labels[a][1]] => :span,
890
+ property: abbreviate(labels[a][0]) } : a
891
+ li << { [lab] => :li, resource: a }
892
+ end
893
+ end
894
+ end
895
+ dd << ul unless li.empty?
896
+ end
897
+ al += [{ [label] => :dt }, { dd => :dd }] unless dd.empty?
898
+ end
899
+
900
+ # ref list
901
+ rl = referents[k].sort do |a, b|
902
+ cmp_label a[0], b[0], labels: labels
903
+ end.map do |ref, pset|
904
+ lab = labels[ref] ? { [labels[ref][1]] => :span,
905
+ property: abbreviate(labels[ref][0]) } : ref
906
+
907
+ { [{ [lab] => :a, rev: abbreviate(pset), href: canon[ref] }] => :li }
908
+ end
909
+
910
+ contents = [h2, {
911
+ al + [{ ['Referenced in:'] => :dt },
912
+ { [{ rl => :ul }] => :dd }] => :dl }]
913
+
914
+ body << { contents => :section,
915
+ rel: pf.call(k), resource: k.to_s, typeof: t }
916
+ end
917
+
918
+ # prepend abstract to body if it exists
919
+ abs = label_for id, candidates: struct, desc: true
920
+ if abs
921
+ tag = { '#p' => abs[1], property: abbreviate(abs[0]) }
922
+ body.unshift tag
923
+ end
924
+
925
+ # add labels to nodes
926
+ nodes += smush_struct labels
927
+
928
+ # get prefixes
929
+ pfx = prefix_subset prefixes, nodes
930
+
931
+ # get title tag
932
+ title = title_tag labels[id][0], labels[id][1],
933
+ prefixes: prefixes, lang: 'en'
934
+
935
+ # get links
936
+ link = head_links id,
937
+ struct: struct, ignore: bodynodes, labels: labels, vocab: XHV
938
+
939
+ # get metas
940
+ mn = {}
941
+ mn[abs[1]] = :description if abs
942
+ mi = Set.new
943
+ mi << labels[id][1] if labels[id]
944
+ meta = head_meta id,
945
+ struct: struct, lang: 'en', ignore: mi, meta_names: mn, vocab: XHV
946
+
947
+ meta += generate_twitter_meta(id) || []
948
+
949
+ xhtml_stub(base: uri, prefix: pfx, lang: 'en', title: title, vocab: XHV,
950
+ link: link, meta: meta, transform: @config[:transform],
951
+ body: { body => :body, about: '',
952
+ typeof: abbreviate(struct[RDF::RDFV.type] || []) }).document
953
+ end
954
+
955
+ # generate skos concept schemes
956
+
957
+ CONCEPTS = Util.all_related(RDF::Vocab::SKOS.Concept).to_set
958
+
959
+ def generate_audience_csv file = nil, published: true
960
+ require 'csv'
961
+ file = coerce_to_path_or_io file if file
962
+ lab = {}
963
+
964
+ out = all_internal_docs(published: published,
965
+ exclude: RDF::Vocab::FOAF.Image).map do |s|
966
+ u = canonical_uri s
967
+ x = struct_for s
968
+ c = x[RDF::Vocab::DC.created] ? x[RDF::Vocab::DC.created][0] : nil
969
+ _, t = label_for s, candidates: x
970
+ _, d = label_for s, candidates: x, desc: true
971
+
972
+ # # audience(s)
973
+ # a = objects_for(s, RDF::Vocab::DC.audience).map do |au|
974
+ # next lab[au] if lab[au]
975
+ # _, al = label_for au
976
+ # lab[au] = al
977
+ # end.map(&:to_s).sort.join '; '
978
+
979
+ # # explicit non-audience(s)
980
+ # n = objects_for(s, RDF::SAK::CI['non-audience']).map do |au|
981
+ # next lab[au] if lab[au]
982
+ # _, al = label_for au
983
+ # lab[au] = al
984
+ # end.map(&:to_s).sort.join '; '
985
+
986
+ # audience and non-audience
987
+ a, n = [RDF::Vocab::DC.audience, CI['non-audience']].map do |ap|
988
+ objects_for(s, ap).map do |au|
989
+ next lab[au] if lab[au]
990
+ _, al = label_for au
991
+ lab[au] = al
992
+ end.map(&:to_s).sort.join '; '
993
+ end
994
+
995
+ # concepts???
996
+ concepts = [RDF::Vocab::DC.subject, CI.introduces,
997
+ CI.assumes, CI.mentions].map do |pred|
998
+ objects_for(s, pred, only: :resource).map do |o|
999
+ con = self.objects_for(o, RDF.type).to_set & CONCEPTS
1000
+ next if con.empty?
1001
+ next lab[o] if lab[o]
1002
+ _, ol = label_for o
1003
+ lab[o] = ol
1004
+ end.compact.map(&:to_s).sort.join '; '
1005
+ end
1006
+
1007
+ [s, u, c, t, d, a, n].map(&:to_s) + concepts
1008
+ end.sort { |a, b| a[2] <=> b[2] }
1009
+
1010
+ out.unshift ['ID', 'URL', 'Created', 'Title', 'Description', 'Audience',
1011
+ 'Non-Audience', 'Subject', 'Introduces', 'Assumes', 'Mentions']
1012
+
1013
+ if file
1014
+ # don't open until now
1015
+ file = file.expand_path.open('wb') unless file.is_a? IO
1016
+
1017
+ csv = CSV.new file
1018
+ out.each { |x| csv << x }
1019
+ file.flush
1020
+ end
1021
+
1022
+ out
1023
+ end
1024
+
1025
+ CSV_PRED = {
1026
+ audience: RDF::Vocab::DC.audience,
1027
+ nonaudience: CI['non-audience'],
1028
+ subject: RDF::Vocab::DC.subject,
1029
+ assumes: CI.assumes,
1030
+ introduces: CI.introduces,
1031
+ mentions: CI.mentions,
1032
+ }
1033
+
1034
+ def ingest_csv file
1035
+ file = coerce_to_path_or_io file
1036
+
1037
+ require 'csv'
1038
+
1039
+ # key mapper
1040
+ km = { uuid: :id, url: :uri }
1041
+ kt = -> (k) { km[k] || k }
1042
+
1043
+ # grab all the concepts and audiences
1044
+
1045
+ audiences = {}
1046
+ all_of_type(CI.Audience).map do |c|
1047
+ s = struct_for c
1048
+
1049
+ # homogenize the labels
1050
+ lab = [false, true].map do |b|
1051
+ label_for(c, candidates: s, unique: false, alt: b).map { |x| x[1] }
1052
+ end.flatten.map { |x| x.to_s.strip.downcase }
1053
+
1054
+ # we want all the keys to share the same set
1055
+ set = nil
1056
+ lab.each { |t| set = audiences[t] ||= set || Set.new }
1057
+ set << c
1058
+ end
1059
+
1060
+ concepts = {}
1061
+ all_of_type(RDF::Vocab::SKOS.Concept).map do |c|
1062
+ s = struct_for c
1063
+
1064
+ # homogenize the labels
1065
+ lab = [false, true].map do |b|
1066
+ label_for(c, candidates: s, unique: false, alt: b).map { |x| x[1] }
1067
+ end.flatten.map { |x| x.to_s.strip.downcase }
1068
+
1069
+ # we want all the keys to share the same set
1070
+ set = nil
1071
+ lab.each { |t| set = concepts[t] ||= set || Set.new }
1072
+ set << c
1073
+ end
1074
+
1075
+ data = CSV.read(file, headers: true,
1076
+ header_converters: :symbol).map do |o|
1077
+ o = o.to_h.transform_keys(&kt)
1078
+ s = canonical_uuid(o.delete :id) or next
1079
+
1080
+ # LOLOL wtf
1081
+
1082
+ # handle audience
1083
+ [:audience, :nonaudience].each do |a|
1084
+ if o[a]
1085
+ o[a] = o[a].strip.split(/\s*[;,]+\s*/, -1).map do |t|
1086
+ if t =~ /^[a-z+-]+:[^[:space:]]+$/
1087
+ u = RDF::URI(t)
1088
+ canonical_uuid(u) || u
1089
+ elsif audiences[t.downcase]
1090
+ audiences[t.downcase].to_a
1091
+ end
1092
+ end.flatten.compact.uniq
1093
+ else
1094
+ o[a] = []
1095
+ end
1096
+ end
1097
+
1098
+ # handle concepts
1099
+ [:subject, :introduces, :assumes, :mentions].each do |a|
1100
+ if o[a]
1101
+ o[a] = o[a].strip.split(/\s*[;,]+\s*/, -1).map do |t|
1102
+ if t =~ /^[a-z+-]+:[^[:space:]]+$/
1103
+ u = RDF::URI(t)
1104
+ canonical_uuid(u) || u
1105
+ elsif concepts[t.downcase]
1106
+ concepts[t.downcase].to_a
1107
+ end
1108
+ end.flatten.compact.uniq
1109
+ else
1110
+ o[a] = []
1111
+ end
1112
+
1113
+ end
1114
+
1115
+ CSV_PRED.each do |sym, pred|
1116
+ o[sym].each do |obj|
1117
+ @graph << [s, pred, obj]
1118
+ end
1119
+ end
1120
+
1121
+ [s, o]
1122
+ end.compact.to_h
1123
+ data
1124
+ end
1125
+
1126
+ def generate_sitemap published: true
1127
+ urls = {}
1128
+
1129
+ # do feeds separately
1130
+ feeds = all_of_type RDF::Vocab::DCAT.Distribution
1131
+ #feeds.select! { |f| published? f } if published
1132
+ feeds.each do |f|
1133
+ uri = canonical_uri(f)
1134
+ f = generate_atom_feed f, published: published, related: feeds
1135
+ mt = f.at_xpath('/atom:feed/atom:updated[1]/text()',
1136
+ { atom: 'http://www.w3.org/2005/Atom' })
1137
+ urls[uri] = { [{ [uri.to_s] => :loc }, { [mt] => :lastmod }] => :url }
1138
+ end
1139
+
1140
+ # build up hash of urls
1141
+ all_internal_docs(published: published).each do |doc|
1142
+ next if asserted_types(doc).include? RDF::Vocab::FOAF.Image
1143
+ uri = canonical_uri(doc)
1144
+ next unless uri.authority && @base && uri.authority == base.authority
1145
+ mods = objects_for(doc, [RDF::Vocab::DC.created,
1146
+ RDF::Vocab::DC.modified, RDF::Vocab::DC.issued],
1147
+ datatype: RDF::XSD.dateTime).sort
1148
+ nodes = [{ [uri.to_s] => :loc }]
1149
+ nodes << { [mods[-1].to_s] => :lastmod } unless mods.empty?
1150
+ urls[uri] = { nodes => :url }
1151
+ end
1152
+
1153
+ urls = urls.sort.map { |_, v| v }
1154
+
1155
+ markup(spec: { urls => :urlset,
1156
+ xmlns: 'http://www.sitemaps.org/schemas/sitemap/0.9' }).document
1157
+ end
1158
+
1159
+ def write_sitemap published: true
1160
+ sitemap = generate_sitemap published: published
1161
+ file = @config[:sitemap] || '.well-known/sitemap.xml'
1162
+ target = @config[published ? :target : :private]
1163
+ target.mkpath unless target.directory?
1164
+
1165
+ fh = (target + file).open(?w)
1166
+ sitemap.write_to fh
1167
+ fh.close
1168
+ end
1169
+
1170
+ # generate atom feed
1171
+
1172
+ #
1173
+ def all_internal_docs published: true, exclude: []
1174
+ # find all UUIDs that are documents
1175
+ docs = all_of_type(RDF::Vocab::FOAF.Document,
1176
+ exclude: exclude).select { |x| x =~ /^urn:uuid:/ }
1177
+
1178
+ # prune out all but the published documents if specified
1179
+ if published
1180
+ p = RDF::Vocab::BIBO.status
1181
+ o = RDF::Vocabulary.find_term(
1182
+ 'http://purl.org/ontology/bibo/status/published')
1183
+ docs = docs.select do |s|
1184
+ @graph.has_statement? RDF::Statement(s, p, o)
1185
+ end
1186
+ end
1187
+
1188
+ docs
1189
+ end
1190
+
1191
+ def generate_atom_feed id, published: true, related: []
1192
+ raise 'ID must be a resource' unless id.is_a? RDF::Resource
1193
+
1194
+ # prepare relateds
1195
+ raise 'related must be an array' unless related.is_a? Array
1196
+ related -= [id]
1197
+
1198
+ # feed = struct_for id
1199
+
1200
+ faudy = audiences_for id
1201
+ faudn = audiences_for id, invert: true
1202
+ faudy -= faudn
1203
+
1204
+ docs = all_internal_docs published: published
1205
+
1206
+ # now we create a hash keyed by uuid containing the metadata
1207
+ authors = {}
1208
+ titles = {}
1209
+ dates = {}
1210
+ entries = {}
1211
+ latest = nil
1212
+ docs.each do |uu|
1213
+ # basically make a jsonld-like structure
1214
+ #rsrc = struct_for uu
1215
+
1216
+ indexed = objects_for uu, RDF::SAK::CI.indexed, only: :literal
1217
+ next if !indexed.empty? and indexed.any? { |f| f == false }
1218
+
1219
+ # get id (got it already duh)
1220
+
1221
+ # get audiences
1222
+ audy = audiences_for uu, proximate: true
1223
+ audn = audiences_for uu, proximate: true, invert: true
1224
+
1225
+ #warn "#{faudy.to_s} & #{faud"
1226
+
1227
+ skip = false
1228
+ if audy.empty?
1229
+ # an unspecified audience implies "everybody", but if the
1230
+ # feed's audience *is* specified, then it's not for everybody
1231
+ skip = true unless faudy.empty?
1232
+ else
1233
+ # if document audience matches feed non-audience, disqualify
1234
+ skip = true unless (faudn & audy).empty?
1235
+
1236
+ # absence of an explicit feed audience implies "everybody"
1237
+ if faudy.empty?
1238
+ # if document audience minus feed non-audience has
1239
+ # members, re-qualify
1240
+ skip = false unless (audy - faudn).empty?
1241
+ else
1242
+ # if document audience matches feed audience, re-qualify
1243
+ skip = false unless (faudy & audy).empty?
1244
+ end
1245
+ end
1246
+
1247
+ # if document non-audience matches feed audience, re-disqualify
1248
+ skip = true if !(audn.empty? || faudy.empty?) && !(faudy & audn).empty?
1249
+
1250
+ next if skip
1251
+
1252
+ canon = URI.parse(canonical_uri(uu).to_s)
1253
+
1254
+ xml = { '#entry' => [
1255
+ { '#link' => nil, rel: :alternate, href: canon, type: 'text/html' },
1256
+ { '#id' => uu.to_s }
1257
+ ] }
1258
+
1259
+ # get published date first
1260
+ published = (objects_for uu,
1261
+ [RDF::Vocab::DC.issued, RDF::Vocab::DC.created],
1262
+ datatype: RDF::XSD.dateTime)[0]
1263
+
1264
+ # get latest updated date
1265
+ updated = (objects_for uu, RDF::Vocab::DC.modified,
1266
+ datatype: RDF::XSD.dateTime).sort[-1]
1267
+ updated ||= published || RDF::Literal::DateTime.new(DateTime.now)
1268
+ updated = Time.parse(updated.to_s).utc
1269
+ latest = updated if !latest or latest < updated
1270
+
1271
+ xml['#entry'].push({ '#updated' => updated.iso8601 })
1272
+
1273
+ if published
1274
+ published = Time.parse(published.to_s).utc
1275
+ xml['#entry'].push({ '#published' => published.iso8601 })
1276
+ dates[uu] = [published, updated]
1277
+ else
1278
+ dates[uu] = [updated, updated]
1279
+ end
1280
+
1281
+ # get author(s)
1282
+ al = []
1283
+ authors_for(uu).each do |a|
1284
+ unless authors[a]
1285
+ n = label_for a
1286
+ x = authors[a] = { '#author' => [{ '#name' => n[1].to_s }] }
1287
+
1288
+ hp = @graph.first_object [a, RDF::Vocab::FOAF.homepage, nil]
1289
+ hp ||= canonical_uri a
1290
+
1291
+ x['#author'].push({ '#uri' => hp.to_s }) if hp
1292
+ end
1293
+
1294
+ al.push authors[a]
1295
+ end
1296
+
1297
+ xml['#entry'] += al unless al.empty?
1298
+
1299
+ # get title (note unshift)
1300
+ if (t = label_for uu)
1301
+ titles[uu] = t[1].to_s
1302
+ xml['#entry'].unshift({ '#title' => t[1].to_s })
1303
+ else
1304
+ titles[uu] = uu.to_s
1305
+ end
1306
+
1307
+ # get abstract
1308
+ if (d = label_for uu, desc: true)
1309
+ xml['#entry'].push({ '#summary' => d[1].to_s })
1310
+ end
1311
+
1312
+ entries[uu] = xml
1313
+ end
1314
+
1315
+ # note we overwrite the entries hash here with a sorted array
1316
+ entrycmp = -> a, b {
1317
+ # first we sort by published date
1318
+ p = dates[a][0] <=> dates[b][0]
1319
+ # if the published dates are the same, sort by updated date
1320
+ u = dates[a][1] <=> dates[b][1]
1321
+ # to break any ties, finally sort by title
1322
+ p == 0 ? u == 0 ? titles[a] <=> titles[b] : u : p }
1323
+ entries = entries.values_at(
1324
+ *entries.keys.sort { |a, b| entrycmp.call(a, b) })
1325
+ # ugggh god forgot the asterisk and lost an hour
1326
+
1327
+ # now we punt out the doc
1328
+
1329
+ preamble = [
1330
+ { '#id' => id.to_s },
1331
+ { '#updated' => latest.iso8601 },
1332
+ { '#generator' => 'RDF::SAK', version: RDF::SAK::VERSION,
1333
+ uri: "https://github.com/doriantaylor/rb-rdf-sak" },
1334
+ { nil => :link, rel: :self, type: 'application/atom+xml',
1335
+ href: canonical_uri(id) },
1336
+ { nil => :link, rel: :alternate, type: 'text/html',
1337
+ href: @base },
1338
+ ] + related.map do |r|
1339
+ { nil => :link, rel: :related, type: 'application/atom+xml',
1340
+ href: canonical_uri(r) }
1341
+ end
1342
+
1343
+ if (t = label_for id)
1344
+ preamble.unshift({ '#title' => t[1].to_s })
1345
+ end
1346
+
1347
+ if (r = @graph.first_literal [id, RDF::Vocab::DC.rights, nil])
1348
+ rh = { '#rights' => r.to_s, type: :text }
1349
+ rh['xml:lang'] = r.language if r.has_language?
1350
+ preamble.push rh
1351
+ end
1352
+
1353
+ markup(spec: { '#feed' => preamble + entries,
1354
+ xmlns: 'http://www.w3.org/2005/Atom' }).document
1355
+ end
1356
+
1357
+ def write_feeds type: RDF::Vocab::DCAT.Distribution, published: true
1358
+ feeds = all_of_type type
1359
+ target = @config[published ? :target : :private]
1360
+ feeds.each do |feed|
1361
+ tu = URI(feed.to_s)
1362
+ doc = generate_atom_feed feed, published: published, related: feeds
1363
+ fh = (target + "#{tu.uuid}.xml").open('w')
1364
+ doc.write_to fh
1365
+ fh.close
1366
+ end
1367
+ end
1368
+
1369
+ # generate sass palettes
1370
+
1371
+ # generate rewrite map(s)
1372
+ def generate_rewrite_map published: false, docs: nil
1373
+ docs ||= reachable published: published
1374
+ base = URI(@base.to_s)
1375
+ rwm = {}
1376
+ docs.each do |doc|
1377
+ tu = URI(doc.to_s)
1378
+ cu = canonical_uri doc, rdf: false
1379
+ next unless tu.respond_to?(:uuid) and cu.respond_to?(:request_uri)
1380
+
1381
+ # skip external links obvs
1382
+ next unless base.route_to(cu).relative?
1383
+
1384
+ # skip /uuid form
1385
+ cp = cu.request_uri.delete_prefix '/'
1386
+ next if cu.host == base.host and tu.uuid == cp
1387
+
1388
+ rwm[cp] = tu.uuid
1389
+ end
1390
+
1391
+ rwm
1392
+ end
1393
+
1394
+ # give me all UUIDs of all documents, filter for published if
1395
+ # applicable
1396
+ #
1397
+ # find the "best" (relative) URL for the UUID and map the pair
1398
+ # together
1399
+ def generate_uuid_redirect_map published: false, docs: nil
1400
+ docs ||= reachable published: published
1401
+
1402
+ base = URI(@base.to_s)
1403
+
1404
+ # keys are /uuid, values are
1405
+ out = {}
1406
+ docs.each do |doc|
1407
+ tu = URI(doc.to_s)
1408
+ cu = canonical_uri doc, rdf: false
1409
+ next unless tu.respond_to?(:uuid) and cu.respond_to?(:request_uri)
1410
+
1411
+ # skip /uuid form
1412
+ cp = cu.request_uri.delete_prefix '/'
1413
+ next if cu.host == base.host && tu.uuid == cp
1414
+
1415
+ # all redirect links are absolute
1416
+ out[tu.uuid] = cu.to_s
1417
+ end
1418
+ out
1419
+ end
1420
+
1421
+ # find all URIs/slugs that are *not* canonical, map them to slugs
1422
+ # that *are* canonical
1423
+ def generate_slug_redirect_map published: false, docs: nil
1424
+ docs ||= reachable published: published
1425
+ base = URI(@base.to_s)
1426
+
1427
+ # for redirects we collect all the docs, plus all their URIs,
1428
+ # separate canonical from the rest
1429
+
1430
+ # actually an easy way to do this is just harvest all the
1431
+ # multi-addressed docs, remove the first one, then ask for the
1432
+ # canonical uuid back,
1433
+
1434
+ fwd = {}
1435
+ rev = {}
1436
+ out = {}
1437
+
1438
+ docs.each do |doc|
1439
+ uris = canonical_uri doc, unique: false, rdf: false
1440
+ canon = uris.shift
1441
+ next unless canon.respond_to? :request_uri
1442
+
1443
+ # cache the forward direction
1444
+ fwd[doc] = canon
1445
+
1446
+ unless uris.empty?
1447
+ uris.each do |uri|
1448
+ next unless uri.respond_to? :request_uri
1449
+ next if canon == uri
1450
+ next unless base.route_to(uri).relative?
1451
+
1452
+ # warn "#{canon} <=> #{uri}"
1453
+
1454
+ requri = uri.request_uri.delete_prefix '/'
1455
+ next if requri == '' ||
1456
+ requri =~ /^[0-9a-f]{8}(?:-[0-9a-f]{4}){4}[0-9a-f]{8}$/
1457
+
1458
+ # cache the reverse direction
1459
+ rev[uri] = requri
1460
+ end
1461
+ end
1462
+ end
1463
+
1464
+ rev.each do |uri, requri|
1465
+ if (doc = canonical_uuid(uri, published: published)) and
1466
+ fwd[doc] and fwd[doc] != uri
1467
+ out[requri] = fwd[doc].to_s
1468
+ end
1469
+ end
1470
+
1471
+ out
1472
+ end
1473
+
1474
+ # you know what, it's entirely possible that these ought never be
1475
+ # called individually and the work to get one would duplicate the
1476
+ # work of getting the other, so maybe just do 'em both at once
1477
+
1478
+ def generate_redirect_map published: false, docs: nil
1479
+ generate_uuid_redirect_map(published: published, docs: docs).merge(
1480
+ generate_slug_redirect_map(published: published, docs: docs))
1481
+ end
1482
+
1483
+ def generate_gone_map published: false, docs: nil
1484
+ # published is a no-op for this one because these docs are by
1485
+ # definition not published
1486
+ docs ||= reachable published: false
1487
+ p = RDF::Vocab::BIBO.status
1488
+ base = URI(@base.to_s)
1489
+ out = {}
1490
+ docs.select { |s|
1491
+ @graph.has_statement? RDF::Statement(s, p, CI.retired) }.each do |doc|
1492
+ canon = canonical_uri doc, rdf: false
1493
+ next unless base.route_to(canon).relative?
1494
+ canon = canon.request_uri.delete_prefix '/'
1495
+ # value of the gone map doesn't matter
1496
+ out[canon] = canon
1497
+ end
1498
+
1499
+ out
1500
+ end
1501
+
1502
+ # private?
1503
+
1504
+ def map_location type
1505
+ # find file name in config
1506
+ fn = @config[:maps][type] or return
1507
+
1508
+ # concatenate to target directory
1509
+ @config[:target] + fn
1510
+ end
1511
+
1512
+ # private?
1513
+
1514
+ def write_map_file location, data
1515
+ # open file
1516
+ fh = File.new location, 'w'
1517
+ data.sort.each { |k, v| fh.write "#{k}\t#{v}\n" }
1518
+ fh.close # return value is return value from close
1519
+ end
1520
+
1521
+ # public again
1522
+
1523
+ def write_rewrite_map published: false, docs: nil
1524
+ data = generate_rewrite_map published: published, docs: docs
1525
+ loc = map_location :rewrite
1526
+ write_map_file loc, data
1527
+ end
1528
+
1529
+ def write_redirect_map published: false, docs: nil
1530
+ data = generate_redirect_map published: published, docs: docs
1531
+ loc = map_location :redirect
1532
+ write_map_file loc, data
1533
+ end
1534
+
1535
+ def write_gone_map published: false, docs: nil
1536
+ data = generate_gone_map published: published, docs: docs
1537
+ loc = map_location :gone
1538
+ write_map_file loc, data
1539
+ end
1540
+
1541
+ def write_maps published: true, docs: nil
1542
+ docs ||= reachable published: false
1543
+ # slug to uuid (internal)
1544
+ write_rewrite_map docs: docs
1545
+ # uuid/slug to canonical slug (308)
1546
+ write_redirect_map docs: docs
1547
+ # retired slugs/uuids (410)
1548
+ write_gone_map docs: docs
1549
+ true
1550
+ end
1551
+
1552
+ # whoops lol we forgot the book list
1553
+
1554
+ def reading_lists published: true
1555
+ out = all_of_type RDF::Vocab::SiocTypes.ReadingList
1556
+ return out unless published
1557
+ out.select { |r| published? r }
1558
+ end
1559
+
1560
+ def generate_reading_list subject, published: true
1561
+ # struct = struct_for subject
1562
+
1563
+ # find all the books, sort them by title
1564
+
1565
+ # for each book, give title, authors, inbound references
1566
+
1567
+ # punt out xhtml
1568
+ end
1569
+
1570
+ def write_reading_lists published: true
1571
+ reading_lists(published: published).each do |rl|
1572
+ tu = URI(rl.to_s)
1573
+ doc = generate_reading_list rl, published: published
1574
+ fh = (target + "#{tu.uuid}.xml").open('w')
1575
+ doc.write_to fh
1576
+ fh.close
1577
+ end
1578
+ end
1579
+
1580
+ DSD_SEQ = %i[characters words blocks sections
1581
+ min low-quartile median high-quartile max mean sd].freeze
1582
+ TH_SEQ = %w[Document Abstract Created Modified Characters Words Blocks
1583
+ Sections Min Q1 Median Q3 Max Mean SD].map { |t| { [t] => :th } }
1584
+
1585
+ def generate_stats published: true
1586
+ out = {}
1587
+ all_of_type(QB.DataSet).map do |s|
1588
+ base = canonical_uri s, rdf: false
1589
+ types = abbreviate asserted_types(s)
1590
+ title = if t = label_for(s)
1591
+ [t[1].to_s, abbreviate(t[0])]
1592
+ end
1593
+ cache = {}
1594
+ subjects_for(QB.dataSet, s, only: :resource).each do |o|
1595
+ if d = objects_for(o, CI.document, only: :resource).first
1596
+ if !published or published?(d)
1597
+ # include a "sort" time that defaults to epoch zero
1598
+ c = cache[o] ||= {
1599
+ doc: d, stime: Time.at(0).getgm, struct: struct_for(o) }
1600
+
1601
+ if t = label_for(d)
1602
+ c[:title] = t
1603
+ end
1604
+ if a = label_for(d, desc: true)
1605
+ c[:abstract] = a
1606
+ end
1607
+ if ct = objects_for(d,
1608
+ RDF::Vocab::DC.created, datatype: RDF::XSD.dateTime).first
1609
+ c[:stime] = c[:ctime] = ct.object.to_time.getgm
1610
+ end
1611
+ if mt = objects_for(d,
1612
+ RDF::Vocab::DC.modified, datatype:RDF::XSD.dateTime)
1613
+ c[:mtime] = mt.map { |m| m.object.to_time.getgm }.sort
1614
+ c[:stime] = c[:mtime].last unless mt.empty?
1615
+ end
1616
+ end
1617
+ end
1618
+ end
1619
+
1620
+ # sort lambda closure
1621
+ sl = -> a, b do
1622
+ x = cache[b][:stime] <=> cache[a][:stime]
1623
+ return x unless x == 0
1624
+ x = cache[b][:ctime] <=> cache[a][:ctime]
1625
+ return x unless x == 0
1626
+ ta = cache[a][:title] || Array.new(2, cache[a][:uri])
1627
+ tb = cache[b][:title] || Array.new(2, cache[b][:uri])
1628
+ ta[1].to_s <=> tb[1].to_s
1629
+ end
1630
+
1631
+ rows = []
1632
+ cache.keys.sort(&sl).each do |k|
1633
+ c = cache[k]
1634
+ href = base.route_to canonical_uri(c[:doc], rdf: false)
1635
+ dt = abbreviate asserted_types(c[:doc])
1636
+ uu = URI(k.to_s).uuid
1637
+ nc = UUID::NCName.to_ncname uu, version: 1
1638
+ tp, tt = c[:title] || []
1639
+ ab = if c[:abstract]
1640
+ { [c[:abstract][1].to_s] => :th, about: href,
1641
+ property: abbreviate(c[:abstract].first) }
1642
+ else
1643
+ { [] => :th }
1644
+ end
1645
+
1646
+ td = [{ { { [tt.to_s] => :span, property: abbreviate(tp) } => :a,
1647
+ rel: 'ci:document', href: href } => :th },
1648
+ ab,
1649
+ { [c[:ctime].iso8601] => :th, property: 'dct:created',
1650
+ datatype: 'xsd:dateTime', about: href, typeof: dt },
1651
+ { c[:mtime].reverse.map { |m| { [m.iso8601] => :span,
1652
+ property: 'dct:modified', datatype: 'xsd:dateTime' } } => :th,
1653
+ about: href
1654
+ },
1655
+ ] + DSD_SEQ.map do |f|
1656
+ h = []
1657
+ x = { h => :td }
1658
+ p = CI[f]
1659
+ if y = c[:struct][p] and !y.empty?
1660
+ h << y = y.first
1661
+ x[:property] = abbreviate p
1662
+ x[:datatype] = abbreviate y.datatype if y.datatype?
1663
+ end
1664
+ x
1665
+ end
1666
+ rows << { td => :tr, id: nc, about: "##{nc}",
1667
+ typeof: 'qb:Observation' }
1668
+ end
1669
+
1670
+ out[s] = xhtml_stub(base: base, title: title,
1671
+ transform: config[:transform], attr: { about: '', typeof: types },
1672
+ prefix: prefixes, content: {
1673
+ [{ [{ [{ ['About'] => :th, colspan: 4 },
1674
+ { ['Counts'] => :th, colspan: 4 },
1675
+ { ['Words per Block'] => :th, colspan: 7 }] => :tr },
1676
+ { TH_SEQ => :tr } ] => :thead },
1677
+ { rows => :tbody, rev: 'qb:dataSet' }] => :table }).document
1678
+ end
1679
+
1680
+ out
1681
+ end
1682
+
1683
+ def write_stats published: true
1684
+ target = @config[published ? :target : :private]
1685
+ target.mkpath unless target.directory?
1686
+ generate_stats(published: published).each do |uu, doc|
1687
+ bn = URI(uu.to_s).uuid + '.xml'
1688
+ fh = (target + bn).open ?w
1689
+ doc.write_to fh
1690
+ fh.flush
1691
+ fh.close
1692
+ end
1693
+ end
1694
+
1695
+ # - io stuff -
1696
+
1697
+ # Locate the file in the source directory associated with the given URI.
1698
+ #
1699
+ # @param [RDF::URI, URI, :to_s] the URI requested
1700
+ #
1701
+ # @return [Pathname] of the corresponding file or nil if no file was found.
1702
+
1703
+ def locate uri
1704
+ uri = coerce_resource uri
1705
+
1706
+ base = URI(@base.to_s)
1707
+
1708
+ tu = URI(uri) # copy of uri for testing content
1709
+ unless tu.scheme == 'urn' and tu.nid == 'uuid'
1710
+ raise "could not find UUID for #{uri}" unless uuid = canonical_uuid(uri)
1711
+ tu = URI(uri = uuid)
1712
+ end
1713
+
1714
+ # xxx bail if the uri isn't a subject in the graph
1715
+
1716
+ candidates = [@config[:source] + tu.uuid]
1717
+
1718
+ # try all canonical URIs
1719
+ (canonical_uri uri, unique: false, slugs: true).each do |u|
1720
+ u = URI(u.to_s)
1721
+ next unless u.hostname == base.hostname
1722
+ p = URI.unescape u.path[/^\/*(.*?)$/, 1]
1723
+ candidates.push(@config[:source] + p)
1724
+ end
1725
+
1726
+ # warn candidates
1727
+
1728
+ files = candidates.uniq.map do |c|
1729
+ Pathname.glob(c.to_s + '{,.*,/index{,.*}}')
1730
+ end.reduce(:+).reject do |x|
1731
+ x.directory? or RDF::SAK::MimeMagic.by_path(x).to_s !~
1732
+ /.*(?:markdown|(?:x?ht|x)ml).*/i
1733
+ end.uniq
1734
+
1735
+ #warn files
1736
+
1737
+ # XXX implement negotiation algorithm
1738
+ return files[0]
1739
+
1740
+ # return the filename from the source
1741
+ # nil
1742
+ end
1743
+
1744
+ # Visit (open) the document at the given URI.
1745
+ #
1746
+ # @param uri [RDF::URI, URI, :to_s]
1747
+ #
1748
+ # @return [RDF::SAK::Context::Document] or nil
1749
+
1750
+ def visit uri
1751
+ uri = canonical_uuid uri
1752
+ path = locate uri
1753
+ return unless path
1754
+ Document.new self, uri, uri: canonical_uri(uri), doc: path
1755
+ end
1756
+
1757
+ # resolve documents from source
1758
+ def resolve_documents
1759
+ src = @config[:source]
1760
+ out = []
1761
+ src.find do |f|
1762
+ Find.prune if f.basename.to_s[0] == ?.
1763
+ next if f.directory?
1764
+ out << f
1765
+ end
1766
+
1767
+ out
1768
+ end
1769
+
1770
+ def resolve_file path
1771
+ return unless path.file?
1772
+ path = Pathname('/') + path.relative_path_from(@config[:source])
1773
+ base = URI(@base.to_s)
1774
+ uri = base + path.to_s
1775
+
1776
+ #warn "trying #{uri}"
1777
+
1778
+ until (out = canonical_uuid uri)
1779
+ # iteratively strip off
1780
+ break if uri.path.end_with? '/'
1781
+
1782
+ dn = path.dirname
1783
+ bn = path.basename '.*'
1784
+
1785
+ # try index first
1786
+ if bn.to_s == 'index'
1787
+ p = dn.to_s
1788
+ p << '/' unless p.end_with? '/'
1789
+ uri = base + p
1790
+ elsif bn == path.basename
1791
+ break
1792
+ else
1793
+ path = dn + bn
1794
+ uri = base + path.to_s
1795
+ end
1796
+
1797
+ # warn "trying #{uri}"
1798
+ end
1799
+
1800
+ out
1801
+ end
1802
+
1803
+ # Determine whether the URI represents a published document.
1804
+ #
1805
+ # @param uri
1806
+ #
1807
+ # @return [true, false]
1808
+ def published? uri, circulated: false
1809
+ RDF::SAK::Util.published? @graph, uri,
1810
+ circulated: circulated, base: @base
1811
+ end
1812
+
1813
+ # Find a destination pathname for the document
1814
+ #
1815
+ # @param uri
1816
+ # @param published
1817
+ #
1818
+ # @return [Pathname]
1819
+ def target_for uri, published: false
1820
+ uri = coerce_resource uri
1821
+ uri = canonical_uuid uri
1822
+ target = @config[published?(uri) && published ? :target : :private]
1823
+
1824
+ # target is a pathname so this makes a pathname
1825
+ target + "#{URI(uri.to_s).uuid}.xml"
1826
+ end
1827
+
1828
+ # read from source
1829
+
1830
+ # write (manipulated (x|x?ht)ml) back to source
1831
+
1832
+ # write public and private variants to target
1833
+
1834
+ def write_xhtml published: true
1835
+ end
1836
+
1837
+ # write modified rdf
1838
+
1839
+ # - internet stuff -
1840
+
1841
+ # verify external links for upness
1842
+
1843
+ # collect triples for external links
1844
+
1845
+ # fetch references for people/companies/concepts/etc from dbpedia/wikidata
1846
+
1847
+ # - document context class -
1848
+
1849
+ class Document
1850
+ include XML::Mixup
1851
+ include Util
1852
+
1853
+ private
1854
+
1855
+ C_OK = [Nokogiri::XML::Node, IO, Pathname].freeze
1856
+
1857
+ public
1858
+
1859
+ attr_reader :doc, :uuid, :uri
1860
+
1861
+ def initialize context, uuid, doc: nil, uri: nil, mtime: nil
1862
+ raise 'context must be a RDF::SAK::Context' unless
1863
+ context.is_a? RDF::SAK::Context
1864
+ raise 'uuid must be an RDF::URI' unless
1865
+ uuid.is_a? RDF::URI and uuid.to_s.start_with? 'urn:uuid:'
1866
+
1867
+ doc ||= context.locate uuid
1868
+ raise 'doc must be Pathname, IO, or Nokogiri node' unless
1869
+ C_OK.any? { |c| doc.is_a? c } || doc.respond_to?(:to_s)
1870
+
1871
+ # set some instance variables
1872
+ @context = context
1873
+ @uuid = uuid
1874
+ @mtime = mtime || doc.respond_to?(:mtime) ? doc.mtime : Time.now
1875
+ @target = context.target_for uuid
1876
+
1877
+ # now process the document
1878
+
1879
+ # turn the document into an XML::Document
1880
+ if doc.is_a? Nokogiri::XML::Node
1881
+ # a node that is not a document should be wrapped with one
1882
+ unless doc.is_a? Nokogiri::XML::Document
1883
+ d = doc.dup 1
1884
+ doc = Nokogiri::XML::Document.new
1885
+ doc << d
1886
+ end
1887
+ else
1888
+ type = nil
1889
+
1890
+ # pathnames turned into IO objects
1891
+ if doc.is_a? Pathname
1892
+ type = RDF::SAK::MimeMagic.by_path doc
1893
+ doc = doc.open # this may raise if the file isn't there
1894
+ end
1895
+
1896
+ # squash everything else to a string
1897
+ doc = doc.to_s unless doc.is_a? IO
1898
+
1899
+ # check type by content
1900
+ type ||= RDF::SAK::MimeMagic.by_magic(doc)
1901
+
1902
+ # can you believe there is a special bookmarks mime type good grief
1903
+ type = 'text/html' if type == 'application/x-mozilla-bookmarks'
1904
+
1905
+ # now we try to parse the blob
1906
+ if type.to_s =~ /xml/i
1907
+ doc = Nokogiri.XML doc
1908
+ elsif type == 'text/html'
1909
+ # if the detected type is html, try it as strict xml first
1910
+ attempt = nil
1911
+ begin
1912
+ attempt = Nokogiri.XML doc, nil, nil, (1 << 11) # NONET
1913
+ rescue Nokogiri::XML::SyntaxError
1914
+ # do not wrap this a second time; let it fail if it's gonna
1915
+ tmp = Nokogiri.HTML doc
1916
+ attempt = Nokogiri::XML::Document.new
1917
+ attempt << tmp.root.dup(1)
1918
+ end
1919
+ doc = attempt
1920
+ elsif type.to_s =~ /^text\/(?:plain|(?:x-)?markdown)/i
1921
+ # just assume plain text is markdown
1922
+ doc = ::MD::Noko.new.ingest doc
1923
+ else
1924
+ raise "Don't know what to do with #{uuid} (#{type})"
1925
+ end
1926
+ end
1927
+
1928
+ # now fix the namespaces for mangled html documents
1929
+ root = doc.root
1930
+ if root.name == 'html'
1931
+ unless root.namespace
1932
+ # clear this off or it will be duplicated in the output
1933
+ root.remove_attribute('xmlns')
1934
+ # now generate a new ns object
1935
+ ns = root.add_namespace(nil, XHTMLNS)
1936
+ # *now* scan the document and add the namespace declaration
1937
+ root.traverse do |node|
1938
+ if node.element? && node.namespace.nil?
1939
+ # downcasing the name may be cargo culting; need to check
1940
+ # node.name = node.name.downcase # yup it is
1941
+ node.namespace = ns
1942
+ end
1943
+ end
1944
+ end
1945
+
1946
+ # also add the magic blank doctype declaration if it's missing
1947
+ unless doc.internal_subset
1948
+ doc.create_internal_subset('html', nil, nil)
1949
+ end
1950
+ end
1951
+
1952
+ # aaand set some more instance variables
1953
+
1954
+ @uri = URI(uri || @context.canonical_uri(uuid))
1955
+
1956
+ # voilà
1957
+ @doc = doc
1958
+ end
1959
+
1960
+ # proxy for context published
1961
+ def published?
1962
+ @context.published? @uuid
1963
+ end
1964
+
1965
+ def base_for node = nil
1966
+ node ||= @doc
1967
+ doc = node.document
1968
+ base = @uri.to_s
1969
+ if doc.root.name.to_sym == :html
1970
+ b = doc.at_xpath(
1971
+ '(/html:html/html:head/html:base[@href])[1]/@href',
1972
+ { html: XHTMLNS }).to_s.strip
1973
+ base = b if URI(b).absolute?
1974
+ elsif b = doc.at_xpath('ancestor-or-self::*[@xml:base][1]/@xml:base')
1975
+ b = b.to_s.strip
1976
+ base = b if URI(b).absolute?
1977
+ end
1978
+
1979
+ URI(base)
1980
+ end
1981
+
1982
+ # notice these are only RDFa attributes that take URIs
1983
+ RDFA_ATTR = [:about, :resource, :typeof].freeze
1984
+ LINK_ATTR = [:href, :src, :data, :action, :longdesc].freeze
1985
+ LINK_XPATH = ('.//html:*[not(self::html:base)][%s]' %
1986
+ (LINK_ATTR + RDFA_ATTR).map { |a| "@#{a.to_s}" }.join('|')).freeze
1987
+
1988
+ def rewrite_links node = @doc, uuids: {}, uris: {}, &block
1989
+ base = base_for node
1990
+ count = 0
1991
+ cache = {}
1992
+ node.xpath(LINK_XPATH, { html: XHTMLNS }).each do |elem|
1993
+ LINK_ATTR.each do |attr|
1994
+ attr = attr.to_s
1995
+ next unless elem.has_attribute? attr
1996
+
1997
+ abs = base.merge uri_pp(elem[attr].strip)
1998
+
1999
+ # fix e.g. http->https
2000
+ if abs.host == @uri.host and abs.scheme != @uri.scheme
2001
+ tmp = @uri.dup
2002
+ tmp.path = abs.path
2003
+ tmp.query = abs.query
2004
+ tmp.fragment = abs.fragment
2005
+ abs = tmp
2006
+ end
2007
+
2008
+ # harvest query string
2009
+ pp = split_pp abs, only: true
2010
+
2011
+ abs = RDF::URI(abs.to_s)
2012
+
2013
+ # round-trip to uuid and back if we can
2014
+ if uuid = uuids[abs] ||= @context.canonical_uuid(abs)
2015
+ abs = cache[abs] ||= @context.canonical_uri(uuid)
2016
+ else
2017
+ abs = cache[abs] ||= @context.canonical_uri(abs)
2018
+ end
2019
+
2020
+ # reinstate the path parameters
2021
+ if !pp.empty? && split_pp(abs, only: true).empty?
2022
+ abs = abs.dup
2023
+ abs.path = ([abs.path] + pp).join(';')
2024
+ end
2025
+
2026
+
2027
+ elem[attr] = @uri.route_to(abs.to_s).to_s
2028
+ count += 1
2029
+ end
2030
+
2031
+ block.call elem if block
2032
+ end
2033
+
2034
+ count
2035
+ end
2036
+
2037
+ # sponge the document for rdfa
2038
+ def triples_for
2039
+ end
2040
+
2041
+ OBJS = [:href, :src].freeze
2042
+
2043
+ # ancestor node always with (@property and not @content) and
2044
+ # not @resource|@href|@src unless @rel|@rev
2045
+ LITXP = ['(ancestor::*[@property][not(@content)]',
2046
+ '[not(@resource|@href|@src) or @rel|@rev])[1]' ].join('').freeze
2047
+ # note parentheses cause the index to be counted from the root
2048
+
2049
+ def vocab_for node
2050
+ if node[:vocab]
2051
+ vocab = node[:vocab].strip
2052
+ return nil if vocab == ''
2053
+ return vocab
2054
+ end
2055
+ parent = node.parent
2056
+ vocab_for parent if parent and parent.element?
2057
+ end
2058
+
2059
+ def prefixes_for node, prefixes = {}
2060
+ # start with namespaces
2061
+ pfx = node.namespaces.select do |k, _|
2062
+ k.start_with? 'xmlns:'
2063
+ end.transform_keys do |k|
2064
+ k.delete_prefix 'xmlns:'
2065
+ end
2066
+
2067
+ # then add @prefix overtop of the namespaces
2068
+ if node[:prefix]
2069
+ x = node[:prefix].strip.split(/\s+/)
2070
+ a = []
2071
+ b = []
2072
+ x.each_index { |i| (i % 2 == 0 ? a : b).push x[i] }
2073
+ # if the size is uneven the values will be nil, so w drop em
2074
+ pfx.merge! a.zip(b).to_h.reject { |_, v| v.nil? }
2075
+ end
2076
+
2077
+ # since we're ascending the tree, input takes precedence
2078
+ prefixes = pfx.merge prefixes
2079
+
2080
+ if node.parent and node.parent.element?
2081
+ prefixes_for(node.parent, prefixes)
2082
+ else
2083
+ prefixes
2084
+ end
2085
+ end
2086
+
2087
+ # give us the rdf subject of the node itself
2088
+ def subject_for node = nil, rdf: false, is_ancestor: false
2089
+ node ||= @doc.root
2090
+ raise 'Node must be an element' unless
2091
+ node.is_a? Nokogiri::XML::Element
2092
+
2093
+ # first we check for an ancestor element with @property and no
2094
+ # @content; if we find one then we reevaluate with that
2095
+ # element as the starting point
2096
+ if n = node.at_xpath(LITXP)
2097
+ return subject_for n
2098
+ end
2099
+
2100
+ # answer a bunch of helpful questions about this element
2101
+ subject = nil
2102
+ base = base_for node
2103
+ parent = node.parent
2104
+ ns_href = node.namespace.href if node.namespace
2105
+ up_ok = %i{rel rev}.none? { |a| node[a] }
2106
+ is_root = !parent or parent.document?
2107
+ special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
2108
+ (ns_href == 'http://www.w3.org/1999/xhtml' or
2109
+ /^(?:[^:]+:)?html$/xi === parent.name)
2110
+
2111
+ # if the node is being inspected as an ancestor to the
2112
+ # original node, we have to check it backwards.
2113
+ if is_ancestor
2114
+ # ah right @resource gets special treatment
2115
+ if subject = node[:resource]
2116
+ subject.strip!
2117
+ if m = /^\[(.*?)\]$/.match(subject)
2118
+ end
2119
+ else
2120
+ OBJS.each do |attr|
2121
+ if node[attr]
2122
+ # merge with the root and return it
2123
+ subject = base + node[attr]
2124
+ break
2125
+ end
2126
+ end
2127
+ end
2128
+
2129
+ return rdf ? RDF::URI(subject.to_s) : subject
2130
+
2131
+ # note if we are being called with is_ancestor, that means
2132
+ # the original node (or indeed any of the nodes previously
2133
+ # tested) have anything resembling a resource in them. this
2134
+ # means @rel/@rev should be ignored, and we should keep
2135
+ # looking for a subject.
2136
+ end
2137
+
2138
+ if node[:about]
2139
+
2140
+ if m = /^_:(.*)$/.match(node[:about])
2141
+ return RDF::Node(m[1])
2142
+ end
2143
+
2144
+ # XXX resolve @about against potential curie
2145
+ subject = base + node[:about]
2146
+
2147
+ elsif is_root
2148
+ subject = base
2149
+ elsif special
2150
+ subject = subject_for parent
2151
+ elsif node[:resource]
2152
+ # XXX resolve @about against potential curie
2153
+ subject = base + node[:resource]
2154
+ elsif node[:href]
2155
+ subject = base + node[:href]
2156
+ elsif node[:src]
2157
+ subject = base + node[:src]
2158
+ elsif node[:typeof]
2159
+ # bnode the typeof attr
2160
+
2161
+ # note we return bnodes irrespective of the rdf flag
2162
+ return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
2163
+ elsif node[:inlist]
2164
+ # bnode the inlist attr
2165
+ return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
2166
+ elsif (parent[:inlist] && OBJS.none? { |a| parent[a] }) ||
2167
+ (is_ancestor && !up_ok)
2168
+ # bnode the element
2169
+ return RDF::Node('id-%016x' % node.pointer_id)
2170
+ # elsif node[:id]
2171
+ else
2172
+ subject = subject_for parent, is_ancestor: true
2173
+ end
2174
+
2175
+ rdf ? RDF::URI(subject.to_s) : URI(subject.to_s)
2176
+
2177
+ end
2178
+
2179
+ # backlink structure
2180
+ def generate_backlinks published: true, ignore: nil
2181
+ @context.generate_backlinks @uuid, published: published, ignore: ignore
2182
+ end
2183
+
2184
+ # goofy twitter-specific metadata
2185
+ def generate_twitter_meta
2186
+ @context.generate_twitter_meta @uuid
2187
+ end
2188
+
2189
+ def transform_xhtml published: true
2190
+ # before we do any more work make sure this is html
2191
+ doc = @doc.dup 1
2192
+ body = doc.at_xpath('//html:body[1]', { html: XHTMLNS }) or return
2193
+
2194
+ # eliminate comments
2195
+ doc.xpath('//comment()[not(ancestor::html:script)]',
2196
+ { html: XHTMLNS }).each { |c| c.unlink }
2197
+
2198
+ # initial stuff
2199
+ struct = @context.struct_for @uuid, uuids: true, canon: true
2200
+ # rstruct = @context.struct_for @uuid, uuids: true, rev: true
2201
+ resources = {}
2202
+ literals = {}
2203
+ ufwd = {} # uuid -> uri
2204
+ urev = {} # uri -> uuid
2205
+ datatypes = Set.new
2206
+ types = Set.new
2207
+ authors = @context.authors_for(@uuid)
2208
+ title = @context.label_for @uuid, candidates: struct
2209
+ desc = @context.label_for @uuid, candidates: struct, desc: true
2210
+
2211
+ # rewrite content
2212
+ title = title[1] if title
2213
+ desc = desc[1] if desc
2214
+
2215
+ # `struct` and `rstruct` will contain all the links and
2216
+ # metadata for forward and backward neighbours, respectively,
2217
+ # which we need to mine (predicates, classes, datatypes) for
2218
+ # prefixes among other things.
2219
+
2220
+ struct.each do |p, v|
2221
+ v.each do |o|
2222
+ if o.literal?
2223
+ literals[o] ||= Set.new
2224
+ literals[o].add p
2225
+
2226
+ # collect the datatype
2227
+ datatypes.add o.datatype if o.has_datatype?
2228
+ else
2229
+ # normalize URIs
2230
+ if o.to_s.start_with? 'urn:uuid:'
2231
+ ufwd[o] ||= @context.canonical_uri o
2232
+ elsif cu = @context.canonical_uuid(o)
2233
+ o = urev[o] ||= cu
2234
+ end
2235
+
2236
+
2237
+ # collect the resource
2238
+ resources[o] ||= Set.new
2239
+ resources[o].add p
2240
+
2241
+ # add to type
2242
+ types.add o if p == RDF::RDFV.type
2243
+ end
2244
+ end
2245
+ end
2246
+ urev.merge! ufwd.invert
2247
+
2248
+ labels = resources.keys.map do |k|
2249
+ # turn this into a pair which subsequently gets turned into a hash
2250
+ [k, @context.label_for(k) ]
2251
+ end.to_h
2252
+
2253
+ #warn labels
2254
+
2255
+ # handle the title
2256
+ title ||= RDF::Literal('')
2257
+ tm = { '#title' => title,
2258
+ property: @context.abbreviate(literals[title].to_a, vocab: XHV) }
2259
+ if tl = title.language
2260
+ tm['xml:lang'] = tl # if xmlns
2261
+ tm['lang'] = tl
2262
+ elsif tdt = title.datatype and tdt != RDF::XSD.string
2263
+ tm[:datatype] = @context.abbreviate(tdt)
2264
+ end
2265
+
2266
+ # we accumulate a record of the links in the body so we know
2267
+ # which ones to skip in the head
2268
+ bodylinks = {}
2269
+ rewrite_links body, uuids: ufwd, uris: urev do |elem|
2270
+ vocab = elem.at_xpath('ancestor-or-self::*[@vocab][1]/@vocab')
2271
+ vocab = uri_pp(vocab.to_s) if vocab
2272
+
2273
+ if elem.key?('href') or elem.key?('src')
2274
+ vu = uri_pp(elem['href'] || elem['src'])
2275
+ ru = RDF::URI(@uri.merge(vu))
2276
+ bodylinks[urev[ru] || ru] = true
2277
+
2278
+ if rel = resources[urev[ru] || ru]
2279
+ elem['rel'] = (@context.abbreviate rel, vocab: vocab).join ' '
2280
+ end
2281
+
2282
+ label = labels[urev[ru] || ru]
2283
+ if label and (!elem.key?('title') or elem['title'].strip == '')
2284
+ elem['title'] = label[1].to_s
2285
+ end
2286
+ end
2287
+ end
2288
+
2289
+ # and now we do the head
2290
+ links = []
2291
+ resources.reject { |k, _| bodylinks[k] }.each do |k, v|
2292
+ v = v.dup.delete RDF::RDFV.type
2293
+ next if v.empty?
2294
+ mts = @context.formats_for k
2295
+
2296
+ # warn k, v.inspect
2297
+
2298
+ # warn k, mts.inspect
2299
+
2300
+ rel = @context.abbreviate v.to_a, vocab: XHV
2301
+ ru = @uri.route_to(uri_pp (ufwd[k] || k).to_s)
2302
+ ln = { nil => :link, rel: rel, href: ru.to_s }
2303
+ if (label = labels[urev[k] || k])
2304
+ ln[:title] = label[1].to_s
2305
+ end
2306
+
2307
+ # add type=lol/wut
2308
+ ln[:type] = mts.first.to_s unless mts.empty?
2309
+
2310
+ if !ln[:type] and v.include?(RDF::Vocab::XHV.stylesheet)
2311
+ ln[:type] = 'text/css'
2312
+ elsif ln[:type] =~ /(java|ecma)script/i or
2313
+ v.include?(RDF::Vocab::DC.requires)
2314
+ ln[nil] = :script
2315
+ ln[:src] = ln.delete :href
2316
+ ln[:type] ||= 'text/javascript'
2317
+ end
2318
+ links.push ln
2319
+ end
2320
+
2321
+ links.sort! do |a, b|
2322
+ # sort by rel, then by href
2323
+ # warn a.inspect, b.inspect
2324
+ s = 0
2325
+ [nil, :rel, :rev, :href, :title].each do |k|
2326
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
2327
+ break if s != 0
2328
+ end
2329
+ s
2330
+ end
2331
+
2332
+ # we want to duplicate links from particular subjects (eg the root)
2333
+ (@context.config[:duplicate] || {}).sort do |a, b|
2334
+ a.first <=> b.first
2335
+ end.each do |s, preds|
2336
+
2337
+ o = {}
2338
+ u = ufwd[s] ||= @context.canonical_uuid s
2339
+ s = urev[u] ||= @context.canonical_uri u if u
2340
+ f = {}
2341
+
2342
+ # do not include this subject as these links are already included!
2343
+ next if u == @uuid
2344
+
2345
+ # gather up the objects, then gather up the predicates
2346
+
2347
+ @context.objects_for u || s, preds, only: :resource do |obj, rel|
2348
+ # XXX do not know why += |= etc does not work
2349
+ x = @context.canonical_uuid(obj) || obj
2350
+ urev[x] ||= @context.canonical_uri x
2351
+ y = o[x] ||= Set.new
2352
+ o[x] = y | rel
2353
+ f[x] = @context.formats_for x
2354
+ end
2355
+
2356
+ srel = @uri.route_to((u ? urev[u] || s : s).to_s)
2357
+
2358
+ # now collect all the other predicates
2359
+ o.keys.each do |obj|
2360
+ hrel = @uri.route_to((urev[obj] || obj).to_s)
2361
+ o[obj] |= @context.graph.query([u || s, nil, obj]).predicates.to_set
2362
+ rels = @context.abbreviate o[obj].to_a, vocab: XHV
2363
+ ln = { nil => :link, about: srel, rel: rels, href: hrel }
2364
+ ln[:type] = f[obj].first if f[obj]
2365
+
2366
+ # add to links
2367
+ links << ln
2368
+ end
2369
+ end
2370
+
2371
+ meta = []
2372
+
2373
+ # include author names as old school meta tags
2374
+ authors.each do |a|
2375
+ name = labels[urev[a] || a] or next
2376
+ datatypes.add name[0] # a convenient place to chuck this
2377
+ prop = @context.abbreviate(name[0])
2378
+ name = name[1]
2379
+ about = @uri.route_to((ufwd[a] || a).to_s)
2380
+ tag = { nil => :meta, about: about.to_s, name: :author,
2381
+ property: prop, content: name.to_s }
2382
+
2383
+ if name.has_datatype? and name.datatype != RDF::XSD.string
2384
+ tag[:datatype] = @context.abbreviate(name.datatype)
2385
+ elsif name.has_language?
2386
+ tag['xml:lang'] = tag[:lang] = name.language
2387
+ end
2388
+ meta.push tag
2389
+ end
2390
+
2391
+ literals.each do |k, v|
2392
+ next if k == title
2393
+ rel = @context.abbreviate v.to_a, vocab: XHV
2394
+ elem = { nil => :meta, property: rel, content: k.to_s }
2395
+ elem[:name] = :description if k == desc
2396
+
2397
+ if k.has_datatype?
2398
+ datatypes.add k.datatype # so we get the prefix
2399
+ elem[:datatype] = @context.abbreviate k.datatype, vocab: XHV
2400
+ end
2401
+
2402
+ meta.push(elem)
2403
+ end
2404
+
2405
+ meta.sort! do |a, b|
2406
+ s = 0
2407
+ [:about, :property, :datatype, :content, :name].each do |k|
2408
+ # warn a.inspect, b.inspect
2409
+ s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
2410
+ break if s != 0
2411
+ end
2412
+ s
2413
+ end
2414
+
2415
+ # don't forget style tag
2416
+ style = doc.xpath('/html:html/html:head/html:style', { html: XHTMLNS })
2417
+
2418
+ body = body.dup 1
2419
+ body = { '#body' => body.children.to_a, about: '' }
2420
+ body[:typeof] = @context.abbreviate(types.to_a, vocab: XHV) unless
2421
+ types.empty?
2422
+
2423
+ # prepare only the prefixes we need to resolve the data we need
2424
+ rsc = @context.abbreviate(
2425
+ (struct.keys + resources.keys + datatypes.to_a + types.to_a).uniq,
2426
+ noop: false).map do |x|
2427
+ next if x.nil?
2428
+ x.split(?:)[0].to_sym
2429
+ end.select { |x| not x.nil? }.to_set
2430
+
2431
+ pfx = @context.prefixes.select do |k, _|
2432
+ rsc.include? k
2433
+ end.transform_values { |v| v.to_s }
2434
+
2435
+ # XXX deal with the qb:Observation separately (just nuke it for now)
2436
+ extra = generate_twitter_meta || []
2437
+ if bl = generate_backlinks(published: published,
2438
+ ignore: @context.graph.query(
2439
+ [nil, CI.document, @uuid]).subjects.to_set)
2440
+ extra << { [bl] => :object }
2441
+ end
2442
+
2443
+ # and now for the document
2444
+ xf = @context.config[:transform]
2445
+ doc = xhtml_stub(
2446
+ base: @uri, prefix: pfx, vocab: XHV, lang: 'en', title: tm,
2447
+ link: links, meta: meta, style: style, transform: xf,
2448
+ extra: extra, body: body).document
2449
+
2450
+ # goddamn script tags and text/html
2451
+ doc.xpath('//html:script[@src][not(node())]',
2452
+ { html: XHTMLNS }).each do |script|
2453
+ script << doc.create_text_node('')
2454
+ end
2455
+
2456
+ doc
2457
+ end
2458
+
2459
+ # Actually write the transformed document to the target
2460
+ #
2461
+ # @param published [true, false]
2462
+ #
2463
+ # @return [Array] pathname(s) written
2464
+ def write_to_target published: true
2465
+
2466
+ # in all cases we write to private target
2467
+ states = [false]
2468
+ # document has to be publishable
2469
+ states.push true if published && @context.published?(@uuid)
2470
+
2471
+ ok = []
2472
+ states.each do |state|
2473
+ target = @context.config[state ? :target : :private]
2474
+
2475
+ # XXX this is dumb; it should do something more robust if it
2476
+ # fails
2477
+ doc = transform_xhtml(published: state) or next
2478
+
2479
+ begin
2480
+ fh = Tempfile.create('xml-', target)
2481
+ path = Pathname(fh.path)
2482
+
2483
+ # write the doc to the target
2484
+ doc.write_to fh
2485
+ fh.close
2486
+
2487
+ uuid = URI(@uuid.to_s)
2488
+ newpath = path.dirname + "#{uuid.uuid}.xml"
2489
+ ok.push newpath
2490
+
2491
+ File.chmod(0644, path)
2492
+ File.rename(path, newpath)
2493
+ File.utime(@mtime, @mtime, newpath)
2494
+ rescue Exception => e
2495
+ # XXX this should only rescue a specific class of errors
2496
+ warn e.class, e
2497
+ File.unlink path if path.exist?
2498
+ end
2499
+ end
2500
+
2501
+ ok
2502
+ end
2503
+
2504
+ end
2505
+ end
2506
+ end